1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
395 struct device_domain_info *info;
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
401 spin_unlock_irqrestore(&device_domain_lock, flags);
405 spin_unlock_irqrestore(&device_domain_lock, flags);
410 const struct iommu_ops intel_iommu_ops;
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 static void init_translation_status(struct intel_iommu *iommu)
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 return container_of(dom, struct dmar_domain, domain);
437 static int __init intel_iommu_setup(char *str)
442 if (!strncmp(str, "on", 2)) {
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
473 str += strcspn(str, ",");
479 __setup("intel_iommu=", intel_iommu_setup);
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
486 struct dmar_domain **domains;
489 domains = iommu->domains[idx];
493 return domains[did & 0xff];
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
499 struct dmar_domain **domains;
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
511 domains[did & 0xff] = domain;
514 void *alloc_pgtable_page(int node)
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
521 vaddr = page_address(page);
525 void free_pgtable_page(void *vaddr)
527 free_page((unsigned long)vaddr);
530 static inline void *alloc_domain_mem(void)
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
535 static void free_domain_mem(void *vaddr)
537 kmem_cache_free(iommu_domain_cache, vaddr);
540 static inline void * alloc_devinfo_mem(void)
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
545 static inline void free_devinfo_mem(void *vaddr)
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
550 static inline int domain_type_is_si(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
571 if (test_bit(agaw, &sagaw))
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 for_each_domain_iommu(iommu_id, domain)
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 return g_iommus[iommu_id];
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
616 struct dmar_drhd_unit *drhd;
617 struct intel_iommu *iommu;
621 domain->iommu_coherency = 1;
623 for_each_domain_iommu(i, domain) {
625 if (!ecap_coherent(g_iommus[i]->ecap)) {
626 domain->iommu_coherency = 0;
633 /* No hardware attached; use lowest common denominator */
635 for_each_active_iommu(iommu, drhd) {
636 if (!ecap_coherent(iommu->ecap)) {
637 domain->iommu_coherency = 0;
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
646 struct dmar_drhd_unit *drhd;
647 struct intel_iommu *iommu;
651 for_each_active_iommu(iommu, drhd) {
653 if (!ecap_sc_support(iommu->ecap)) {
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
666 struct dmar_drhd_unit *drhd;
667 struct intel_iommu *iommu;
670 if (!intel_iommu_superpage) {
674 /* set iommu_superpage to the smallest common denominator */
676 for_each_active_iommu(iommu, drhd) {
678 mask &= cap_super_page_val(iommu->cap);
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
691 domain_update_iommu_coherency(domain);
692 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
699 struct root_entry *root = &iommu->root_entry[bus];
700 struct context_entry *context;
704 if (sm_supported(iommu)) {
712 context = phys_to_virt(*entry & VTD_PAGE_MASK);
714 unsigned long phy_addr;
718 context = alloc_pgtable_page(iommu->node);
722 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723 phy_addr = virt_to_phys((void *)context);
724 *entry = phy_addr | 1;
725 __iommu_flush_cache(iommu, entry, sizeof(*entry));
727 return &context[devfn];
730 static int iommu_dummy(struct device *dev)
732 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
736 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737 * sub-hierarchy of a candidate PCI-PCI bridge
738 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739 * @bridge: the candidate PCI-PCI bridge
741 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
746 struct pci_dev *pdev, *pbridge;
748 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
751 pdev = to_pci_dev(dev);
752 pbridge = to_pci_dev(bridge);
754 if (pbridge->subordinate &&
755 pbridge->subordinate->number <= pdev->bus->number &&
756 pbridge->subordinate->busn_res.end >= pdev->bus->number)
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
764 struct dmar_drhd_unit *drhd = NULL;
765 struct intel_iommu *iommu;
767 struct pci_dev *pdev = NULL;
771 if (iommu_dummy(dev))
774 if (dev_is_pci(dev)) {
775 struct pci_dev *pf_pdev;
777 pdev = to_pci_dev(dev);
780 /* VMD child devices currently cannot be handled individually */
781 if (is_vmd(pdev->bus))
785 /* VFs aren't listed in scope tables; we need to look up
786 * the PF instead to find the IOMMU. */
787 pf_pdev = pci_physfn(pdev);
789 segment = pci_domain_nr(pdev->bus);
790 } else if (has_acpi_companion(dev))
791 dev = &ACPI_COMPANION(dev)->dev;
794 for_each_active_iommu(iommu, drhd) {
795 if (pdev && segment != drhd->segment)
798 for_each_active_dev_scope(drhd->devices,
799 drhd->devices_cnt, i, tmp) {
801 /* For a VF use its original BDF# not that of the PF
802 * which we used for the IOMMU lookup. Strictly speaking
803 * we could do this for all PCI devices; we only need to
804 * get the BDF# from the scope table for ACPI matches. */
805 if (pdev && pdev->is_virtfn)
808 *bus = drhd->devices[i].bus;
809 *devfn = drhd->devices[i].devfn;
813 if (is_downstream_to_pci_bridge(dev, tmp))
817 if (pdev && drhd->include_all) {
819 *bus = pdev->bus->number;
820 *devfn = pdev->devfn;
831 static void domain_flush_cache(struct dmar_domain *domain,
832 void *addr, int size)
834 if (!domain->iommu_coherency)
835 clflush_cache_range(addr, size);
838 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
840 struct context_entry *context;
844 spin_lock_irqsave(&iommu->lock, flags);
845 context = iommu_context_addr(iommu, bus, devfn, 0);
847 ret = context_present(context);
848 spin_unlock_irqrestore(&iommu->lock, flags);
852 static void free_context_table(struct intel_iommu *iommu)
856 struct context_entry *context;
858 spin_lock_irqsave(&iommu->lock, flags);
859 if (!iommu->root_entry) {
862 for (i = 0; i < ROOT_ENTRY_NR; i++) {
863 context = iommu_context_addr(iommu, i, 0, 0);
865 free_pgtable_page(context);
867 if (!sm_supported(iommu))
870 context = iommu_context_addr(iommu, i, 0x80, 0);
872 free_pgtable_page(context);
875 free_pgtable_page(iommu->root_entry);
876 iommu->root_entry = NULL;
878 spin_unlock_irqrestore(&iommu->lock, flags);
881 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882 unsigned long pfn, int *target_level)
884 struct dma_pte *parent, *pte;
885 int level = agaw_to_level(domain->agaw);
888 BUG_ON(!domain->pgd);
890 if (!domain_pfn_supported(domain, pfn))
891 /* Address beyond IOMMU's addressing capabilities. */
894 parent = domain->pgd;
899 offset = pfn_level_offset(pfn, level);
900 pte = &parent[offset];
901 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
903 if (level == *target_level)
906 if (!dma_pte_present(pte)) {
909 tmp_page = alloc_pgtable_page(domain->nid);
914 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916 if (cmpxchg64(&pte->val, 0ULL, pteval))
917 /* Someone else set it while we were thinking; use theirs. */
918 free_pgtable_page(tmp_page);
920 domain_flush_cache(domain, pte, sizeof(*pte));
925 parent = phys_to_virt(dma_pte_addr(pte));
930 *target_level = level;
935 /* return address's pte at specific level */
936 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
938 int level, int *large_page)
940 struct dma_pte *parent, *pte;
941 int total = agaw_to_level(domain->agaw);
944 parent = domain->pgd;
945 while (level <= total) {
946 offset = pfn_level_offset(pfn, total);
947 pte = &parent[offset];
951 if (!dma_pte_present(pte)) {
956 if (dma_pte_superpage(pte)) {
961 parent = phys_to_virt(dma_pte_addr(pte));
967 /* clear last level pte, a tlb flush should be followed */
968 static void dma_pte_clear_range(struct dmar_domain *domain,
969 unsigned long start_pfn,
970 unsigned long last_pfn)
972 unsigned int large_page;
973 struct dma_pte *first_pte, *pte;
975 BUG_ON(!domain_pfn_supported(domain, start_pfn));
976 BUG_ON(!domain_pfn_supported(domain, last_pfn));
977 BUG_ON(start_pfn > last_pfn);
979 /* we don't need lock here; nobody else touches the iova range */
982 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
984 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
989 start_pfn += lvl_to_nr_pages(large_page);
991 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
993 domain_flush_cache(domain, first_pte,
994 (void *)pte - (void *)first_pte);
996 } while (start_pfn && start_pfn <= last_pfn);
999 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000 int retain_level, struct dma_pte *pte,
1001 unsigned long pfn, unsigned long start_pfn,
1002 unsigned long last_pfn)
1004 pfn = max(start_pfn, pfn);
1005 pte = &pte[pfn_level_offset(pfn, level)];
1008 unsigned long level_pfn;
1009 struct dma_pte *level_pte;
1011 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1014 level_pfn = pfn & level_mask(level);
1015 level_pte = phys_to_virt(dma_pte_addr(pte));
1018 dma_pte_free_level(domain, level - 1, retain_level,
1019 level_pte, level_pfn, start_pfn,
1024 * Free the page table if we're below the level we want to
1025 * retain and the range covers the entire table.
1027 if (level < retain_level && !(start_pfn > level_pfn ||
1028 last_pfn < level_pfn + level_size(level) - 1)) {
1030 domain_flush_cache(domain, pte, sizeof(*pte));
1031 free_pgtable_page(level_pte);
1034 pfn += level_size(level);
1035 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1039 * clear last level (leaf) ptes and free page table pages below the
1040 * level we wish to keep intact.
1042 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043 unsigned long start_pfn,
1044 unsigned long last_pfn,
1047 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049 BUG_ON(start_pfn > last_pfn);
1051 dma_pte_clear_range(domain, start_pfn, last_pfn);
1053 /* We don't need lock here; nobody else touches the iova range */
1054 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055 domain->pgd, 0, start_pfn, last_pfn);
1058 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059 free_pgtable_page(domain->pgd);
1064 /* When a page at a given level is being unlinked from its parent, we don't
1065 need to *modify* it at all. All we need to do is make a list of all the
1066 pages which can be freed just as soon as we've flushed the IOTLB and we
1067 know the hardware page-walk will no longer touch them.
1068 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1070 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071 int level, struct dma_pte *pte,
1072 struct page *freelist)
1076 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077 pg->freelist = freelist;
1083 pte = page_address(pg);
1085 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086 freelist = dma_pte_list_pagetables(domain, level - 1,
1089 } while (!first_pte_in_page(pte));
1094 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095 struct dma_pte *pte, unsigned long pfn,
1096 unsigned long start_pfn,
1097 unsigned long last_pfn,
1098 struct page *freelist)
1100 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1102 pfn = max(start_pfn, pfn);
1103 pte = &pte[pfn_level_offset(pfn, level)];
1106 unsigned long level_pfn;
1108 if (!dma_pte_present(pte))
1111 level_pfn = pfn & level_mask(level);
1113 /* If range covers entire pagetable, free it */
1114 if (start_pfn <= level_pfn &&
1115 last_pfn >= level_pfn + level_size(level) - 1) {
1116 /* These suborbinate page tables are going away entirely. Don't
1117 bother to clear them; we're just going to *free* them. */
1118 if (level > 1 && !dma_pte_superpage(pte))
1119 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1125 } else if (level > 1) {
1126 /* Recurse down into a level that isn't *entirely* obsolete */
1127 freelist = dma_pte_clear_level(domain, level - 1,
1128 phys_to_virt(dma_pte_addr(pte)),
1129 level_pfn, start_pfn, last_pfn,
1133 pfn += level_size(level);
1134 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1137 domain_flush_cache(domain, first_pte,
1138 (void *)++last_pte - (void *)first_pte);
1143 /* We can't just free the pages because the IOMMU may still be walking
1144 the page tables, and may have cached the intermediate levels. The
1145 pages can only be freed after the IOTLB flush has been done. */
1146 static struct page *domain_unmap(struct dmar_domain *domain,
1147 unsigned long start_pfn,
1148 unsigned long last_pfn)
1150 struct page *freelist;
1152 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154 BUG_ON(start_pfn > last_pfn);
1156 /* we don't need lock here; nobody else touches the iova range */
1157 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158 domain->pgd, 0, start_pfn, last_pfn, NULL);
1161 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162 struct page *pgd_page = virt_to_page(domain->pgd);
1163 pgd_page->freelist = freelist;
1164 freelist = pgd_page;
1172 static void dma_free_pagelist(struct page *freelist)
1176 while ((pg = freelist)) {
1177 freelist = pg->freelist;
1178 free_pgtable_page(page_address(pg));
1182 static void iova_entry_free(unsigned long data)
1184 struct page *freelist = (struct page *)data;
1186 dma_free_pagelist(freelist);
1189 /* iommu handling */
1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1192 struct root_entry *root;
1193 unsigned long flags;
1195 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1197 pr_err("Allocating root entry for %s failed\n",
1202 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1204 spin_lock_irqsave(&iommu->lock, flags);
1205 iommu->root_entry = root;
1206 spin_unlock_irqrestore(&iommu->lock, flags);
1211 static void iommu_set_root_entry(struct intel_iommu *iommu)
1217 addr = virt_to_phys(iommu->root_entry);
1218 if (sm_supported(iommu))
1219 addr |= DMA_RTADDR_SMT;
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1224 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228 readl, (sts & DMA_GSTS_RTPS), sts);
1230 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1238 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1241 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1244 /* Make sure hardware complete it */
1245 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246 readl, (!(val & DMA_GSTS_WBFS)), val);
1248 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 /* return value determine if we need a write buffer flush */
1252 static void __iommu_flush_context(struct intel_iommu *iommu,
1253 u16 did, u16 source_id, u8 function_mask,
1260 case DMA_CCMD_GLOBAL_INVL:
1261 val = DMA_CCMD_GLOBAL_INVL;
1263 case DMA_CCMD_DOMAIN_INVL:
1264 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1266 case DMA_CCMD_DEVICE_INVL:
1267 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1273 val |= DMA_CCMD_ICC;
1275 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1278 /* Make sure hardware complete it */
1279 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1282 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1285 /* return value determine if we need a write buffer flush */
1286 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287 u64 addr, unsigned int size_order, u64 type)
1289 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290 u64 val = 0, val_iva = 0;
1294 case DMA_TLB_GLOBAL_FLUSH:
1295 /* global flush doesn't need set IVA_REG */
1296 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1298 case DMA_TLB_DSI_FLUSH:
1299 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1301 case DMA_TLB_PSI_FLUSH:
1302 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 /* IH bit is passed in as part of address */
1304 val_iva = size_order | addr;
1309 /* Note: set drain read/write */
1312 * This is probably to be super secure.. Looks like we can
1313 * ignore it without any impact.
1315 if (cap_read_drain(iommu->cap))
1316 val |= DMA_TLB_READ_DRAIN;
1318 if (cap_write_drain(iommu->cap))
1319 val |= DMA_TLB_WRITE_DRAIN;
1321 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322 /* Note: Only uses first TLB reg currently */
1324 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1327 /* Make sure hardware complete it */
1328 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1331 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1333 /* check IOTLB invalidation granularity */
1334 if (DMA_TLB_IAIG(val) == 0)
1335 pr_err("Flush IOTLB failed\n");
1336 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337 pr_debug("TLB flush request %Lx, actual %Lx\n",
1338 (unsigned long long)DMA_TLB_IIRG(type),
1339 (unsigned long long)DMA_TLB_IAIG(val));
1342 static struct device_domain_info *
1343 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1346 struct device_domain_info *info;
1348 assert_spin_locked(&device_domain_lock);
1353 list_for_each_entry(info, &domain->devices, link)
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 if (info->ats_supported && info->dev)
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1366 struct device_domain_info *info;
1367 bool has_iotlb_device = false;
1369 assert_spin_locked(&device_domain_lock);
1371 list_for_each_entry(info, &domain->devices, link) {
1372 struct pci_dev *pdev;
1374 if (!info->dev || !dev_is_pci(info->dev))
1377 pdev = to_pci_dev(info->dev);
1378 if (pdev->ats_enabled) {
1379 has_iotlb_device = true;
1384 domain->has_iotlb_device = has_iotlb_device;
1387 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1389 struct pci_dev *pdev;
1391 assert_spin_locked(&device_domain_lock);
1393 if (!info || !dev_is_pci(info->dev))
1396 pdev = to_pci_dev(info->dev);
1397 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400 * reserved, which should be set to 0.
1402 if (!ecap_dit(info->iommu->ecap))
1405 struct pci_dev *pf_pdev;
1407 /* pdev will be returned if device is not a vf */
1408 pf_pdev = pci_physfn(pdev);
1409 info->pfsid = pci_dev_id(pf_pdev);
1412 #ifdef CONFIG_INTEL_IOMMU_SVM
1413 /* The PCIe spec, in its wisdom, declares that the behaviour of
1414 the device if you enable PASID support after ATS support is
1415 undefined. So always enable PASID support on devices which
1416 have it, even if we can't yet know if we're ever going to
1418 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419 info->pasid_enabled = 1;
1421 if (info->pri_supported &&
1422 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1423 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424 info->pri_enabled = 1;
1426 if (!pdev->untrusted && info->ats_supported &&
1427 pci_ats_page_aligned(pdev) &&
1428 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429 info->ats_enabled = 1;
1430 domain_update_iotlb(info->domain);
1431 info->ats_qdep = pci_ats_queue_depth(pdev);
1435 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1437 struct pci_dev *pdev;
1439 assert_spin_locked(&device_domain_lock);
1441 if (!dev_is_pci(info->dev))
1444 pdev = to_pci_dev(info->dev);
1446 if (info->ats_enabled) {
1447 pci_disable_ats(pdev);
1448 info->ats_enabled = 0;
1449 domain_update_iotlb(info->domain);
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452 if (info->pri_enabled) {
1453 pci_disable_pri(pdev);
1454 info->pri_enabled = 0;
1456 if (info->pasid_enabled) {
1457 pci_disable_pasid(pdev);
1458 info->pasid_enabled = 0;
1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464 u64 addr, unsigned mask)
1467 unsigned long flags;
1468 struct device_domain_info *info;
1470 if (!domain->has_iotlb_device)
1473 spin_lock_irqsave(&device_domain_lock, flags);
1474 list_for_each_entry(info, &domain->devices, link) {
1475 if (!info->ats_enabled)
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1483 spin_unlock_irqrestore(&device_domain_lock, flags);
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487 struct dmar_domain *domain,
1488 unsigned long pfn, unsigned int pages,
1491 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493 u16 did = domain->iommu_did[iommu->seq_id];
1500 * Fallback to domain selective flush if no PSI support or the size is
1502 * PSI requires page size to be 2 ^ x, and the base address is naturally
1503 * aligned to the size
1505 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1509 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1513 * In caching mode, changes of pages from non-present to present require
1514 * flush. However, device IOTLB doesn't need to be flushed in this case.
1516 if (!cap_caching_mode(iommu->cap) || !map)
1517 iommu_flush_dev_iotlb(domain, addr, mask);
1520 /* Notification for newly created mappings */
1521 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522 struct dmar_domain *domain,
1523 unsigned long pfn, unsigned int pages)
1525 /* It's a non-present to present mapping. Only flush if caching mode */
1526 if (cap_caching_mode(iommu->cap))
1527 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1529 iommu_flush_write_buffer(iommu);
1532 static void iommu_flush_iova(struct iova_domain *iovad)
1534 struct dmar_domain *domain;
1537 domain = container_of(iovad, struct dmar_domain, iovad);
1539 for_each_domain_iommu(idx, domain) {
1540 struct intel_iommu *iommu = g_iommus[idx];
1541 u16 did = domain->iommu_did[iommu->seq_id];
1543 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1545 if (!cap_caching_mode(iommu->cap))
1546 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547 0, MAX_AGAW_PFN_WIDTH);
1551 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1554 unsigned long flags;
1556 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1559 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561 pmen &= ~DMA_PMEN_EPM;
1562 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1564 /* wait for the protected region status bit to clear */
1565 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566 readl, !(pmen & DMA_PMEN_PRS), pmen);
1568 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1571 static void iommu_enable_translation(struct intel_iommu *iommu)
1574 unsigned long flags;
1576 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577 iommu->gcmd |= DMA_GCMD_TE;
1578 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1580 /* Make sure hardware complete it */
1581 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582 readl, (sts & DMA_GSTS_TES), sts);
1584 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1587 static void iommu_disable_translation(struct intel_iommu *iommu)
1592 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593 iommu->gcmd &= ~DMA_GCMD_TE;
1594 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1596 /* Make sure hardware complete it */
1597 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598 readl, (!(sts & DMA_GSTS_TES)), sts);
1600 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1603 static int iommu_init_domains(struct intel_iommu *iommu)
1605 u32 ndomains, nlongs;
1608 ndomains = cap_ndoms(iommu->cap);
1609 pr_debug("%s: Number of Domains supported <%d>\n",
1610 iommu->name, ndomains);
1611 nlongs = BITS_TO_LONGS(ndomains);
1613 spin_lock_init(&iommu->lock);
1615 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616 if (!iommu->domain_ids) {
1617 pr_err("%s: Allocating domain id array failed\n",
1622 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623 iommu->domains = kzalloc(size, GFP_KERNEL);
1625 if (iommu->domains) {
1626 size = 256 * sizeof(struct dmar_domain *);
1627 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1630 if (!iommu->domains || !iommu->domains[0]) {
1631 pr_err("%s: Allocating domain array failed\n",
1633 kfree(iommu->domain_ids);
1634 kfree(iommu->domains);
1635 iommu->domain_ids = NULL;
1636 iommu->domains = NULL;
1641 * If Caching mode is set, then invalid translations are tagged
1642 * with domain-id 0, hence we need to pre-allocate it. We also
1643 * use domain-id 0 as a marker for non-allocated domain-id, so
1644 * make sure it is not used for a real domain.
1646 set_bit(0, iommu->domain_ids);
1649 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650 * entry for first-level or pass-through translation modes should
1651 * be programmed with a domain id different from those used for
1652 * second-level or nested translation. We reserve a domain id for
1655 if (sm_supported(iommu))
1656 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1661 static void disable_dmar_iommu(struct intel_iommu *iommu)
1663 struct device_domain_info *info, *tmp;
1664 unsigned long flags;
1666 if (!iommu->domains || !iommu->domain_ids)
1669 spin_lock_irqsave(&device_domain_lock, flags);
1670 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671 if (info->iommu != iommu)
1674 if (!info->dev || !info->domain)
1677 __dmar_remove_one_dev_info(info);
1679 spin_unlock_irqrestore(&device_domain_lock, flags);
1681 if (iommu->gcmd & DMA_GCMD_TE)
1682 iommu_disable_translation(iommu);
1685 static void free_dmar_iommu(struct intel_iommu *iommu)
1687 if ((iommu->domains) && (iommu->domain_ids)) {
1688 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1691 for (i = 0; i < elems; i++)
1692 kfree(iommu->domains[i]);
1693 kfree(iommu->domains);
1694 kfree(iommu->domain_ids);
1695 iommu->domains = NULL;
1696 iommu->domain_ids = NULL;
1699 g_iommus[iommu->seq_id] = NULL;
1701 /* free context mapping */
1702 free_context_table(iommu);
1704 #ifdef CONFIG_INTEL_IOMMU_SVM
1705 if (pasid_supported(iommu)) {
1706 if (ecap_prs(iommu->ecap))
1707 intel_svm_finish_prq(iommu);
1712 static struct dmar_domain *alloc_domain(int flags)
1714 struct dmar_domain *domain;
1716 domain = alloc_domain_mem();
1720 memset(domain, 0, sizeof(*domain));
1721 domain->nid = NUMA_NO_NODE;
1722 domain->flags = flags;
1723 domain->has_iotlb_device = false;
1724 INIT_LIST_HEAD(&domain->devices);
1729 /* Must be called with iommu->lock */
1730 static int domain_attach_iommu(struct dmar_domain *domain,
1731 struct intel_iommu *iommu)
1733 unsigned long ndomains;
1736 assert_spin_locked(&device_domain_lock);
1737 assert_spin_locked(&iommu->lock);
1739 domain->iommu_refcnt[iommu->seq_id] += 1;
1740 domain->iommu_count += 1;
1741 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742 ndomains = cap_ndoms(iommu->cap);
1743 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1745 if (num >= ndomains) {
1746 pr_err("%s: No free domain ids\n", iommu->name);
1747 domain->iommu_refcnt[iommu->seq_id] -= 1;
1748 domain->iommu_count -= 1;
1752 set_bit(num, iommu->domain_ids);
1753 set_iommu_domain(iommu, num, domain);
1755 domain->iommu_did[iommu->seq_id] = num;
1756 domain->nid = iommu->node;
1758 domain_update_iommu_cap(domain);
1764 static int domain_detach_iommu(struct dmar_domain *domain,
1765 struct intel_iommu *iommu)
1769 assert_spin_locked(&device_domain_lock);
1770 assert_spin_locked(&iommu->lock);
1772 domain->iommu_refcnt[iommu->seq_id] -= 1;
1773 count = --domain->iommu_count;
1774 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775 num = domain->iommu_did[iommu->seq_id];
1776 clear_bit(num, iommu->domain_ids);
1777 set_iommu_domain(iommu, num, NULL);
1779 domain_update_iommu_cap(domain);
1780 domain->iommu_did[iommu->seq_id] = 0;
1786 static struct iova_domain reserved_iova_list;
1787 static struct lock_class_key reserved_rbtree_key;
1789 static int dmar_init_reserved_ranges(void)
1791 struct pci_dev *pdev = NULL;
1795 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1797 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798 &reserved_rbtree_key);
1800 /* IOAPIC ranges shouldn't be accessed by DMA */
1801 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802 IOVA_PFN(IOAPIC_RANGE_END));
1804 pr_err("Reserve IOAPIC range failed\n");
1808 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809 for_each_pci_dev(pdev) {
1812 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813 r = &pdev->resource[i];
1814 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1816 iova = reserve_iova(&reserved_iova_list,
1820 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1828 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1830 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1833 static inline int guestwidth_to_adjustwidth(int gaw)
1836 int r = (gaw - 12) % 9;
1847 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1850 int adjust_width, agaw;
1851 unsigned long sagaw;
1854 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1856 err = init_iova_flush_queue(&domain->iovad,
1857 iommu_flush_iova, iova_entry_free);
1861 domain_reserve_special_ranges(domain);
1863 /* calculate AGAW */
1864 if (guest_width > cap_mgaw(iommu->cap))
1865 guest_width = cap_mgaw(iommu->cap);
1866 domain->gaw = guest_width;
1867 adjust_width = guestwidth_to_adjustwidth(guest_width);
1868 agaw = width_to_agaw(adjust_width);
1869 sagaw = cap_sagaw(iommu->cap);
1870 if (!test_bit(agaw, &sagaw)) {
1871 /* hardware doesn't support it, choose a bigger one */
1872 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873 agaw = find_next_bit(&sagaw, 5, agaw);
1877 domain->agaw = agaw;
1879 if (ecap_coherent(iommu->ecap))
1880 domain->iommu_coherency = 1;
1882 domain->iommu_coherency = 0;
1884 if (ecap_sc_support(iommu->ecap))
1885 domain->iommu_snooping = 1;
1887 domain->iommu_snooping = 0;
1889 if (intel_iommu_superpage)
1890 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1892 domain->iommu_superpage = 0;
1894 domain->nid = iommu->node;
1896 /* always allocate the top pgd */
1897 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1900 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1904 static void domain_exit(struct dmar_domain *domain)
1907 /* Remove associated devices and clear attached or cached domains */
1908 domain_remove_dev_info(domain);
1911 put_iova_domain(&domain->iovad);
1914 struct page *freelist;
1916 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917 dma_free_pagelist(freelist);
1920 free_domain_mem(domain);
1924 * Get the PASID directory size for scalable mode context entry.
1925 * Value of X in the PDTS field of a scalable mode context entry
1926 * indicates PASID directory with 2^(X + 7) entries.
1928 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1932 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1941 * Set the RID_PASID field of a scalable mode context entry. The
1942 * IOMMU hardware will use the PASID value set in this field for
1943 * DMA translations of DMA requests without PASID.
1946 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1948 context->hi |= pasid & ((1 << 20) - 1);
1949 context->hi |= (1 << 20);
1953 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1956 static inline void context_set_sm_dte(struct context_entry *context)
1958 context->lo |= (1 << 2);
1962 * Set the PRE(Page Request Enable) field of a scalable mode context
1965 static inline void context_set_sm_pre(struct context_entry *context)
1967 context->lo |= (1 << 4);
1970 /* Convert value to context PASID directory size field coding. */
1971 #define context_pdts(pds) (((pds) & 0x7) << 9)
1973 static int domain_context_mapping_one(struct dmar_domain *domain,
1974 struct intel_iommu *iommu,
1975 struct pasid_table *table,
1978 u16 did = domain->iommu_did[iommu->seq_id];
1979 int translation = CONTEXT_TT_MULTI_LEVEL;
1980 struct device_domain_info *info = NULL;
1981 struct context_entry *context;
1982 unsigned long flags;
1987 if (hw_pass_through && domain_type_is_si(domain))
1988 translation = CONTEXT_TT_PASS_THROUGH;
1990 pr_debug("Set context mapping for %02x:%02x.%d\n",
1991 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1993 BUG_ON(!domain->pgd);
1995 spin_lock_irqsave(&device_domain_lock, flags);
1996 spin_lock(&iommu->lock);
1999 context = iommu_context_addr(iommu, bus, devfn, 1);
2004 if (context_present(context))
2008 * For kdump cases, old valid entries may be cached due to the
2009 * in-flight DMA and copied pgtable, but there is no unmapping
2010 * behaviour for them, thus we need an explicit cache flush for
2011 * the newly-mapped device. For kdump, at this point, the device
2012 * is supposed to finish reset at its driver probe stage, so no
2013 * in-flight DMA will exist, and we don't need to worry anymore
2016 if (context_copied(context)) {
2017 u16 did_old = context_domain_id(context);
2019 if (did_old < cap_ndoms(iommu->cap)) {
2020 iommu->flush.flush_context(iommu, did_old,
2021 (((u16)bus) << 8) | devfn,
2022 DMA_CCMD_MASK_NOBIT,
2023 DMA_CCMD_DEVICE_INVL);
2024 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2029 context_clear_entry(context);
2031 if (sm_supported(iommu)) {
2036 /* Setup the PASID DIR pointer: */
2037 pds = context_get_sm_pds(table);
2038 context->lo = (u64)virt_to_phys(table->table) |
2041 /* Setup the RID_PASID field: */
2042 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2045 * Setup the Device-TLB enable bit and Page request
2048 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049 if (info && info->ats_supported)
2050 context_set_sm_dte(context);
2051 if (info && info->pri_supported)
2052 context_set_sm_pre(context);
2054 struct dma_pte *pgd = domain->pgd;
2057 context_set_domain_id(context, did);
2059 if (translation != CONTEXT_TT_PASS_THROUGH) {
2061 * Skip top levels of page tables for iommu which has
2062 * less agaw than default. Unnecessary for PT mode.
2064 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2066 pgd = phys_to_virt(dma_pte_addr(pgd));
2067 if (!dma_pte_present(pgd))
2071 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072 if (info && info->ats_supported)
2073 translation = CONTEXT_TT_DEV_IOTLB;
2075 translation = CONTEXT_TT_MULTI_LEVEL;
2077 context_set_address_root(context, virt_to_phys(pgd));
2078 context_set_address_width(context, agaw);
2081 * In pass through mode, AW must be programmed to
2082 * indicate the largest AGAW value supported by
2083 * hardware. And ASR is ignored by hardware.
2085 context_set_address_width(context, iommu->msagaw);
2088 context_set_translation_type(context, translation);
2091 context_set_fault_enable(context);
2092 context_set_present(context);
2093 domain_flush_cache(domain, context, sizeof(*context));
2096 * It's a non-present to present mapping. If hardware doesn't cache
2097 * non-present entry we only need to flush the write-buffer. If the
2098 * _does_ cache non-present entries, then it does so in the special
2099 * domain #0, which we have to flush:
2101 if (cap_caching_mode(iommu->cap)) {
2102 iommu->flush.flush_context(iommu, 0,
2103 (((u16)bus) << 8) | devfn,
2104 DMA_CCMD_MASK_NOBIT,
2105 DMA_CCMD_DEVICE_INVL);
2106 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2108 iommu_flush_write_buffer(iommu);
2110 iommu_enable_dev_iotlb(info);
2115 spin_unlock(&iommu->lock);
2116 spin_unlock_irqrestore(&device_domain_lock, flags);
2121 struct domain_context_mapping_data {
2122 struct dmar_domain *domain;
2123 struct intel_iommu *iommu;
2124 struct pasid_table *table;
2127 static int domain_context_mapping_cb(struct pci_dev *pdev,
2128 u16 alias, void *opaque)
2130 struct domain_context_mapping_data *data = opaque;
2132 return domain_context_mapping_one(data->domain, data->iommu,
2133 data->table, PCI_BUS_NUM(alias),
2138 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2140 struct domain_context_mapping_data data;
2141 struct pasid_table *table;
2142 struct intel_iommu *iommu;
2145 iommu = device_to_iommu(dev, &bus, &devfn);
2149 table = intel_pasid_get_table(dev);
2151 if (!dev_is_pci(dev))
2152 return domain_context_mapping_one(domain, iommu, table,
2155 data.domain = domain;
2159 return pci_for_each_dma_alias(to_pci_dev(dev),
2160 &domain_context_mapping_cb, &data);
2163 static int domain_context_mapped_cb(struct pci_dev *pdev,
2164 u16 alias, void *opaque)
2166 struct intel_iommu *iommu = opaque;
2168 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2171 static int domain_context_mapped(struct device *dev)
2173 struct intel_iommu *iommu;
2176 iommu = device_to_iommu(dev, &bus, &devfn);
2180 if (!dev_is_pci(dev))
2181 return device_context_mapped(iommu, bus, devfn);
2183 return !pci_for_each_dma_alias(to_pci_dev(dev),
2184 domain_context_mapped_cb, iommu);
2187 /* Returns a number of VTD pages, but aligned to MM page size */
2188 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2191 host_addr &= ~PAGE_MASK;
2192 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2195 /* Return largest possible superpage level for a given mapping */
2196 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197 unsigned long iov_pfn,
2198 unsigned long phy_pfn,
2199 unsigned long pages)
2201 int support, level = 1;
2202 unsigned long pfnmerge;
2204 support = domain->iommu_superpage;
2206 /* To use a large page, the virtual *and* physical addresses
2207 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208 of them will mean we have to use smaller pages. So just
2209 merge them and check both at once. */
2210 pfnmerge = iov_pfn | phy_pfn;
2212 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213 pages >>= VTD_STRIDE_SHIFT;
2216 pfnmerge >>= VTD_STRIDE_SHIFT;
2223 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224 struct scatterlist *sg, unsigned long phys_pfn,
2225 unsigned long nr_pages, int prot)
2227 struct dma_pte *first_pte = NULL, *pte = NULL;
2228 phys_addr_t uninitialized_var(pteval);
2229 unsigned long sg_res = 0;
2230 unsigned int largepage_lvl = 0;
2231 unsigned long lvl_pages = 0;
2233 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2235 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2238 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2242 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2245 while (nr_pages > 0) {
2249 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2251 sg_res = aligned_nrpages(sg->offset, sg->length);
2252 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253 sg->dma_length = sg->length;
2254 pteval = (sg_phys(sg) - pgoff) | prot;
2255 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2259 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2261 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2264 /* It is large page*/
2265 if (largepage_lvl > 1) {
2266 unsigned long nr_superpages, end_pfn;
2268 pteval |= DMA_PTE_LARGE_PAGE;
2269 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2271 nr_superpages = sg_res / lvl_pages;
2272 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2275 * Ensure that old small page tables are
2276 * removed to make room for superpage(s).
2277 * We're adding new large pages, so make sure
2278 * we don't remove their parent tables.
2280 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2283 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2287 /* We don't need lock here, nobody else
2288 * touches the iova range
2290 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2292 static int dumps = 5;
2293 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294 iov_pfn, tmp, (unsigned long long)pteval);
2297 debug_dma_dump_mappings(NULL);
2302 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2304 BUG_ON(nr_pages < lvl_pages);
2305 BUG_ON(sg_res < lvl_pages);
2307 nr_pages -= lvl_pages;
2308 iov_pfn += lvl_pages;
2309 phys_pfn += lvl_pages;
2310 pteval += lvl_pages * VTD_PAGE_SIZE;
2311 sg_res -= lvl_pages;
2313 /* If the next PTE would be the first in a new page, then we
2314 need to flush the cache on the entries we've just written.
2315 And then we'll need to recalculate 'pte', so clear it and
2316 let it get set again in the if (!pte) block above.
2318 If we're done (!nr_pages) we need to flush the cache too.
2320 Also if we've been setting superpages, we may need to
2321 recalculate 'pte' and switch back to smaller pages for the
2322 end of the mapping, if the trailing size is not enough to
2323 use another superpage (i.e. sg_res < lvl_pages). */
2325 if (!nr_pages || first_pte_in_page(pte) ||
2326 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327 domain_flush_cache(domain, first_pte,
2328 (void *)pte - (void *)first_pte);
2332 if (!sg_res && nr_pages)
2338 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339 struct scatterlist *sg, unsigned long phys_pfn,
2340 unsigned long nr_pages, int prot)
2343 struct intel_iommu *iommu;
2345 /* Do the real mapping first */
2346 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2350 for_each_domain_iommu(iommu_id, domain) {
2351 iommu = g_iommus[iommu_id];
2352 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2358 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359 struct scatterlist *sg, unsigned long nr_pages,
2362 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2365 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366 unsigned long phys_pfn, unsigned long nr_pages,
2369 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2372 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2374 unsigned long flags;
2375 struct context_entry *context;
2381 spin_lock_irqsave(&iommu->lock, flags);
2382 context = iommu_context_addr(iommu, bus, devfn, 0);
2384 spin_unlock_irqrestore(&iommu->lock, flags);
2387 did_old = context_domain_id(context);
2388 context_clear_entry(context);
2389 __iommu_flush_cache(iommu, context, sizeof(*context));
2390 spin_unlock_irqrestore(&iommu->lock, flags);
2391 iommu->flush.flush_context(iommu,
2393 (((u16)bus) << 8) | devfn,
2394 DMA_CCMD_MASK_NOBIT,
2395 DMA_CCMD_DEVICE_INVL);
2396 iommu->flush.flush_iotlb(iommu,
2403 static inline void unlink_domain_info(struct device_domain_info *info)
2405 assert_spin_locked(&device_domain_lock);
2406 list_del(&info->link);
2407 list_del(&info->global);
2409 info->dev->archdata.iommu = NULL;
2412 static void domain_remove_dev_info(struct dmar_domain *domain)
2414 struct device_domain_info *info, *tmp;
2415 unsigned long flags;
2417 spin_lock_irqsave(&device_domain_lock, flags);
2418 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419 __dmar_remove_one_dev_info(info);
2420 spin_unlock_irqrestore(&device_domain_lock, flags);
2425 * Note: we use struct device->archdata.iommu stores the info
2427 static struct dmar_domain *find_domain(struct device *dev)
2429 struct device_domain_info *info;
2431 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2432 struct iommu_domain *domain;
2434 dev->archdata.iommu = NULL;
2435 domain = iommu_get_domain_for_dev(dev);
2437 intel_iommu_attach_device(domain, dev);
2440 /* No lock here, assumes no domain exit in normal case */
2441 info = dev->archdata.iommu;
2444 return info->domain;
2448 static inline struct device_domain_info *
2449 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2451 struct device_domain_info *info;
2453 list_for_each_entry(info, &device_domain_list, global)
2454 if (info->iommu->segment == segment && info->bus == bus &&
2455 info->devfn == devfn)
2461 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2464 struct dmar_domain *domain)
2466 struct dmar_domain *found = NULL;
2467 struct device_domain_info *info;
2468 unsigned long flags;
2471 info = alloc_devinfo_mem();
2476 info->devfn = devfn;
2477 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2481 info->domain = domain;
2482 info->iommu = iommu;
2483 info->pasid_table = NULL;
2484 info->auxd_enabled = 0;
2485 INIT_LIST_HEAD(&info->auxiliary_domains);
2487 if (dev && dev_is_pci(dev)) {
2488 struct pci_dev *pdev = to_pci_dev(info->dev);
2490 if (!pdev->untrusted &&
2491 !pci_ats_disabled() &&
2492 ecap_dev_iotlb_support(iommu->ecap) &&
2493 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494 dmar_find_matched_atsr_unit(pdev))
2495 info->ats_supported = 1;
2497 if (sm_supported(iommu)) {
2498 if (pasid_supported(iommu)) {
2499 int features = pci_pasid_features(pdev);
2501 info->pasid_supported = features | 1;
2504 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506 info->pri_supported = 1;
2510 spin_lock_irqsave(&device_domain_lock, flags);
2512 found = find_domain(dev);
2515 struct device_domain_info *info2;
2516 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2518 found = info2->domain;
2524 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 free_devinfo_mem(info);
2526 /* Caller must free the original domain */
2530 spin_lock(&iommu->lock);
2531 ret = domain_attach_iommu(domain, iommu);
2532 spin_unlock(&iommu->lock);
2535 spin_unlock_irqrestore(&device_domain_lock, flags);
2536 free_devinfo_mem(info);
2540 list_add(&info->link, &domain->devices);
2541 list_add(&info->global, &device_domain_list);
2543 dev->archdata.iommu = info;
2544 spin_unlock_irqrestore(&device_domain_lock, flags);
2546 /* PASID table is mandatory for a PCI device in scalable mode. */
2547 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2548 ret = intel_pasid_alloc_table(dev);
2550 dev_err(dev, "PASID table allocation failed\n");
2551 dmar_remove_one_dev_info(dev);
2555 /* Setup the PASID entry for requests without PASID: */
2556 spin_lock(&iommu->lock);
2557 if (hw_pass_through && domain_type_is_si(domain))
2558 ret = intel_pasid_setup_pass_through(iommu, domain,
2559 dev, PASID_RID2PASID);
2561 ret = intel_pasid_setup_second_level(iommu, domain,
2562 dev, PASID_RID2PASID);
2563 spin_unlock(&iommu->lock);
2565 dev_err(dev, "Setup RID2PASID failed\n");
2566 dmar_remove_one_dev_info(dev);
2571 if (dev && domain_context_mapping(domain, dev)) {
2572 dev_err(dev, "Domain context map failed\n");
2573 dmar_remove_one_dev_info(dev);
2580 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2582 *(u16 *)opaque = alias;
2586 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2588 struct device_domain_info *info;
2589 struct dmar_domain *domain = NULL;
2590 struct intel_iommu *iommu;
2592 unsigned long flags;
2595 iommu = device_to_iommu(dev, &bus, &devfn);
2599 if (dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(dev);
2602 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2604 spin_lock_irqsave(&device_domain_lock, flags);
2605 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2606 PCI_BUS_NUM(dma_alias),
2609 iommu = info->iommu;
2610 domain = info->domain;
2612 spin_unlock_irqrestore(&device_domain_lock, flags);
2614 /* DMA alias already has a domain, use it */
2619 /* Allocate and initialize new domain for the device */
2620 domain = alloc_domain(0);
2623 if (domain_init(domain, iommu, gaw)) {
2624 domain_exit(domain);
2632 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633 struct dmar_domain *domain)
2635 struct intel_iommu *iommu;
2636 struct dmar_domain *tmp;
2637 u16 req_id, dma_alias;
2640 iommu = device_to_iommu(dev, &bus, &devfn);
2644 req_id = ((u16)bus << 8) | devfn;
2646 if (dev_is_pci(dev)) {
2647 struct pci_dev *pdev = to_pci_dev(dev);
2649 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2651 /* register PCI DMA alias device */
2652 if (req_id != dma_alias) {
2653 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654 dma_alias & 0xff, NULL, domain);
2656 if (!tmp || tmp != domain)
2661 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662 if (!tmp || tmp != domain)
2668 static int iommu_domain_identity_map(struct dmar_domain *domain,
2669 unsigned long long start,
2670 unsigned long long end)
2672 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2673 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2675 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2676 dma_to_mm_pfn(last_vpfn))) {
2677 pr_err("Reserving iova failed\n");
2681 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2683 * RMRR range might have overlap with physical memory range,
2686 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2688 return __domain_mapping(domain, first_vpfn, NULL,
2689 first_vpfn, last_vpfn - first_vpfn + 1,
2690 DMA_PTE_READ|DMA_PTE_WRITE);
2693 static int domain_prepare_identity_map(struct device *dev,
2694 struct dmar_domain *domain,
2695 unsigned long long start,
2696 unsigned long long end)
2698 /* For _hardware_ passthrough, don't bother. But for software
2699 passthrough, we do it anyway -- it may indicate a memory
2700 range which is reserved in E820, so which didn't get set
2701 up to start with in si_domain */
2702 if (domain == si_domain && hw_pass_through) {
2703 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2708 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2711 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2712 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2713 dmi_get_system_info(DMI_BIOS_VENDOR),
2714 dmi_get_system_info(DMI_BIOS_VERSION),
2715 dmi_get_system_info(DMI_PRODUCT_VERSION));
2719 if (end >> agaw_to_width(domain->agaw)) {
2720 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2721 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2722 agaw_to_width(domain->agaw),
2723 dmi_get_system_info(DMI_BIOS_VENDOR),
2724 dmi_get_system_info(DMI_BIOS_VERSION),
2725 dmi_get_system_info(DMI_PRODUCT_VERSION));
2729 return iommu_domain_identity_map(domain, start, end);
2732 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2734 static int __init si_domain_init(int hw)
2736 struct dmar_rmrr_unit *rmrr;
2740 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2744 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2745 domain_exit(si_domain);
2752 for_each_online_node(nid) {
2753 unsigned long start_pfn, end_pfn;
2756 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2757 ret = iommu_domain_identity_map(si_domain,
2758 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2765 * Identity map the RMRRs so that devices with RMRRs could also use
2768 for_each_rmrr_units(rmrr) {
2769 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2771 unsigned long long start = rmrr->base_address;
2772 unsigned long long end = rmrr->end_address;
2774 if (WARN_ON(end < start ||
2775 end >> agaw_to_width(si_domain->agaw)))
2778 ret = iommu_domain_identity_map(si_domain, start, end);
2787 static int identity_mapping(struct device *dev)
2789 struct device_domain_info *info;
2791 info = dev->archdata.iommu;
2792 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2793 return (info->domain == si_domain);
2798 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2800 struct dmar_domain *ndomain;
2801 struct intel_iommu *iommu;
2804 iommu = device_to_iommu(dev, &bus, &devfn);
2808 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2809 if (ndomain != domain)
2815 static bool device_has_rmrr(struct device *dev)
2817 struct dmar_rmrr_unit *rmrr;
2822 for_each_rmrr_units(rmrr) {
2824 * Return TRUE if this RMRR contains the device that
2827 for_each_active_dev_scope(rmrr->devices,
2828 rmrr->devices_cnt, i, tmp)
2830 is_downstream_to_pci_bridge(dev, tmp)) {
2840 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2841 * is relaxable (ie. is allowed to be not enforced under some conditions)
2842 * @dev: device handle
2844 * We assume that PCI USB devices with RMRRs have them largely
2845 * for historical reasons and that the RMRR space is not actively used post
2846 * boot. This exclusion may change if vendors begin to abuse it.
2848 * The same exception is made for graphics devices, with the requirement that
2849 * any use of the RMRR regions will be torn down before assigning the device
2852 * Return: true if the RMRR is relaxable, false otherwise
2854 static bool device_rmrr_is_relaxable(struct device *dev)
2856 struct pci_dev *pdev;
2858 if (!dev_is_pci(dev))
2861 pdev = to_pci_dev(dev);
2862 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2869 * There are a couple cases where we need to restrict the functionality of
2870 * devices associated with RMRRs. The first is when evaluating a device for
2871 * identity mapping because problems exist when devices are moved in and out
2872 * of domains and their respective RMRR information is lost. This means that
2873 * a device with associated RMRRs will never be in a "passthrough" domain.
2874 * The second is use of the device through the IOMMU API. This interface
2875 * expects to have full control of the IOVA space for the device. We cannot
2876 * satisfy both the requirement that RMRR access is maintained and have an
2877 * unencumbered IOVA space. We also have no ability to quiesce the device's
2878 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2879 * We therefore prevent devices associated with an RMRR from participating in
2880 * the IOMMU API, which eliminates them from device assignment.
2882 * In both cases, devices which have relaxable RMRRs are not concerned by this
2883 * restriction. See device_rmrr_is_relaxable comment.
2885 static bool device_is_rmrr_locked(struct device *dev)
2887 if (!device_has_rmrr(dev))
2890 if (device_rmrr_is_relaxable(dev))
2897 * Return the required default domain type for a specific device.
2899 * @dev: the device in query
2900 * @startup: true if this is during early boot
2903 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2904 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2905 * - 0: both identity and dynamic domains work for this device
2907 static int device_def_domain_type(struct device *dev)
2909 if (dev_is_pci(dev)) {
2910 struct pci_dev *pdev = to_pci_dev(dev);
2913 * Prevent any device marked as untrusted from getting
2914 * placed into the statically identity mapping domain.
2916 if (pdev->untrusted)
2917 return IOMMU_DOMAIN_DMA;
2919 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2920 return IOMMU_DOMAIN_IDENTITY;
2922 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2923 return IOMMU_DOMAIN_IDENTITY;
2926 * We want to start off with all devices in the 1:1 domain, and
2927 * take them out later if we find they can't access all of memory.
2929 * However, we can't do this for PCI devices behind bridges,
2930 * because all PCI devices behind the same bridge will end up
2931 * with the same source-id on their transactions.
2933 * Practically speaking, we can't change things around for these
2934 * devices at run-time, because we can't be sure there'll be no
2935 * DMA transactions in flight for any of their siblings.
2937 * So PCI devices (unless they're on the root bus) as well as
2938 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2939 * the 1:1 domain, just in _case_ one of their siblings turns out
2940 * not to be able to map all of memory.
2942 if (!pci_is_pcie(pdev)) {
2943 if (!pci_is_root_bus(pdev->bus))
2944 return IOMMU_DOMAIN_DMA;
2945 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2946 return IOMMU_DOMAIN_DMA;
2947 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2948 return IOMMU_DOMAIN_DMA;
2951 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2952 IOMMU_DOMAIN_IDENTITY : 0;
2955 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2958 * Start from the sane iommu hardware state.
2959 * If the queued invalidation is already initialized by us
2960 * (for example, while enabling interrupt-remapping) then
2961 * we got the things already rolling from a sane state.
2965 * Clear any previous faults.
2967 dmar_fault(-1, iommu);
2969 * Disable queued invalidation if supported and already enabled
2970 * before OS handover.
2972 dmar_disable_qi(iommu);
2975 if (dmar_enable_qi(iommu)) {
2977 * Queued Invalidate not enabled, use Register Based Invalidate
2979 iommu->flush.flush_context = __iommu_flush_context;
2980 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2981 pr_info("%s: Using Register based invalidation\n",
2984 iommu->flush.flush_context = qi_flush_context;
2985 iommu->flush.flush_iotlb = qi_flush_iotlb;
2986 pr_info("%s: Using Queued invalidation\n", iommu->name);
2990 static int copy_context_table(struct intel_iommu *iommu,
2991 struct root_entry *old_re,
2992 struct context_entry **tbl,
2995 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2996 struct context_entry *new_ce = NULL, ce;
2997 struct context_entry *old_ce = NULL;
2998 struct root_entry re;
2999 phys_addr_t old_ce_phys;
3001 tbl_idx = ext ? bus * 2 : bus;
3002 memcpy(&re, old_re, sizeof(re));
3004 for (devfn = 0; devfn < 256; devfn++) {
3005 /* First calculate the correct index */
3006 idx = (ext ? devfn * 2 : devfn) % 256;
3009 /* First save what we may have and clean up */
3011 tbl[tbl_idx] = new_ce;
3012 __iommu_flush_cache(iommu, new_ce,
3022 old_ce_phys = root_entry_lctp(&re);
3024 old_ce_phys = root_entry_uctp(&re);
3027 if (ext && devfn == 0) {
3028 /* No LCTP, try UCTP */
3037 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3042 new_ce = alloc_pgtable_page(iommu->node);
3049 /* Now copy the context entry */
3050 memcpy(&ce, old_ce + idx, sizeof(ce));
3052 if (!__context_present(&ce))
3055 did = context_domain_id(&ce);
3056 if (did >= 0 && did < cap_ndoms(iommu->cap))
3057 set_bit(did, iommu->domain_ids);
3060 * We need a marker for copied context entries. This
3061 * marker needs to work for the old format as well as
3062 * for extended context entries.
3064 * Bit 67 of the context entry is used. In the old
3065 * format this bit is available to software, in the
3066 * extended format it is the PGE bit, but PGE is ignored
3067 * by HW if PASIDs are disabled (and thus still
3070 * So disable PASIDs first and then mark the entry
3071 * copied. This means that we don't copy PASID
3072 * translations from the old kernel, but this is fine as
3073 * faults there are not fatal.
3075 context_clear_pasid_enable(&ce);
3076 context_set_copied(&ce);
3081 tbl[tbl_idx + pos] = new_ce;
3083 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3092 static int copy_translation_tables(struct intel_iommu *iommu)
3094 struct context_entry **ctxt_tbls;
3095 struct root_entry *old_rt;
3096 phys_addr_t old_rt_phys;
3097 int ctxt_table_entries;
3098 unsigned long flags;
3103 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3104 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3105 new_ext = !!ecap_ecs(iommu->ecap);
3108 * The RTT bit can only be changed when translation is disabled,
3109 * but disabling translation means to open a window for data
3110 * corruption. So bail out and don't copy anything if we would
3111 * have to change the bit.
3116 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3120 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3124 /* This is too big for the stack - allocate it from slab */
3125 ctxt_table_entries = ext ? 512 : 256;
3127 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3131 for (bus = 0; bus < 256; bus++) {
3132 ret = copy_context_table(iommu, &old_rt[bus],
3133 ctxt_tbls, bus, ext);
3135 pr_err("%s: Failed to copy context table for bus %d\n",
3141 spin_lock_irqsave(&iommu->lock, flags);
3143 /* Context tables are copied, now write them to the root_entry table */
3144 for (bus = 0; bus < 256; bus++) {
3145 int idx = ext ? bus * 2 : bus;
3148 if (ctxt_tbls[idx]) {
3149 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3150 iommu->root_entry[bus].lo = val;
3153 if (!ext || !ctxt_tbls[idx + 1])
3156 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3157 iommu->root_entry[bus].hi = val;
3160 spin_unlock_irqrestore(&iommu->lock, flags);
3164 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3174 static int __init init_dmars(void)
3176 struct dmar_drhd_unit *drhd;
3177 struct intel_iommu *iommu;
3183 * initialize and program root entry to not present
3186 for_each_drhd_unit(drhd) {
3188 * lock not needed as this is only incremented in the single
3189 * threaded kernel __init code path all other access are read
3192 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3196 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3199 /* Preallocate enough resources for IOMMU hot-addition */
3200 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3201 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3203 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3206 pr_err("Allocating global iommu array failed\n");
3211 for_each_iommu(iommu, drhd) {
3212 if (drhd->ignored) {
3213 iommu_disable_translation(iommu);
3218 * Find the max pasid size of all IOMMU's in the system.
3219 * We need to ensure the system pasid table is no bigger
3220 * than the smallest supported.
3222 if (pasid_supported(iommu)) {
3223 u32 temp = 2 << ecap_pss(iommu->ecap);
3225 intel_pasid_max_id = min_t(u32, temp,
3226 intel_pasid_max_id);
3229 g_iommus[iommu->seq_id] = iommu;
3231 intel_iommu_init_qi(iommu);
3233 ret = iommu_init_domains(iommu);
3237 init_translation_status(iommu);
3239 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240 iommu_disable_translation(iommu);
3241 clear_translation_pre_enabled(iommu);
3242 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3248 * we could share the same root & context tables
3249 * among all IOMMU's. Need to Split it later.
3251 ret = iommu_alloc_root_entry(iommu);
3255 if (translation_pre_enabled(iommu)) {
3256 pr_info("Translation already enabled - trying to copy translation structures\n");
3258 ret = copy_translation_tables(iommu);
3261 * We found the IOMMU with translation
3262 * enabled - but failed to copy over the
3263 * old root-entry table. Try to proceed
3264 * by disabling translation now and
3265 * allocating a clean root-entry table.
3266 * This might cause DMAR faults, but
3267 * probably the dump will still succeed.
3269 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3271 iommu_disable_translation(iommu);
3272 clear_translation_pre_enabled(iommu);
3274 pr_info("Copied translation tables from previous kernel for %s\n",
3279 if (!ecap_pass_through(iommu->ecap))
3280 hw_pass_through = 0;
3281 #ifdef CONFIG_INTEL_IOMMU_SVM
3282 if (pasid_supported(iommu))
3283 intel_svm_init(iommu);
3288 * Now that qi is enabled on all iommus, set the root entry and flush
3289 * caches. This is required on some Intel X58 chipsets, otherwise the
3290 * flush_context function will loop forever and the boot hangs.
3292 for_each_active_iommu(iommu, drhd) {
3293 iommu_flush_write_buffer(iommu);
3294 iommu_set_root_entry(iommu);
3295 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3296 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3299 if (iommu_default_passthrough())
3300 iommu_identity_mapping |= IDENTMAP_ALL;
3302 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3307 iommu_identity_mapping |= IDENTMAP_GFX;
3309 check_tylersburg_isoch();
3311 ret = si_domain_init(hw_pass_through);
3318 * global invalidate context cache
3319 * global invalidate iotlb
3320 * enable translation
3322 for_each_iommu(iommu, drhd) {
3323 if (drhd->ignored) {
3325 * we always have to disable PMRs or DMA may fail on
3329 iommu_disable_protect_mem_regions(iommu);
3333 iommu_flush_write_buffer(iommu);
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3338 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3339 * could cause possible lock race condition.
3341 up_write(&dmar_global_lock);
3342 ret = intel_svm_enable_prq(iommu);
3343 down_write(&dmar_global_lock);
3348 ret = dmar_set_interrupt(iommu);
3356 for_each_active_iommu(iommu, drhd) {
3357 disable_dmar_iommu(iommu);
3358 free_dmar_iommu(iommu);
3367 /* This takes a number of _MM_ pages, not VTD pages */
3368 static unsigned long intel_alloc_iova(struct device *dev,
3369 struct dmar_domain *domain,
3370 unsigned long nrpages, uint64_t dma_mask)
3372 unsigned long iova_pfn;
3374 /* Restrict dma_mask to the width that the iommu can handle */
3375 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3376 /* Ensure we reserve the whole size-aligned region */
3377 nrpages = __roundup_pow_of_two(nrpages);
3379 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3381 * First try to allocate an io virtual address in
3382 * DMA_BIT_MASK(32) and if that fails then try allocating
3385 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3386 IOVA_PFN(DMA_BIT_MASK(32)), false);
3390 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3391 IOVA_PFN(dma_mask), true);
3392 if (unlikely(!iova_pfn)) {
3393 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3401 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3403 struct dmar_domain *domain, *tmp;
3404 struct dmar_rmrr_unit *rmrr;
3405 struct device *i_dev;
3408 /* Device shouldn't be attached by any domains. */
3409 domain = find_domain(dev);
3413 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3417 /* We have a new domain - setup possible RMRRs for the device */
3419 for_each_rmrr_units(rmrr) {
3420 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3425 ret = domain_prepare_identity_map(dev, domain,
3429 dev_err(dev, "Mapping reserved region failed\n");
3434 tmp = set_domain_for_dev(dev, domain);
3435 if (!tmp || domain != tmp) {
3436 domain_exit(domain);
3442 dev_err(dev, "Allocating domain failed\n");
3444 domain->domain.type = IOMMU_DOMAIN_DMA;
3449 /* Check if the dev needs to go through non-identity map and unmap process.*/
3450 static bool iommu_need_mapping(struct device *dev)
3454 if (iommu_dummy(dev))
3457 ret = identity_mapping(dev);
3459 u64 dma_mask = *dev->dma_mask;
3461 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3462 dma_mask = dev->coherent_dma_mask;
3464 if (dma_mask >= dma_direct_get_required_mask(dev))
3468 * 32 bit DMA is removed from si_domain and fall back to
3469 * non-identity mapping.
3471 dmar_remove_one_dev_info(dev);
3472 ret = iommu_request_dma_domain_for_dev(dev);
3474 struct iommu_domain *domain;
3475 struct dmar_domain *dmar_domain;
3477 domain = iommu_get_domain_for_dev(dev);
3479 dmar_domain = to_dmar_domain(domain);
3480 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3482 dmar_remove_one_dev_info(dev);
3483 get_private_domain_for_dev(dev);
3486 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3492 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3493 size_t size, int dir, u64 dma_mask)
3495 struct dmar_domain *domain;
3496 phys_addr_t start_paddr;
3497 unsigned long iova_pfn;
3500 struct intel_iommu *iommu;
3501 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3503 BUG_ON(dir == DMA_NONE);
3505 domain = find_domain(dev);
3507 return DMA_MAPPING_ERROR;
3509 iommu = domain_get_iommu(domain);
3510 size = aligned_nrpages(paddr, size);
3512 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3517 * Check if DMAR supports zero-length reads on write only
3520 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3521 !cap_zlr(iommu->cap))
3522 prot |= DMA_PTE_READ;
3523 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3524 prot |= DMA_PTE_WRITE;
3526 * paddr - (paddr + size) might be partial page, we should map the whole
3527 * page. Note: if two part of one page are separately mapped, we
3528 * might have two guest_addr mapping to the same host paddr, but this
3529 * is not a big problem
3531 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3532 mm_to_dma_pfn(paddr_pfn), size, prot);
3536 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3537 start_paddr += paddr & ~PAGE_MASK;
3539 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3545 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3546 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3547 size, (unsigned long long)paddr, dir);
3548 return DMA_MAPPING_ERROR;
3551 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3552 unsigned long offset, size_t size,
3553 enum dma_data_direction dir,
3554 unsigned long attrs)
3556 if (iommu_need_mapping(dev))
3557 return __intel_map_single(dev, page_to_phys(page) + offset,
3558 size, dir, *dev->dma_mask);
3559 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3562 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3563 size_t size, enum dma_data_direction dir,
3564 unsigned long attrs)
3566 if (iommu_need_mapping(dev))
3567 return __intel_map_single(dev, phys_addr, size, dir,
3569 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3572 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3574 struct dmar_domain *domain;
3575 unsigned long start_pfn, last_pfn;
3576 unsigned long nrpages;
3577 unsigned long iova_pfn;
3578 struct intel_iommu *iommu;
3579 struct page *freelist;
3580 struct pci_dev *pdev = NULL;
3582 domain = find_domain(dev);
3585 iommu = domain_get_iommu(domain);
3587 iova_pfn = IOVA_PFN(dev_addr);
3589 nrpages = aligned_nrpages(dev_addr, size);
3590 start_pfn = mm_to_dma_pfn(iova_pfn);
3591 last_pfn = start_pfn + nrpages - 1;
3593 if (dev_is_pci(dev))
3594 pdev = to_pci_dev(dev);
3596 freelist = domain_unmap(domain, start_pfn, last_pfn);
3597 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3598 !has_iova_flush_queue(&domain->iovad)) {
3599 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3600 nrpages, !freelist, 0);
3602 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3603 dma_free_pagelist(freelist);
3605 queue_iova(&domain->iovad, iova_pfn, nrpages,
3606 (unsigned long)freelist);
3608 * queue up the release of the unmap to save the 1/6th of the
3609 * cpu used up by the iotlb flush operation...
3613 trace_unmap_single(dev, dev_addr, size);
3616 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3617 size_t size, enum dma_data_direction dir,
3618 unsigned long attrs)
3620 if (iommu_need_mapping(dev))
3621 intel_unmap(dev, dev_addr, size);
3623 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3626 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3627 size_t size, enum dma_data_direction dir, unsigned long attrs)
3629 if (iommu_need_mapping(dev))
3630 intel_unmap(dev, dev_addr, size);
3633 static void *intel_alloc_coherent(struct device *dev, size_t size,
3634 dma_addr_t *dma_handle, gfp_t flags,
3635 unsigned long attrs)
3637 struct page *page = NULL;
3640 if (!iommu_need_mapping(dev))
3641 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3643 size = PAGE_ALIGN(size);
3644 order = get_order(size);
3646 if (gfpflags_allow_blocking(flags)) {
3647 unsigned int count = size >> PAGE_SHIFT;
3649 page = dma_alloc_from_contiguous(dev, count, order,
3650 flags & __GFP_NOWARN);
3654 page = alloc_pages(flags, order);
3657 memset(page_address(page), 0, size);
3659 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3661 dev->coherent_dma_mask);
3662 if (*dma_handle != DMA_MAPPING_ERROR)
3663 return page_address(page);
3664 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3665 __free_pages(page, order);
3670 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3671 dma_addr_t dma_handle, unsigned long attrs)
3674 struct page *page = virt_to_page(vaddr);
3676 if (!iommu_need_mapping(dev))
3677 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3679 size = PAGE_ALIGN(size);
3680 order = get_order(size);
3682 intel_unmap(dev, dma_handle, size);
3683 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3684 __free_pages(page, order);
3687 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3688 int nelems, enum dma_data_direction dir,
3689 unsigned long attrs)
3691 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3692 unsigned long nrpages = 0;
3693 struct scatterlist *sg;
3696 if (!iommu_need_mapping(dev))
3697 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3699 for_each_sg(sglist, sg, nelems, i) {
3700 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3703 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3705 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3708 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3709 enum dma_data_direction dir, unsigned long attrs)
3712 struct dmar_domain *domain;
3715 unsigned long iova_pfn;
3717 struct scatterlist *sg;
3718 unsigned long start_vpfn;
3719 struct intel_iommu *iommu;
3721 BUG_ON(dir == DMA_NONE);
3722 if (!iommu_need_mapping(dev))
3723 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3725 domain = find_domain(dev);
3729 iommu = domain_get_iommu(domain);
3731 for_each_sg(sglist, sg, nelems, i)
3732 size += aligned_nrpages(sg->offset, sg->length);
3734 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3737 sglist->dma_length = 0;
3742 * Check if DMAR supports zero-length reads on write only
3745 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3746 !cap_zlr(iommu->cap))
3747 prot |= DMA_PTE_READ;
3748 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3749 prot |= DMA_PTE_WRITE;
3751 start_vpfn = mm_to_dma_pfn(iova_pfn);
3753 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3754 if (unlikely(ret)) {
3755 dma_pte_free_pagetable(domain, start_vpfn,
3756 start_vpfn + size - 1,
3757 agaw_to_level(domain->agaw) + 1);
3758 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3762 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3763 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3768 static u64 intel_get_required_mask(struct device *dev)
3770 if (!iommu_need_mapping(dev))
3771 return dma_direct_get_required_mask(dev);
3772 return DMA_BIT_MASK(32);
3775 static const struct dma_map_ops intel_dma_ops = {
3776 .alloc = intel_alloc_coherent,
3777 .free = intel_free_coherent,
3778 .map_sg = intel_map_sg,
3779 .unmap_sg = intel_unmap_sg,
3780 .map_page = intel_map_page,
3781 .unmap_page = intel_unmap_page,
3782 .map_resource = intel_map_resource,
3783 .unmap_resource = intel_unmap_resource,
3784 .dma_supported = dma_direct_supported,
3785 .mmap = dma_common_mmap,
3786 .get_sgtable = dma_common_get_sgtable,
3787 .get_required_mask = intel_get_required_mask,
3791 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3792 enum dma_data_direction dir, enum dma_sync_target target)
3794 struct dmar_domain *domain;
3795 phys_addr_t tlb_addr;
3797 domain = find_domain(dev);
3798 if (WARN_ON(!domain))
3801 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3802 if (is_swiotlb_buffer(tlb_addr))
3803 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3807 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3808 enum dma_data_direction dir, unsigned long attrs,
3811 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3812 struct dmar_domain *domain;
3813 struct intel_iommu *iommu;
3814 unsigned long iova_pfn;
3815 unsigned long nrpages;
3816 phys_addr_t tlb_addr;
3820 domain = find_domain(dev);
3821 if (WARN_ON(dir == DMA_NONE || !domain))
3822 return DMA_MAPPING_ERROR;
3824 iommu = domain_get_iommu(domain);
3825 if (WARN_ON(!iommu))
3826 return DMA_MAPPING_ERROR;
3828 nrpages = aligned_nrpages(0, size);
3829 iova_pfn = intel_alloc_iova(dev, domain,
3830 dma_to_mm_pfn(nrpages), dma_mask);
3832 return DMA_MAPPING_ERROR;
3835 * Check if DMAR supports zero-length reads on write only
3838 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3839 !cap_zlr(iommu->cap))
3840 prot |= DMA_PTE_READ;
3841 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3842 prot |= DMA_PTE_WRITE;
3845 * If both the physical buffer start address and size are
3846 * page aligned, we don't need to use a bounce page.
3848 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3849 tlb_addr = swiotlb_tbl_map_single(dev,
3850 __phys_to_dma(dev, io_tlb_start),
3851 paddr, size, aligned_size, dir, attrs);
3852 if (tlb_addr == DMA_MAPPING_ERROR) {
3855 /* Cleanup the padding area. */
3856 void *padding_start = phys_to_virt(tlb_addr);
3857 size_t padding_size = aligned_size;
3859 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3860 (dir == DMA_TO_DEVICE ||
3861 dir == DMA_BIDIRECTIONAL)) {
3862 padding_start += size;
3863 padding_size -= size;
3866 memset(padding_start, 0, padding_size);
3872 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3873 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3877 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3879 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3882 if (is_swiotlb_buffer(tlb_addr))
3883 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3884 aligned_size, dir, attrs);
3886 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3887 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3888 size, (unsigned long long)paddr, dir);
3890 return DMA_MAPPING_ERROR;
3894 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3895 enum dma_data_direction dir, unsigned long attrs)
3897 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3898 struct dmar_domain *domain;
3899 phys_addr_t tlb_addr;
3901 domain = find_domain(dev);
3902 if (WARN_ON(!domain))
3905 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3906 if (WARN_ON(!tlb_addr))
3909 intel_unmap(dev, dev_addr, size);
3910 if (is_swiotlb_buffer(tlb_addr))
3911 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3912 aligned_size, dir, attrs);
3914 trace_bounce_unmap_single(dev, dev_addr, size);
3918 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3919 size_t size, enum dma_data_direction dir, unsigned long attrs)
3921 return bounce_map_single(dev, page_to_phys(page) + offset,
3922 size, dir, attrs, *dev->dma_mask);
3926 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3927 enum dma_data_direction dir, unsigned long attrs)
3929 return bounce_map_single(dev, phys_addr, size,
3930 dir, attrs, *dev->dma_mask);
3934 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3935 enum dma_data_direction dir, unsigned long attrs)
3937 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3941 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3942 enum dma_data_direction dir, unsigned long attrs)
3944 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3948 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3949 enum dma_data_direction dir, unsigned long attrs)
3951 struct scatterlist *sg;
3954 for_each_sg(sglist, sg, nelems, i)
3955 bounce_unmap_page(dev, sg->dma_address,
3956 sg_dma_len(sg), dir, attrs);
3960 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3961 enum dma_data_direction dir, unsigned long attrs)
3964 struct scatterlist *sg;
3966 for_each_sg(sglist, sg, nelems, i) {
3967 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3968 sg->offset, sg->length,
3970 if (sg->dma_address == DMA_MAPPING_ERROR)
3972 sg_dma_len(sg) = sg->length;
3978 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3983 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3984 size_t size, enum dma_data_direction dir)
3986 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3990 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3991 size_t size, enum dma_data_direction dir)
3993 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3997 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3998 int nelems, enum dma_data_direction dir)
4000 struct scatterlist *sg;
4003 for_each_sg(sglist, sg, nelems, i)
4004 bounce_sync_single(dev, sg_dma_address(sg),
4005 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4009 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4010 int nelems, enum dma_data_direction dir)
4012 struct scatterlist *sg;
4015 for_each_sg(sglist, sg, nelems, i)
4016 bounce_sync_single(dev, sg_dma_address(sg),
4017 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4020 static const struct dma_map_ops bounce_dma_ops = {
4021 .alloc = intel_alloc_coherent,
4022 .free = intel_free_coherent,
4023 .map_sg = bounce_map_sg,
4024 .unmap_sg = bounce_unmap_sg,
4025 .map_page = bounce_map_page,
4026 .unmap_page = bounce_unmap_page,
4027 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4028 .sync_single_for_device = bounce_sync_single_for_device,
4029 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4030 .sync_sg_for_device = bounce_sync_sg_for_device,
4031 .map_resource = bounce_map_resource,
4032 .unmap_resource = bounce_unmap_resource,
4033 .dma_supported = dma_direct_supported,
4036 static inline int iommu_domain_cache_init(void)
4040 iommu_domain_cache = kmem_cache_create("iommu_domain",
4041 sizeof(struct dmar_domain),
4046 if (!iommu_domain_cache) {
4047 pr_err("Couldn't create iommu_domain cache\n");
4054 static inline int iommu_devinfo_cache_init(void)
4058 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4059 sizeof(struct device_domain_info),
4063 if (!iommu_devinfo_cache) {
4064 pr_err("Couldn't create devinfo cache\n");
4071 static int __init iommu_init_mempool(void)
4074 ret = iova_cache_get();
4078 ret = iommu_domain_cache_init();
4082 ret = iommu_devinfo_cache_init();
4086 kmem_cache_destroy(iommu_domain_cache);
4093 static void __init iommu_exit_mempool(void)
4095 kmem_cache_destroy(iommu_devinfo_cache);
4096 kmem_cache_destroy(iommu_domain_cache);
4100 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4102 struct dmar_drhd_unit *drhd;
4106 /* We know that this device on this chipset has its own IOMMU.
4107 * If we find it under a different IOMMU, then the BIOS is lying
4108 * to us. Hope that the IOMMU for this device is actually
4109 * disabled, and it needs no translation...
4111 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4113 /* "can't" happen */
4114 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4117 vtbar &= 0xffff0000;
4119 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4120 drhd = dmar_find_matched_drhd_unit(pdev);
4121 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4122 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4123 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4124 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4127 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4129 static void __init init_no_remapping_devices(void)
4131 struct dmar_drhd_unit *drhd;
4135 for_each_drhd_unit(drhd) {
4136 if (!drhd->include_all) {
4137 for_each_active_dev_scope(drhd->devices,
4138 drhd->devices_cnt, i, dev)
4140 /* ignore DMAR unit if no devices exist */
4141 if (i == drhd->devices_cnt)
4146 for_each_active_drhd_unit(drhd) {
4147 if (drhd->include_all)
4150 for_each_active_dev_scope(drhd->devices,
4151 drhd->devices_cnt, i, dev)
4152 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4154 if (i < drhd->devices_cnt)
4157 /* This IOMMU has *only* gfx devices. Either bypass it or
4158 set the gfx_mapped flag, as appropriate */
4159 if (!dmar_map_gfx) {
4161 for_each_active_dev_scope(drhd->devices,
4162 drhd->devices_cnt, i, dev)
4163 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4168 #ifdef CONFIG_SUSPEND
4169 static int init_iommu_hw(void)
4171 struct dmar_drhd_unit *drhd;
4172 struct intel_iommu *iommu = NULL;
4174 for_each_active_iommu(iommu, drhd)
4176 dmar_reenable_qi(iommu);
4178 for_each_iommu(iommu, drhd) {
4179 if (drhd->ignored) {
4181 * we always have to disable PMRs or DMA may fail on
4185 iommu_disable_protect_mem_regions(iommu);
4189 iommu_flush_write_buffer(iommu);
4191 iommu_set_root_entry(iommu);
4193 iommu->flush.flush_context(iommu, 0, 0, 0,
4194 DMA_CCMD_GLOBAL_INVL);
4195 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4196 iommu_enable_translation(iommu);
4197 iommu_disable_protect_mem_regions(iommu);
4203 static void iommu_flush_all(void)
4205 struct dmar_drhd_unit *drhd;
4206 struct intel_iommu *iommu;
4208 for_each_active_iommu(iommu, drhd) {
4209 iommu->flush.flush_context(iommu, 0, 0, 0,
4210 DMA_CCMD_GLOBAL_INVL);
4211 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4212 DMA_TLB_GLOBAL_FLUSH);
4216 static int iommu_suspend(void)
4218 struct dmar_drhd_unit *drhd;
4219 struct intel_iommu *iommu = NULL;
4222 for_each_active_iommu(iommu, drhd) {
4223 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4225 if (!iommu->iommu_state)
4231 for_each_active_iommu(iommu, drhd) {
4232 iommu_disable_translation(iommu);
4234 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4236 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4237 readl(iommu->reg + DMAR_FECTL_REG);
4238 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4239 readl(iommu->reg + DMAR_FEDATA_REG);
4240 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4241 readl(iommu->reg + DMAR_FEADDR_REG);
4242 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4243 readl(iommu->reg + DMAR_FEUADDR_REG);
4245 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4250 for_each_active_iommu(iommu, drhd)
4251 kfree(iommu->iommu_state);
4256 static void iommu_resume(void)
4258 struct dmar_drhd_unit *drhd;
4259 struct intel_iommu *iommu = NULL;
4262 if (init_iommu_hw()) {
4264 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4266 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4270 for_each_active_iommu(iommu, drhd) {
4272 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4274 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4275 iommu->reg + DMAR_FECTL_REG);
4276 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4277 iommu->reg + DMAR_FEDATA_REG);
4278 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4279 iommu->reg + DMAR_FEADDR_REG);
4280 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4281 iommu->reg + DMAR_FEUADDR_REG);
4283 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4286 for_each_active_iommu(iommu, drhd)
4287 kfree(iommu->iommu_state);
4290 static struct syscore_ops iommu_syscore_ops = {
4291 .resume = iommu_resume,
4292 .suspend = iommu_suspend,
4295 static void __init init_iommu_pm_ops(void)
4297 register_syscore_ops(&iommu_syscore_ops);
4301 static inline void init_iommu_pm_ops(void) {}
4302 #endif /* CONFIG_PM */
4304 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4306 struct acpi_dmar_reserved_memory *rmrr;
4307 struct dmar_rmrr_unit *rmrru;
4309 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4313 rmrru->hdr = header;
4314 rmrr = (struct acpi_dmar_reserved_memory *)header;
4315 rmrru->base_address = rmrr->base_address;
4316 rmrru->end_address = rmrr->end_address;
4318 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4319 ((void *)rmrr) + rmrr->header.length,
4320 &rmrru->devices_cnt);
4321 if (rmrru->devices_cnt && rmrru->devices == NULL)
4324 list_add(&rmrru->list, &dmar_rmrr_units);
4333 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4335 struct dmar_atsr_unit *atsru;
4336 struct acpi_dmar_atsr *tmp;
4338 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4340 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4341 if (atsr->segment != tmp->segment)
4343 if (atsr->header.length != tmp->header.length)
4345 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4352 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4354 struct acpi_dmar_atsr *atsr;
4355 struct dmar_atsr_unit *atsru;
4357 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4360 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4361 atsru = dmar_find_atsr(atsr);
4365 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4370 * If memory is allocated from slab by ACPI _DSM method, we need to
4371 * copy the memory content because the memory buffer will be freed
4374 atsru->hdr = (void *)(atsru + 1);
4375 memcpy(atsru->hdr, hdr, hdr->length);
4376 atsru->include_all = atsr->flags & 0x1;
4377 if (!atsru->include_all) {
4378 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4379 (void *)atsr + atsr->header.length,
4380 &atsru->devices_cnt);
4381 if (atsru->devices_cnt && atsru->devices == NULL) {
4387 list_add_rcu(&atsru->list, &dmar_atsr_units);
4392 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4394 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4398 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4400 struct acpi_dmar_atsr *atsr;
4401 struct dmar_atsr_unit *atsru;
4403 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4404 atsru = dmar_find_atsr(atsr);
4406 list_del_rcu(&atsru->list);
4408 intel_iommu_free_atsr(atsru);
4414 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4418 struct acpi_dmar_atsr *atsr;
4419 struct dmar_atsr_unit *atsru;
4421 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4422 atsru = dmar_find_atsr(atsr);
4426 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4427 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4435 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4438 struct intel_iommu *iommu = dmaru->iommu;
4440 if (g_iommus[iommu->seq_id])
4443 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4444 pr_warn("%s: Doesn't support hardware pass through.\n",
4448 if (!ecap_sc_support(iommu->ecap) &&
4449 domain_update_iommu_snooping(iommu)) {
4450 pr_warn("%s: Doesn't support snooping.\n",
4454 sp = domain_update_iommu_superpage(iommu) - 1;
4455 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4456 pr_warn("%s: Doesn't support large page.\n",
4462 * Disable translation if already enabled prior to OS handover.
4464 if (iommu->gcmd & DMA_GCMD_TE)
4465 iommu_disable_translation(iommu);
4467 g_iommus[iommu->seq_id] = iommu;
4468 ret = iommu_init_domains(iommu);
4470 ret = iommu_alloc_root_entry(iommu);
4474 #ifdef CONFIG_INTEL_IOMMU_SVM
4475 if (pasid_supported(iommu))
4476 intel_svm_init(iommu);
4479 if (dmaru->ignored) {
4481 * we always have to disable PMRs or DMA may fail on this device
4484 iommu_disable_protect_mem_regions(iommu);
4488 intel_iommu_init_qi(iommu);
4489 iommu_flush_write_buffer(iommu);
4491 #ifdef CONFIG_INTEL_IOMMU_SVM
4492 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4493 ret = intel_svm_enable_prq(iommu);
4498 ret = dmar_set_interrupt(iommu);
4502 iommu_set_root_entry(iommu);
4503 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4504 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4505 iommu_enable_translation(iommu);
4507 iommu_disable_protect_mem_regions(iommu);
4511 disable_dmar_iommu(iommu);
4513 free_dmar_iommu(iommu);
4517 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4520 struct intel_iommu *iommu = dmaru->iommu;
4522 if (!intel_iommu_enabled)
4528 ret = intel_iommu_add(dmaru);
4530 disable_dmar_iommu(iommu);
4531 free_dmar_iommu(iommu);
4537 static void intel_iommu_free_dmars(void)
4539 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4540 struct dmar_atsr_unit *atsru, *atsr_n;
4542 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4543 list_del(&rmrru->list);
4544 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4548 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4549 list_del(&atsru->list);
4550 intel_iommu_free_atsr(atsru);
4554 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4557 struct pci_bus *bus;
4558 struct pci_dev *bridge = NULL;
4560 struct acpi_dmar_atsr *atsr;
4561 struct dmar_atsr_unit *atsru;
4563 dev = pci_physfn(dev);
4564 for (bus = dev->bus; bus; bus = bus->parent) {
4566 /* If it's an integrated device, allow ATS */
4569 /* Connected via non-PCIe: no ATS */
4570 if (!pci_is_pcie(bridge) ||
4571 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4573 /* If we found the root port, look it up in the ATSR */
4574 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4579 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4580 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4581 if (atsr->segment != pci_domain_nr(dev->bus))
4584 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4585 if (tmp == &bridge->dev)
4588 if (atsru->include_all)
4598 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4601 struct dmar_rmrr_unit *rmrru;
4602 struct dmar_atsr_unit *atsru;
4603 struct acpi_dmar_atsr *atsr;
4604 struct acpi_dmar_reserved_memory *rmrr;
4606 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4609 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4610 rmrr = container_of(rmrru->hdr,
4611 struct acpi_dmar_reserved_memory, header);
4612 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4613 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4614 ((void *)rmrr) + rmrr->header.length,
4615 rmrr->segment, rmrru->devices,
4616 rmrru->devices_cnt);
4619 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4620 dmar_remove_dev_scope(info, rmrr->segment,
4621 rmrru->devices, rmrru->devices_cnt);
4625 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4626 if (atsru->include_all)
4629 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4630 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4631 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4632 (void *)atsr + atsr->header.length,
4633 atsr->segment, atsru->devices,
4634 atsru->devices_cnt);
4639 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4640 if (dmar_remove_dev_scope(info, atsr->segment,
4641 atsru->devices, atsru->devices_cnt))
4649 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4650 unsigned long val, void *v)
4652 struct memory_notify *mhp = v;
4653 unsigned long long start, end;
4654 unsigned long start_vpfn, last_vpfn;
4657 case MEM_GOING_ONLINE:
4658 start = mhp->start_pfn << PAGE_SHIFT;
4659 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4660 if (iommu_domain_identity_map(si_domain, start, end)) {
4661 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4668 case MEM_CANCEL_ONLINE:
4669 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4670 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4671 while (start_vpfn <= last_vpfn) {
4673 struct dmar_drhd_unit *drhd;
4674 struct intel_iommu *iommu;
4675 struct page *freelist;
4677 iova = find_iova(&si_domain->iovad, start_vpfn);
4679 pr_debug("Failed get IOVA for PFN %lx\n",
4684 iova = split_and_remove_iova(&si_domain->iovad, iova,
4685 start_vpfn, last_vpfn);
4687 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4688 start_vpfn, last_vpfn);
4692 freelist = domain_unmap(si_domain, iova->pfn_lo,
4696 for_each_active_iommu(iommu, drhd)
4697 iommu_flush_iotlb_psi(iommu, si_domain,
4698 iova->pfn_lo, iova_size(iova),
4701 dma_free_pagelist(freelist);
4703 start_vpfn = iova->pfn_hi + 1;
4704 free_iova_mem(iova);
4712 static struct notifier_block intel_iommu_memory_nb = {
4713 .notifier_call = intel_iommu_memory_notifier,
4717 static void free_all_cpu_cached_iovas(unsigned int cpu)
4721 for (i = 0; i < g_num_of_iommus; i++) {
4722 struct intel_iommu *iommu = g_iommus[i];
4723 struct dmar_domain *domain;
4729 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4730 domain = get_iommu_domain(iommu, (u16)did);
4734 free_cpu_cached_iovas(cpu, &domain->iovad);
4739 static int intel_iommu_cpu_dead(unsigned int cpu)
4741 free_all_cpu_cached_iovas(cpu);
4745 static void intel_disable_iommus(void)
4747 struct intel_iommu *iommu = NULL;
4748 struct dmar_drhd_unit *drhd;
4750 for_each_iommu(iommu, drhd)
4751 iommu_disable_translation(iommu);
4754 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4756 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4758 return container_of(iommu_dev, struct intel_iommu, iommu);
4761 static ssize_t intel_iommu_show_version(struct device *dev,
4762 struct device_attribute *attr,
4765 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4766 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4767 return sprintf(buf, "%d:%d\n",
4768 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4770 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4772 static ssize_t intel_iommu_show_address(struct device *dev,
4773 struct device_attribute *attr,
4776 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4777 return sprintf(buf, "%llx\n", iommu->reg_phys);
4779 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4781 static ssize_t intel_iommu_show_cap(struct device *dev,
4782 struct device_attribute *attr,
4785 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4786 return sprintf(buf, "%llx\n", iommu->cap);
4788 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4790 static ssize_t intel_iommu_show_ecap(struct device *dev,
4791 struct device_attribute *attr,
4794 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4795 return sprintf(buf, "%llx\n", iommu->ecap);
4797 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4799 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4800 struct device_attribute *attr,
4803 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4804 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4806 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4808 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4809 struct device_attribute *attr,
4812 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4813 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4814 cap_ndoms(iommu->cap)));
4816 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4818 static struct attribute *intel_iommu_attrs[] = {
4819 &dev_attr_version.attr,
4820 &dev_attr_address.attr,
4822 &dev_attr_ecap.attr,
4823 &dev_attr_domains_supported.attr,
4824 &dev_attr_domains_used.attr,
4828 static struct attribute_group intel_iommu_group = {
4829 .name = "intel-iommu",
4830 .attrs = intel_iommu_attrs,
4833 const struct attribute_group *intel_iommu_groups[] = {
4838 static inline bool has_untrusted_dev(void)
4840 struct pci_dev *pdev = NULL;
4842 for_each_pci_dev(pdev)
4843 if (pdev->untrusted)
4849 static int __init platform_optin_force_iommu(void)
4851 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4854 if (no_iommu || dmar_disabled)
4855 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4858 * If Intel-IOMMU is disabled by default, we will apply identity
4859 * map for all devices except those marked as being untrusted.
4862 iommu_identity_mapping |= IDENTMAP_ALL;
4870 static int __init probe_acpi_namespace_devices(void)
4872 struct dmar_drhd_unit *drhd;
4873 /* To avoid a -Wunused-but-set-variable warning. */
4874 struct intel_iommu *iommu __maybe_unused;
4878 for_each_active_iommu(iommu, drhd) {
4879 for_each_active_dev_scope(drhd->devices,
4880 drhd->devices_cnt, i, dev) {
4881 struct acpi_device_physical_node *pn;
4882 struct iommu_group *group;
4883 struct acpi_device *adev;
4885 if (dev->bus != &acpi_bus_type)
4888 adev = to_acpi_device(dev);
4889 mutex_lock(&adev->physical_node_lock);
4890 list_for_each_entry(pn,
4891 &adev->physical_node_list, node) {
4892 group = iommu_group_get(pn->dev);
4894 iommu_group_put(group);
4898 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4899 ret = iommu_probe_device(pn->dev);
4903 mutex_unlock(&adev->physical_node_lock);
4913 int __init intel_iommu_init(void)
4916 struct dmar_drhd_unit *drhd;
4917 struct intel_iommu *iommu;
4920 * Intel IOMMU is required for a TXT/tboot launch or platform
4921 * opt in, so enforce that.
4923 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4925 if (iommu_init_mempool()) {
4927 panic("tboot: Failed to initialize iommu memory\n");
4931 down_write(&dmar_global_lock);
4932 if (dmar_table_init()) {
4934 panic("tboot: Failed to initialize DMAR table\n");
4938 if (dmar_dev_scope_init() < 0) {
4940 panic("tboot: Failed to initialize DMAR device scope\n");
4944 up_write(&dmar_global_lock);
4947 * The bus notifier takes the dmar_global_lock, so lockdep will
4948 * complain later when we register it under the lock.
4950 dmar_register_bus_notifier();
4952 down_write(&dmar_global_lock);
4955 intel_iommu_debugfs_init();
4957 if (no_iommu || dmar_disabled) {
4959 * We exit the function here to ensure IOMMU's remapping and
4960 * mempool aren't setup, which means that the IOMMU's PMRs
4961 * won't be disabled via the call to init_dmars(). So disable
4962 * it explicitly here. The PMRs were setup by tboot prior to
4963 * calling SENTER, but the kernel is expected to reset/tear
4966 if (intel_iommu_tboot_noforce) {
4967 for_each_iommu(iommu, drhd)
4968 iommu_disable_protect_mem_regions(iommu);
4972 * Make sure the IOMMUs are switched off, even when we
4973 * boot into a kexec kernel and the previous kernel left
4976 intel_disable_iommus();
4980 if (list_empty(&dmar_rmrr_units))
4981 pr_info("No RMRR found\n");
4983 if (list_empty(&dmar_atsr_units))
4984 pr_info("No ATSR found\n");
4986 if (dmar_init_reserved_ranges()) {
4988 panic("tboot: Failed to reserve iommu ranges\n");
4989 goto out_free_reserved_range;
4993 intel_iommu_gfx_mapped = 1;
4995 init_no_remapping_devices();
5000 panic("tboot: Failed to initialize DMARs\n");
5001 pr_err("Initialization failed\n");
5002 goto out_free_reserved_range;
5004 up_write(&dmar_global_lock);
5006 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5008 * If the system has no untrusted device or the user has decided
5009 * to disable the bounce page mechanisms, we don't need swiotlb.
5010 * Mark this and the pre-allocated bounce pages will be released
5013 if (!has_untrusted_dev() || intel_no_bounce)
5016 dma_ops = &intel_dma_ops;
5018 init_iommu_pm_ops();
5020 down_read(&dmar_global_lock);
5021 for_each_active_iommu(iommu, drhd) {
5022 iommu_device_sysfs_add(&iommu->iommu, NULL,
5025 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5026 iommu_device_register(&iommu->iommu);
5028 up_read(&dmar_global_lock);
5030 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5031 if (si_domain && !hw_pass_through)
5032 register_memory_notifier(&intel_iommu_memory_nb);
5033 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5034 intel_iommu_cpu_dead);
5036 down_read(&dmar_global_lock);
5037 if (probe_acpi_namespace_devices())
5038 pr_warn("ACPI name space devices didn't probe correctly\n");
5040 /* Finally, we enable the DMA remapping hardware. */
5041 for_each_iommu(iommu, drhd) {
5042 if (!drhd->ignored && !translation_pre_enabled(iommu))
5043 iommu_enable_translation(iommu);
5045 iommu_disable_protect_mem_regions(iommu);
5047 up_read(&dmar_global_lock);
5049 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5051 intel_iommu_enabled = 1;
5055 out_free_reserved_range:
5056 put_iova_domain(&reserved_iova_list);
5058 intel_iommu_free_dmars();
5059 up_write(&dmar_global_lock);
5060 iommu_exit_mempool();
5064 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5066 struct intel_iommu *iommu = opaque;
5068 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5073 * NB - intel-iommu lacks any sort of reference counting for the users of
5074 * dependent devices. If multiple endpoints have intersecting dependent
5075 * devices, unbinding the driver from any one of them will possibly leave
5076 * the others unable to operate.
5078 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5080 if (!iommu || !dev || !dev_is_pci(dev))
5083 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5086 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5088 struct dmar_domain *domain;
5089 struct intel_iommu *iommu;
5090 unsigned long flags;
5092 assert_spin_locked(&device_domain_lock);
5097 iommu = info->iommu;
5098 domain = info->domain;
5101 if (dev_is_pci(info->dev) && sm_supported(iommu))
5102 intel_pasid_tear_down_entry(iommu, info->dev,
5105 iommu_disable_dev_iotlb(info);
5106 domain_context_clear(iommu, info->dev);
5107 intel_pasid_free_table(info->dev);
5110 unlink_domain_info(info);
5112 spin_lock_irqsave(&iommu->lock, flags);
5113 domain_detach_iommu(domain, iommu);
5114 spin_unlock_irqrestore(&iommu->lock, flags);
5116 /* free the private domain */
5117 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5118 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5119 list_empty(&domain->devices))
5120 domain_exit(info->domain);
5122 free_devinfo_mem(info);
5125 static void dmar_remove_one_dev_info(struct device *dev)
5127 struct device_domain_info *info;
5128 unsigned long flags;
5130 spin_lock_irqsave(&device_domain_lock, flags);
5131 info = dev->archdata.iommu;
5132 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5133 && info != DUMMY_DEVICE_DOMAIN_INFO)
5134 __dmar_remove_one_dev_info(info);
5135 spin_unlock_irqrestore(&device_domain_lock, flags);
5138 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5142 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5143 domain_reserve_special_ranges(domain);
5145 /* calculate AGAW */
5146 domain->gaw = guest_width;
5147 adjust_width = guestwidth_to_adjustwidth(guest_width);
5148 domain->agaw = width_to_agaw(adjust_width);
5150 domain->iommu_coherency = 0;
5151 domain->iommu_snooping = 0;
5152 domain->iommu_superpage = 0;
5153 domain->max_addr = 0;
5155 /* always allocate the top pgd */
5156 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5159 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5163 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5165 struct dmar_domain *dmar_domain;
5166 struct iommu_domain *domain;
5169 case IOMMU_DOMAIN_DMA:
5171 case IOMMU_DOMAIN_UNMANAGED:
5172 dmar_domain = alloc_domain(0);
5174 pr_err("Can't allocate dmar_domain\n");
5177 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5178 pr_err("Domain initialization failed\n");
5179 domain_exit(dmar_domain);
5183 if (type == IOMMU_DOMAIN_DMA &&
5184 init_iova_flush_queue(&dmar_domain->iovad,
5185 iommu_flush_iova, iova_entry_free)) {
5186 pr_warn("iova flush queue initialization failed\n");
5187 intel_iommu_strict = 1;
5190 domain_update_iommu_cap(dmar_domain);
5192 domain = &dmar_domain->domain;
5193 domain->geometry.aperture_start = 0;
5194 domain->geometry.aperture_end =
5195 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5196 domain->geometry.force_aperture = true;
5199 case IOMMU_DOMAIN_IDENTITY:
5200 return &si_domain->domain;
5208 static void intel_iommu_domain_free(struct iommu_domain *domain)
5210 if (domain != &si_domain->domain)
5211 domain_exit(to_dmar_domain(domain));
5215 * Check whether a @domain could be attached to the @dev through the
5216 * aux-domain attach/detach APIs.
5219 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5221 struct device_domain_info *info = dev->archdata.iommu;
5223 return info && info->auxd_enabled &&
5224 domain->type == IOMMU_DOMAIN_UNMANAGED;
5227 static void auxiliary_link_device(struct dmar_domain *domain,
5230 struct device_domain_info *info = dev->archdata.iommu;
5232 assert_spin_locked(&device_domain_lock);
5236 domain->auxd_refcnt++;
5237 list_add(&domain->auxd, &info->auxiliary_domains);
5240 static void auxiliary_unlink_device(struct dmar_domain *domain,
5243 struct device_domain_info *info = dev->archdata.iommu;
5245 assert_spin_locked(&device_domain_lock);
5249 list_del(&domain->auxd);
5250 domain->auxd_refcnt--;
5252 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5253 intel_pasid_free_id(domain->default_pasid);
5256 static int aux_domain_add_dev(struct dmar_domain *domain,
5261 unsigned long flags;
5262 struct intel_iommu *iommu;
5264 iommu = device_to_iommu(dev, &bus, &devfn);
5268 if (domain->default_pasid <= 0) {
5271 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5272 pci_max_pasids(to_pci_dev(dev)),
5275 pr_err("Can't allocate default pasid\n");
5278 domain->default_pasid = pasid;
5281 spin_lock_irqsave(&device_domain_lock, flags);
5283 * iommu->lock must be held to attach domain to iommu and setup the
5284 * pasid entry for second level translation.
5286 spin_lock(&iommu->lock);
5287 ret = domain_attach_iommu(domain, iommu);
5291 /* Setup the PASID entry for mediated devices: */
5292 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5293 domain->default_pasid);
5296 spin_unlock(&iommu->lock);
5298 auxiliary_link_device(domain, dev);
5300 spin_unlock_irqrestore(&device_domain_lock, flags);
5305 domain_detach_iommu(domain, iommu);
5307 spin_unlock(&iommu->lock);
5308 spin_unlock_irqrestore(&device_domain_lock, flags);
5309 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5310 intel_pasid_free_id(domain->default_pasid);
5315 static void aux_domain_remove_dev(struct dmar_domain *domain,
5318 struct device_domain_info *info;
5319 struct intel_iommu *iommu;
5320 unsigned long flags;
5322 if (!is_aux_domain(dev, &domain->domain))
5325 spin_lock_irqsave(&device_domain_lock, flags);
5326 info = dev->archdata.iommu;
5327 iommu = info->iommu;
5329 auxiliary_unlink_device(domain, dev);
5331 spin_lock(&iommu->lock);
5332 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5333 domain_detach_iommu(domain, iommu);
5334 spin_unlock(&iommu->lock);
5336 spin_unlock_irqrestore(&device_domain_lock, flags);
5339 static int prepare_domain_attach_device(struct iommu_domain *domain,
5342 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5343 struct intel_iommu *iommu;
5347 iommu = device_to_iommu(dev, &bus, &devfn);
5351 /* check if this iommu agaw is sufficient for max mapped address */
5352 addr_width = agaw_to_width(iommu->agaw);
5353 if (addr_width > cap_mgaw(iommu->cap))
5354 addr_width = cap_mgaw(iommu->cap);
5356 if (dmar_domain->max_addr > (1LL << addr_width)) {
5357 dev_err(dev, "%s: iommu width (%d) is not "
5358 "sufficient for the mapped address (%llx)\n",
5359 __func__, addr_width, dmar_domain->max_addr);
5362 dmar_domain->gaw = addr_width;
5365 * Knock out extra levels of page tables if necessary
5367 while (iommu->agaw < dmar_domain->agaw) {
5368 struct dma_pte *pte;
5370 pte = dmar_domain->pgd;
5371 if (dma_pte_present(pte)) {
5372 dmar_domain->pgd = (struct dma_pte *)
5373 phys_to_virt(dma_pte_addr(pte));
5374 free_pgtable_page(pte);
5376 dmar_domain->agaw--;
5382 static int intel_iommu_attach_device(struct iommu_domain *domain,
5387 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5388 device_is_rmrr_locked(dev)) {
5389 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5393 if (is_aux_domain(dev, domain))
5396 /* normally dev is not mapped */
5397 if (unlikely(domain_context_mapped(dev))) {
5398 struct dmar_domain *old_domain;
5400 old_domain = find_domain(dev);
5402 dmar_remove_one_dev_info(dev);
5405 ret = prepare_domain_attach_device(domain, dev);
5409 return domain_add_dev_info(to_dmar_domain(domain), dev);
5412 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5417 if (!is_aux_domain(dev, domain))
5420 ret = prepare_domain_attach_device(domain, dev);
5424 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5427 static void intel_iommu_detach_device(struct iommu_domain *domain,
5430 dmar_remove_one_dev_info(dev);
5433 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5436 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5439 static int intel_iommu_map(struct iommu_domain *domain,
5440 unsigned long iova, phys_addr_t hpa,
5441 size_t size, int iommu_prot)
5443 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5448 if (iommu_prot & IOMMU_READ)
5449 prot |= DMA_PTE_READ;
5450 if (iommu_prot & IOMMU_WRITE)
5451 prot |= DMA_PTE_WRITE;
5452 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5453 prot |= DMA_PTE_SNP;
5455 max_addr = iova + size;
5456 if (dmar_domain->max_addr < max_addr) {
5459 /* check if minimum agaw is sufficient for mapped address */
5460 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5461 if (end < max_addr) {
5462 pr_err("%s: iommu width (%d) is not "
5463 "sufficient for the mapped address (%llx)\n",
5464 __func__, dmar_domain->gaw, max_addr);
5467 dmar_domain->max_addr = max_addr;
5469 /* Round up size to next multiple of PAGE_SIZE, if it and
5470 the low bits of hpa would take us onto the next page */
5471 size = aligned_nrpages(hpa, size);
5472 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5473 hpa >> VTD_PAGE_SHIFT, size, prot);
5477 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5478 unsigned long iova, size_t size,
5479 struct iommu_iotlb_gather *gather)
5481 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5482 struct page *freelist = NULL;
5483 unsigned long start_pfn, last_pfn;
5484 unsigned int npages;
5485 int iommu_id, level = 0;
5487 /* Cope with horrid API which requires us to unmap more than the
5488 size argument if it happens to be a large-page mapping. */
5489 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5491 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5492 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5494 start_pfn = iova >> VTD_PAGE_SHIFT;
5495 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5497 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5499 npages = last_pfn - start_pfn + 1;
5501 for_each_domain_iommu(iommu_id, dmar_domain)
5502 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5503 start_pfn, npages, !freelist, 0);
5505 dma_free_pagelist(freelist);
5507 if (dmar_domain->max_addr == iova + size)
5508 dmar_domain->max_addr = iova;
5513 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5516 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5517 struct dma_pte *pte;
5521 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5522 if (pte && dma_pte_present(pte))
5523 phys = dma_pte_addr(pte) +
5524 (iova & (BIT_MASK(level_to_offset_bits(level) +
5525 VTD_PAGE_SHIFT) - 1));
5530 static inline bool scalable_mode_support(void)
5532 struct dmar_drhd_unit *drhd;
5533 struct intel_iommu *iommu;
5537 for_each_active_iommu(iommu, drhd) {
5538 if (!sm_supported(iommu)) {
5548 static inline bool iommu_pasid_support(void)
5550 struct dmar_drhd_unit *drhd;
5551 struct intel_iommu *iommu;
5555 for_each_active_iommu(iommu, drhd) {
5556 if (!pasid_supported(iommu)) {
5566 static bool intel_iommu_capable(enum iommu_cap cap)
5568 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5569 return domain_update_iommu_snooping(NULL) == 1;
5570 if (cap == IOMMU_CAP_INTR_REMAP)
5571 return irq_remapping_enabled == 1;
5576 static int intel_iommu_add_device(struct device *dev)
5578 struct dmar_domain *dmar_domain;
5579 struct iommu_domain *domain;
5580 struct intel_iommu *iommu;
5581 struct iommu_group *group;
5585 iommu = device_to_iommu(dev, &bus, &devfn);
5589 iommu_device_link(&iommu->iommu, dev);
5591 if (translation_pre_enabled(iommu))
5592 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5594 group = iommu_group_get_for_dev(dev);
5596 if (IS_ERR(group)) {
5597 ret = PTR_ERR(group);
5601 iommu_group_put(group);
5603 domain = iommu_get_domain_for_dev(dev);
5604 dmar_domain = to_dmar_domain(domain);
5605 if (domain->type == IOMMU_DOMAIN_DMA) {
5606 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5607 ret = iommu_request_dm_for_dev(dev);
5609 dmar_remove_one_dev_info(dev);
5610 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5611 domain_add_dev_info(si_domain, dev);
5613 "Device uses a private identity domain.\n");
5617 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5618 ret = iommu_request_dma_domain_for_dev(dev);
5620 dmar_remove_one_dev_info(dev);
5621 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5622 if (!get_private_domain_for_dev(dev)) {
5624 "Failed to get a private domain.\n");
5630 "Device uses a private dma domain.\n");
5635 if (device_needs_bounce(dev)) {
5636 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5637 set_dma_ops(dev, &bounce_dma_ops);
5643 iommu_device_unlink(&iommu->iommu, dev);
5647 static void intel_iommu_remove_device(struct device *dev)
5649 struct intel_iommu *iommu;
5652 iommu = device_to_iommu(dev, &bus, &devfn);
5656 dmar_remove_one_dev_info(dev);
5658 iommu_group_remove_device(dev);
5660 iommu_device_unlink(&iommu->iommu, dev);
5662 if (device_needs_bounce(dev))
5663 set_dma_ops(dev, NULL);
5666 static void intel_iommu_get_resv_regions(struct device *device,
5667 struct list_head *head)
5669 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5670 struct iommu_resv_region *reg;
5671 struct dmar_rmrr_unit *rmrr;
5672 struct device *i_dev;
5675 down_read(&dmar_global_lock);
5676 for_each_rmrr_units(rmrr) {
5677 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5679 struct iommu_resv_region *resv;
5680 enum iommu_resv_type type;
5683 if (i_dev != device &&
5684 !is_downstream_to_pci_bridge(device, i_dev))
5687 length = rmrr->end_address - rmrr->base_address + 1;
5689 type = device_rmrr_is_relaxable(device) ?
5690 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5692 resv = iommu_alloc_resv_region(rmrr->base_address,
5693 length, prot, type);
5697 list_add_tail(&resv->list, head);
5700 up_read(&dmar_global_lock);
5702 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5703 if (dev_is_pci(device)) {
5704 struct pci_dev *pdev = to_pci_dev(device);
5706 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5707 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5708 IOMMU_RESV_DIRECT_RELAXABLE);
5710 list_add_tail(®->list, head);
5713 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5715 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5716 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5720 list_add_tail(®->list, head);
5723 static void intel_iommu_put_resv_regions(struct device *dev,
5724 struct list_head *head)
5726 struct iommu_resv_region *entry, *next;
5728 list_for_each_entry_safe(entry, next, head, list)
5732 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5734 struct device_domain_info *info;
5735 struct context_entry *context;
5736 struct dmar_domain *domain;
5737 unsigned long flags;
5741 domain = find_domain(dev);
5745 spin_lock_irqsave(&device_domain_lock, flags);
5746 spin_lock(&iommu->lock);
5749 info = dev->archdata.iommu;
5750 if (!info || !info->pasid_supported)
5753 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5754 if (WARN_ON(!context))
5757 ctx_lo = context[0].lo;
5759 if (!(ctx_lo & CONTEXT_PASIDE)) {
5760 ctx_lo |= CONTEXT_PASIDE;
5761 context[0].lo = ctx_lo;
5763 iommu->flush.flush_context(iommu,
5764 domain->iommu_did[iommu->seq_id],
5765 PCI_DEVID(info->bus, info->devfn),
5766 DMA_CCMD_MASK_NOBIT,
5767 DMA_CCMD_DEVICE_INVL);
5770 /* Enable PASID support in the device, if it wasn't already */
5771 if (!info->pasid_enabled)
5772 iommu_enable_dev_iotlb(info);
5777 spin_unlock(&iommu->lock);
5778 spin_unlock_irqrestore(&device_domain_lock, flags);
5783 static void intel_iommu_apply_resv_region(struct device *dev,
5784 struct iommu_domain *domain,
5785 struct iommu_resv_region *region)
5787 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5788 unsigned long start, end;
5790 start = IOVA_PFN(region->start);
5791 end = IOVA_PFN(region->start + region->length - 1);
5793 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5796 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5798 if (dev_is_pci(dev))
5799 return pci_device_group(dev);
5800 return generic_device_group(dev);
5803 #ifdef CONFIG_INTEL_IOMMU_SVM
5804 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5806 struct intel_iommu *iommu;
5809 if (iommu_dummy(dev)) {
5811 "No IOMMU translation for device; cannot enable SVM\n");
5815 iommu = device_to_iommu(dev, &bus, &devfn);
5817 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5823 #endif /* CONFIG_INTEL_IOMMU_SVM */
5825 static int intel_iommu_enable_auxd(struct device *dev)
5827 struct device_domain_info *info;
5828 struct intel_iommu *iommu;
5829 unsigned long flags;
5833 iommu = device_to_iommu(dev, &bus, &devfn);
5834 if (!iommu || dmar_disabled)
5837 if (!sm_supported(iommu) || !pasid_supported(iommu))
5840 ret = intel_iommu_enable_pasid(iommu, dev);
5844 spin_lock_irqsave(&device_domain_lock, flags);
5845 info = dev->archdata.iommu;
5846 info->auxd_enabled = 1;
5847 spin_unlock_irqrestore(&device_domain_lock, flags);
5852 static int intel_iommu_disable_auxd(struct device *dev)
5854 struct device_domain_info *info;
5855 unsigned long flags;
5857 spin_lock_irqsave(&device_domain_lock, flags);
5858 info = dev->archdata.iommu;
5859 if (!WARN_ON(!info))
5860 info->auxd_enabled = 0;
5861 spin_unlock_irqrestore(&device_domain_lock, flags);
5867 * A PCI express designated vendor specific extended capability is defined
5868 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5869 * for system software and tools to detect endpoint devices supporting the
5870 * Intel scalable IO virtualization without host driver dependency.
5872 * Returns the address of the matching extended capability structure within
5873 * the device's PCI configuration space or 0 if the device does not support
5876 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5881 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5883 pci_read_config_word(pdev, pos + 4, &vendor);
5884 pci_read_config_word(pdev, pos + 8, &id);
5885 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5888 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5895 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5897 if (feat == IOMMU_DEV_FEAT_AUX) {
5900 if (!dev_is_pci(dev) || dmar_disabled ||
5901 !scalable_mode_support() || !iommu_pasid_support())
5904 ret = pci_pasid_features(to_pci_dev(dev));
5908 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5915 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5917 if (feat == IOMMU_DEV_FEAT_AUX)
5918 return intel_iommu_enable_auxd(dev);
5924 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5926 if (feat == IOMMU_DEV_FEAT_AUX)
5927 return intel_iommu_disable_auxd(dev);
5933 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5935 struct device_domain_info *info = dev->archdata.iommu;
5937 if (feat == IOMMU_DEV_FEAT_AUX)
5938 return scalable_mode_support() && info && info->auxd_enabled;
5944 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5946 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5948 return dmar_domain->default_pasid > 0 ?
5949 dmar_domain->default_pasid : -EINVAL;
5952 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5955 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5958 const struct iommu_ops intel_iommu_ops = {
5959 .capable = intel_iommu_capable,
5960 .domain_alloc = intel_iommu_domain_alloc,
5961 .domain_free = intel_iommu_domain_free,
5962 .attach_dev = intel_iommu_attach_device,
5963 .detach_dev = intel_iommu_detach_device,
5964 .aux_attach_dev = intel_iommu_aux_attach_device,
5965 .aux_detach_dev = intel_iommu_aux_detach_device,
5966 .aux_get_pasid = intel_iommu_aux_get_pasid,
5967 .map = intel_iommu_map,
5968 .unmap = intel_iommu_unmap,
5969 .iova_to_phys = intel_iommu_iova_to_phys,
5970 .add_device = intel_iommu_add_device,
5971 .remove_device = intel_iommu_remove_device,
5972 .get_resv_regions = intel_iommu_get_resv_regions,
5973 .put_resv_regions = intel_iommu_put_resv_regions,
5974 .apply_resv_region = intel_iommu_apply_resv_region,
5975 .device_group = intel_iommu_device_group,
5976 .dev_has_feat = intel_iommu_dev_has_feat,
5977 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5978 .dev_enable_feat = intel_iommu_dev_enable_feat,
5979 .dev_disable_feat = intel_iommu_dev_disable_feat,
5980 .is_attach_deferred = intel_iommu_is_attach_deferred,
5981 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5984 static void quirk_iommu_igfx(struct pci_dev *dev)
5986 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5990 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5999 /* Broadwell igfx malfunctions with dmar */
6000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6015 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6016 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6025 static void quirk_iommu_rwbf(struct pci_dev *dev)
6028 * Mobile 4 Series Chipset neglects to set RWBF capability,
6029 * but needs it. Same seems to hold for the desktop versions.
6031 pci_info(dev, "Forcing write-buffer flush capability\n");
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6044 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6045 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6046 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6047 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6048 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6049 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6050 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6051 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6053 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6057 if (pci_read_config_word(dev, GGC, &ggc))
6060 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6061 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6063 } else if (dmar_map_gfx) {
6064 /* we have to ensure the gfx device is idle before we flush */
6065 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6066 intel_iommu_strict = 1;
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6072 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6074 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6075 ISOCH DMAR unit for the Azalia sound device, but not give it any
6076 TLB entries, which causes it to deadlock. Check for that. We do
6077 this in a function called from init_dmars(), instead of in a PCI
6078 quirk, because we don't want to print the obnoxious "BIOS broken"
6079 message if VT-d is actually disabled.
6081 static void __init check_tylersburg_isoch(void)
6083 struct pci_dev *pdev;
6084 uint32_t vtisochctrl;
6086 /* If there's no Azalia in the system anyway, forget it. */
6087 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6092 /* System Management Registers. Might be hidden, in which case
6093 we can't do the sanity check. But that's OK, because the
6094 known-broken BIOSes _don't_ actually hide it, so far. */
6095 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6099 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6106 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6107 if (vtisochctrl & 1)
6110 /* Drop all bits other than the number of TLB entries */
6111 vtisochctrl &= 0x1c;
6113 /* If we have the recommended number of TLB entries (16), fine. */
6114 if (vtisochctrl == 0x10)
6117 /* Zero TLB entries? You get to ride the short bus to school. */
6119 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6120 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6121 dmi_get_system_info(DMI_BIOS_VENDOR),
6122 dmi_get_system_info(DMI_BIOS_VERSION),
6123 dmi_get_system_info(DMI_PRODUCT_VERSION));
6124 iommu_identity_mapping |= IDENTMAP_AZALIA;
6128 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",