2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
49 #include "irq_remapping.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
56 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58 #define IOAPIC_RANGE_START (0xfee00000)
59 #define IOAPIC_RANGE_END (0xfeefffff)
60 #define IOVA_START_ADDR (0x1000)
62 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
64 #define MAX_AGAW_WIDTH 64
66 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
72 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
75 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
76 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
77 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
79 /* page table handling */
80 #define LEVEL_STRIDE (9)
81 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
84 * This bitmap is used to advertise the page sizes our hardware support
85 * to the IOMMU core, which will then use this information to split
86 * physically contiguous memory regions it is mapping into page sizes
89 * Traditionally the IOMMU core just handed us the mappings directly,
90 * after making sure the size is an order of a 4KiB page and that the
91 * mapping has natural alignment.
93 * To retain this behavior, we currently advertise that we support
94 * all page sizes that are an order of 4KiB.
96 * If at some point we'd like to utilize the IOMMU core's new behavior,
97 * we could change this to advertise the real page sizes we support.
99 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
101 static inline int agaw_to_level(int agaw)
106 static inline int agaw_to_width(int agaw)
108 return 30 + agaw * LEVEL_STRIDE;
111 static inline int width_to_agaw(int width)
113 return (width - 30) / LEVEL_STRIDE;
116 static inline unsigned int level_to_offset_bits(int level)
118 return (level - 1) * LEVEL_STRIDE;
121 static inline int pfn_level_offset(unsigned long pfn, int level)
123 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
126 static inline unsigned long level_mask(int level)
128 return -1UL << level_to_offset_bits(level);
131 static inline unsigned long level_size(int level)
133 return 1UL << level_to_offset_bits(level);
136 static inline unsigned long align_to_level(unsigned long pfn, int level)
138 return (pfn + level_size(level) - 1) & level_mask(level);
141 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
143 return 1 << ((lvl - 1) * LEVEL_STRIDE);
146 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
147 are never going to work. */
148 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
150 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
155 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
157 static inline unsigned long page_to_dma_pfn(struct page *pg)
159 return mm_to_dma_pfn(page_to_pfn(pg));
161 static inline unsigned long virt_to_dma_pfn(void *p)
163 return page_to_dma_pfn(virt_to_page(p));
166 /* global iommu list, set NULL for ignored DMAR units */
167 static struct intel_iommu **g_iommus;
169 static void __init check_tylersburg_isoch(void);
170 static int rwbf_quirk;
173 * set to 1 to panic kernel if can't successfully enable VT-d
174 * (used when kernel is launched w/ TXT)
176 static int force_on = 0;
181 * 12-63: Context Ptr (12 - (haw-1))
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189 static inline bool root_present(struct root_entry *root)
191 return (root->val & 1);
193 static inline void set_root_present(struct root_entry *root)
197 static inline void set_root_value(struct root_entry *root, unsigned long value)
199 root->val |= value & VTD_PAGE_MASK;
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
205 return (struct context_entry *)
206 (root_present(root)?phys_to_virt(
207 root->val & VTD_PAGE_MASK) :
214 * 1: fault processing disable
215 * 2-3: translation type
216 * 12-63: address space root
222 struct context_entry {
227 static inline bool context_present(struct context_entry *context)
229 return (context->lo & 1);
231 static inline void context_set_present(struct context_entry *context)
236 static inline void context_set_fault_enable(struct context_entry *context)
238 context->lo &= (((u64)-1) << 2) | 1;
241 static inline void context_set_translation_type(struct context_entry *context,
244 context->lo &= (((u64)-1) << 4) | 3;
245 context->lo |= (value & 3) << 2;
248 static inline void context_set_address_root(struct context_entry *context,
251 context->lo |= value & VTD_PAGE_MASK;
254 static inline void context_set_address_width(struct context_entry *context,
257 context->hi |= value & 7;
260 static inline void context_set_domain_id(struct context_entry *context,
263 context->hi |= (value & ((1 << 16) - 1)) << 8;
266 static inline void context_clear_entry(struct context_entry *context)
279 * 12-63: Host physcial address
285 static inline void dma_clear_pte(struct dma_pte *pte)
290 static inline void dma_set_pte_readable(struct dma_pte *pte)
292 pte->val |= DMA_PTE_READ;
295 static inline void dma_set_pte_writable(struct dma_pte *pte)
297 pte->val |= DMA_PTE_WRITE;
300 static inline void dma_set_pte_snp(struct dma_pte *pte)
302 pte->val |= DMA_PTE_SNP;
305 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
307 pte->val = (pte->val & ~3) | (prot & 3);
310 static inline u64 dma_pte_addr(struct dma_pte *pte)
313 return pte->val & VTD_PAGE_MASK;
315 /* Must have a full atomic 64-bit read */
316 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
320 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
322 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
325 static inline bool dma_pte_present(struct dma_pte *pte)
327 return (pte->val & 3) != 0;
330 static inline bool dma_pte_superpage(struct dma_pte *pte)
332 return (pte->val & (1 << 7));
335 static inline int first_pte_in_page(struct dma_pte *pte)
337 return !((unsigned long)pte & ~VTD_PAGE_MASK);
341 * This domain is a statically identity mapping domain.
342 * 1. This domain creats a static 1:1 mapping to all usable memory.
343 * 2. It maps to each iommu if successful.
344 * 3. Each iommu mapps to this domain if successful.
346 static struct dmar_domain *si_domain;
347 static int hw_pass_through = 1;
349 /* devices under the same p2p bridge are owned in one domain */
350 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
352 /* domain represents a virtual machine, more than one devices
353 * across iommus may be owned in one domain, e.g. kvm guest.
355 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
357 /* si_domain contains mulitple devices */
358 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
360 /* define the limit of IOMMUs supported in each domain */
362 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
364 # define IOMMU_UNITS_SUPPORTED 64
368 int id; /* domain id */
369 int nid; /* node id */
370 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
371 /* bitmap of iommus this domain uses*/
373 struct list_head devices; /* all devices' list */
374 struct iova_domain iovad; /* iova's that belong to this domain */
376 struct dma_pte *pgd; /* virtual address */
377 int gaw; /* max guest address width */
379 /* adjusted guest address width, 0 is level 2 30-bit */
382 int flags; /* flags to find out type of domain */
384 int iommu_coherency;/* indicate coherency of iommu access */
385 int iommu_snooping; /* indicate snooping control feature*/
386 int iommu_count; /* reference count of iommu */
387 int iommu_superpage;/* Level of superpages supported:
388 0 == 4KiB (no superpages), 1 == 2MiB,
389 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
390 spinlock_t iommu_lock; /* protect iommu set in domain */
391 u64 max_addr; /* maximum mapped address */
394 /* PCI domain-device relationship */
395 struct device_domain_info {
396 struct list_head link; /* link to domain siblings */
397 struct list_head global; /* link to global list */
398 int segment; /* PCI domain */
399 u8 bus; /* PCI bus number */
400 u8 devfn; /* PCI devfn number */
401 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
402 struct intel_iommu *iommu; /* IOMMU used by this device */
403 struct dmar_domain *domain; /* pointer to domain */
406 static void flush_unmaps_timeout(unsigned long data);
408 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
410 #define HIGH_WATER_MARK 250
411 struct deferred_flush_tables {
413 struct iova *iova[HIGH_WATER_MARK];
414 struct dmar_domain *domain[HIGH_WATER_MARK];
417 static struct deferred_flush_tables *deferred_flush;
419 /* bitmap for indexing intel_iommus */
420 static int g_num_of_iommus;
422 static DEFINE_SPINLOCK(async_umap_flush_lock);
423 static LIST_HEAD(unmaps_to_do);
426 static long list_size;
428 static void domain_remove_dev_info(struct dmar_domain *domain);
430 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
431 int dmar_disabled = 0;
433 int dmar_disabled = 1;
434 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
436 int intel_iommu_enabled = 0;
437 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
439 static int dmar_map_gfx = 1;
440 static int dmar_forcedac;
441 static int intel_iommu_strict;
442 static int intel_iommu_superpage = 1;
444 int intel_iommu_gfx_mapped;
445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
448 static DEFINE_SPINLOCK(device_domain_lock);
449 static LIST_HEAD(device_domain_list);
451 static struct iommu_ops intel_iommu_ops;
453 static int __init intel_iommu_setup(char *str)
458 if (!strncmp(str, "on", 2)) {
460 printk(KERN_INFO "Intel-IOMMU: enabled\n");
461 } else if (!strncmp(str, "off", 3)) {
463 printk(KERN_INFO "Intel-IOMMU: disabled\n");
464 } else if (!strncmp(str, "igfx_off", 8)) {
467 "Intel-IOMMU: disable GFX device mapping\n");
468 } else if (!strncmp(str, "forcedac", 8)) {
470 "Intel-IOMMU: Forcing DAC for PCI devices\n");
472 } else if (!strncmp(str, "strict", 6)) {
474 "Intel-IOMMU: disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
478 "Intel-IOMMU: disable supported super page\n");
479 intel_iommu_superpage = 0;
482 str += strcspn(str, ",");
488 __setup("intel_iommu=", intel_iommu_setup);
490 static struct kmem_cache *iommu_domain_cache;
491 static struct kmem_cache *iommu_devinfo_cache;
492 static struct kmem_cache *iommu_iova_cache;
494 static inline void *alloc_pgtable_page(int node)
499 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
501 vaddr = page_address(page);
505 static inline void free_pgtable_page(void *vaddr)
507 free_page((unsigned long)vaddr);
510 static inline void *alloc_domain_mem(void)
512 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
515 static void free_domain_mem(void *vaddr)
517 kmem_cache_free(iommu_domain_cache, vaddr);
520 static inline void * alloc_devinfo_mem(void)
522 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
525 static inline void free_devinfo_mem(void *vaddr)
527 kmem_cache_free(iommu_devinfo_cache, vaddr);
530 struct iova *alloc_iova_mem(void)
532 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
535 void free_iova_mem(struct iova *iova)
537 kmem_cache_free(iommu_iova_cache, iova);
541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
546 sagaw = cap_sagaw(iommu->cap);
547 for (agaw = width_to_agaw(max_gaw);
549 if (test_bit(agaw, &sagaw))
557 * Calculate max SAGAW for each iommu.
559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
561 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
565 * calculate agaw for each iommu.
566 * "SAGAW" may be different across iommus, use a default agaw, and
567 * get a supported less agaw for iommus that don't support the default agaw.
569 int iommu_calculate_agaw(struct intel_iommu *iommu)
571 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
574 /* This functionin only returns single iommu in a domain */
575 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
579 /* si_domain and vm domain should not get here. */
580 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
581 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
583 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
584 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587 return g_iommus[iommu_id];
590 static void domain_update_iommu_coherency(struct dmar_domain *domain)
594 i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
596 domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
598 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
599 if (!ecap_coherent(g_iommus[i]->ecap)) {
600 domain->iommu_coherency = 0;
606 static void domain_update_iommu_snooping(struct dmar_domain *domain)
610 domain->iommu_snooping = 1;
612 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
613 if (!ecap_sc_support(g_iommus[i]->ecap)) {
614 domain->iommu_snooping = 0;
620 static void domain_update_iommu_superpage(struct dmar_domain *domain)
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu = NULL;
626 if (!intel_iommu_superpage) {
627 domain->iommu_superpage = 0;
631 /* set iommu_superpage to the smallest common denominator */
632 for_each_active_iommu(iommu, drhd) {
633 mask &= cap_super_page_val(iommu->cap);
638 domain->iommu_superpage = fls(mask);
641 /* Some capabilities may be different across iommus */
642 static void domain_update_iommu_cap(struct dmar_domain *domain)
644 domain_update_iommu_coherency(domain);
645 domain_update_iommu_snooping(domain);
646 domain_update_iommu_superpage(domain);
649 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
651 struct dmar_drhd_unit *drhd = NULL;
654 for_each_drhd_unit(drhd) {
657 if (segment != drhd->segment)
660 for (i = 0; i < drhd->devices_cnt; i++) {
661 if (drhd->devices[i] &&
662 drhd->devices[i]->bus->number == bus &&
663 drhd->devices[i]->devfn == devfn)
665 if (drhd->devices[i] &&
666 drhd->devices[i]->subordinate &&
667 drhd->devices[i]->subordinate->number <= bus &&
668 drhd->devices[i]->subordinate->busn_res.end >= bus)
672 if (drhd->include_all)
679 static void domain_flush_cache(struct dmar_domain *domain,
680 void *addr, int size)
682 if (!domain->iommu_coherency)
683 clflush_cache_range(addr, size);
686 /* Gets context entry for a given bus and devfn */
687 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
690 struct root_entry *root;
691 struct context_entry *context;
692 unsigned long phy_addr;
695 spin_lock_irqsave(&iommu->lock, flags);
696 root = &iommu->root_entry[bus];
697 context = get_context_addr_from_root(root);
699 context = (struct context_entry *)
700 alloc_pgtable_page(iommu->node);
702 spin_unlock_irqrestore(&iommu->lock, flags);
705 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
706 phy_addr = virt_to_phys((void *)context);
707 set_root_value(root, phy_addr);
708 set_root_present(root);
709 __iommu_flush_cache(iommu, root, sizeof(*root));
711 spin_unlock_irqrestore(&iommu->lock, flags);
712 return &context[devfn];
715 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
717 struct root_entry *root;
718 struct context_entry *context;
722 spin_lock_irqsave(&iommu->lock, flags);
723 root = &iommu->root_entry[bus];
724 context = get_context_addr_from_root(root);
729 ret = context_present(&context[devfn]);
731 spin_unlock_irqrestore(&iommu->lock, flags);
735 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
737 struct root_entry *root;
738 struct context_entry *context;
741 spin_lock_irqsave(&iommu->lock, flags);
742 root = &iommu->root_entry[bus];
743 context = get_context_addr_from_root(root);
745 context_clear_entry(&context[devfn]);
746 __iommu_flush_cache(iommu, &context[devfn], \
749 spin_unlock_irqrestore(&iommu->lock, flags);
752 static void free_context_table(struct intel_iommu *iommu)
754 struct root_entry *root;
757 struct context_entry *context;
759 spin_lock_irqsave(&iommu->lock, flags);
760 if (!iommu->root_entry) {
763 for (i = 0; i < ROOT_ENTRY_NR; i++) {
764 root = &iommu->root_entry[i];
765 context = get_context_addr_from_root(root);
767 free_pgtable_page(context);
769 free_pgtable_page(iommu->root_entry);
770 iommu->root_entry = NULL;
772 spin_unlock_irqrestore(&iommu->lock, flags);
775 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
776 unsigned long pfn, int target_level)
778 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
779 struct dma_pte *parent, *pte = NULL;
780 int level = agaw_to_level(domain->agaw);
783 BUG_ON(!domain->pgd);
784 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
785 parent = domain->pgd;
790 offset = pfn_level_offset(pfn, level);
791 pte = &parent[offset];
792 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
794 if (level == target_level)
797 if (!dma_pte_present(pte)) {
800 tmp_page = alloc_pgtable_page(domain->nid);
805 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
806 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
807 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
808 /* Someone else set it while we were thinking; use theirs. */
809 free_pgtable_page(tmp_page);
812 domain_flush_cache(domain, pte, sizeof(*pte));
815 parent = phys_to_virt(dma_pte_addr(pte));
823 /* return address's pte at specific level */
824 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
826 int level, int *large_page)
828 struct dma_pte *parent, *pte = NULL;
829 int total = agaw_to_level(domain->agaw);
832 parent = domain->pgd;
833 while (level <= total) {
834 offset = pfn_level_offset(pfn, total);
835 pte = &parent[offset];
839 if (!dma_pte_present(pte)) {
844 if (pte->val & DMA_PTE_LARGE_PAGE) {
849 parent = phys_to_virt(dma_pte_addr(pte));
855 /* clear last level pte, a tlb flush should be followed */
856 static int dma_pte_clear_range(struct dmar_domain *domain,
857 unsigned long start_pfn,
858 unsigned long last_pfn)
860 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
861 unsigned int large_page = 1;
862 struct dma_pte *first_pte, *pte;
865 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
866 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
867 BUG_ON(start_pfn > last_pfn);
869 /* we don't need lock here; nobody else touches the iova range */
872 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
874 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
879 start_pfn += lvl_to_nr_pages(large_page);
881 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
883 domain_flush_cache(domain, first_pte,
884 (void *)pte - (void *)first_pte);
886 } while (start_pfn && start_pfn <= last_pfn);
888 order = (large_page - 1) * 9;
892 /* free page table pages. last level pte should already be cleared */
893 static void dma_pte_free_pagetable(struct dmar_domain *domain,
894 unsigned long start_pfn,
895 unsigned long last_pfn)
897 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
898 struct dma_pte *first_pte, *pte;
899 int total = agaw_to_level(domain->agaw);
904 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906 BUG_ON(start_pfn > last_pfn);
908 /* We don't need lock here; nobody else touches the iova range */
910 while (level <= total) {
911 tmp = align_to_level(start_pfn, level);
913 /* If we can't even clear one PTE at this level, we're done */
914 if (tmp + level_size(level) - 1 > last_pfn)
919 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
920 if (large_page > level)
921 level = large_page + 1;
923 tmp = align_to_level(tmp + 1, level + 1);
927 if (dma_pte_present(pte)) {
928 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
932 tmp += level_size(level);
933 } while (!first_pte_in_page(pte) &&
934 tmp + level_size(level) - 1 <= last_pfn);
936 domain_flush_cache(domain, first_pte,
937 (void *)pte - (void *)first_pte);
939 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
943 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
944 free_pgtable_page(domain->pgd);
950 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
952 struct root_entry *root;
955 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
959 __iommu_flush_cache(iommu, root, ROOT_SIZE);
961 spin_lock_irqsave(&iommu->lock, flags);
962 iommu->root_entry = root;
963 spin_unlock_irqrestore(&iommu->lock, flags);
968 static void iommu_set_root_entry(struct intel_iommu *iommu)
974 addr = iommu->root_entry;
976 raw_spin_lock_irqsave(&iommu->register_lock, flag);
977 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
979 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
981 /* Make sure hardware complete it */
982 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983 readl, (sts & DMA_GSTS_RTPS), sts);
985 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
988 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
993 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
996 raw_spin_lock_irqsave(&iommu->register_lock, flag);
997 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
999 /* Make sure hardware complete it */
1000 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001 readl, (!(val & DMA_GSTS_WBFS)), val);
1003 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1006 /* return value determine if we need a write buffer flush */
1007 static void __iommu_flush_context(struct intel_iommu *iommu,
1008 u16 did, u16 source_id, u8 function_mask,
1015 case DMA_CCMD_GLOBAL_INVL:
1016 val = DMA_CCMD_GLOBAL_INVL;
1018 case DMA_CCMD_DOMAIN_INVL:
1019 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1021 case DMA_CCMD_DEVICE_INVL:
1022 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1023 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1028 val |= DMA_CCMD_ICC;
1030 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1031 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1033 /* Make sure hardware complete it */
1034 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1035 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1037 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1040 /* return value determine if we need a write buffer flush */
1041 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1042 u64 addr, unsigned int size_order, u64 type)
1044 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1045 u64 val = 0, val_iva = 0;
1049 case DMA_TLB_GLOBAL_FLUSH:
1050 /* global flush doesn't need set IVA_REG */
1051 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1053 case DMA_TLB_DSI_FLUSH:
1054 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056 case DMA_TLB_PSI_FLUSH:
1057 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1058 /* Note: always flush non-leaf currently */
1059 val_iva = size_order | addr;
1064 /* Note: set drain read/write */
1067 * This is probably to be super secure.. Looks like we can
1068 * ignore it without any impact.
1070 if (cap_read_drain(iommu->cap))
1071 val |= DMA_TLB_READ_DRAIN;
1073 if (cap_write_drain(iommu->cap))
1074 val |= DMA_TLB_WRITE_DRAIN;
1076 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1077 /* Note: Only uses first TLB reg currently */
1079 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1080 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1082 /* Make sure hardware complete it */
1083 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1084 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1086 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1088 /* check IOTLB invalidation granularity */
1089 if (DMA_TLB_IAIG(val) == 0)
1090 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1091 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1092 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1093 (unsigned long long)DMA_TLB_IIRG(type),
1094 (unsigned long long)DMA_TLB_IAIG(val));
1097 static struct device_domain_info *iommu_support_dev_iotlb(
1098 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1101 unsigned long flags;
1102 struct device_domain_info *info;
1103 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1105 if (!ecap_dev_iotlb_support(iommu->ecap))
1111 spin_lock_irqsave(&device_domain_lock, flags);
1112 list_for_each_entry(info, &domain->devices, link)
1113 if (info->bus == bus && info->devfn == devfn) {
1117 spin_unlock_irqrestore(&device_domain_lock, flags);
1119 if (!found || !info->dev)
1122 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1125 if (!dmar_find_matched_atsr_unit(info->dev))
1128 info->iommu = iommu;
1133 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1138 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1141 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1143 if (!info->dev || !pci_ats_enabled(info->dev))
1146 pci_disable_ats(info->dev);
1149 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1150 u64 addr, unsigned mask)
1153 unsigned long flags;
1154 struct device_domain_info *info;
1156 spin_lock_irqsave(&device_domain_lock, flags);
1157 list_for_each_entry(info, &domain->devices, link) {
1158 if (!info->dev || !pci_ats_enabled(info->dev))
1161 sid = info->bus << 8 | info->devfn;
1162 qdep = pci_ats_queue_depth(info->dev);
1163 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1165 spin_unlock_irqrestore(&device_domain_lock, flags);
1168 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1169 unsigned long pfn, unsigned int pages, int map)
1171 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1172 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1177 * Fallback to domain selective flush if no PSI support or the size is
1179 * PSI requires page size to be 2 ^ x, and the base address is naturally
1180 * aligned to the size
1182 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1183 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1186 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1190 * In caching mode, changes of pages from non-present to present require
1191 * flush. However, device IOTLB doesn't need to be flushed in this case.
1193 if (!cap_caching_mode(iommu->cap) || !map)
1194 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1197 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1200 unsigned long flags;
1202 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1203 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1204 pmen &= ~DMA_PMEN_EPM;
1205 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1207 /* wait for the protected region status bit to clear */
1208 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1209 readl, !(pmen & DMA_PMEN_PRS), pmen);
1211 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1214 static int iommu_enable_translation(struct intel_iommu *iommu)
1217 unsigned long flags;
1219 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1220 iommu->gcmd |= DMA_GCMD_TE;
1221 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1223 /* Make sure hardware complete it */
1224 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225 readl, (sts & DMA_GSTS_TES), sts);
1227 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1231 static int iommu_disable_translation(struct intel_iommu *iommu)
1236 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237 iommu->gcmd &= ~DMA_GCMD_TE;
1238 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1240 /* Make sure hardware complete it */
1241 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1242 readl, (!(sts & DMA_GSTS_TES)), sts);
1244 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1249 static int iommu_init_domains(struct intel_iommu *iommu)
1251 unsigned long ndomains;
1252 unsigned long nlongs;
1254 ndomains = cap_ndoms(iommu->cap);
1255 pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1257 nlongs = BITS_TO_LONGS(ndomains);
1259 spin_lock_init(&iommu->lock);
1261 /* TBD: there might be 64K domains,
1262 * consider other allocation for future chip
1264 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1265 if (!iommu->domain_ids) {
1266 printk(KERN_ERR "Allocating domain id array failed\n");
1269 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1271 if (!iommu->domains) {
1272 printk(KERN_ERR "Allocating domain array failed\n");
1277 * if Caching mode is set, then invalid translations are tagged
1278 * with domainid 0. Hence we need to pre-allocate it.
1280 if (cap_caching_mode(iommu->cap))
1281 set_bit(0, iommu->domain_ids);
1286 static void domain_exit(struct dmar_domain *domain);
1287 static void vm_domain_exit(struct dmar_domain *domain);
1289 void free_dmar_iommu(struct intel_iommu *iommu)
1291 struct dmar_domain *domain;
1293 unsigned long flags;
1295 if ((iommu->domains) && (iommu->domain_ids)) {
1296 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1297 domain = iommu->domains[i];
1298 clear_bit(i, iommu->domain_ids);
1300 spin_lock_irqsave(&domain->iommu_lock, flags);
1301 if (--domain->iommu_count == 0) {
1302 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1303 vm_domain_exit(domain);
1305 domain_exit(domain);
1307 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1311 if (iommu->gcmd & DMA_GCMD_TE)
1312 iommu_disable_translation(iommu);
1315 irq_set_handler_data(iommu->irq, NULL);
1316 /* This will mask the irq */
1317 free_irq(iommu->irq, iommu);
1318 destroy_irq(iommu->irq);
1321 kfree(iommu->domains);
1322 kfree(iommu->domain_ids);
1324 g_iommus[iommu->seq_id] = NULL;
1326 /* if all iommus are freed, free g_iommus */
1327 for (i = 0; i < g_num_of_iommus; i++) {
1332 if (i == g_num_of_iommus)
1335 /* free context mapping */
1336 free_context_table(iommu);
1339 static struct dmar_domain *alloc_domain(void)
1341 struct dmar_domain *domain;
1343 domain = alloc_domain_mem();
1348 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1354 static int iommu_attach_domain(struct dmar_domain *domain,
1355 struct intel_iommu *iommu)
1358 unsigned long ndomains;
1359 unsigned long flags;
1361 ndomains = cap_ndoms(iommu->cap);
1363 spin_lock_irqsave(&iommu->lock, flags);
1365 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1366 if (num >= ndomains) {
1367 spin_unlock_irqrestore(&iommu->lock, flags);
1368 printk(KERN_ERR "IOMMU: no free domain ids\n");
1373 set_bit(num, iommu->domain_ids);
1374 set_bit(iommu->seq_id, domain->iommu_bmp);
1375 iommu->domains[num] = domain;
1376 spin_unlock_irqrestore(&iommu->lock, flags);
1381 static void iommu_detach_domain(struct dmar_domain *domain,
1382 struct intel_iommu *iommu)
1384 unsigned long flags;
1388 spin_lock_irqsave(&iommu->lock, flags);
1389 ndomains = cap_ndoms(iommu->cap);
1390 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1391 if (iommu->domains[num] == domain) {
1398 clear_bit(num, iommu->domain_ids);
1399 clear_bit(iommu->seq_id, domain->iommu_bmp);
1400 iommu->domains[num] = NULL;
1402 spin_unlock_irqrestore(&iommu->lock, flags);
1405 static struct iova_domain reserved_iova_list;
1406 static struct lock_class_key reserved_rbtree_key;
1408 static int dmar_init_reserved_ranges(void)
1410 struct pci_dev *pdev = NULL;
1414 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1416 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1417 &reserved_rbtree_key);
1419 /* IOAPIC ranges shouldn't be accessed by DMA */
1420 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1421 IOVA_PFN(IOAPIC_RANGE_END));
1423 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1427 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1428 for_each_pci_dev(pdev) {
1431 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1432 r = &pdev->resource[i];
1433 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1435 iova = reserve_iova(&reserved_iova_list,
1439 printk(KERN_ERR "Reserve iova failed\n");
1447 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1449 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1452 static inline int guestwidth_to_adjustwidth(int gaw)
1455 int r = (gaw - 12) % 9;
1466 static int domain_init(struct dmar_domain *domain, int guest_width)
1468 struct intel_iommu *iommu;
1469 int adjust_width, agaw;
1470 unsigned long sagaw;
1472 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1473 spin_lock_init(&domain->iommu_lock);
1475 domain_reserve_special_ranges(domain);
1477 /* calculate AGAW */
1478 iommu = domain_get_iommu(domain);
1479 if (guest_width > cap_mgaw(iommu->cap))
1480 guest_width = cap_mgaw(iommu->cap);
1481 domain->gaw = guest_width;
1482 adjust_width = guestwidth_to_adjustwidth(guest_width);
1483 agaw = width_to_agaw(adjust_width);
1484 sagaw = cap_sagaw(iommu->cap);
1485 if (!test_bit(agaw, &sagaw)) {
1486 /* hardware doesn't support it, choose a bigger one */
1487 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1488 agaw = find_next_bit(&sagaw, 5, agaw);
1492 domain->agaw = agaw;
1493 INIT_LIST_HEAD(&domain->devices);
1495 if (ecap_coherent(iommu->ecap))
1496 domain->iommu_coherency = 1;
1498 domain->iommu_coherency = 0;
1500 if (ecap_sc_support(iommu->ecap))
1501 domain->iommu_snooping = 1;
1503 domain->iommu_snooping = 0;
1505 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1506 domain->iommu_count = 1;
1507 domain->nid = iommu->node;
1509 /* always allocate the top pgd */
1510 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1513 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1517 static void domain_exit(struct dmar_domain *domain)
1519 struct dmar_drhd_unit *drhd;
1520 struct intel_iommu *iommu;
1522 /* Domain 0 is reserved, so dont process it */
1526 /* Flush any lazy unmaps that may reference this domain */
1527 if (!intel_iommu_strict)
1528 flush_unmaps_timeout(0);
1530 domain_remove_dev_info(domain);
1532 put_iova_domain(&domain->iovad);
1535 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537 /* free page tables */
1538 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1540 for_each_active_iommu(iommu, drhd)
1541 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1542 iommu_detach_domain(domain, iommu);
1544 free_domain_mem(domain);
1547 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1548 u8 bus, u8 devfn, int translation)
1550 struct context_entry *context;
1551 unsigned long flags;
1552 struct intel_iommu *iommu;
1553 struct dma_pte *pgd;
1555 unsigned long ndomains;
1558 struct device_domain_info *info = NULL;
1560 pr_debug("Set context mapping for %02x:%02x.%d\n",
1561 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1563 BUG_ON(!domain->pgd);
1564 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1565 translation != CONTEXT_TT_MULTI_LEVEL);
1567 iommu = device_to_iommu(segment, bus, devfn);
1571 context = device_to_context_entry(iommu, bus, devfn);
1574 spin_lock_irqsave(&iommu->lock, flags);
1575 if (context_present(context)) {
1576 spin_unlock_irqrestore(&iommu->lock, flags);
1583 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1584 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1587 /* find an available domain id for this device in iommu */
1588 ndomains = cap_ndoms(iommu->cap);
1589 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1590 if (iommu->domains[num] == domain) {
1598 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1599 if (num >= ndomains) {
1600 spin_unlock_irqrestore(&iommu->lock, flags);
1601 printk(KERN_ERR "IOMMU: no free domain ids\n");
1605 set_bit(num, iommu->domain_ids);
1606 iommu->domains[num] = domain;
1610 /* Skip top levels of page tables for
1611 * iommu which has less agaw than default.
1612 * Unnecessary for PT mode.
1614 if (translation != CONTEXT_TT_PASS_THROUGH) {
1615 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1616 pgd = phys_to_virt(dma_pte_addr(pgd));
1617 if (!dma_pte_present(pgd)) {
1618 spin_unlock_irqrestore(&iommu->lock, flags);
1625 context_set_domain_id(context, id);
1627 if (translation != CONTEXT_TT_PASS_THROUGH) {
1628 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1629 translation = info ? CONTEXT_TT_DEV_IOTLB :
1630 CONTEXT_TT_MULTI_LEVEL;
1633 * In pass through mode, AW must be programmed to indicate the largest
1634 * AGAW value supported by hardware. And ASR is ignored by hardware.
1636 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1637 context_set_address_width(context, iommu->msagaw);
1639 context_set_address_root(context, virt_to_phys(pgd));
1640 context_set_address_width(context, iommu->agaw);
1643 context_set_translation_type(context, translation);
1644 context_set_fault_enable(context);
1645 context_set_present(context);
1646 domain_flush_cache(domain, context, sizeof(*context));
1649 * It's a non-present to present mapping. If hardware doesn't cache
1650 * non-present entry we only need to flush the write-buffer. If the
1651 * _does_ cache non-present entries, then it does so in the special
1652 * domain #0, which we have to flush:
1654 if (cap_caching_mode(iommu->cap)) {
1655 iommu->flush.flush_context(iommu, 0,
1656 (((u16)bus) << 8) | devfn,
1657 DMA_CCMD_MASK_NOBIT,
1658 DMA_CCMD_DEVICE_INVL);
1659 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1661 iommu_flush_write_buffer(iommu);
1663 iommu_enable_dev_iotlb(info);
1664 spin_unlock_irqrestore(&iommu->lock, flags);
1666 spin_lock_irqsave(&domain->iommu_lock, flags);
1667 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1668 domain->iommu_count++;
1669 if (domain->iommu_count == 1)
1670 domain->nid = iommu->node;
1671 domain_update_iommu_cap(domain);
1673 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1678 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1682 struct pci_dev *tmp, *parent;
1684 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1685 pdev->bus->number, pdev->devfn,
1690 /* dependent device mapping */
1691 tmp = pci_find_upstream_pcie_bridge(pdev);
1694 /* Secondary interface's bus number and devfn 0 */
1695 parent = pdev->bus->self;
1696 while (parent != tmp) {
1697 ret = domain_context_mapping_one(domain,
1698 pci_domain_nr(parent->bus),
1699 parent->bus->number,
1700 parent->devfn, translation);
1703 parent = parent->bus->self;
1705 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1706 return domain_context_mapping_one(domain,
1707 pci_domain_nr(tmp->subordinate),
1708 tmp->subordinate->number, 0,
1710 else /* this is a legacy PCI bridge */
1711 return domain_context_mapping_one(domain,
1712 pci_domain_nr(tmp->bus),
1718 static int domain_context_mapped(struct pci_dev *pdev)
1721 struct pci_dev *tmp, *parent;
1722 struct intel_iommu *iommu;
1724 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1729 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1732 /* dependent device mapping */
1733 tmp = pci_find_upstream_pcie_bridge(pdev);
1736 /* Secondary interface's bus number and devfn 0 */
1737 parent = pdev->bus->self;
1738 while (parent != tmp) {
1739 ret = device_context_mapped(iommu, parent->bus->number,
1743 parent = parent->bus->self;
1745 if (pci_is_pcie(tmp))
1746 return device_context_mapped(iommu, tmp->subordinate->number,
1749 return device_context_mapped(iommu, tmp->bus->number,
1753 /* Returns a number of VTD pages, but aligned to MM page size */
1754 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1757 host_addr &= ~PAGE_MASK;
1758 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1761 /* Return largest possible superpage level for a given mapping */
1762 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1763 unsigned long iov_pfn,
1764 unsigned long phy_pfn,
1765 unsigned long pages)
1767 int support, level = 1;
1768 unsigned long pfnmerge;
1770 support = domain->iommu_superpage;
1772 /* To use a large page, the virtual *and* physical addresses
1773 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1774 of them will mean we have to use smaller pages. So just
1775 merge them and check both at once. */
1776 pfnmerge = iov_pfn | phy_pfn;
1778 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1779 pages >>= VTD_STRIDE_SHIFT;
1782 pfnmerge >>= VTD_STRIDE_SHIFT;
1789 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1790 struct scatterlist *sg, unsigned long phys_pfn,
1791 unsigned long nr_pages, int prot)
1793 struct dma_pte *first_pte = NULL, *pte = NULL;
1794 phys_addr_t uninitialized_var(pteval);
1795 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1796 unsigned long sg_res;
1797 unsigned int largepage_lvl = 0;
1798 unsigned long lvl_pages = 0;
1800 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1802 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1805 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1810 sg_res = nr_pages + 1;
1811 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1814 while (nr_pages > 0) {
1818 sg_res = aligned_nrpages(sg->offset, sg->length);
1819 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1820 sg->dma_length = sg->length;
1821 pteval = page_to_phys(sg_page(sg)) | prot;
1822 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1826 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1828 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1831 /* It is large page*/
1832 if (largepage_lvl > 1) {
1833 pteval |= DMA_PTE_LARGE_PAGE;
1834 /* Ensure that old small page tables are removed to make room
1835 for superpage, if they exist. */
1836 dma_pte_clear_range(domain, iov_pfn,
1837 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838 dma_pte_free_pagetable(domain, iov_pfn,
1839 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1841 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1845 /* We don't need lock here, nobody else
1846 * touches the iova range
1848 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1850 static int dumps = 5;
1851 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1852 iov_pfn, tmp, (unsigned long long)pteval);
1855 debug_dma_dump_mappings(NULL);
1860 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1862 BUG_ON(nr_pages < lvl_pages);
1863 BUG_ON(sg_res < lvl_pages);
1865 nr_pages -= lvl_pages;
1866 iov_pfn += lvl_pages;
1867 phys_pfn += lvl_pages;
1868 pteval += lvl_pages * VTD_PAGE_SIZE;
1869 sg_res -= lvl_pages;
1871 /* If the next PTE would be the first in a new page, then we
1872 need to flush the cache on the entries we've just written.
1873 And then we'll need to recalculate 'pte', so clear it and
1874 let it get set again in the if (!pte) block above.
1876 If we're done (!nr_pages) we need to flush the cache too.
1878 Also if we've been setting superpages, we may need to
1879 recalculate 'pte' and switch back to smaller pages for the
1880 end of the mapping, if the trailing size is not enough to
1881 use another superpage (i.e. sg_res < lvl_pages). */
1883 if (!nr_pages || first_pte_in_page(pte) ||
1884 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1885 domain_flush_cache(domain, first_pte,
1886 (void *)pte - (void *)first_pte);
1890 if (!sg_res && nr_pages)
1896 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1897 struct scatterlist *sg, unsigned long nr_pages,
1900 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1903 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1904 unsigned long phys_pfn, unsigned long nr_pages,
1907 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1910 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1915 clear_context_table(iommu, bus, devfn);
1916 iommu->flush.flush_context(iommu, 0, 0, 0,
1917 DMA_CCMD_GLOBAL_INVL);
1918 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1921 static inline void unlink_domain_info(struct device_domain_info *info)
1923 assert_spin_locked(&device_domain_lock);
1924 list_del(&info->link);
1925 list_del(&info->global);
1927 info->dev->dev.archdata.iommu = NULL;
1930 static void domain_remove_dev_info(struct dmar_domain *domain)
1932 struct device_domain_info *info;
1933 unsigned long flags;
1934 struct intel_iommu *iommu;
1936 spin_lock_irqsave(&device_domain_lock, flags);
1937 while (!list_empty(&domain->devices)) {
1938 info = list_entry(domain->devices.next,
1939 struct device_domain_info, link);
1940 unlink_domain_info(info);
1941 spin_unlock_irqrestore(&device_domain_lock, flags);
1943 iommu_disable_dev_iotlb(info);
1944 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1945 iommu_detach_dev(iommu, info->bus, info->devfn);
1946 free_devinfo_mem(info);
1948 spin_lock_irqsave(&device_domain_lock, flags);
1950 spin_unlock_irqrestore(&device_domain_lock, flags);
1955 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1957 static struct dmar_domain *
1958 find_domain(struct pci_dev *pdev)
1960 struct device_domain_info *info;
1962 /* No lock here, assumes no domain exit in normal case */
1963 info = pdev->dev.archdata.iommu;
1965 return info->domain;
1969 /* domain is initialized */
1970 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1972 struct dmar_domain *domain, *found = NULL;
1973 struct intel_iommu *iommu;
1974 struct dmar_drhd_unit *drhd;
1975 struct device_domain_info *info, *tmp;
1976 struct pci_dev *dev_tmp;
1977 unsigned long flags;
1978 int bus = 0, devfn = 0;
1982 domain = find_domain(pdev);
1986 segment = pci_domain_nr(pdev->bus);
1988 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1990 if (pci_is_pcie(dev_tmp)) {
1991 bus = dev_tmp->subordinate->number;
1994 bus = dev_tmp->bus->number;
1995 devfn = dev_tmp->devfn;
1997 spin_lock_irqsave(&device_domain_lock, flags);
1998 list_for_each_entry(info, &device_domain_list, global) {
1999 if (info->segment == segment &&
2000 info->bus == bus && info->devfn == devfn) {
2001 found = info->domain;
2005 spin_unlock_irqrestore(&device_domain_lock, flags);
2006 /* pcie-pci bridge already has a domain, uses it */
2013 domain = alloc_domain();
2017 /* Allocate new domain for the device */
2018 drhd = dmar_find_matched_drhd_unit(pdev);
2020 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2022 free_domain_mem(domain);
2025 iommu = drhd->iommu;
2027 ret = iommu_attach_domain(domain, iommu);
2029 free_domain_mem(domain);
2033 if (domain_init(domain, gaw)) {
2034 domain_exit(domain);
2038 /* register pcie-to-pci device */
2040 info = alloc_devinfo_mem();
2042 domain_exit(domain);
2045 info->segment = segment;
2047 info->devfn = devfn;
2049 info->domain = domain;
2050 /* This domain is shared by devices under p2p bridge */
2051 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2053 /* pcie-to-pci bridge already has a domain, uses it */
2055 spin_lock_irqsave(&device_domain_lock, flags);
2056 list_for_each_entry(tmp, &device_domain_list, global) {
2057 if (tmp->segment == segment &&
2058 tmp->bus == bus && tmp->devfn == devfn) {
2059 found = tmp->domain;
2064 spin_unlock_irqrestore(&device_domain_lock, flags);
2065 free_devinfo_mem(info);
2066 domain_exit(domain);
2069 list_add(&info->link, &domain->devices);
2070 list_add(&info->global, &device_domain_list);
2071 spin_unlock_irqrestore(&device_domain_lock, flags);
2076 info = alloc_devinfo_mem();
2079 info->segment = segment;
2080 info->bus = pdev->bus->number;
2081 info->devfn = pdev->devfn;
2083 info->domain = domain;
2084 spin_lock_irqsave(&device_domain_lock, flags);
2085 /* somebody is fast */
2086 found = find_domain(pdev);
2087 if (found != NULL) {
2088 spin_unlock_irqrestore(&device_domain_lock, flags);
2089 if (found != domain) {
2090 domain_exit(domain);
2093 free_devinfo_mem(info);
2096 list_add(&info->link, &domain->devices);
2097 list_add(&info->global, &device_domain_list);
2098 pdev->dev.archdata.iommu = info;
2099 spin_unlock_irqrestore(&device_domain_lock, flags);
2102 /* recheck it here, maybe others set it */
2103 return find_domain(pdev);
2106 static int iommu_identity_mapping;
2107 #define IDENTMAP_ALL 1
2108 #define IDENTMAP_GFX 2
2109 #define IDENTMAP_AZALIA 4
2111 static int iommu_domain_identity_map(struct dmar_domain *domain,
2112 unsigned long long start,
2113 unsigned long long end)
2115 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2116 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2118 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2119 dma_to_mm_pfn(last_vpfn))) {
2120 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2124 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2125 start, end, domain->id);
2127 * RMRR range might have overlap with physical memory range,
2130 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2132 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2133 last_vpfn - first_vpfn + 1,
2134 DMA_PTE_READ|DMA_PTE_WRITE);
2137 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2138 unsigned long long start,
2139 unsigned long long end)
2141 struct dmar_domain *domain;
2144 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2148 /* For _hardware_ passthrough, don't bother. But for software
2149 passthrough, we do it anyway -- it may indicate a memory
2150 range which is reserved in E820, so which didn't get set
2151 up to start with in si_domain */
2152 if (domain == si_domain && hw_pass_through) {
2153 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2154 pci_name(pdev), start, end);
2159 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2160 pci_name(pdev), start, end);
2163 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2164 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165 dmi_get_system_info(DMI_BIOS_VENDOR),
2166 dmi_get_system_info(DMI_BIOS_VERSION),
2167 dmi_get_system_info(DMI_PRODUCT_VERSION));
2172 if (end >> agaw_to_width(domain->agaw)) {
2173 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2174 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2175 agaw_to_width(domain->agaw),
2176 dmi_get_system_info(DMI_BIOS_VENDOR),
2177 dmi_get_system_info(DMI_BIOS_VERSION),
2178 dmi_get_system_info(DMI_PRODUCT_VERSION));
2183 ret = iommu_domain_identity_map(domain, start, end);
2187 /* context entry init */
2188 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2195 domain_exit(domain);
2199 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2200 struct pci_dev *pdev)
2202 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2204 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2208 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2209 static inline void iommu_prepare_isa(void)
2211 struct pci_dev *pdev;
2214 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2218 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2219 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2222 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2223 "floppy might not work\n");
2227 static inline void iommu_prepare_isa(void)
2231 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2233 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2235 static int __init si_domain_init(int hw)
2237 struct dmar_drhd_unit *drhd;
2238 struct intel_iommu *iommu;
2241 si_domain = alloc_domain();
2245 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2247 for_each_active_iommu(iommu, drhd) {
2248 ret = iommu_attach_domain(si_domain, iommu);
2250 domain_exit(si_domain);
2255 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2256 domain_exit(si_domain);
2260 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2265 for_each_online_node(nid) {
2266 unsigned long start_pfn, end_pfn;
2269 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2270 ret = iommu_domain_identity_map(si_domain,
2271 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2280 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2281 struct pci_dev *pdev);
2282 static int identity_mapping(struct pci_dev *pdev)
2284 struct device_domain_info *info;
2286 if (likely(!iommu_identity_mapping))
2289 info = pdev->dev.archdata.iommu;
2290 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2291 return (info->domain == si_domain);
2296 static int domain_add_dev_info(struct dmar_domain *domain,
2297 struct pci_dev *pdev,
2300 struct device_domain_info *info;
2301 unsigned long flags;
2304 info = alloc_devinfo_mem();
2308 info->segment = pci_domain_nr(pdev->bus);
2309 info->bus = pdev->bus->number;
2310 info->devfn = pdev->devfn;
2312 info->domain = domain;
2314 spin_lock_irqsave(&device_domain_lock, flags);
2315 list_add(&info->link, &domain->devices);
2316 list_add(&info->global, &device_domain_list);
2317 pdev->dev.archdata.iommu = info;
2318 spin_unlock_irqrestore(&device_domain_lock, flags);
2320 ret = domain_context_mapping(domain, pdev, translation);
2322 spin_lock_irqsave(&device_domain_lock, flags);
2323 unlink_domain_info(info);
2324 spin_unlock_irqrestore(&device_domain_lock, flags);
2325 free_devinfo_mem(info);
2332 static bool device_has_rmrr(struct pci_dev *dev)
2334 struct dmar_rmrr_unit *rmrr;
2337 for_each_rmrr_units(rmrr) {
2338 for (i = 0; i < rmrr->devices_cnt; i++) {
2340 * Return TRUE if this RMRR contains the device that
2343 if (rmrr->devices[i] == dev)
2350 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2354 * We want to prevent any device associated with an RMRR from
2355 * getting placed into the SI Domain. This is done because
2356 * problems exist when devices are moved in and out of domains
2357 * and their respective RMRR info is lost. We exempt USB devices
2358 * from this process due to their usage of RMRRs that are known
2359 * to not be needed after BIOS hand-off to OS.
2361 if (device_has_rmrr(pdev) &&
2362 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2365 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2368 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2371 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2375 * We want to start off with all devices in the 1:1 domain, and
2376 * take them out later if we find they can't access all of memory.
2378 * However, we can't do this for PCI devices behind bridges,
2379 * because all PCI devices behind the same bridge will end up
2380 * with the same source-id on their transactions.
2382 * Practically speaking, we can't change things around for these
2383 * devices at run-time, because we can't be sure there'll be no
2384 * DMA transactions in flight for any of their siblings.
2386 * So PCI devices (unless they're on the root bus) as well as
2387 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2388 * the 1:1 domain, just in _case_ one of their siblings turns out
2389 * not to be able to map all of memory.
2391 if (!pci_is_pcie(pdev)) {
2392 if (!pci_is_root_bus(pdev->bus))
2394 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2396 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2400 * At boot time, we don't yet know if devices will be 64-bit capable.
2401 * Assume that they will -- if they turn out not to be, then we can
2402 * take them out of the 1:1 domain later.
2406 * If the device's dma_mask is less than the system's memory
2407 * size then this is not a candidate for identity mapping.
2409 u64 dma_mask = pdev->dma_mask;
2411 if (pdev->dev.coherent_dma_mask &&
2412 pdev->dev.coherent_dma_mask < dma_mask)
2413 dma_mask = pdev->dev.coherent_dma_mask;
2415 return dma_mask >= dma_get_required_mask(&pdev->dev);
2421 static int __init iommu_prepare_static_identity_mapping(int hw)
2423 struct pci_dev *pdev = NULL;
2426 ret = si_domain_init(hw);
2430 for_each_pci_dev(pdev) {
2431 if (iommu_should_identity_map(pdev, 1)) {
2432 ret = domain_add_dev_info(si_domain, pdev,
2433 hw ? CONTEXT_TT_PASS_THROUGH :
2434 CONTEXT_TT_MULTI_LEVEL);
2436 /* device not associated with an iommu */
2441 pr_info("IOMMU: %s identity mapping for device %s\n",
2442 hw ? "hardware" : "software", pci_name(pdev));
2449 static int __init init_dmars(void)
2451 struct dmar_drhd_unit *drhd;
2452 struct dmar_rmrr_unit *rmrr;
2453 struct pci_dev *pdev;
2454 struct intel_iommu *iommu;
2460 * initialize and program root entry to not present
2463 for_each_drhd_unit(drhd) {
2465 * lock not needed as this is only incremented in the single
2466 * threaded kernel __init code path all other access are read
2469 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2473 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2474 IOMMU_UNITS_SUPPORTED);
2477 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2480 printk(KERN_ERR "Allocating global iommu array failed\n");
2485 deferred_flush = kzalloc(g_num_of_iommus *
2486 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2487 if (!deferred_flush) {
2492 for_each_drhd_unit(drhd) {
2496 iommu = drhd->iommu;
2497 g_iommus[iommu->seq_id] = iommu;
2499 ret = iommu_init_domains(iommu);
2505 * we could share the same root & context tables
2506 * among all IOMMU's. Need to Split it later.
2508 ret = iommu_alloc_root_entry(iommu);
2510 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2513 if (!ecap_pass_through(iommu->ecap))
2514 hw_pass_through = 0;
2518 * Start from the sane iommu hardware state.
2520 for_each_drhd_unit(drhd) {
2524 iommu = drhd->iommu;
2527 * If the queued invalidation is already initialized by us
2528 * (for example, while enabling interrupt-remapping) then
2529 * we got the things already rolling from a sane state.
2535 * Clear any previous faults.
2537 dmar_fault(-1, iommu);
2539 * Disable queued invalidation if supported and already enabled
2540 * before OS handover.
2542 dmar_disable_qi(iommu);
2545 for_each_drhd_unit(drhd) {
2549 iommu = drhd->iommu;
2551 if (dmar_enable_qi(iommu)) {
2553 * Queued Invalidate not enabled, use Register Based
2556 iommu->flush.flush_context = __iommu_flush_context;
2557 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2558 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2561 (unsigned long long)drhd->reg_base_addr);
2563 iommu->flush.flush_context = qi_flush_context;
2564 iommu->flush.flush_iotlb = qi_flush_iotlb;
2565 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2568 (unsigned long long)drhd->reg_base_addr);
2572 if (iommu_pass_through)
2573 iommu_identity_mapping |= IDENTMAP_ALL;
2575 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2576 iommu_identity_mapping |= IDENTMAP_GFX;
2579 check_tylersburg_isoch();
2582 * If pass through is not set or not enabled, setup context entries for
2583 * identity mappings for rmrr, gfx, and isa and may fall back to static
2584 * identity mapping if iommu_identity_mapping is set.
2586 if (iommu_identity_mapping) {
2587 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2589 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2595 * for each dev attached to rmrr
2597 * locate drhd for dev, alloc domain for dev
2598 * allocate free domain
2599 * allocate page table entries for rmrr
2600 * if context not allocated for bus
2601 * allocate and init context
2602 * set present in root table for this bus
2603 * init context with domain, translation etc
2607 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2608 for_each_rmrr_units(rmrr) {
2609 for (i = 0; i < rmrr->devices_cnt; i++) {
2610 pdev = rmrr->devices[i];
2612 * some BIOS lists non-exist devices in DMAR
2617 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2620 "IOMMU: mapping reserved region failed\n");
2624 iommu_prepare_isa();
2629 * global invalidate context cache
2630 * global invalidate iotlb
2631 * enable translation
2633 for_each_drhd_unit(drhd) {
2634 if (drhd->ignored) {
2636 * we always have to disable PMRs or DMA may fail on
2640 iommu_disable_protect_mem_regions(drhd->iommu);
2643 iommu = drhd->iommu;
2645 iommu_flush_write_buffer(iommu);
2647 ret = dmar_set_interrupt(iommu);
2651 iommu_set_root_entry(iommu);
2653 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2656 ret = iommu_enable_translation(iommu);
2660 iommu_disable_protect_mem_regions(iommu);
2665 for_each_drhd_unit(drhd) {
2668 iommu = drhd->iommu;
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova *intel_alloc_iova(struct device *dev,
2677 struct dmar_domain *domain,
2678 unsigned long nrpages, uint64_t dma_mask)
2680 struct pci_dev *pdev = to_pci_dev(dev);
2681 struct iova *iova = NULL;
2683 /* Restrict dma_mask to the width that the iommu can handle */
2684 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2686 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2688 * First try to allocate an io virtual address in
2689 * DMA_BIT_MASK(32) and if that fails then try allocating
2692 iova = alloc_iova(&domain->iovad, nrpages,
2693 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2697 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698 if (unlikely(!iova)) {
2699 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700 nrpages, pci_name(pdev));
2707 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2709 struct dmar_domain *domain;
2712 domain = get_domain_for_dev(pdev,
2713 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2716 "Allocating domain for %s failed", pci_name(pdev));
2720 /* make sure context mapping is ok */
2721 if (unlikely(!domain_context_mapped(pdev))) {
2722 ret = domain_context_mapping(domain, pdev,
2723 CONTEXT_TT_MULTI_LEVEL);
2726 "Domain context map for %s failed",
2735 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2737 struct device_domain_info *info;
2739 /* No lock here, assumes no domain exit in normal case */
2740 info = dev->dev.archdata.iommu;
2742 return info->domain;
2744 return __get_valid_domain_for_dev(dev);
2747 static int iommu_dummy(struct pci_dev *pdev)
2749 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device *dev)
2755 struct pci_dev *pdev;
2758 if (unlikely(dev->bus != &pci_bus_type))
2761 pdev = to_pci_dev(dev);
2762 if (iommu_dummy(pdev))
2765 if (!iommu_identity_mapping)
2768 found = identity_mapping(pdev);
2770 if (iommu_should_identity_map(pdev, 0))
2774 * 32 bit DMA is removed from si_domain and fall back
2775 * to non-identity mapping.
2777 domain_remove_one_dev_info(si_domain, pdev);
2778 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2784 * In case of a detached 64 bit DMA device from vm, the device
2785 * is put into si_domain for identity mapping.
2787 if (iommu_should_identity_map(pdev, 0)) {
2789 ret = domain_add_dev_info(si_domain, pdev,
2791 CONTEXT_TT_PASS_THROUGH :
2792 CONTEXT_TT_MULTI_LEVEL);
2794 printk(KERN_INFO "64bit %s uses identity mapping\n",
2804 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805 size_t size, int dir, u64 dma_mask)
2807 struct pci_dev *pdev = to_pci_dev(hwdev);
2808 struct dmar_domain *domain;
2809 phys_addr_t start_paddr;
2813 struct intel_iommu *iommu;
2814 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2816 BUG_ON(dir == DMA_NONE);
2818 if (iommu_no_mapping(hwdev))
2821 domain = get_valid_domain_for_dev(pdev);
2825 iommu = domain_get_iommu(domain);
2826 size = aligned_nrpages(paddr, size);
2828 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2833 * Check if DMAR supports zero-length reads on write only
2836 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837 !cap_zlr(iommu->cap))
2838 prot |= DMA_PTE_READ;
2839 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840 prot |= DMA_PTE_WRITE;
2842 * paddr - (paddr + size) might be partial page, we should map the whole
2843 * page. Note: if two part of one page are separately mapped, we
2844 * might have two guest_addr mapping to the same host paddr, but this
2845 * is not a big problem
2847 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848 mm_to_dma_pfn(paddr_pfn), size, prot);
2852 /* it's a non-present to present mapping. Only flush if caching mode */
2853 if (cap_caching_mode(iommu->cap))
2854 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2856 iommu_flush_write_buffer(iommu);
2858 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859 start_paddr += paddr & ~PAGE_MASK;
2864 __free_iova(&domain->iovad, iova);
2865 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866 pci_name(pdev), size, (unsigned long long)paddr, dir);
2870 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871 unsigned long offset, size_t size,
2872 enum dma_data_direction dir,
2873 struct dma_attrs *attrs)
2875 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876 dir, to_pci_dev(dev)->dma_mask);
2879 static void flush_unmaps(void)
2885 /* just flush them all */
2886 for (i = 0; i < g_num_of_iommus; i++) {
2887 struct intel_iommu *iommu = g_iommus[i];
2891 if (!deferred_flush[i].next)
2894 /* In caching mode, global flushes turn emulation expensive */
2895 if (!cap_caching_mode(iommu->cap))
2896 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897 DMA_TLB_GLOBAL_FLUSH);
2898 for (j = 0; j < deferred_flush[i].next; j++) {
2900 struct iova *iova = deferred_flush[i].iova[j];
2901 struct dmar_domain *domain = deferred_flush[i].domain[j];
2903 /* On real hardware multiple invalidations are expensive */
2904 if (cap_caching_mode(iommu->cap))
2905 iommu_flush_iotlb_psi(iommu, domain->id,
2906 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2908 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2912 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2914 deferred_flush[i].next = 0;
2920 static void flush_unmaps_timeout(unsigned long data)
2922 unsigned long flags;
2924 spin_lock_irqsave(&async_umap_flush_lock, flags);
2926 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2929 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2931 unsigned long flags;
2933 struct intel_iommu *iommu;
2935 spin_lock_irqsave(&async_umap_flush_lock, flags);
2936 if (list_size == HIGH_WATER_MARK)
2939 iommu = domain_get_iommu(dom);
2940 iommu_id = iommu->seq_id;
2942 next = deferred_flush[iommu_id].next;
2943 deferred_flush[iommu_id].domain[next] = dom;
2944 deferred_flush[iommu_id].iova[next] = iova;
2945 deferred_flush[iommu_id].next++;
2948 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2952 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2955 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956 size_t size, enum dma_data_direction dir,
2957 struct dma_attrs *attrs)
2959 struct pci_dev *pdev = to_pci_dev(dev);
2960 struct dmar_domain *domain;
2961 unsigned long start_pfn, last_pfn;
2963 struct intel_iommu *iommu;
2965 if (iommu_no_mapping(dev))
2968 domain = find_domain(pdev);
2971 iommu = domain_get_iommu(domain);
2973 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975 (unsigned long long)dev_addr))
2978 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2981 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982 pci_name(pdev), start_pfn, last_pfn);
2984 /* clear the whole page */
2985 dma_pte_clear_range(domain, start_pfn, last_pfn);
2987 /* free page tables */
2988 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2990 if (intel_iommu_strict) {
2991 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992 last_pfn - start_pfn + 1, 0);
2994 __free_iova(&domain->iovad, iova);
2996 add_unmap(domain, iova);
2998 * queue up the release of the unmap to save the 1/6th of the
2999 * cpu used up by the iotlb flush operation...
3004 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005 dma_addr_t *dma_handle, gfp_t flags,
3006 struct dma_attrs *attrs)
3011 size = PAGE_ALIGN(size);
3012 order = get_order(size);
3014 if (!iommu_no_mapping(hwdev))
3015 flags &= ~(GFP_DMA | GFP_DMA32);
3016 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3023 vaddr = (void *)__get_free_pages(flags, order);
3026 memset(vaddr, 0, size);
3028 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3030 hwdev->coherent_dma_mask);
3033 free_pages((unsigned long)vaddr, order);
3037 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038 dma_addr_t dma_handle, struct dma_attrs *attrs)
3042 size = PAGE_ALIGN(size);
3043 order = get_order(size);
3045 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046 free_pages((unsigned long)vaddr, order);
3049 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050 int nelems, enum dma_data_direction dir,
3051 struct dma_attrs *attrs)
3053 struct pci_dev *pdev = to_pci_dev(hwdev);
3054 struct dmar_domain *domain;
3055 unsigned long start_pfn, last_pfn;
3057 struct intel_iommu *iommu;
3059 if (iommu_no_mapping(hwdev))
3062 domain = find_domain(pdev);
3065 iommu = domain_get_iommu(domain);
3067 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069 (unsigned long long)sglist[0].dma_address))
3072 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3075 /* clear the whole page */
3076 dma_pte_clear_range(domain, start_pfn, last_pfn);
3078 /* free page tables */
3079 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3081 if (intel_iommu_strict) {
3082 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083 last_pfn - start_pfn + 1, 0);
3085 __free_iova(&domain->iovad, iova);
3087 add_unmap(domain, iova);
3089 * queue up the release of the unmap to save the 1/6th of the
3090 * cpu used up by the iotlb flush operation...
3095 static int intel_nontranslate_map_sg(struct device *hddev,
3096 struct scatterlist *sglist, int nelems, int dir)
3099 struct scatterlist *sg;
3101 for_each_sg(sglist, sg, nelems, i) {
3102 BUG_ON(!sg_page(sg));
3103 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104 sg->dma_length = sg->length;
3109 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110 enum dma_data_direction dir, struct dma_attrs *attrs)
3113 struct pci_dev *pdev = to_pci_dev(hwdev);
3114 struct dmar_domain *domain;
3117 struct iova *iova = NULL;
3119 struct scatterlist *sg;
3120 unsigned long start_vpfn;
3121 struct intel_iommu *iommu;
3123 BUG_ON(dir == DMA_NONE);
3124 if (iommu_no_mapping(hwdev))
3125 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3127 domain = get_valid_domain_for_dev(pdev);
3131 iommu = domain_get_iommu(domain);
3133 for_each_sg(sglist, sg, nelems, i)
3134 size += aligned_nrpages(sg->offset, sg->length);
3136 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3139 sglist->dma_length = 0;
3144 * Check if DMAR supports zero-length reads on write only
3147 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148 !cap_zlr(iommu->cap))
3149 prot |= DMA_PTE_READ;
3150 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151 prot |= DMA_PTE_WRITE;
3153 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3155 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156 if (unlikely(ret)) {
3157 /* clear the page */
3158 dma_pte_clear_range(domain, start_vpfn,
3159 start_vpfn + size - 1);
3160 /* free page tables */
3161 dma_pte_free_pagetable(domain, start_vpfn,
3162 start_vpfn + size - 1);
3164 __free_iova(&domain->iovad, iova);
3168 /* it's a non-present to present mapping. Only flush if caching mode */
3169 if (cap_caching_mode(iommu->cap))
3170 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3172 iommu_flush_write_buffer(iommu);
3177 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3182 struct dma_map_ops intel_dma_ops = {
3183 .alloc = intel_alloc_coherent,
3184 .free = intel_free_coherent,
3185 .map_sg = intel_map_sg,
3186 .unmap_sg = intel_unmap_sg,
3187 .map_page = intel_map_page,
3188 .unmap_page = intel_unmap_page,
3189 .mapping_error = intel_mapping_error,
3192 static inline int iommu_domain_cache_init(void)
3196 iommu_domain_cache = kmem_cache_create("iommu_domain",
3197 sizeof(struct dmar_domain),
3202 if (!iommu_domain_cache) {
3203 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3210 static inline int iommu_devinfo_cache_init(void)
3214 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215 sizeof(struct device_domain_info),
3219 if (!iommu_devinfo_cache) {
3220 printk(KERN_ERR "Couldn't create devinfo cache\n");
3227 static inline int iommu_iova_cache_init(void)
3231 iommu_iova_cache = kmem_cache_create("iommu_iova",
3232 sizeof(struct iova),
3236 if (!iommu_iova_cache) {
3237 printk(KERN_ERR "Couldn't create iova cache\n");
3244 static int __init iommu_init_mempool(void)
3247 ret = iommu_iova_cache_init();
3251 ret = iommu_domain_cache_init();
3255 ret = iommu_devinfo_cache_init();
3259 kmem_cache_destroy(iommu_domain_cache);
3261 kmem_cache_destroy(iommu_iova_cache);
3266 static void __init iommu_exit_mempool(void)
3268 kmem_cache_destroy(iommu_devinfo_cache);
3269 kmem_cache_destroy(iommu_domain_cache);
3270 kmem_cache_destroy(iommu_iova_cache);
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3276 struct dmar_drhd_unit *drhd;
3280 /* We know that this device on this chipset has its own IOMMU.
3281 * If we find it under a different IOMMU, then the BIOS is lying
3282 * to us. Hope that the IOMMU for this device is actually
3283 * disabled, and it needs no translation...
3285 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3287 /* "can't" happen */
3288 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3291 vtbar &= 0xffff0000;
3293 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294 drhd = dmar_find_matched_drhd_unit(pdev);
3295 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296 TAINT_FIRMWARE_WORKAROUND,
3297 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3302 static void __init init_no_remapping_devices(void)
3304 struct dmar_drhd_unit *drhd;
3306 for_each_drhd_unit(drhd) {
3307 if (!drhd->include_all) {
3309 for (i = 0; i < drhd->devices_cnt; i++)
3310 if (drhd->devices[i] != NULL)
3312 /* ignore DMAR unit if no pci devices exist */
3313 if (i == drhd->devices_cnt)
3318 for_each_drhd_unit(drhd) {
3320 if (drhd->ignored || drhd->include_all)
3323 for (i = 0; i < drhd->devices_cnt; i++)
3324 if (drhd->devices[i] &&
3325 !IS_GFX_DEVICE(drhd->devices[i]))
3328 if (i < drhd->devices_cnt)
3331 /* This IOMMU has *only* gfx devices. Either bypass it or
3332 set the gfx_mapped flag, as appropriate */
3334 intel_iommu_gfx_mapped = 1;
3337 for (i = 0; i < drhd->devices_cnt; i++) {
3338 if (!drhd->devices[i])
3340 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3346 #ifdef CONFIG_SUSPEND
3347 static int init_iommu_hw(void)
3349 struct dmar_drhd_unit *drhd;
3350 struct intel_iommu *iommu = NULL;
3352 for_each_active_iommu(iommu, drhd)
3354 dmar_reenable_qi(iommu);
3356 for_each_iommu(iommu, drhd) {
3357 if (drhd->ignored) {
3359 * we always have to disable PMRs or DMA may fail on
3363 iommu_disable_protect_mem_regions(iommu);
3367 iommu_flush_write_buffer(iommu);
3369 iommu_set_root_entry(iommu);
3371 iommu->flush.flush_context(iommu, 0, 0, 0,
3372 DMA_CCMD_GLOBAL_INVL);
3373 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3374 DMA_TLB_GLOBAL_FLUSH);
3375 if (iommu_enable_translation(iommu))
3377 iommu_disable_protect_mem_regions(iommu);
3383 static void iommu_flush_all(void)
3385 struct dmar_drhd_unit *drhd;
3386 struct intel_iommu *iommu;
3388 for_each_active_iommu(iommu, drhd) {
3389 iommu->flush.flush_context(iommu, 0, 0, 0,
3390 DMA_CCMD_GLOBAL_INVL);
3391 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3392 DMA_TLB_GLOBAL_FLUSH);
3396 static int iommu_suspend(void)
3398 struct dmar_drhd_unit *drhd;
3399 struct intel_iommu *iommu = NULL;
3402 for_each_active_iommu(iommu, drhd) {
3403 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3405 if (!iommu->iommu_state)
3411 for_each_active_iommu(iommu, drhd) {
3412 iommu_disable_translation(iommu);
3414 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3416 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3417 readl(iommu->reg + DMAR_FECTL_REG);
3418 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3419 readl(iommu->reg + DMAR_FEDATA_REG);
3420 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3421 readl(iommu->reg + DMAR_FEADDR_REG);
3422 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3423 readl(iommu->reg + DMAR_FEUADDR_REG);
3425 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3430 for_each_active_iommu(iommu, drhd)
3431 kfree(iommu->iommu_state);
3436 static void iommu_resume(void)
3438 struct dmar_drhd_unit *drhd;
3439 struct intel_iommu *iommu = NULL;
3442 if (init_iommu_hw()) {
3444 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3446 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3450 for_each_active_iommu(iommu, drhd) {
3452 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3454 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3455 iommu->reg + DMAR_FECTL_REG);
3456 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3457 iommu->reg + DMAR_FEDATA_REG);
3458 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3459 iommu->reg + DMAR_FEADDR_REG);
3460 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3461 iommu->reg + DMAR_FEUADDR_REG);
3463 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3466 for_each_active_iommu(iommu, drhd)
3467 kfree(iommu->iommu_state);
3470 static struct syscore_ops iommu_syscore_ops = {
3471 .resume = iommu_resume,
3472 .suspend = iommu_suspend,
3475 static void __init init_iommu_pm_ops(void)
3477 register_syscore_ops(&iommu_syscore_ops);
3481 static inline void init_iommu_pm_ops(void) {}
3482 #endif /* CONFIG_PM */
3484 LIST_HEAD(dmar_rmrr_units);
3486 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3488 list_add(&rmrr->list, &dmar_rmrr_units);
3492 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3494 struct acpi_dmar_reserved_memory *rmrr;
3495 struct dmar_rmrr_unit *rmrru;
3497 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3501 rmrru->hdr = header;
3502 rmrr = (struct acpi_dmar_reserved_memory *)header;
3503 rmrru->base_address = rmrr->base_address;
3504 rmrru->end_address = rmrr->end_address;
3506 dmar_register_rmrr_unit(rmrru);
3511 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3513 struct acpi_dmar_reserved_memory *rmrr;
3516 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3517 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3518 ((void *)rmrr) + rmrr->header.length,
3519 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3521 if (ret || (rmrru->devices_cnt == 0)) {
3522 list_del(&rmrru->list);
3528 static LIST_HEAD(dmar_atsr_units);
3530 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3532 struct acpi_dmar_atsr *atsr;
3533 struct dmar_atsr_unit *atsru;
3535 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3536 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3541 atsru->include_all = atsr->flags & 0x1;
3543 list_add(&atsru->list, &dmar_atsr_units);
3548 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3551 struct acpi_dmar_atsr *atsr;
3553 if (atsru->include_all)
3556 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3557 rc = dmar_parse_dev_scope((void *)(atsr + 1),
3558 (void *)atsr + atsr->header.length,
3559 &atsru->devices_cnt, &atsru->devices,
3561 if (rc || !atsru->devices_cnt) {
3562 list_del(&atsru->list);
3569 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3572 struct pci_bus *bus;
3573 struct acpi_dmar_atsr *atsr;
3574 struct dmar_atsr_unit *atsru;
3576 dev = pci_physfn(dev);
3578 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3579 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3580 if (atsr->segment == pci_domain_nr(dev->bus))
3587 for (bus = dev->bus; bus; bus = bus->parent) {
3588 struct pci_dev *bridge = bus->self;
3590 if (!bridge || !pci_is_pcie(bridge) ||
3591 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3594 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3595 for (i = 0; i < atsru->devices_cnt; i++)
3596 if (atsru->devices[i] == bridge)
3602 if (atsru->include_all)
3608 int __init dmar_parse_rmrr_atsr_dev(void)
3610 struct dmar_rmrr_unit *rmrr, *rmrr_n;
3611 struct dmar_atsr_unit *atsr, *atsr_n;
3614 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3615 ret = rmrr_parse_dev(rmrr);
3620 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3621 ret = atsr_parse_dev(atsr);
3630 * Here we only respond to action of unbound device from driver.
3632 * Added device is not attached to its DMAR domain here yet. That will happen
3633 * when mapping the device to iova.
3635 static int device_notifier(struct notifier_block *nb,
3636 unsigned long action, void *data)
3638 struct device *dev = data;
3639 struct pci_dev *pdev = to_pci_dev(dev);
3640 struct dmar_domain *domain;
3642 if (iommu_no_mapping(dev))
3645 domain = find_domain(pdev);
3649 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3650 domain_remove_one_dev_info(domain, pdev);
3652 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3653 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3654 list_empty(&domain->devices))
3655 domain_exit(domain);
3661 static struct notifier_block device_nb = {
3662 .notifier_call = device_notifier,
3665 int __init intel_iommu_init(void)
3669 /* VT-d is required for a TXT/tboot launch, so enforce that */
3670 force_on = tboot_force_iommu();
3672 if (dmar_table_init()) {
3674 panic("tboot: Failed to initialize DMAR table\n");
3678 if (dmar_dev_scope_init() < 0) {
3680 panic("tboot: Failed to initialize DMAR device scope\n");
3684 if (no_iommu || dmar_disabled)
3687 if (iommu_init_mempool()) {
3689 panic("tboot: Failed to initialize iommu memory\n");
3693 if (list_empty(&dmar_rmrr_units))
3694 printk(KERN_INFO "DMAR: No RMRR found\n");
3696 if (list_empty(&dmar_atsr_units))
3697 printk(KERN_INFO "DMAR: No ATSR found\n");
3699 if (dmar_init_reserved_ranges()) {
3701 panic("tboot: Failed to reserve iommu ranges\n");
3705 init_no_remapping_devices();
3710 panic("tboot: Failed to initialize DMARs\n");
3711 printk(KERN_ERR "IOMMU: dmar init failed\n");
3712 put_iova_domain(&reserved_iova_list);
3713 iommu_exit_mempool();
3717 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3719 init_timer(&unmap_timer);
3720 #ifdef CONFIG_SWIOTLB
3723 dma_ops = &intel_dma_ops;
3725 init_iommu_pm_ops();
3727 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3729 bus_register_notifier(&pci_bus_type, &device_nb);
3731 intel_iommu_enabled = 1;
3736 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3737 struct pci_dev *pdev)
3739 struct pci_dev *tmp, *parent;
3741 if (!iommu || !pdev)
3744 /* dependent device detach */
3745 tmp = pci_find_upstream_pcie_bridge(pdev);
3746 /* Secondary interface's bus number and devfn 0 */
3748 parent = pdev->bus->self;
3749 while (parent != tmp) {
3750 iommu_detach_dev(iommu, parent->bus->number,
3752 parent = parent->bus->self;
3754 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3755 iommu_detach_dev(iommu,
3756 tmp->subordinate->number, 0);
3757 else /* this is a legacy PCI bridge */
3758 iommu_detach_dev(iommu, tmp->bus->number,
3763 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3764 struct pci_dev *pdev)
3766 struct device_domain_info *info;
3767 struct intel_iommu *iommu;
3768 unsigned long flags;
3770 struct list_head *entry, *tmp;
3772 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3777 spin_lock_irqsave(&device_domain_lock, flags);
3778 list_for_each_safe(entry, tmp, &domain->devices) {
3779 info = list_entry(entry, struct device_domain_info, link);
3780 if (info->segment == pci_domain_nr(pdev->bus) &&
3781 info->bus == pdev->bus->number &&
3782 info->devfn == pdev->devfn) {
3783 unlink_domain_info(info);
3784 spin_unlock_irqrestore(&device_domain_lock, flags);
3786 iommu_disable_dev_iotlb(info);
3787 iommu_detach_dev(iommu, info->bus, info->devfn);
3788 iommu_detach_dependent_devices(iommu, pdev);
3789 free_devinfo_mem(info);
3791 spin_lock_irqsave(&device_domain_lock, flags);
3799 /* if there is no other devices under the same iommu
3800 * owned by this domain, clear this iommu in iommu_bmp
3801 * update iommu count and coherency
3803 if (iommu == device_to_iommu(info->segment, info->bus,
3808 spin_unlock_irqrestore(&device_domain_lock, flags);
3811 unsigned long tmp_flags;
3812 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3813 clear_bit(iommu->seq_id, domain->iommu_bmp);
3814 domain->iommu_count--;
3815 domain_update_iommu_cap(domain);
3816 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3818 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3819 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3820 spin_lock_irqsave(&iommu->lock, tmp_flags);
3821 clear_bit(domain->id, iommu->domain_ids);
3822 iommu->domains[domain->id] = NULL;
3823 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3828 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3830 struct device_domain_info *info;
3831 struct intel_iommu *iommu;
3832 unsigned long flags1, flags2;
3834 spin_lock_irqsave(&device_domain_lock, flags1);
3835 while (!list_empty(&domain->devices)) {
3836 info = list_entry(domain->devices.next,
3837 struct device_domain_info, link);
3838 unlink_domain_info(info);
3839 spin_unlock_irqrestore(&device_domain_lock, flags1);
3841 iommu_disable_dev_iotlb(info);
3842 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3843 iommu_detach_dev(iommu, info->bus, info->devfn);
3844 iommu_detach_dependent_devices(iommu, info->dev);
3846 /* clear this iommu in iommu_bmp, update iommu count
3849 spin_lock_irqsave(&domain->iommu_lock, flags2);
3850 if (test_and_clear_bit(iommu->seq_id,
3851 domain->iommu_bmp)) {
3852 domain->iommu_count--;
3853 domain_update_iommu_cap(domain);
3855 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3857 free_devinfo_mem(info);
3858 spin_lock_irqsave(&device_domain_lock, flags1);
3860 spin_unlock_irqrestore(&device_domain_lock, flags1);
3863 /* domain id for virtual machine, it won't be set in context */
3864 static unsigned long vm_domid;
3866 static struct dmar_domain *iommu_alloc_vm_domain(void)
3868 struct dmar_domain *domain;
3870 domain = alloc_domain_mem();
3874 domain->id = vm_domid++;
3876 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3877 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3882 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3886 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3887 spin_lock_init(&domain->iommu_lock);
3889 domain_reserve_special_ranges(domain);
3891 /* calculate AGAW */
3892 domain->gaw = guest_width;
3893 adjust_width = guestwidth_to_adjustwidth(guest_width);
3894 domain->agaw = width_to_agaw(adjust_width);
3896 INIT_LIST_HEAD(&domain->devices);
3898 domain->iommu_count = 0;
3899 domain->iommu_coherency = 0;
3900 domain->iommu_snooping = 0;
3901 domain->iommu_superpage = 0;
3902 domain->max_addr = 0;
3905 /* always allocate the top pgd */
3906 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3909 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3913 static void iommu_free_vm_domain(struct dmar_domain *domain)
3915 unsigned long flags;
3916 struct dmar_drhd_unit *drhd;
3917 struct intel_iommu *iommu;
3919 unsigned long ndomains;
3921 for_each_drhd_unit(drhd) {
3924 iommu = drhd->iommu;
3926 ndomains = cap_ndoms(iommu->cap);
3927 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3928 if (iommu->domains[i] == domain) {
3929 spin_lock_irqsave(&iommu->lock, flags);
3930 clear_bit(i, iommu->domain_ids);
3931 iommu->domains[i] = NULL;
3932 spin_unlock_irqrestore(&iommu->lock, flags);
3939 static void vm_domain_exit(struct dmar_domain *domain)
3941 /* Domain 0 is reserved, so dont process it */
3945 vm_domain_remove_all_dev_info(domain);
3947 put_iova_domain(&domain->iovad);
3950 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3952 /* free page tables */
3953 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3955 iommu_free_vm_domain(domain);
3956 free_domain_mem(domain);
3959 static int intel_iommu_domain_init(struct iommu_domain *domain)
3961 struct dmar_domain *dmar_domain;
3963 dmar_domain = iommu_alloc_vm_domain();
3966 "intel_iommu_domain_init: dmar_domain == NULL\n");
3969 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3971 "intel_iommu_domain_init() failed\n");
3972 vm_domain_exit(dmar_domain);
3975 domain_update_iommu_cap(dmar_domain);
3976 domain->priv = dmar_domain;
3978 domain->geometry.aperture_start = 0;
3979 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3980 domain->geometry.force_aperture = true;
3985 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3987 struct dmar_domain *dmar_domain = domain->priv;
3989 domain->priv = NULL;
3990 vm_domain_exit(dmar_domain);
3993 static int intel_iommu_attach_device(struct iommu_domain *domain,
3996 struct dmar_domain *dmar_domain = domain->priv;
3997 struct pci_dev *pdev = to_pci_dev(dev);
3998 struct intel_iommu *iommu;
4001 /* normally pdev is not mapped */
4002 if (unlikely(domain_context_mapped(pdev))) {
4003 struct dmar_domain *old_domain;
4005 old_domain = find_domain(pdev);
4007 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4008 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4009 domain_remove_one_dev_info(old_domain, pdev);
4011 domain_remove_dev_info(old_domain);
4015 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4020 /* check if this iommu agaw is sufficient for max mapped address */
4021 addr_width = agaw_to_width(iommu->agaw);
4022 if (addr_width > cap_mgaw(iommu->cap))
4023 addr_width = cap_mgaw(iommu->cap);
4025 if (dmar_domain->max_addr > (1LL << addr_width)) {
4026 printk(KERN_ERR "%s: iommu width (%d) is not "
4027 "sufficient for the mapped address (%llx)\n",
4028 __func__, addr_width, dmar_domain->max_addr);
4031 dmar_domain->gaw = addr_width;
4034 * Knock out extra levels of page tables if necessary
4036 while (iommu->agaw < dmar_domain->agaw) {
4037 struct dma_pte *pte;
4039 pte = dmar_domain->pgd;
4040 if (dma_pte_present(pte)) {
4041 dmar_domain->pgd = (struct dma_pte *)
4042 phys_to_virt(dma_pte_addr(pte));
4043 free_pgtable_page(pte);
4045 dmar_domain->agaw--;
4048 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4051 static void intel_iommu_detach_device(struct iommu_domain *domain,
4054 struct dmar_domain *dmar_domain = domain->priv;
4055 struct pci_dev *pdev = to_pci_dev(dev);
4057 domain_remove_one_dev_info(dmar_domain, pdev);
4060 static int intel_iommu_map(struct iommu_domain *domain,
4061 unsigned long iova, phys_addr_t hpa,
4062 size_t size, int iommu_prot)
4064 struct dmar_domain *dmar_domain = domain->priv;
4069 if (iommu_prot & IOMMU_READ)
4070 prot |= DMA_PTE_READ;
4071 if (iommu_prot & IOMMU_WRITE)
4072 prot |= DMA_PTE_WRITE;
4073 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4074 prot |= DMA_PTE_SNP;
4076 max_addr = iova + size;
4077 if (dmar_domain->max_addr < max_addr) {
4080 /* check if minimum agaw is sufficient for mapped address */
4081 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4082 if (end < max_addr) {
4083 printk(KERN_ERR "%s: iommu width (%d) is not "
4084 "sufficient for the mapped address (%llx)\n",
4085 __func__, dmar_domain->gaw, max_addr);
4088 dmar_domain->max_addr = max_addr;
4090 /* Round up size to next multiple of PAGE_SIZE, if it and
4091 the low bits of hpa would take us onto the next page */
4092 size = aligned_nrpages(hpa, size);
4093 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4094 hpa >> VTD_PAGE_SHIFT, size, prot);
4098 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4099 unsigned long iova, size_t size)
4101 struct dmar_domain *dmar_domain = domain->priv;
4104 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4105 (iova + size - 1) >> VTD_PAGE_SHIFT);
4107 if (dmar_domain->max_addr == iova + size)
4108 dmar_domain->max_addr = iova;
4110 return PAGE_SIZE << order;
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4116 struct dmar_domain *dmar_domain = domain->priv;
4117 struct dma_pte *pte;
4120 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4122 phys = dma_pte_addr(pte);
4127 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4130 struct dmar_domain *dmar_domain = domain->priv;
4132 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4133 return dmar_domain->iommu_snooping;
4134 if (cap == IOMMU_CAP_INTR_REMAP)
4135 return irq_remapping_enabled;
4140 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4146 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4148 static int intel_iommu_add_device(struct device *dev)
4150 struct pci_dev *pdev = to_pci_dev(dev);
4151 struct pci_dev *bridge, *dma_pdev = NULL;
4152 struct iommu_group *group;
4155 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4156 pdev->bus->number, pdev->devfn))
4159 bridge = pci_find_upstream_pcie_bridge(pdev);
4161 if (pci_is_pcie(bridge))
4162 dma_pdev = pci_get_domain_bus_and_slot(
4163 pci_domain_nr(pdev->bus),
4164 bridge->subordinate->number, 0);
4166 dma_pdev = pci_dev_get(bridge);
4168 dma_pdev = pci_dev_get(pdev);
4170 /* Account for quirked devices */
4171 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4174 * If it's a multifunction device that does not support our
4175 * required ACS flags, add to the same group as function 0.
4177 if (dma_pdev->multifunction &&
4178 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4179 swap_pci_ref(&dma_pdev,
4180 pci_get_slot(dma_pdev->bus,
4181 PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4185 * Devices on the root bus go through the iommu. If that's not us,
4186 * find the next upstream device and test ACS up to the root bus.
4187 * Finding the next device may require skipping virtual buses.
4189 while (!pci_is_root_bus(dma_pdev->bus)) {
4190 struct pci_bus *bus = dma_pdev->bus;
4192 while (!bus->self) {
4193 if (!pci_is_root_bus(bus))
4199 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4202 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4206 group = iommu_group_get(&dma_pdev->dev);
4207 pci_dev_put(dma_pdev);
4209 group = iommu_group_alloc();
4211 return PTR_ERR(group);
4214 ret = iommu_group_add_device(group, dev);
4216 iommu_group_put(group);
4220 static void intel_iommu_remove_device(struct device *dev)
4222 iommu_group_remove_device(dev);
4225 static struct iommu_ops intel_iommu_ops = {
4226 .domain_init = intel_iommu_domain_init,
4227 .domain_destroy = intel_iommu_domain_destroy,
4228 .attach_dev = intel_iommu_attach_device,
4229 .detach_dev = intel_iommu_detach_device,
4230 .map = intel_iommu_map,
4231 .unmap = intel_iommu_unmap,
4232 .iova_to_phys = intel_iommu_iova_to_phys,
4233 .domain_has_cap = intel_iommu_domain_has_cap,
4234 .add_device = intel_iommu_add_device,
4235 .remove_device = intel_iommu_remove_device,
4236 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4239 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4241 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4242 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4254 static void quirk_iommu_rwbf(struct pci_dev *dev)
4257 * Mobile 4 Series Chipset neglects to set RWBF capability,
4258 * but needs it. Same seems to hold for the desktop versions.
4260 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4273 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4274 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4275 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4276 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4277 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4278 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4279 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4280 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4282 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4286 if (pci_read_config_word(dev, GGC, &ggc))
4289 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4290 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4292 } else if (dmar_map_gfx) {
4293 /* we have to ensure the gfx device is idle before we flush */
4294 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4295 intel_iommu_strict = 1;
4298 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4303 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4304 ISOCH DMAR unit for the Azalia sound device, but not give it any
4305 TLB entries, which causes it to deadlock. Check for that. We do
4306 this in a function called from init_dmars(), instead of in a PCI
4307 quirk, because we don't want to print the obnoxious "BIOS broken"
4308 message if VT-d is actually disabled.
4310 static void __init check_tylersburg_isoch(void)
4312 struct pci_dev *pdev;
4313 uint32_t vtisochctrl;
4315 /* If there's no Azalia in the system anyway, forget it. */
4316 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4321 /* System Management Registers. Might be hidden, in which case
4322 we can't do the sanity check. But that's OK, because the
4323 known-broken BIOSes _don't_ actually hide it, so far. */
4324 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4328 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4335 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4336 if (vtisochctrl & 1)
4339 /* Drop all bits other than the number of TLB entries */
4340 vtisochctrl &= 0x1c;
4342 /* If we have the recommended number of TLB entries (16), fine. */
4343 if (vtisochctrl == 0x10)
4346 /* Zero TLB entries? You get to ride the short bus to school. */
4348 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4349 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4350 dmi_get_system_info(DMI_BIOS_VENDOR),
4351 dmi_get_system_info(DMI_BIOS_VERSION),
4352 dmi_get_system_info(DMI_PRODUCT_VERSION));
4353 iommu_identity_mapping |= IDENTMAP_AZALIA;
4357 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",