2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53 #define IOAPIC_RANGE_START (0xfee00000)
54 #define IOAPIC_RANGE_END (0xfeefffff)
55 #define IOVA_START_ADDR (0x1000)
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59 #define MAX_AGAW_WIDTH 64
61 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
67 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76 are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
79 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
84 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
88 return mm_to_dma_pfn(page_to_pfn(pg));
90 static inline unsigned long virt_to_dma_pfn(void *p)
92 return page_to_dma_pfn(virt_to_page(p));
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
104 * 12-63: Context Ptr (12 - (haw-1))
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
114 return (root->val & 1);
116 static inline void set_root_present(struct root_entry *root)
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
122 root->val |= value & VTD_PAGE_MASK;
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
128 return (struct context_entry *)
129 (root_present(root)?phys_to_virt(
130 root->val & VTD_PAGE_MASK) :
137 * 1: fault processing disable
138 * 2-3: translation type
139 * 12-63: address space root
145 struct context_entry {
150 static inline bool context_present(struct context_entry *context)
152 return (context->lo & 1);
154 static inline void context_set_present(struct context_entry *context)
159 static inline void context_set_fault_enable(struct context_entry *context)
161 context->lo &= (((u64)-1) << 2) | 1;
164 static inline void context_set_translation_type(struct context_entry *context,
167 context->lo &= (((u64)-1) << 4) | 3;
168 context->lo |= (value & 3) << 2;
171 static inline void context_set_address_root(struct context_entry *context,
174 context->lo |= value & VTD_PAGE_MASK;
177 static inline void context_set_address_width(struct context_entry *context,
180 context->hi |= value & 7;
183 static inline void context_set_domain_id(struct context_entry *context,
186 context->hi |= (value & ((1 << 16) - 1)) << 8;
189 static inline void context_clear_entry(struct context_entry *context)
202 * 12-63: Host physcial address
208 static inline void dma_clear_pte(struct dma_pte *pte)
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
215 pte->val |= DMA_PTE_READ;
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
220 pte->val |= DMA_PTE_WRITE;
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
225 pte->val |= DMA_PTE_SNP;
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
230 pte->val = (pte->val & ~3) | (prot & 3);
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
236 return pte->val & VTD_PAGE_MASK;
238 /* Must have a full atomic 64-bit read */
239 return __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
245 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
248 static inline bool dma_pte_present(struct dma_pte *pte)
250 return (pte->val & 3) != 0;
253 static inline int first_pte_in_page(struct dma_pte *pte)
255 return !((unsigned long)pte & ~VTD_PAGE_MASK);
259 * This domain is a statically identity mapping domain.
260 * 1. This domain creats a static 1:1 mapping to all usable memory.
261 * 2. It maps to each iommu if successful.
262 * 3. Each iommu mapps to this domain if successful.
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
270 /* domain represents a virtual machine, more than one devices
271 * across iommus may be owned in one domain, e.g. kvm guest.
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
279 int id; /* domain id */
280 int nid; /* node id */
281 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
283 struct list_head devices; /* all devices' list */
284 struct iova_domain iovad; /* iova's that belong to this domain */
286 struct dma_pte *pgd; /* virtual address */
287 int gaw; /* max guest address width */
289 /* adjusted guest address width, 0 is level 2 30-bit */
292 int flags; /* flags to find out type of domain */
294 int iommu_coherency;/* indicate coherency of iommu access */
295 int iommu_snooping; /* indicate snooping control feature*/
296 int iommu_count; /* reference count of iommu */
297 spinlock_t iommu_lock; /* protect iommu set in domain */
298 u64 max_addr; /* maximum mapped address */
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303 struct list_head link; /* link to domain siblings */
304 struct list_head global; /* link to global list */
305 int segment; /* PCI domain */
306 u8 bus; /* PCI bus number */
307 u8 devfn; /* PCI devfn number */
308 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309 struct intel_iommu *iommu; /* IOMMU used by this device */
310 struct dmar_domain *domain; /* pointer to domain */
313 static void flush_unmaps_timeout(unsigned long data);
315 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
320 struct iova *iova[HIGH_WATER_MARK];
321 struct dmar_domain *domain[HIGH_WATER_MARK];
324 static struct deferred_flush_tables *deferred_flush;
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
333 static long list_size;
335 static void domain_remove_dev_info(struct dmar_domain *domain);
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
351 static struct iommu_ops intel_iommu_ops;
353 static int __init intel_iommu_setup(char *str)
358 if (!strncmp(str, "on", 2)) {
360 printk(KERN_INFO "Intel-IOMMU: enabled\n");
361 } else if (!strncmp(str, "off", 3)) {
363 printk(KERN_INFO "Intel-IOMMU: disabled\n");
364 } else if (!strncmp(str, "igfx_off", 8)) {
367 "Intel-IOMMU: disable GFX device mapping\n");
368 } else if (!strncmp(str, "forcedac", 8)) {
370 "Intel-IOMMU: Forcing DAC for PCI devices\n");
372 } else if (!strncmp(str, "strict", 6)) {
374 "Intel-IOMMU: disable batched IOTLB flush\n");
375 intel_iommu_strict = 1;
378 str += strcspn(str, ",");
384 __setup("intel_iommu=", intel_iommu_setup);
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
390 static inline void *alloc_pgtable_page(int node)
395 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
397 vaddr = page_address(page);
401 static inline void free_pgtable_page(void *vaddr)
403 free_page((unsigned long)vaddr);
406 static inline void *alloc_domain_mem(void)
408 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
411 static void free_domain_mem(void *vaddr)
413 kmem_cache_free(iommu_domain_cache, vaddr);
416 static inline void * alloc_devinfo_mem(void)
418 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
421 static inline void free_devinfo_mem(void *vaddr)
423 kmem_cache_free(iommu_devinfo_cache, vaddr);
426 struct iova *alloc_iova_mem(void)
428 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
431 void free_iova_mem(struct iova *iova)
433 kmem_cache_free(iommu_iova_cache, iova);
437 static inline int width_to_agaw(int width);
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
444 sagaw = cap_sagaw(iommu->cap);
445 for (agaw = width_to_agaw(max_gaw);
447 if (test_bit(agaw, &sagaw))
455 * Calculate max SAGAW for each iommu.
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
459 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
463 * calculate agaw for each iommu.
464 * "SAGAW" may be different across iommus, use a default agaw, and
465 * get a supported less agaw for iommus that don't support the default agaw.
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
469 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
477 /* si_domain and vm domain should not get here. */
478 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
481 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
485 return g_iommus[iommu_id];
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
492 domain->iommu_coherency = 1;
494 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495 if (!ecap_coherent(g_iommus[i]->ecap)) {
496 domain->iommu_coherency = 0;
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
506 domain->iommu_snooping = 1;
508 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510 domain->iommu_snooping = 0;
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
519 domain_update_iommu_coherency(domain);
520 domain_update_iommu_snooping(domain);
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
525 struct dmar_drhd_unit *drhd = NULL;
528 for_each_drhd_unit(drhd) {
531 if (segment != drhd->segment)
534 for (i = 0; i < drhd->devices_cnt; i++) {
535 if (drhd->devices[i] &&
536 drhd->devices[i]->bus->number == bus &&
537 drhd->devices[i]->devfn == devfn)
539 if (drhd->devices[i] &&
540 drhd->devices[i]->subordinate &&
541 drhd->devices[i]->subordinate->number <= bus &&
542 drhd->devices[i]->subordinate->subordinate >= bus)
546 if (drhd->include_all)
553 static void domain_flush_cache(struct dmar_domain *domain,
554 void *addr, int size)
556 if (!domain->iommu_coherency)
557 clflush_cache_range(addr, size);
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
564 struct root_entry *root;
565 struct context_entry *context;
566 unsigned long phy_addr;
569 spin_lock_irqsave(&iommu->lock, flags);
570 root = &iommu->root_entry[bus];
571 context = get_context_addr_from_root(root);
573 context = (struct context_entry *)
574 alloc_pgtable_page(iommu->node);
576 spin_unlock_irqrestore(&iommu->lock, flags);
579 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580 phy_addr = virt_to_phys((void *)context);
581 set_root_value(root, phy_addr);
582 set_root_present(root);
583 __iommu_flush_cache(iommu, root, sizeof(*root));
585 spin_unlock_irqrestore(&iommu->lock, flags);
586 return &context[devfn];
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
591 struct root_entry *root;
592 struct context_entry *context;
596 spin_lock_irqsave(&iommu->lock, flags);
597 root = &iommu->root_entry[bus];
598 context = get_context_addr_from_root(root);
603 ret = context_present(&context[devfn]);
605 spin_unlock_irqrestore(&iommu->lock, flags);
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
611 struct root_entry *root;
612 struct context_entry *context;
615 spin_lock_irqsave(&iommu->lock, flags);
616 root = &iommu->root_entry[bus];
617 context = get_context_addr_from_root(root);
619 context_clear_entry(&context[devfn]);
620 __iommu_flush_cache(iommu, &context[devfn], \
623 spin_unlock_irqrestore(&iommu->lock, flags);
626 static void free_context_table(struct intel_iommu *iommu)
628 struct root_entry *root;
631 struct context_entry *context;
633 spin_lock_irqsave(&iommu->lock, flags);
634 if (!iommu->root_entry) {
637 for (i = 0; i < ROOT_ENTRY_NR; i++) {
638 root = &iommu->root_entry[i];
639 context = get_context_addr_from_root(root);
641 free_pgtable_page(context);
643 free_pgtable_page(iommu->root_entry);
644 iommu->root_entry = NULL;
646 spin_unlock_irqrestore(&iommu->lock, flags);
649 /* page table handling */
650 #define LEVEL_STRIDE (9)
651 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
653 static inline int agaw_to_level(int agaw)
658 static inline int agaw_to_width(int agaw)
660 return 30 + agaw * LEVEL_STRIDE;
664 static inline int width_to_agaw(int width)
666 return (width - 30) / LEVEL_STRIDE;
669 static inline unsigned int level_to_offset_bits(int level)
671 return (level - 1) * LEVEL_STRIDE;
674 static inline int pfn_level_offset(unsigned long pfn, int level)
676 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
679 static inline unsigned long level_mask(int level)
681 return -1UL << level_to_offset_bits(level);
684 static inline unsigned long level_size(int level)
686 return 1UL << level_to_offset_bits(level);
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
691 return (pfn + level_size(level) - 1) & level_mask(level);
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
697 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698 struct dma_pte *parent, *pte = NULL;
699 int level = agaw_to_level(domain->agaw);
702 BUG_ON(!domain->pgd);
703 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704 parent = domain->pgd;
709 offset = pfn_level_offset(pfn, level);
710 pte = &parent[offset];
714 if (!dma_pte_present(pte)) {
717 tmp_page = alloc_pgtable_page(domain->nid);
722 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725 /* Someone else set it while we were thinking; use theirs. */
726 free_pgtable_page(tmp_page);
729 domain_flush_cache(domain, pte, sizeof(*pte));
732 parent = phys_to_virt(dma_pte_addr(pte));
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
744 struct dma_pte *parent, *pte = NULL;
745 int total = agaw_to_level(domain->agaw);
748 parent = domain->pgd;
749 while (level <= total) {
750 offset = pfn_level_offset(pfn, total);
751 pte = &parent[offset];
755 if (!dma_pte_present(pte))
757 parent = phys_to_virt(dma_pte_addr(pte));
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765 unsigned long start_pfn,
766 unsigned long last_pfn)
768 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769 struct dma_pte *first_pte, *pte;
771 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773 BUG_ON(start_pfn > last_pfn);
775 /* we don't need lock here; nobody else touches the iova range */
777 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
779 start_pfn = align_to_level(start_pfn + 1, 2);
786 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
788 domain_flush_cache(domain, first_pte,
789 (void *)pte - (void *)first_pte);
791 } while (start_pfn && start_pfn <= last_pfn);
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796 unsigned long start_pfn,
797 unsigned long last_pfn)
799 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800 struct dma_pte *first_pte, *pte;
801 int total = agaw_to_level(domain->agaw);
805 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807 BUG_ON(start_pfn > last_pfn);
809 /* We don't need lock here; nobody else touches the iova range */
811 while (level <= total) {
812 tmp = align_to_level(start_pfn, level);
814 /* If we can't even clear one PTE at this level, we're done */
815 if (tmp + level_size(level) - 1 > last_pfn)
819 first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
821 tmp = align_to_level(tmp + 1, level + 1);
825 if (dma_pte_present(pte)) {
826 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
830 tmp += level_size(level);
831 } while (!first_pte_in_page(pte) &&
832 tmp + level_size(level) - 1 <= last_pfn);
834 domain_flush_cache(domain, first_pte,
835 (void *)pte - (void *)first_pte);
837 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
841 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842 free_pgtable_page(domain->pgd);
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
850 struct root_entry *root;
853 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
857 __iommu_flush_cache(iommu, root, ROOT_SIZE);
859 spin_lock_irqsave(&iommu->lock, flags);
860 iommu->root_entry = root;
861 spin_unlock_irqrestore(&iommu->lock, flags);
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
872 addr = iommu->root_entry;
874 spin_lock_irqsave(&iommu->register_lock, flag);
875 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
877 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
879 /* Make sure hardware complete it */
880 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881 readl, (sts & DMA_GSTS_RTPS), sts);
883 spin_unlock_irqrestore(&iommu->register_lock, flag);
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
891 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
894 spin_lock_irqsave(&iommu->register_lock, flag);
895 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
897 /* Make sure hardware complete it */
898 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899 readl, (!(val & DMA_GSTS_WBFS)), val);
901 spin_unlock_irqrestore(&iommu->register_lock, flag);
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906 u16 did, u16 source_id, u8 function_mask,
913 case DMA_CCMD_GLOBAL_INVL:
914 val = DMA_CCMD_GLOBAL_INVL;
916 case DMA_CCMD_DOMAIN_INVL:
917 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
919 case DMA_CCMD_DEVICE_INVL:
920 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
928 spin_lock_irqsave(&iommu->register_lock, flag);
929 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
931 /* Make sure hardware complete it */
932 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
935 spin_unlock_irqrestore(&iommu->register_lock, flag);
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940 u64 addr, unsigned int size_order, u64 type)
942 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943 u64 val = 0, val_iva = 0;
947 case DMA_TLB_GLOBAL_FLUSH:
948 /* global flush doesn't need set IVA_REG */
949 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
951 case DMA_TLB_DSI_FLUSH:
952 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
954 case DMA_TLB_PSI_FLUSH:
955 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956 /* Note: always flush non-leaf currently */
957 val_iva = size_order | addr;
962 /* Note: set drain read/write */
965 * This is probably to be super secure.. Looks like we can
966 * ignore it without any impact.
968 if (cap_read_drain(iommu->cap))
969 val |= DMA_TLB_READ_DRAIN;
971 if (cap_write_drain(iommu->cap))
972 val |= DMA_TLB_WRITE_DRAIN;
974 spin_lock_irqsave(&iommu->register_lock, flag);
975 /* Note: Only uses first TLB reg currently */
977 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978 dmar_writeq(iommu->reg + tlb_offset + 8, val);
980 /* Make sure hardware complete it */
981 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982 dmar_readq, (!(val & DMA_TLB_IVT)), val);
984 spin_unlock_irqrestore(&iommu->register_lock, flag);
986 /* check IOTLB invalidation granularity */
987 if (DMA_TLB_IAIG(val) == 0)
988 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991 (unsigned long long)DMA_TLB_IIRG(type),
992 (unsigned long long)DMA_TLB_IAIG(val));
995 static struct device_domain_info *iommu_support_dev_iotlb(
996 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1000 struct device_domain_info *info;
1001 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1003 if (!ecap_dev_iotlb_support(iommu->ecap))
1009 spin_lock_irqsave(&device_domain_lock, flags);
1010 list_for_each_entry(info, &domain->devices, link)
1011 if (info->bus == bus && info->devfn == devfn) {
1015 spin_unlock_irqrestore(&device_domain_lock, flags);
1017 if (!found || !info->dev)
1020 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1023 if (!dmar_find_matched_atsr_unit(info->dev))
1026 info->iommu = iommu;
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1036 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1041 if (!info->dev || !pci_ats_enabled(info->dev))
1044 pci_disable_ats(info->dev);
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048 u64 addr, unsigned mask)
1051 unsigned long flags;
1052 struct device_domain_info *info;
1054 spin_lock_irqsave(&device_domain_lock, flags);
1055 list_for_each_entry(info, &domain->devices, link) {
1056 if (!info->dev || !pci_ats_enabled(info->dev))
1059 sid = info->bus << 8 | info->devfn;
1060 qdep = pci_ats_queue_depth(info->dev);
1061 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1063 spin_unlock_irqrestore(&device_domain_lock, flags);
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067 unsigned long pfn, unsigned int pages)
1069 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1075 * Fallback to domain selective flush if no PSI support or the size is
1077 * PSI requires page size to be 2 ^ x, and the base address is naturally
1078 * aligned to the size
1080 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1084 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1088 * In caching mode, domain ID 0 is reserved for non-present to present
1089 * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1091 if (!cap_caching_mode(iommu->cap) || did)
1092 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1098 unsigned long flags;
1100 spin_lock_irqsave(&iommu->register_lock, flags);
1101 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102 pmen &= ~DMA_PMEN_EPM;
1103 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1105 /* wait for the protected region status bit to clear */
1106 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107 readl, !(pmen & DMA_PMEN_PRS), pmen);
1109 spin_unlock_irqrestore(&iommu->register_lock, flags);
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1115 unsigned long flags;
1117 spin_lock_irqsave(&iommu->register_lock, flags);
1118 iommu->gcmd |= DMA_GCMD_TE;
1119 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1121 /* Make sure hardware complete it */
1122 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123 readl, (sts & DMA_GSTS_TES), sts);
1125 spin_unlock_irqrestore(&iommu->register_lock, flags);
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1134 spin_lock_irqsave(&iommu->register_lock, flag);
1135 iommu->gcmd &= ~DMA_GCMD_TE;
1136 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1138 /* Make sure hardware complete it */
1139 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140 readl, (!(sts & DMA_GSTS_TES)), sts);
1142 spin_unlock_irqrestore(&iommu->register_lock, flag);
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1149 unsigned long ndomains;
1150 unsigned long nlongs;
1152 ndomains = cap_ndoms(iommu->cap);
1153 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1154 nlongs = BITS_TO_LONGS(ndomains);
1156 spin_lock_init(&iommu->lock);
1158 /* TBD: there might be 64K domains,
1159 * consider other allocation for future chip
1161 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1162 if (!iommu->domain_ids) {
1163 printk(KERN_ERR "Allocating domain id array failed\n");
1166 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1168 if (!iommu->domains) {
1169 printk(KERN_ERR "Allocating domain array failed\n");
1174 * if Caching mode is set, then invalid translations are tagged
1175 * with domainid 0. Hence we need to pre-allocate it.
1177 if (cap_caching_mode(iommu->cap))
1178 set_bit(0, iommu->domain_ids);
1183 static void domain_exit(struct dmar_domain *domain);
1184 static void vm_domain_exit(struct dmar_domain *domain);
1186 void free_dmar_iommu(struct intel_iommu *iommu)
1188 struct dmar_domain *domain;
1190 unsigned long flags;
1192 if ((iommu->domains) && (iommu->domain_ids)) {
1193 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1194 domain = iommu->domains[i];
1195 clear_bit(i, iommu->domain_ids);
1197 spin_lock_irqsave(&domain->iommu_lock, flags);
1198 if (--domain->iommu_count == 0) {
1199 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1200 vm_domain_exit(domain);
1202 domain_exit(domain);
1204 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1208 if (iommu->gcmd & DMA_GCMD_TE)
1209 iommu_disable_translation(iommu);
1212 set_irq_data(iommu->irq, NULL);
1213 /* This will mask the irq */
1214 free_irq(iommu->irq, iommu);
1215 destroy_irq(iommu->irq);
1218 kfree(iommu->domains);
1219 kfree(iommu->domain_ids);
1221 g_iommus[iommu->seq_id] = NULL;
1223 /* if all iommus are freed, free g_iommus */
1224 for (i = 0; i < g_num_of_iommus; i++) {
1229 if (i == g_num_of_iommus)
1232 /* free context mapping */
1233 free_context_table(iommu);
1236 static struct dmar_domain *alloc_domain(void)
1238 struct dmar_domain *domain;
1240 domain = alloc_domain_mem();
1245 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1251 static int iommu_attach_domain(struct dmar_domain *domain,
1252 struct intel_iommu *iommu)
1255 unsigned long ndomains;
1256 unsigned long flags;
1258 ndomains = cap_ndoms(iommu->cap);
1260 spin_lock_irqsave(&iommu->lock, flags);
1262 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1263 if (num >= ndomains) {
1264 spin_unlock_irqrestore(&iommu->lock, flags);
1265 printk(KERN_ERR "IOMMU: no free domain ids\n");
1270 set_bit(num, iommu->domain_ids);
1271 set_bit(iommu->seq_id, &domain->iommu_bmp);
1272 iommu->domains[num] = domain;
1273 spin_unlock_irqrestore(&iommu->lock, flags);
1278 static void iommu_detach_domain(struct dmar_domain *domain,
1279 struct intel_iommu *iommu)
1281 unsigned long flags;
1285 spin_lock_irqsave(&iommu->lock, flags);
1286 ndomains = cap_ndoms(iommu->cap);
1287 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1288 if (iommu->domains[num] == domain) {
1295 clear_bit(num, iommu->domain_ids);
1296 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1297 iommu->domains[num] = NULL;
1299 spin_unlock_irqrestore(&iommu->lock, flags);
1302 static struct iova_domain reserved_iova_list;
1303 static struct lock_class_key reserved_rbtree_key;
1305 static void dmar_init_reserved_ranges(void)
1307 struct pci_dev *pdev = NULL;
1311 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1313 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1314 &reserved_rbtree_key);
1316 /* IOAPIC ranges shouldn't be accessed by DMA */
1317 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1318 IOVA_PFN(IOAPIC_RANGE_END));
1320 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1322 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1323 for_each_pci_dev(pdev) {
1326 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1327 r = &pdev->resource[i];
1328 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1330 iova = reserve_iova(&reserved_iova_list,
1334 printk(KERN_ERR "Reserve iova failed\n");
1340 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1345 static inline int guestwidth_to_adjustwidth(int gaw)
1348 int r = (gaw - 12) % 9;
1359 static int domain_init(struct dmar_domain *domain, int guest_width)
1361 struct intel_iommu *iommu;
1362 int adjust_width, agaw;
1363 unsigned long sagaw;
1365 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1366 spin_lock_init(&domain->iommu_lock);
1368 domain_reserve_special_ranges(domain);
1370 /* calculate AGAW */
1371 iommu = domain_get_iommu(domain);
1372 if (guest_width > cap_mgaw(iommu->cap))
1373 guest_width = cap_mgaw(iommu->cap);
1374 domain->gaw = guest_width;
1375 adjust_width = guestwidth_to_adjustwidth(guest_width);
1376 agaw = width_to_agaw(adjust_width);
1377 sagaw = cap_sagaw(iommu->cap);
1378 if (!test_bit(agaw, &sagaw)) {
1379 /* hardware doesn't support it, choose a bigger one */
1380 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1381 agaw = find_next_bit(&sagaw, 5, agaw);
1385 domain->agaw = agaw;
1386 INIT_LIST_HEAD(&domain->devices);
1388 if (ecap_coherent(iommu->ecap))
1389 domain->iommu_coherency = 1;
1391 domain->iommu_coherency = 0;
1393 if (ecap_sc_support(iommu->ecap))
1394 domain->iommu_snooping = 1;
1396 domain->iommu_snooping = 0;
1398 domain->iommu_count = 1;
1399 domain->nid = iommu->node;
1401 /* always allocate the top pgd */
1402 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1405 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1409 static void domain_exit(struct dmar_domain *domain)
1411 struct dmar_drhd_unit *drhd;
1412 struct intel_iommu *iommu;
1414 /* Domain 0 is reserved, so dont process it */
1418 domain_remove_dev_info(domain);
1420 put_iova_domain(&domain->iovad);
1423 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425 /* free page tables */
1426 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428 for_each_active_iommu(iommu, drhd)
1429 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1430 iommu_detach_domain(domain, iommu);
1432 free_domain_mem(domain);
1435 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1436 u8 bus, u8 devfn, int translation)
1438 struct context_entry *context;
1439 unsigned long flags;
1440 struct intel_iommu *iommu;
1441 struct dma_pte *pgd;
1443 unsigned long ndomains;
1446 struct device_domain_info *info = NULL;
1448 pr_debug("Set context mapping for %02x:%02x.%d\n",
1449 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451 BUG_ON(!domain->pgd);
1452 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1453 translation != CONTEXT_TT_MULTI_LEVEL);
1455 iommu = device_to_iommu(segment, bus, devfn);
1459 context = device_to_context_entry(iommu, bus, devfn);
1462 spin_lock_irqsave(&iommu->lock, flags);
1463 if (context_present(context)) {
1464 spin_unlock_irqrestore(&iommu->lock, flags);
1471 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1472 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1475 /* find an available domain id for this device in iommu */
1476 ndomains = cap_ndoms(iommu->cap);
1477 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1478 if (iommu->domains[num] == domain) {
1486 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1487 if (num >= ndomains) {
1488 spin_unlock_irqrestore(&iommu->lock, flags);
1489 printk(KERN_ERR "IOMMU: no free domain ids\n");
1493 set_bit(num, iommu->domain_ids);
1494 iommu->domains[num] = domain;
1498 /* Skip top levels of page tables for
1499 * iommu which has less agaw than default.
1500 * Unnecessary for PT mode.
1502 if (translation != CONTEXT_TT_PASS_THROUGH) {
1503 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1504 pgd = phys_to_virt(dma_pte_addr(pgd));
1505 if (!dma_pte_present(pgd)) {
1506 spin_unlock_irqrestore(&iommu->lock, flags);
1513 context_set_domain_id(context, id);
1515 if (translation != CONTEXT_TT_PASS_THROUGH) {
1516 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1517 translation = info ? CONTEXT_TT_DEV_IOTLB :
1518 CONTEXT_TT_MULTI_LEVEL;
1521 * In pass through mode, AW must be programmed to indicate the largest
1522 * AGAW value supported by hardware. And ASR is ignored by hardware.
1524 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1525 context_set_address_width(context, iommu->msagaw);
1527 context_set_address_root(context, virt_to_phys(pgd));
1528 context_set_address_width(context, iommu->agaw);
1531 context_set_translation_type(context, translation);
1532 context_set_fault_enable(context);
1533 context_set_present(context);
1534 domain_flush_cache(domain, context, sizeof(*context));
1537 * It's a non-present to present mapping. If hardware doesn't cache
1538 * non-present entry we only need to flush the write-buffer. If the
1539 * _does_ cache non-present entries, then it does so in the special
1540 * domain #0, which we have to flush:
1542 if (cap_caching_mode(iommu->cap)) {
1543 iommu->flush.flush_context(iommu, 0,
1544 (((u16)bus) << 8) | devfn,
1545 DMA_CCMD_MASK_NOBIT,
1546 DMA_CCMD_DEVICE_INVL);
1547 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1549 iommu_flush_write_buffer(iommu);
1551 iommu_enable_dev_iotlb(info);
1552 spin_unlock_irqrestore(&iommu->lock, flags);
1554 spin_lock_irqsave(&domain->iommu_lock, flags);
1555 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1556 domain->iommu_count++;
1557 if (domain->iommu_count == 1)
1558 domain->nid = iommu->node;
1559 domain_update_iommu_cap(domain);
1561 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1566 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1570 struct pci_dev *tmp, *parent;
1572 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1573 pdev->bus->number, pdev->devfn,
1578 /* dependent device mapping */
1579 tmp = pci_find_upstream_pcie_bridge(pdev);
1582 /* Secondary interface's bus number and devfn 0 */
1583 parent = pdev->bus->self;
1584 while (parent != tmp) {
1585 ret = domain_context_mapping_one(domain,
1586 pci_domain_nr(parent->bus),
1587 parent->bus->number,
1588 parent->devfn, translation);
1591 parent = parent->bus->self;
1593 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1594 return domain_context_mapping_one(domain,
1595 pci_domain_nr(tmp->subordinate),
1596 tmp->subordinate->number, 0,
1598 else /* this is a legacy PCI bridge */
1599 return domain_context_mapping_one(domain,
1600 pci_domain_nr(tmp->bus),
1606 static int domain_context_mapped(struct pci_dev *pdev)
1609 struct pci_dev *tmp, *parent;
1610 struct intel_iommu *iommu;
1612 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1617 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1620 /* dependent device mapping */
1621 tmp = pci_find_upstream_pcie_bridge(pdev);
1624 /* Secondary interface's bus number and devfn 0 */
1625 parent = pdev->bus->self;
1626 while (parent != tmp) {
1627 ret = device_context_mapped(iommu, parent->bus->number,
1631 parent = parent->bus->self;
1633 if (pci_is_pcie(tmp))
1634 return device_context_mapped(iommu, tmp->subordinate->number,
1637 return device_context_mapped(iommu, tmp->bus->number,
1641 /* Returns a number of VTD pages, but aligned to MM page size */
1642 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1645 host_addr &= ~PAGE_MASK;
1646 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1649 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1650 struct scatterlist *sg, unsigned long phys_pfn,
1651 unsigned long nr_pages, int prot)
1653 struct dma_pte *first_pte = NULL, *pte = NULL;
1654 phys_addr_t uninitialized_var(pteval);
1655 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1656 unsigned long sg_res;
1658 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1663 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1668 sg_res = nr_pages + 1;
1669 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1672 while (nr_pages--) {
1676 sg_res = aligned_nrpages(sg->offset, sg->length);
1677 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1678 sg->dma_length = sg->length;
1679 pteval = page_to_phys(sg_page(sg)) | prot;
1682 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1686 /* We don't need lock here, nobody else
1687 * touches the iova range
1689 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691 static int dumps = 5;
1692 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1693 iov_pfn, tmp, (unsigned long long)pteval);
1696 debug_dma_dump_mappings(NULL);
1701 if (!nr_pages || first_pte_in_page(pte)) {
1702 domain_flush_cache(domain, first_pte,
1703 (void *)pte - (void *)first_pte);
1707 pteval += VTD_PAGE_SIZE;
1715 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1716 struct scatterlist *sg, unsigned long nr_pages,
1719 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1722 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723 unsigned long phys_pfn, unsigned long nr_pages,
1726 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1729 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1734 clear_context_table(iommu, bus, devfn);
1735 iommu->flush.flush_context(iommu, 0, 0, 0,
1736 DMA_CCMD_GLOBAL_INVL);
1737 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1740 static void domain_remove_dev_info(struct dmar_domain *domain)
1742 struct device_domain_info *info;
1743 unsigned long flags;
1744 struct intel_iommu *iommu;
1746 spin_lock_irqsave(&device_domain_lock, flags);
1747 while (!list_empty(&domain->devices)) {
1748 info = list_entry(domain->devices.next,
1749 struct device_domain_info, link);
1750 list_del(&info->link);
1751 list_del(&info->global);
1753 info->dev->dev.archdata.iommu = NULL;
1754 spin_unlock_irqrestore(&device_domain_lock, flags);
1756 iommu_disable_dev_iotlb(info);
1757 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1758 iommu_detach_dev(iommu, info->bus, info->devfn);
1759 free_devinfo_mem(info);
1761 spin_lock_irqsave(&device_domain_lock, flags);
1763 spin_unlock_irqrestore(&device_domain_lock, flags);
1768 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770 static struct dmar_domain *
1771 find_domain(struct pci_dev *pdev)
1773 struct device_domain_info *info;
1775 /* No lock here, assumes no domain exit in normal case */
1776 info = pdev->dev.archdata.iommu;
1778 return info->domain;
1782 /* domain is initialized */
1783 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785 struct dmar_domain *domain, *found = NULL;
1786 struct intel_iommu *iommu;
1787 struct dmar_drhd_unit *drhd;
1788 struct device_domain_info *info, *tmp;
1789 struct pci_dev *dev_tmp;
1790 unsigned long flags;
1791 int bus = 0, devfn = 0;
1795 domain = find_domain(pdev);
1799 segment = pci_domain_nr(pdev->bus);
1801 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803 if (pci_is_pcie(dev_tmp)) {
1804 bus = dev_tmp->subordinate->number;
1807 bus = dev_tmp->bus->number;
1808 devfn = dev_tmp->devfn;
1810 spin_lock_irqsave(&device_domain_lock, flags);
1811 list_for_each_entry(info, &device_domain_list, global) {
1812 if (info->segment == segment &&
1813 info->bus == bus && info->devfn == devfn) {
1814 found = info->domain;
1818 spin_unlock_irqrestore(&device_domain_lock, flags);
1819 /* pcie-pci bridge already has a domain, uses it */
1826 domain = alloc_domain();
1830 /* Allocate new domain for the device */
1831 drhd = dmar_find_matched_drhd_unit(pdev);
1833 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1837 iommu = drhd->iommu;
1839 ret = iommu_attach_domain(domain, iommu);
1841 domain_exit(domain);
1845 if (domain_init(domain, gaw)) {
1846 domain_exit(domain);
1850 /* register pcie-to-pci device */
1852 info = alloc_devinfo_mem();
1854 domain_exit(domain);
1857 info->segment = segment;
1859 info->devfn = devfn;
1861 info->domain = domain;
1862 /* This domain is shared by devices under p2p bridge */
1863 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865 /* pcie-to-pci bridge already has a domain, uses it */
1867 spin_lock_irqsave(&device_domain_lock, flags);
1868 list_for_each_entry(tmp, &device_domain_list, global) {
1869 if (tmp->segment == segment &&
1870 tmp->bus == bus && tmp->devfn == devfn) {
1871 found = tmp->domain;
1876 free_devinfo_mem(info);
1877 domain_exit(domain);
1880 list_add(&info->link, &domain->devices);
1881 list_add(&info->global, &device_domain_list);
1883 spin_unlock_irqrestore(&device_domain_lock, flags);
1887 info = alloc_devinfo_mem();
1890 info->segment = segment;
1891 info->bus = pdev->bus->number;
1892 info->devfn = pdev->devfn;
1894 info->domain = domain;
1895 spin_lock_irqsave(&device_domain_lock, flags);
1896 /* somebody is fast */
1897 found = find_domain(pdev);
1898 if (found != NULL) {
1899 spin_unlock_irqrestore(&device_domain_lock, flags);
1900 if (found != domain) {
1901 domain_exit(domain);
1904 free_devinfo_mem(info);
1907 list_add(&info->link, &domain->devices);
1908 list_add(&info->global, &device_domain_list);
1909 pdev->dev.archdata.iommu = info;
1910 spin_unlock_irqrestore(&device_domain_lock, flags);
1913 /* recheck it here, maybe others set it */
1914 return find_domain(pdev);
1917 static int iommu_identity_mapping;
1918 #define IDENTMAP_ALL 1
1919 #define IDENTMAP_GFX 2
1920 #define IDENTMAP_AZALIA 4
1922 static int iommu_domain_identity_map(struct dmar_domain *domain,
1923 unsigned long long start,
1924 unsigned long long end)
1926 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1927 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1929 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1930 dma_to_mm_pfn(last_vpfn))) {
1931 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1935 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1936 start, end, domain->id);
1938 * RMRR range might have overlap with physical memory range,
1941 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1943 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1944 last_vpfn - first_vpfn + 1,
1945 DMA_PTE_READ|DMA_PTE_WRITE);
1948 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1949 unsigned long long start,
1950 unsigned long long end)
1952 struct dmar_domain *domain;
1955 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1959 /* For _hardware_ passthrough, don't bother. But for software
1960 passthrough, we do it anyway -- it may indicate a memory
1961 range which is reserved in E820, so which didn't get set
1962 up to start with in si_domain */
1963 if (domain == si_domain && hw_pass_through) {
1964 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1965 pci_name(pdev), start, end);
1970 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1971 pci_name(pdev), start, end);
1974 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1975 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1976 dmi_get_system_info(DMI_BIOS_VENDOR),
1977 dmi_get_system_info(DMI_BIOS_VERSION),
1978 dmi_get_system_info(DMI_PRODUCT_VERSION));
1983 if (end >> agaw_to_width(domain->agaw)) {
1984 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1985 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1986 agaw_to_width(domain->agaw),
1987 dmi_get_system_info(DMI_BIOS_VENDOR),
1988 dmi_get_system_info(DMI_BIOS_VERSION),
1989 dmi_get_system_info(DMI_PRODUCT_VERSION));
1994 ret = iommu_domain_identity_map(domain, start, end);
1998 /* context entry init */
1999 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2006 domain_exit(domain);
2010 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2011 struct pci_dev *pdev)
2013 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2015 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2016 rmrr->end_address + 1);
2019 #ifdef CONFIG_DMAR_FLOPPY_WA
2020 static inline void iommu_prepare_isa(void)
2022 struct pci_dev *pdev;
2025 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2029 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2030 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2033 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2034 "floppy might not work\n");
2038 static inline void iommu_prepare_isa(void)
2042 #endif /* !CONFIG_DMAR_FLPY_WA */
2044 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2046 static int __init si_domain_work_fn(unsigned long start_pfn,
2047 unsigned long end_pfn, void *datax)
2051 *ret = iommu_domain_identity_map(si_domain,
2052 (uint64_t)start_pfn << PAGE_SHIFT,
2053 (uint64_t)end_pfn << PAGE_SHIFT);
2058 static int __init si_domain_init(int hw)
2060 struct dmar_drhd_unit *drhd;
2061 struct intel_iommu *iommu;
2064 si_domain = alloc_domain();
2068 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2070 for_each_active_iommu(iommu, drhd) {
2071 ret = iommu_attach_domain(si_domain, iommu);
2073 domain_exit(si_domain);
2078 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2079 domain_exit(si_domain);
2083 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2088 for_each_online_node(nid) {
2089 work_with_active_regions(nid, si_domain_work_fn, &ret);
2097 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2098 struct pci_dev *pdev);
2099 static int identity_mapping(struct pci_dev *pdev)
2101 struct device_domain_info *info;
2103 if (likely(!iommu_identity_mapping))
2107 list_for_each_entry(info, &si_domain->devices, link)
2108 if (info->dev == pdev)
2113 static int domain_add_dev_info(struct dmar_domain *domain,
2114 struct pci_dev *pdev,
2117 struct device_domain_info *info;
2118 unsigned long flags;
2121 info = alloc_devinfo_mem();
2125 ret = domain_context_mapping(domain, pdev, translation);
2127 free_devinfo_mem(info);
2131 info->segment = pci_domain_nr(pdev->bus);
2132 info->bus = pdev->bus->number;
2133 info->devfn = pdev->devfn;
2135 info->domain = domain;
2137 spin_lock_irqsave(&device_domain_lock, flags);
2138 list_add(&info->link, &domain->devices);
2139 list_add(&info->global, &device_domain_list);
2140 pdev->dev.archdata.iommu = info;
2141 spin_unlock_irqrestore(&device_domain_lock, flags);
2146 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2148 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2151 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2154 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2158 * We want to start off with all devices in the 1:1 domain, and
2159 * take them out later if we find they can't access all of memory.
2161 * However, we can't do this for PCI devices behind bridges,
2162 * because all PCI devices behind the same bridge will end up
2163 * with the same source-id on their transactions.
2165 * Practically speaking, we can't change things around for these
2166 * devices at run-time, because we can't be sure there'll be no
2167 * DMA transactions in flight for any of their siblings.
2169 * So PCI devices (unless they're on the root bus) as well as
2170 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2171 * the 1:1 domain, just in _case_ one of their siblings turns out
2172 * not to be able to map all of memory.
2174 if (!pci_is_pcie(pdev)) {
2175 if (!pci_is_root_bus(pdev->bus))
2177 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2179 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2183 * At boot time, we don't yet know if devices will be 64-bit capable.
2184 * Assume that they will -- if they turn out not to be, then we can
2185 * take them out of the 1:1 domain later.
2188 return pdev->dma_mask > DMA_BIT_MASK(32);
2193 static int __init iommu_prepare_static_identity_mapping(int hw)
2195 struct pci_dev *pdev = NULL;
2198 ret = si_domain_init(hw);
2202 for_each_pci_dev(pdev) {
2203 if (iommu_should_identity_map(pdev, 1)) {
2204 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2205 hw ? "hardware" : "software", pci_name(pdev));
2207 ret = domain_add_dev_info(si_domain, pdev,
2208 hw ? CONTEXT_TT_PASS_THROUGH :
2209 CONTEXT_TT_MULTI_LEVEL);
2218 int __init init_dmars(void)
2220 struct dmar_drhd_unit *drhd;
2221 struct dmar_rmrr_unit *rmrr;
2222 struct pci_dev *pdev;
2223 struct intel_iommu *iommu;
2229 * initialize and program root entry to not present
2232 for_each_drhd_unit(drhd) {
2235 * lock not needed as this is only incremented in the single
2236 * threaded kernel __init code path all other access are read
2241 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2244 printk(KERN_ERR "Allocating global iommu array failed\n");
2249 deferred_flush = kzalloc(g_num_of_iommus *
2250 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2251 if (!deferred_flush) {
2256 for_each_drhd_unit(drhd) {
2260 iommu = drhd->iommu;
2261 g_iommus[iommu->seq_id] = iommu;
2263 ret = iommu_init_domains(iommu);
2269 * we could share the same root & context tables
2270 * amoung all IOMMU's. Need to Split it later.
2272 ret = iommu_alloc_root_entry(iommu);
2274 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2277 if (!ecap_pass_through(iommu->ecap))
2278 hw_pass_through = 0;
2282 * Start from the sane iommu hardware state.
2284 for_each_drhd_unit(drhd) {
2288 iommu = drhd->iommu;
2291 * If the queued invalidation is already initialized by us
2292 * (for example, while enabling interrupt-remapping) then
2293 * we got the things already rolling from a sane state.
2299 * Clear any previous faults.
2301 dmar_fault(-1, iommu);
2303 * Disable queued invalidation if supported and already enabled
2304 * before OS handover.
2306 dmar_disable_qi(iommu);
2309 for_each_drhd_unit(drhd) {
2313 iommu = drhd->iommu;
2315 if (dmar_enable_qi(iommu)) {
2317 * Queued Invalidate not enabled, use Register Based
2320 iommu->flush.flush_context = __iommu_flush_context;
2321 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2322 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2324 (unsigned long long)drhd->reg_base_addr);
2326 iommu->flush.flush_context = qi_flush_context;
2327 iommu->flush.flush_iotlb = qi_flush_iotlb;
2328 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2330 (unsigned long long)drhd->reg_base_addr);
2334 if (iommu_pass_through)
2335 iommu_identity_mapping |= IDENTMAP_ALL;
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338 iommu_identity_mapping |= IDENTMAP_GFX;
2341 check_tylersburg_isoch();
2344 * If pass through is not set or not enabled, setup context entries for
2345 * identity mappings for rmrr, gfx, and isa and may fall back to static
2346 * identity mapping if iommu_identity_mapping is set.
2348 if (iommu_identity_mapping) {
2349 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2351 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2357 * for each dev attached to rmrr
2359 * locate drhd for dev, alloc domain for dev
2360 * allocate free domain
2361 * allocate page table entries for rmrr
2362 * if context not allocated for bus
2363 * allocate and init context
2364 * set present in root table for this bus
2365 * init context with domain, translation etc
2369 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370 for_each_rmrr_units(rmrr) {
2371 for (i = 0; i < rmrr->devices_cnt; i++) {
2372 pdev = rmrr->devices[i];
2374 * some BIOS lists non-exist devices in DMAR
2379 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2382 "IOMMU: mapping reserved region failed\n");
2386 iommu_prepare_isa();
2391 * global invalidate context cache
2392 * global invalidate iotlb
2393 * enable translation
2395 for_each_drhd_unit(drhd) {
2398 iommu = drhd->iommu;
2400 iommu_flush_write_buffer(iommu);
2402 ret = dmar_set_interrupt(iommu);
2406 iommu_set_root_entry(iommu);
2408 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2411 ret = iommu_enable_translation(iommu);
2415 iommu_disable_protect_mem_regions(iommu);
2420 for_each_drhd_unit(drhd) {
2423 iommu = drhd->iommu;
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432 struct dmar_domain *domain,
2433 unsigned long nrpages, uint64_t dma_mask)
2435 struct pci_dev *pdev = to_pci_dev(dev);
2436 struct iova *iova = NULL;
2438 /* Restrict dma_mask to the width that the iommu can handle */
2439 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2441 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2443 * First try to allocate an io virtual address in
2444 * DMA_BIT_MASK(32) and if that fails then try allocating
2447 iova = alloc_iova(&domain->iovad, nrpages,
2448 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2452 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453 if (unlikely(!iova)) {
2454 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455 nrpages, pci_name(pdev));
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2464 struct dmar_domain *domain;
2467 domain = get_domain_for_dev(pdev,
2468 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2471 "Allocating domain for %s failed", pci_name(pdev));
2475 /* make sure context mapping is ok */
2476 if (unlikely(!domain_context_mapped(pdev))) {
2477 ret = domain_context_mapping(domain, pdev,
2478 CONTEXT_TT_MULTI_LEVEL);
2481 "Domain context map for %s failed",
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2492 struct device_domain_info *info;
2494 /* No lock here, assumes no domain exit in normal case */
2495 info = dev->dev.archdata.iommu;
2497 return info->domain;
2499 return __get_valid_domain_for_dev(dev);
2502 static int iommu_dummy(struct pci_dev *pdev)
2504 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2510 struct pci_dev *pdev;
2513 if (unlikely(dev->bus != &pci_bus_type))
2516 pdev = to_pci_dev(dev);
2517 if (iommu_dummy(pdev))
2520 if (!iommu_identity_mapping)
2523 found = identity_mapping(pdev);
2525 if (iommu_should_identity_map(pdev, 0))
2529 * 32 bit DMA is removed from si_domain and fall back
2530 * to non-identity mapping.
2532 domain_remove_one_dev_info(si_domain, pdev);
2533 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2539 * In case of a detached 64 bit DMA device from vm, the device
2540 * is put into si_domain for identity mapping.
2542 if (iommu_should_identity_map(pdev, 0)) {
2544 ret = domain_add_dev_info(si_domain, pdev,
2546 CONTEXT_TT_PASS_THROUGH :
2547 CONTEXT_TT_MULTI_LEVEL);
2549 printk(KERN_INFO "64bit %s uses identity mapping\n",
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560 size_t size, int dir, u64 dma_mask)
2562 struct pci_dev *pdev = to_pci_dev(hwdev);
2563 struct dmar_domain *domain;
2564 phys_addr_t start_paddr;
2568 struct intel_iommu *iommu;
2569 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2571 BUG_ON(dir == DMA_NONE);
2573 if (iommu_no_mapping(hwdev))
2576 domain = get_valid_domain_for_dev(pdev);
2580 iommu = domain_get_iommu(domain);
2581 size = aligned_nrpages(paddr, size);
2583 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2589 * Check if DMAR supports zero-length reads on write only
2592 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593 !cap_zlr(iommu->cap))
2594 prot |= DMA_PTE_READ;
2595 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596 prot |= DMA_PTE_WRITE;
2598 * paddr - (paddr + size) might be partial page, we should map the whole
2599 * page. Note: if two part of one page are separately mapped, we
2600 * might have two guest_addr mapping to the same host paddr, but this
2601 * is not a big problem
2603 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604 mm_to_dma_pfn(paddr_pfn), size, prot);
2608 /* it's a non-present to present mapping. Only flush if caching mode */
2609 if (cap_caching_mode(iommu->cap))
2610 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2612 iommu_flush_write_buffer(iommu);
2614 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615 start_paddr += paddr & ~PAGE_MASK;
2620 __free_iova(&domain->iovad, iova);
2621 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622 pci_name(pdev), size, (unsigned long long)paddr, dir);
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627 unsigned long offset, size_t size,
2628 enum dma_data_direction dir,
2629 struct dma_attrs *attrs)
2631 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632 dir, to_pci_dev(dev)->dma_mask);
2635 static void flush_unmaps(void)
2641 /* just flush them all */
2642 for (i = 0; i < g_num_of_iommus; i++) {
2643 struct intel_iommu *iommu = g_iommus[i];
2647 if (!deferred_flush[i].next)
2650 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2651 DMA_TLB_GLOBAL_FLUSH);
2652 for (j = 0; j < deferred_flush[i].next; j++) {
2654 struct iova *iova = deferred_flush[i].iova[j];
2656 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2657 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2658 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2659 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2661 deferred_flush[i].next = 0;
2667 static void flush_unmaps_timeout(unsigned long data)
2669 unsigned long flags;
2671 spin_lock_irqsave(&async_umap_flush_lock, flags);
2673 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2676 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2678 unsigned long flags;
2680 struct intel_iommu *iommu;
2682 spin_lock_irqsave(&async_umap_flush_lock, flags);
2683 if (list_size == HIGH_WATER_MARK)
2686 iommu = domain_get_iommu(dom);
2687 iommu_id = iommu->seq_id;
2689 next = deferred_flush[iommu_id].next;
2690 deferred_flush[iommu_id].domain[next] = dom;
2691 deferred_flush[iommu_id].iova[next] = iova;
2692 deferred_flush[iommu_id].next++;
2695 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2699 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2702 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2703 size_t size, enum dma_data_direction dir,
2704 struct dma_attrs *attrs)
2706 struct pci_dev *pdev = to_pci_dev(dev);
2707 struct dmar_domain *domain;
2708 unsigned long start_pfn, last_pfn;
2710 struct intel_iommu *iommu;
2712 if (iommu_no_mapping(dev))
2715 domain = find_domain(pdev);
2718 iommu = domain_get_iommu(domain);
2720 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2721 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2722 (unsigned long long)dev_addr))
2725 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2726 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2728 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2729 pci_name(pdev), start_pfn, last_pfn);
2731 /* clear the whole page */
2732 dma_pte_clear_range(domain, start_pfn, last_pfn);
2734 /* free page tables */
2735 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2737 if (intel_iommu_strict) {
2738 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2739 last_pfn - start_pfn + 1);
2741 __free_iova(&domain->iovad, iova);
2743 add_unmap(domain, iova);
2745 * queue up the release of the unmap to save the 1/6th of the
2746 * cpu used up by the iotlb flush operation...
2751 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2752 dma_addr_t *dma_handle, gfp_t flags)
2757 size = PAGE_ALIGN(size);
2758 order = get_order(size);
2760 if (!iommu_no_mapping(hwdev))
2761 flags &= ~(GFP_DMA | GFP_DMA32);
2762 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2763 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2769 vaddr = (void *)__get_free_pages(flags, order);
2772 memset(vaddr, 0, size);
2774 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2776 hwdev->coherent_dma_mask);
2779 free_pages((unsigned long)vaddr, order);
2783 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2784 dma_addr_t dma_handle)
2788 size = PAGE_ALIGN(size);
2789 order = get_order(size);
2791 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2792 free_pages((unsigned long)vaddr, order);
2795 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2796 int nelems, enum dma_data_direction dir,
2797 struct dma_attrs *attrs)
2799 struct pci_dev *pdev = to_pci_dev(hwdev);
2800 struct dmar_domain *domain;
2801 unsigned long start_pfn, last_pfn;
2803 struct intel_iommu *iommu;
2805 if (iommu_no_mapping(hwdev))
2808 domain = find_domain(pdev);
2811 iommu = domain_get_iommu(domain);
2813 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2814 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2815 (unsigned long long)sglist[0].dma_address))
2818 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2819 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2821 /* clear the whole page */
2822 dma_pte_clear_range(domain, start_pfn, last_pfn);
2824 /* free page tables */
2825 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2827 if (intel_iommu_strict) {
2828 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2829 last_pfn - start_pfn + 1);
2831 __free_iova(&domain->iovad, iova);
2833 add_unmap(domain, iova);
2835 * queue up the release of the unmap to save the 1/6th of the
2836 * cpu used up by the iotlb flush operation...
2841 static int intel_nontranslate_map_sg(struct device *hddev,
2842 struct scatterlist *sglist, int nelems, int dir)
2845 struct scatterlist *sg;
2847 for_each_sg(sglist, sg, nelems, i) {
2848 BUG_ON(!sg_page(sg));
2849 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2850 sg->dma_length = sg->length;
2855 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2856 enum dma_data_direction dir, struct dma_attrs *attrs)
2859 struct pci_dev *pdev = to_pci_dev(hwdev);
2860 struct dmar_domain *domain;
2863 size_t offset_pfn = 0;
2864 struct iova *iova = NULL;
2866 struct scatterlist *sg;
2867 unsigned long start_vpfn;
2868 struct intel_iommu *iommu;
2870 BUG_ON(dir == DMA_NONE);
2871 if (iommu_no_mapping(hwdev))
2872 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2874 domain = get_valid_domain_for_dev(pdev);
2878 iommu = domain_get_iommu(domain);
2880 for_each_sg(sglist, sg, nelems, i)
2881 size += aligned_nrpages(sg->offset, sg->length);
2883 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2886 sglist->dma_length = 0;
2891 * Check if DMAR supports zero-length reads on write only
2894 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2895 !cap_zlr(iommu->cap))
2896 prot |= DMA_PTE_READ;
2897 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2898 prot |= DMA_PTE_WRITE;
2900 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2902 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2903 if (unlikely(ret)) {
2904 /* clear the page */
2905 dma_pte_clear_range(domain, start_vpfn,
2906 start_vpfn + size - 1);
2907 /* free page tables */
2908 dma_pte_free_pagetable(domain, start_vpfn,
2909 start_vpfn + size - 1);
2911 __free_iova(&domain->iovad, iova);
2915 /* it's a non-present to present mapping. Only flush if caching mode */
2916 if (cap_caching_mode(iommu->cap))
2917 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2919 iommu_flush_write_buffer(iommu);
2924 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2929 struct dma_map_ops intel_dma_ops = {
2930 .alloc_coherent = intel_alloc_coherent,
2931 .free_coherent = intel_free_coherent,
2932 .map_sg = intel_map_sg,
2933 .unmap_sg = intel_unmap_sg,
2934 .map_page = intel_map_page,
2935 .unmap_page = intel_unmap_page,
2936 .mapping_error = intel_mapping_error,
2939 static inline int iommu_domain_cache_init(void)
2943 iommu_domain_cache = kmem_cache_create("iommu_domain",
2944 sizeof(struct dmar_domain),
2949 if (!iommu_domain_cache) {
2950 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2957 static inline int iommu_devinfo_cache_init(void)
2961 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2962 sizeof(struct device_domain_info),
2966 if (!iommu_devinfo_cache) {
2967 printk(KERN_ERR "Couldn't create devinfo cache\n");
2974 static inline int iommu_iova_cache_init(void)
2978 iommu_iova_cache = kmem_cache_create("iommu_iova",
2979 sizeof(struct iova),
2983 if (!iommu_iova_cache) {
2984 printk(KERN_ERR "Couldn't create iova cache\n");
2991 static int __init iommu_init_mempool(void)
2994 ret = iommu_iova_cache_init();
2998 ret = iommu_domain_cache_init();
3002 ret = iommu_devinfo_cache_init();
3006 kmem_cache_destroy(iommu_domain_cache);
3008 kmem_cache_destroy(iommu_iova_cache);
3013 static void __init iommu_exit_mempool(void)
3015 kmem_cache_destroy(iommu_devinfo_cache);
3016 kmem_cache_destroy(iommu_domain_cache);
3017 kmem_cache_destroy(iommu_iova_cache);
3021 static void __init init_no_remapping_devices(void)
3023 struct dmar_drhd_unit *drhd;
3025 for_each_drhd_unit(drhd) {
3026 if (!drhd->include_all) {
3028 for (i = 0; i < drhd->devices_cnt; i++)
3029 if (drhd->devices[i] != NULL)
3031 /* ignore DMAR unit if no pci devices exist */
3032 if (i == drhd->devices_cnt)
3040 for_each_drhd_unit(drhd) {
3042 if (drhd->ignored || drhd->include_all)
3045 for (i = 0; i < drhd->devices_cnt; i++)
3046 if (drhd->devices[i] &&
3047 !IS_GFX_DEVICE(drhd->devices[i]))
3050 if (i < drhd->devices_cnt)
3053 /* bypass IOMMU if it is just for gfx devices */
3055 for (i = 0; i < drhd->devices_cnt; i++) {
3056 if (!drhd->devices[i])
3058 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3063 #ifdef CONFIG_SUSPEND
3064 static int init_iommu_hw(void)
3066 struct dmar_drhd_unit *drhd;
3067 struct intel_iommu *iommu = NULL;
3069 for_each_active_iommu(iommu, drhd)
3071 dmar_reenable_qi(iommu);
3073 for_each_active_iommu(iommu, drhd) {
3074 iommu_flush_write_buffer(iommu);
3076 iommu_set_root_entry(iommu);
3078 iommu->flush.flush_context(iommu, 0, 0, 0,
3079 DMA_CCMD_GLOBAL_INVL);
3080 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3081 DMA_TLB_GLOBAL_FLUSH);
3082 iommu_enable_translation(iommu);
3083 iommu_disable_protect_mem_regions(iommu);
3089 static void iommu_flush_all(void)
3091 struct dmar_drhd_unit *drhd;
3092 struct intel_iommu *iommu;
3094 for_each_active_iommu(iommu, drhd) {
3095 iommu->flush.flush_context(iommu, 0, 0, 0,
3096 DMA_CCMD_GLOBAL_INVL);
3097 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3098 DMA_TLB_GLOBAL_FLUSH);
3102 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3104 struct dmar_drhd_unit *drhd;
3105 struct intel_iommu *iommu = NULL;
3108 for_each_active_iommu(iommu, drhd) {
3109 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3111 if (!iommu->iommu_state)
3117 for_each_active_iommu(iommu, drhd) {
3118 iommu_disable_translation(iommu);
3120 spin_lock_irqsave(&iommu->register_lock, flag);
3122 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3123 readl(iommu->reg + DMAR_FECTL_REG);
3124 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3125 readl(iommu->reg + DMAR_FEDATA_REG);
3126 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3127 readl(iommu->reg + DMAR_FEADDR_REG);
3128 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3129 readl(iommu->reg + DMAR_FEUADDR_REG);
3131 spin_unlock_irqrestore(&iommu->register_lock, flag);
3136 for_each_active_iommu(iommu, drhd)
3137 kfree(iommu->iommu_state);
3142 static int iommu_resume(struct sys_device *dev)
3144 struct dmar_drhd_unit *drhd;
3145 struct intel_iommu *iommu = NULL;
3148 if (init_iommu_hw()) {
3149 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3153 for_each_active_iommu(iommu, drhd) {
3155 spin_lock_irqsave(&iommu->register_lock, flag);
3157 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3158 iommu->reg + DMAR_FECTL_REG);
3159 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3160 iommu->reg + DMAR_FEDATA_REG);
3161 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3162 iommu->reg + DMAR_FEADDR_REG);
3163 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3164 iommu->reg + DMAR_FEUADDR_REG);
3166 spin_unlock_irqrestore(&iommu->register_lock, flag);
3169 for_each_active_iommu(iommu, drhd)
3170 kfree(iommu->iommu_state);
3175 static struct sysdev_class iommu_sysclass = {
3177 .resume = iommu_resume,
3178 .suspend = iommu_suspend,
3181 static struct sys_device device_iommu = {
3182 .cls = &iommu_sysclass,
3185 static int __init init_iommu_sysfs(void)
3189 error = sysdev_class_register(&iommu_sysclass);
3193 error = sysdev_register(&device_iommu);
3195 sysdev_class_unregister(&iommu_sysclass);
3201 static int __init init_iommu_sysfs(void)
3205 #endif /* CONFIG_PM */
3208 * Here we only respond to action of unbound device from driver.
3210 * Added device is not attached to its DMAR domain here yet. That will happen
3211 * when mapping the device to iova.
3213 static int device_notifier(struct notifier_block *nb,
3214 unsigned long action, void *data)
3216 struct device *dev = data;
3217 struct pci_dev *pdev = to_pci_dev(dev);
3218 struct dmar_domain *domain;
3220 if (iommu_no_mapping(dev))
3223 domain = find_domain(pdev);
3227 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3228 domain_remove_one_dev_info(domain, pdev);
3233 static struct notifier_block device_nb = {
3234 .notifier_call = device_notifier,
3237 int __init intel_iommu_init(void)
3242 /* VT-d is required for a TXT/tboot launch, so enforce that */
3243 force_on = tboot_force_iommu();
3245 if (dmar_table_init()) {
3247 panic("tboot: Failed to initialize DMAR table\n");
3251 if (dmar_dev_scope_init()) {
3253 panic("tboot: Failed to initialize DMAR device scope\n");
3258 * Check the need for DMA-remapping initialization now.
3259 * Above initialization will also be used by Interrupt-remapping.
3261 if (no_iommu || dmar_disabled)
3264 iommu_init_mempool();
3265 dmar_init_reserved_ranges();
3267 init_no_remapping_devices();
3272 panic("tboot: Failed to initialize DMARs\n");
3273 printk(KERN_ERR "IOMMU: dmar init failed\n");
3274 put_iova_domain(&reserved_iova_list);
3275 iommu_exit_mempool();
3279 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3281 init_timer(&unmap_timer);
3282 #ifdef CONFIG_SWIOTLB
3285 dma_ops = &intel_dma_ops;
3289 register_iommu(&intel_iommu_ops);
3291 bus_register_notifier(&pci_bus_type, &device_nb);
3296 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3297 struct pci_dev *pdev)
3299 struct pci_dev *tmp, *parent;
3301 if (!iommu || !pdev)
3304 /* dependent device detach */
3305 tmp = pci_find_upstream_pcie_bridge(pdev);
3306 /* Secondary interface's bus number and devfn 0 */
3308 parent = pdev->bus->self;
3309 while (parent != tmp) {
3310 iommu_detach_dev(iommu, parent->bus->number,
3312 parent = parent->bus->self;
3314 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3315 iommu_detach_dev(iommu,
3316 tmp->subordinate->number, 0);
3317 else /* this is a legacy PCI bridge */
3318 iommu_detach_dev(iommu, tmp->bus->number,
3323 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3324 struct pci_dev *pdev)
3326 struct device_domain_info *info;
3327 struct intel_iommu *iommu;
3328 unsigned long flags;
3330 struct list_head *entry, *tmp;
3332 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3337 spin_lock_irqsave(&device_domain_lock, flags);
3338 list_for_each_safe(entry, tmp, &domain->devices) {
3339 info = list_entry(entry, struct device_domain_info, link);
3340 /* No need to compare PCI domain; it has to be the same */
3341 if (info->bus == pdev->bus->number &&
3342 info->devfn == pdev->devfn) {
3343 list_del(&info->link);
3344 list_del(&info->global);
3346 info->dev->dev.archdata.iommu = NULL;
3347 spin_unlock_irqrestore(&device_domain_lock, flags);
3349 iommu_disable_dev_iotlb(info);
3350 iommu_detach_dev(iommu, info->bus, info->devfn);
3351 iommu_detach_dependent_devices(iommu, pdev);
3352 free_devinfo_mem(info);
3354 spin_lock_irqsave(&device_domain_lock, flags);
3362 /* if there is no other devices under the same iommu
3363 * owned by this domain, clear this iommu in iommu_bmp
3364 * update iommu count and coherency
3366 if (iommu == device_to_iommu(info->segment, info->bus,
3372 unsigned long tmp_flags;
3373 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3374 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3375 domain->iommu_count--;
3376 domain_update_iommu_cap(domain);
3377 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3380 spin_unlock_irqrestore(&device_domain_lock, flags);
3383 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3385 struct device_domain_info *info;
3386 struct intel_iommu *iommu;
3387 unsigned long flags1, flags2;
3389 spin_lock_irqsave(&device_domain_lock, flags1);
3390 while (!list_empty(&domain->devices)) {
3391 info = list_entry(domain->devices.next,
3392 struct device_domain_info, link);
3393 list_del(&info->link);
3394 list_del(&info->global);
3396 info->dev->dev.archdata.iommu = NULL;
3398 spin_unlock_irqrestore(&device_domain_lock, flags1);
3400 iommu_disable_dev_iotlb(info);
3401 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3402 iommu_detach_dev(iommu, info->bus, info->devfn);
3403 iommu_detach_dependent_devices(iommu, info->dev);
3405 /* clear this iommu in iommu_bmp, update iommu count
3408 spin_lock_irqsave(&domain->iommu_lock, flags2);
3409 if (test_and_clear_bit(iommu->seq_id,
3410 &domain->iommu_bmp)) {
3411 domain->iommu_count--;
3412 domain_update_iommu_cap(domain);
3414 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3416 free_devinfo_mem(info);
3417 spin_lock_irqsave(&device_domain_lock, flags1);
3419 spin_unlock_irqrestore(&device_domain_lock, flags1);
3422 /* domain id for virtual machine, it won't be set in context */
3423 static unsigned long vm_domid;
3425 static int vm_domain_min_agaw(struct dmar_domain *domain)
3428 int min_agaw = domain->agaw;
3430 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
3431 if (min_agaw > g_iommus[i]->agaw)
3432 min_agaw = g_iommus[i]->agaw;
3438 static struct dmar_domain *iommu_alloc_vm_domain(void)
3440 struct dmar_domain *domain;
3442 domain = alloc_domain_mem();
3446 domain->id = vm_domid++;
3448 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3449 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3454 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3458 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3459 spin_lock_init(&domain->iommu_lock);
3461 domain_reserve_special_ranges(domain);
3463 /* calculate AGAW */
3464 domain->gaw = guest_width;
3465 adjust_width = guestwidth_to_adjustwidth(guest_width);
3466 domain->agaw = width_to_agaw(adjust_width);
3468 INIT_LIST_HEAD(&domain->devices);
3470 domain->iommu_count = 0;
3471 domain->iommu_coherency = 0;
3472 domain->iommu_snooping = 0;
3473 domain->max_addr = 0;
3476 /* always allocate the top pgd */
3477 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3480 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3484 static void iommu_free_vm_domain(struct dmar_domain *domain)
3486 unsigned long flags;
3487 struct dmar_drhd_unit *drhd;
3488 struct intel_iommu *iommu;
3490 unsigned long ndomains;
3492 for_each_drhd_unit(drhd) {
3495 iommu = drhd->iommu;
3497 ndomains = cap_ndoms(iommu->cap);
3498 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3499 if (iommu->domains[i] == domain) {
3500 spin_lock_irqsave(&iommu->lock, flags);
3501 clear_bit(i, iommu->domain_ids);
3502 iommu->domains[i] = NULL;
3503 spin_unlock_irqrestore(&iommu->lock, flags);
3510 static void vm_domain_exit(struct dmar_domain *domain)
3512 /* Domain 0 is reserved, so dont process it */
3516 vm_domain_remove_all_dev_info(domain);
3518 put_iova_domain(&domain->iovad);
3521 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3523 /* free page tables */
3524 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3526 iommu_free_vm_domain(domain);
3527 free_domain_mem(domain);
3530 static int intel_iommu_domain_init(struct iommu_domain *domain)
3532 struct dmar_domain *dmar_domain;
3534 dmar_domain = iommu_alloc_vm_domain();
3537 "intel_iommu_domain_init: dmar_domain == NULL\n");
3540 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3542 "intel_iommu_domain_init() failed\n");
3543 vm_domain_exit(dmar_domain);
3546 domain->priv = dmar_domain;
3551 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3553 struct dmar_domain *dmar_domain = domain->priv;
3555 domain->priv = NULL;
3556 vm_domain_exit(dmar_domain);
3559 static int intel_iommu_attach_device(struct iommu_domain *domain,
3562 struct dmar_domain *dmar_domain = domain->priv;
3563 struct pci_dev *pdev = to_pci_dev(dev);
3564 struct intel_iommu *iommu;
3568 /* normally pdev is not mapped */
3569 if (unlikely(domain_context_mapped(pdev))) {
3570 struct dmar_domain *old_domain;
3572 old_domain = find_domain(pdev);
3574 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3575 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3576 domain_remove_one_dev_info(old_domain, pdev);
3578 domain_remove_dev_info(old_domain);
3582 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3587 /* check if this iommu agaw is sufficient for max mapped address */
3588 addr_width = agaw_to_width(iommu->agaw);
3589 end = DOMAIN_MAX_ADDR(addr_width);
3590 end = end & VTD_PAGE_MASK;
3591 if (end < dmar_domain->max_addr) {
3592 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3593 "sufficient for the mapped address (%llx)\n",
3594 __func__, iommu->agaw, dmar_domain->max_addr);
3598 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3601 static void intel_iommu_detach_device(struct iommu_domain *domain,
3604 struct dmar_domain *dmar_domain = domain->priv;
3605 struct pci_dev *pdev = to_pci_dev(dev);
3607 domain_remove_one_dev_info(dmar_domain, pdev);
3610 static int intel_iommu_map_range(struct iommu_domain *domain,
3611 unsigned long iova, phys_addr_t hpa,
3612 size_t size, int iommu_prot)
3614 struct dmar_domain *dmar_domain = domain->priv;
3620 if (iommu_prot & IOMMU_READ)
3621 prot |= DMA_PTE_READ;
3622 if (iommu_prot & IOMMU_WRITE)
3623 prot |= DMA_PTE_WRITE;
3624 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3625 prot |= DMA_PTE_SNP;
3627 max_addr = iova + size;
3628 if (dmar_domain->max_addr < max_addr) {
3632 /* check if minimum agaw is sufficient for mapped address */
3633 min_agaw = vm_domain_min_agaw(dmar_domain);
3634 addr_width = agaw_to_width(min_agaw);
3635 end = DOMAIN_MAX_ADDR(addr_width);
3636 end = end & VTD_PAGE_MASK;
3637 if (end < max_addr) {
3638 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3639 "sufficient for the mapped address (%llx)\n",
3640 __func__, min_agaw, max_addr);
3643 dmar_domain->max_addr = max_addr;
3645 /* Round up size to next multiple of PAGE_SIZE, if it and
3646 the low bits of hpa would take us onto the next page */
3647 size = aligned_nrpages(hpa, size);
3648 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3649 hpa >> VTD_PAGE_SHIFT, size, prot);
3653 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3654 unsigned long iova, size_t size)
3656 struct dmar_domain *dmar_domain = domain->priv;
3661 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3662 (iova + size - 1) >> VTD_PAGE_SHIFT);
3664 if (dmar_domain->max_addr == iova + size)
3665 dmar_domain->max_addr = iova;
3668 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3671 struct dmar_domain *dmar_domain = domain->priv;
3672 struct dma_pte *pte;
3675 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3677 phys = dma_pte_addr(pte);
3682 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3685 struct dmar_domain *dmar_domain = domain->priv;
3687 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3688 return dmar_domain->iommu_snooping;
3693 static struct iommu_ops intel_iommu_ops = {
3694 .domain_init = intel_iommu_domain_init,
3695 .domain_destroy = intel_iommu_domain_destroy,
3696 .attach_dev = intel_iommu_attach_device,
3697 .detach_dev = intel_iommu_detach_device,
3698 .map = intel_iommu_map_range,
3699 .unmap = intel_iommu_unmap_range,
3700 .iova_to_phys = intel_iommu_iova_to_phys,
3701 .domain_has_cap = intel_iommu_domain_has_cap,
3704 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3707 * Mobile 4 Series Chipset neglects to set RWBF capability,
3710 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3716 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3717 ISOCH DMAR unit for the Azalia sound device, but not give it any
3718 TLB entries, which causes it to deadlock. Check for that. We do
3719 this in a function called from init_dmars(), instead of in a PCI
3720 quirk, because we don't want to print the obnoxious "BIOS broken"
3721 message if VT-d is actually disabled.
3723 static void __init check_tylersburg_isoch(void)
3725 struct pci_dev *pdev;
3726 uint32_t vtisochctrl;
3728 /* If there's no Azalia in the system anyway, forget it. */
3729 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3734 /* System Management Registers. Might be hidden, in which case
3735 we can't do the sanity check. But that's OK, because the
3736 known-broken BIOSes _don't_ actually hide it, so far. */
3737 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3741 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3748 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3749 if (vtisochctrl & 1)
3752 /* Drop all bits other than the number of TLB entries */
3753 vtisochctrl &= 0x1c;
3755 /* If we have the recommended number of TLB entries (16), fine. */
3756 if (vtisochctrl == 0x10)
3759 /* Zero TLB entries? You get to ride the short bus to school. */
3761 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3762 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3763 dmi_get_system_info(DMI_BIOS_VENDOR),
3764 dmi_get_system_info(DMI_BIOS_VERSION),
3765 dmi_get_system_info(DMI_PRODUCT_VERSION));
3766 iommu_identity_mapping |= IDENTMAP_AZALIA;
3770 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",