iommu/amd: Fix pci_request_acs() call-place
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         domain->iommu_coherency = 1;
593
594         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
595                 if (!ecap_coherent(g_iommus[i]->ecap)) {
596                         domain->iommu_coherency = 0;
597                         break;
598                 }
599         }
600 }
601
602 static void domain_update_iommu_snooping(struct dmar_domain *domain)
603 {
604         int i;
605
606         domain->iommu_snooping = 1;
607
608         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
609                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
610                         domain->iommu_snooping = 0;
611                         break;
612                 }
613         }
614 }
615
616 static void domain_update_iommu_superpage(struct dmar_domain *domain)
617 {
618         struct dmar_drhd_unit *drhd;
619         struct intel_iommu *iommu = NULL;
620         int mask = 0xf;
621
622         if (!intel_iommu_superpage) {
623                 domain->iommu_superpage = 0;
624                 return;
625         }
626
627         /* set iommu_superpage to the smallest common denominator */
628         for_each_active_iommu(iommu, drhd) {
629                 mask &= cap_super_page_val(iommu->cap);
630                 if (!mask) {
631                         break;
632                 }
633         }
634         domain->iommu_superpage = fls(mask);
635 }
636
637 /* Some capabilities may be different across iommus */
638 static void domain_update_iommu_cap(struct dmar_domain *domain)
639 {
640         domain_update_iommu_coherency(domain);
641         domain_update_iommu_snooping(domain);
642         domain_update_iommu_superpage(domain);
643 }
644
645 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646 {
647         struct dmar_drhd_unit *drhd = NULL;
648         int i;
649
650         for_each_drhd_unit(drhd) {
651                 if (drhd->ignored)
652                         continue;
653                 if (segment != drhd->segment)
654                         continue;
655
656                 for (i = 0; i < drhd->devices_cnt; i++) {
657                         if (drhd->devices[i] &&
658                             drhd->devices[i]->bus->number == bus &&
659                             drhd->devices[i]->devfn == devfn)
660                                 return drhd->iommu;
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->subordinate &&
663                             drhd->devices[i]->subordinate->number <= bus &&
664                             drhd->devices[i]->subordinate->busn_res.end >= bus)
665                                 return drhd->iommu;
666                 }
667
668                 if (drhd->include_all)
669                         return drhd->iommu;
670         }
671
672         return NULL;
673 }
674
675 static void domain_flush_cache(struct dmar_domain *domain,
676                                void *addr, int size)
677 {
678         if (!domain->iommu_coherency)
679                 clflush_cache_range(addr, size);
680 }
681
682 /* Gets context entry for a given bus and devfn */
683 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
684                 u8 bus, u8 devfn)
685 {
686         struct root_entry *root;
687         struct context_entry *context;
688         unsigned long phy_addr;
689         unsigned long flags;
690
691         spin_lock_irqsave(&iommu->lock, flags);
692         root = &iommu->root_entry[bus];
693         context = get_context_addr_from_root(root);
694         if (!context) {
695                 context = (struct context_entry *)
696                                 alloc_pgtable_page(iommu->node);
697                 if (!context) {
698                         spin_unlock_irqrestore(&iommu->lock, flags);
699                         return NULL;
700                 }
701                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
702                 phy_addr = virt_to_phys((void *)context);
703                 set_root_value(root, phy_addr);
704                 set_root_present(root);
705                 __iommu_flush_cache(iommu, root, sizeof(*root));
706         }
707         spin_unlock_irqrestore(&iommu->lock, flags);
708         return &context[devfn];
709 }
710
711 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712 {
713         struct root_entry *root;
714         struct context_entry *context;
715         int ret;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->lock, flags);
719         root = &iommu->root_entry[bus];
720         context = get_context_addr_from_root(root);
721         if (!context) {
722                 ret = 0;
723                 goto out;
724         }
725         ret = context_present(&context[devfn]);
726 out:
727         spin_unlock_irqrestore(&iommu->lock, flags);
728         return ret;
729 }
730
731 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732 {
733         struct root_entry *root;
734         struct context_entry *context;
735         unsigned long flags;
736
737         spin_lock_irqsave(&iommu->lock, flags);
738         root = &iommu->root_entry[bus];
739         context = get_context_addr_from_root(root);
740         if (context) {
741                 context_clear_entry(&context[devfn]);
742                 __iommu_flush_cache(iommu, &context[devfn], \
743                         sizeof(*context));
744         }
745         spin_unlock_irqrestore(&iommu->lock, flags);
746 }
747
748 static void free_context_table(struct intel_iommu *iommu)
749 {
750         struct root_entry *root;
751         int i;
752         unsigned long flags;
753         struct context_entry *context;
754
755         spin_lock_irqsave(&iommu->lock, flags);
756         if (!iommu->root_entry) {
757                 goto out;
758         }
759         for (i = 0; i < ROOT_ENTRY_NR; i++) {
760                 root = &iommu->root_entry[i];
761                 context = get_context_addr_from_root(root);
762                 if (context)
763                         free_pgtable_page(context);
764         }
765         free_pgtable_page(iommu->root_entry);
766         iommu->root_entry = NULL;
767 out:
768         spin_unlock_irqrestore(&iommu->lock, flags);
769 }
770
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772                                       unsigned long pfn, int target_level)
773 {
774         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
775         struct dma_pte *parent, *pte = NULL;
776         int level = agaw_to_level(domain->agaw);
777         int offset;
778
779         BUG_ON(!domain->pgd);
780         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
781         parent = domain->pgd;
782
783         while (level > 0) {
784                 void *tmp_page;
785
786                 offset = pfn_level_offset(pfn, level);
787                 pte = &parent[offset];
788                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789                         break;
790                 if (level == target_level)
791                         break;
792
793                 if (!dma_pte_present(pte)) {
794                         uint64_t pteval;
795
796                         tmp_page = alloc_pgtable_page(domain->nid);
797
798                         if (!tmp_page)
799                                 return NULL;
800
801                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
804                                 /* Someone else set it while we were thinking; use theirs. */
805                                 free_pgtable_page(tmp_page);
806                         } else {
807                                 dma_pte_addr(pte);
808                                 domain_flush_cache(domain, pte, sizeof(*pte));
809                         }
810                 }
811                 parent = phys_to_virt(dma_pte_addr(pte));
812                 level--;
813         }
814
815         return pte;
816 }
817
818
819 /* return address's pte at specific level */
820 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
821                                          unsigned long pfn,
822                                          int level, int *large_page)
823 {
824         struct dma_pte *parent, *pte = NULL;
825         int total = agaw_to_level(domain->agaw);
826         int offset;
827
828         parent = domain->pgd;
829         while (level <= total) {
830                 offset = pfn_level_offset(pfn, total);
831                 pte = &parent[offset];
832                 if (level == total)
833                         return pte;
834
835                 if (!dma_pte_present(pte)) {
836                         *large_page = total;
837                         break;
838                 }
839
840                 if (pte->val & DMA_PTE_LARGE_PAGE) {
841                         *large_page = total;
842                         return pte;
843                 }
844
845                 parent = phys_to_virt(dma_pte_addr(pte));
846                 total--;
847         }
848         return NULL;
849 }
850
851 /* clear last level pte, a tlb flush should be followed */
852 static int dma_pte_clear_range(struct dmar_domain *domain,
853                                 unsigned long start_pfn,
854                                 unsigned long last_pfn)
855 {
856         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857         unsigned int large_page = 1;
858         struct dma_pte *first_pte, *pte;
859         int order;
860
861         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
862         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
863         BUG_ON(start_pfn > last_pfn);
864
865         /* we don't need lock here; nobody else touches the iova range */
866         do {
867                 large_page = 1;
868                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
869                 if (!pte) {
870                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
871                         continue;
872                 }
873                 do {
874                         dma_clear_pte(pte);
875                         start_pfn += lvl_to_nr_pages(large_page);
876                         pte++;
877                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878
879                 domain_flush_cache(domain, first_pte,
880                                    (void *)pte - (void *)first_pte);
881
882         } while (start_pfn && start_pfn <= last_pfn);
883
884         order = (large_page - 1) * 9;
885         return order;
886 }
887
888 /* free page table pages. last level pte should already be cleared */
889 static void dma_pte_free_pagetable(struct dmar_domain *domain,
890                                    unsigned long start_pfn,
891                                    unsigned long last_pfn)
892 {
893         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
894         struct dma_pte *first_pte, *pte;
895         int total = agaw_to_level(domain->agaw);
896         int level;
897         unsigned long tmp;
898         int large_page = 2;
899
900         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
901         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
902         BUG_ON(start_pfn > last_pfn);
903
904         /* We don't need lock here; nobody else touches the iova range */
905         level = 2;
906         while (level <= total) {
907                 tmp = align_to_level(start_pfn, level);
908
909                 /* If we can't even clear one PTE at this level, we're done */
910                 if (tmp + level_size(level) - 1 > last_pfn)
911                         return;
912
913                 do {
914                         large_page = level;
915                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
916                         if (large_page > level)
917                                 level = large_page + 1;
918                         if (!pte) {
919                                 tmp = align_to_level(tmp + 1, level + 1);
920                                 continue;
921                         }
922                         do {
923                                 if (dma_pte_present(pte)) {
924                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
925                                         dma_clear_pte(pte);
926                                 }
927                                 pte++;
928                                 tmp += level_size(level);
929                         } while (!first_pte_in_page(pte) &&
930                                  tmp + level_size(level) - 1 <= last_pfn);
931
932                         domain_flush_cache(domain, first_pte,
933                                            (void *)pte - (void *)first_pte);
934                         
935                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
936                 level++;
937         }
938         /* free pgd */
939         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
940                 free_pgtable_page(domain->pgd);
941                 domain->pgd = NULL;
942         }
943 }
944
945 /* iommu handling */
946 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947 {
948         struct root_entry *root;
949         unsigned long flags;
950
951         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
952         if (!root)
953                 return -ENOMEM;
954
955         __iommu_flush_cache(iommu, root, ROOT_SIZE);
956
957         spin_lock_irqsave(&iommu->lock, flags);
958         iommu->root_entry = root;
959         spin_unlock_irqrestore(&iommu->lock, flags);
960
961         return 0;
962 }
963
964 static void iommu_set_root_entry(struct intel_iommu *iommu)
965 {
966         void *addr;
967         u32 sts;
968         unsigned long flag;
969
970         addr = iommu->root_entry;
971
972         raw_spin_lock_irqsave(&iommu->register_lock, flag);
973         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974
975         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976
977         /* Make sure hardware complete it */
978         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
979                       readl, (sts & DMA_GSTS_RTPS), sts);
980
981         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
982 }
983
984 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985 {
986         u32 val;
987         unsigned long flag;
988
989         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
990                 return;
991
992         raw_spin_lock_irqsave(&iommu->register_lock, flag);
993         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994
995         /* Make sure hardware complete it */
996         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
997                       readl, (!(val & DMA_GSTS_WBFS)), val);
998
999         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000 }
1001
1002 /* return value determine if we need a write buffer flush */
1003 static void __iommu_flush_context(struct intel_iommu *iommu,
1004                                   u16 did, u16 source_id, u8 function_mask,
1005                                   u64 type)
1006 {
1007         u64 val = 0;
1008         unsigned long flag;
1009
1010         switch (type) {
1011         case DMA_CCMD_GLOBAL_INVL:
1012                 val = DMA_CCMD_GLOBAL_INVL;
1013                 break;
1014         case DMA_CCMD_DOMAIN_INVL:
1015                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016                 break;
1017         case DMA_CCMD_DEVICE_INVL:
1018                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020                 break;
1021         default:
1022                 BUG();
1023         }
1024         val |= DMA_CCMD_ICC;
1025
1026         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029         /* Make sure hardware complete it */
1030         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034 }
1035
1036 /* return value determine if we need a write buffer flush */
1037 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                                 u64 addr, unsigned int size_order, u64 type)
1039 {
1040         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041         u64 val = 0, val_iva = 0;
1042         unsigned long flag;
1043
1044         switch (type) {
1045         case DMA_TLB_GLOBAL_FLUSH:
1046                 /* global flush doesn't need set IVA_REG */
1047                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048                 break;
1049         case DMA_TLB_DSI_FLUSH:
1050                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051                 break;
1052         case DMA_TLB_PSI_FLUSH:
1053                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                 /* Note: always flush non-leaf currently */
1055                 val_iva = size_order | addr;
1056                 break;
1057         default:
1058                 BUG();
1059         }
1060         /* Note: set drain read/write */
1061 #if 0
1062         /*
1063          * This is probably to be super secure.. Looks like we can
1064          * ignore it without any impact.
1065          */
1066         if (cap_read_drain(iommu->cap))
1067                 val |= DMA_TLB_READ_DRAIN;
1068 #endif
1069         if (cap_write_drain(iommu->cap))
1070                 val |= DMA_TLB_WRITE_DRAIN;
1071
1072         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073         /* Note: Only uses first TLB reg currently */
1074         if (val_iva)
1075                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078         /* Make sure hardware complete it */
1079         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084         /* check IOTLB invalidation granularity */
1085         if (DMA_TLB_IAIG(val) == 0)
1086                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089                         (unsigned long long)DMA_TLB_IIRG(type),
1090                         (unsigned long long)DMA_TLB_IAIG(val));
1091 }
1092
1093 static struct device_domain_info *iommu_support_dev_iotlb(
1094         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095 {
1096         int found = 0;
1097         unsigned long flags;
1098         struct device_domain_info *info;
1099         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101         if (!ecap_dev_iotlb_support(iommu->ecap))
1102                 return NULL;
1103
1104         if (!iommu->qi)
1105                 return NULL;
1106
1107         spin_lock_irqsave(&device_domain_lock, flags);
1108         list_for_each_entry(info, &domain->devices, link)
1109                 if (info->bus == bus && info->devfn == devfn) {
1110                         found = 1;
1111                         break;
1112                 }
1113         spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115         if (!found || !info->dev)
1116                 return NULL;
1117
1118         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119                 return NULL;
1120
1121         if (!dmar_find_matched_atsr_unit(info->dev))
1122                 return NULL;
1123
1124         info->iommu = iommu;
1125
1126         return info;
1127 }
1128
1129 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130 {
1131         if (!info)
1132                 return;
1133
1134         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135 }
1136
1137 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138 {
1139         if (!info->dev || !pci_ats_enabled(info->dev))
1140                 return;
1141
1142         pci_disable_ats(info->dev);
1143 }
1144
1145 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                                   u64 addr, unsigned mask)
1147 {
1148         u16 sid, qdep;
1149         unsigned long flags;
1150         struct device_domain_info *info;
1151
1152         spin_lock_irqsave(&device_domain_lock, flags);
1153         list_for_each_entry(info, &domain->devices, link) {
1154                 if (!info->dev || !pci_ats_enabled(info->dev))
1155                         continue;
1156
1157                 sid = info->bus << 8 | info->devfn;
1158                 qdep = pci_ats_queue_depth(info->dev);
1159                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160         }
1161         spin_unlock_irqrestore(&device_domain_lock, flags);
1162 }
1163
1164 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                                   unsigned long pfn, unsigned int pages, int map)
1166 {
1167         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170         BUG_ON(pages == 0);
1171
1172         /*
1173          * Fallback to domain selective flush if no PSI support or the size is
1174          * too big.
1175          * PSI requires page size to be 2 ^ x, and the base address is naturally
1176          * aligned to the size
1177          */
1178         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                                                 DMA_TLB_DSI_FLUSH);
1181         else
1182                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                                                 DMA_TLB_PSI_FLUSH);
1184
1185         /*
1186          * In caching mode, changes of pages from non-present to present require
1187          * flush. However, device IOTLB doesn't need to be flushed in this case.
1188          */
1189         if (!cap_caching_mode(iommu->cap) || !map)
1190                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191 }
1192
1193 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194 {
1195         u32 pmen;
1196         unsigned long flags;
1197
1198         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200         pmen &= ~DMA_PMEN_EPM;
1201         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203         /* wait for the protected region status bit to clear */
1204         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208 }
1209
1210 static int iommu_enable_translation(struct intel_iommu *iommu)
1211 {
1212         u32 sts;
1213         unsigned long flags;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216         iommu->gcmd |= DMA_GCMD_TE;
1217         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219         /* Make sure hardware complete it */
1220         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221                       readl, (sts & DMA_GSTS_TES), sts);
1222
1223         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224         return 0;
1225 }
1226
1227 static int iommu_disable_translation(struct intel_iommu *iommu)
1228 {
1229         u32 sts;
1230         unsigned long flag;
1231
1232         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233         iommu->gcmd &= ~DMA_GCMD_TE;
1234         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236         /* Make sure hardware complete it */
1237         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                       readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241         return 0;
1242 }
1243
1244
1245 static int iommu_init_domains(struct intel_iommu *iommu)
1246 {
1247         unsigned long ndomains;
1248         unsigned long nlongs;
1249
1250         ndomains = cap_ndoms(iommu->cap);
1251         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252                         ndomains);
1253         nlongs = BITS_TO_LONGS(ndomains);
1254
1255         spin_lock_init(&iommu->lock);
1256
1257         /* TBD: there might be 64K domains,
1258          * consider other allocation for future chip
1259          */
1260         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261         if (!iommu->domain_ids) {
1262                 printk(KERN_ERR "Allocating domain id array failed\n");
1263                 return -ENOMEM;
1264         }
1265         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266                         GFP_KERNEL);
1267         if (!iommu->domains) {
1268                 printk(KERN_ERR "Allocating domain array failed\n");
1269                 return -ENOMEM;
1270         }
1271
1272         /*
1273          * if Caching mode is set, then invalid translations are tagged
1274          * with domainid 0. Hence we need to pre-allocate it.
1275          */
1276         if (cap_caching_mode(iommu->cap))
1277                 set_bit(0, iommu->domain_ids);
1278         return 0;
1279 }
1280
1281
1282 static void domain_exit(struct dmar_domain *domain);
1283 static void vm_domain_exit(struct dmar_domain *domain);
1284
1285 void free_dmar_iommu(struct intel_iommu *iommu)
1286 {
1287         struct dmar_domain *domain;
1288         int i;
1289         unsigned long flags;
1290
1291         if ((iommu->domains) && (iommu->domain_ids)) {
1292                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293                         domain = iommu->domains[i];
1294                         clear_bit(i, iommu->domain_ids);
1295
1296                         spin_lock_irqsave(&domain->iommu_lock, flags);
1297                         if (--domain->iommu_count == 0) {
1298                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                                         vm_domain_exit(domain);
1300                                 else
1301                                         domain_exit(domain);
1302                         }
1303                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304                 }
1305         }
1306
1307         if (iommu->gcmd & DMA_GCMD_TE)
1308                 iommu_disable_translation(iommu);
1309
1310         if (iommu->irq) {
1311                 irq_set_handler_data(iommu->irq, NULL);
1312                 /* This will mask the irq */
1313                 free_irq(iommu->irq, iommu);
1314                 destroy_irq(iommu->irq);
1315         }
1316
1317         kfree(iommu->domains);
1318         kfree(iommu->domain_ids);
1319
1320         g_iommus[iommu->seq_id] = NULL;
1321
1322         /* if all iommus are freed, free g_iommus */
1323         for (i = 0; i < g_num_of_iommus; i++) {
1324                 if (g_iommus[i])
1325                         break;
1326         }
1327
1328         if (i == g_num_of_iommus)
1329                 kfree(g_iommus);
1330
1331         /* free context mapping */
1332         free_context_table(iommu);
1333 }
1334
1335 static struct dmar_domain *alloc_domain(void)
1336 {
1337         struct dmar_domain *domain;
1338
1339         domain = alloc_domain_mem();
1340         if (!domain)
1341                 return NULL;
1342
1343         domain->nid = -1;
1344         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345         domain->flags = 0;
1346
1347         return domain;
1348 }
1349
1350 static int iommu_attach_domain(struct dmar_domain *domain,
1351                                struct intel_iommu *iommu)
1352 {
1353         int num;
1354         unsigned long ndomains;
1355         unsigned long flags;
1356
1357         ndomains = cap_ndoms(iommu->cap);
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360
1361         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362         if (num >= ndomains) {
1363                 spin_unlock_irqrestore(&iommu->lock, flags);
1364                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1365                 return -ENOMEM;
1366         }
1367
1368         domain->id = num;
1369         set_bit(num, iommu->domain_ids);
1370         set_bit(iommu->seq_id, domain->iommu_bmp);
1371         iommu->domains[num] = domain;
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374         return 0;
1375 }
1376
1377 static void iommu_detach_domain(struct dmar_domain *domain,
1378                                 struct intel_iommu *iommu)
1379 {
1380         unsigned long flags;
1381         int num, ndomains;
1382         int found = 0;
1383
1384         spin_lock_irqsave(&iommu->lock, flags);
1385         ndomains = cap_ndoms(iommu->cap);
1386         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387                 if (iommu->domains[num] == domain) {
1388                         found = 1;
1389                         break;
1390                 }
1391         }
1392
1393         if (found) {
1394                 clear_bit(num, iommu->domain_ids);
1395                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1396                 iommu->domains[num] = NULL;
1397         }
1398         spin_unlock_irqrestore(&iommu->lock, flags);
1399 }
1400
1401 static struct iova_domain reserved_iova_list;
1402 static struct lock_class_key reserved_rbtree_key;
1403
1404 static int dmar_init_reserved_ranges(void)
1405 {
1406         struct pci_dev *pdev = NULL;
1407         struct iova *iova;
1408         int i;
1409
1410         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413                 &reserved_rbtree_key);
1414
1415         /* IOAPIC ranges shouldn't be accessed by DMA */
1416         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417                 IOVA_PFN(IOAPIC_RANGE_END));
1418         if (!iova) {
1419                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420                 return -ENODEV;
1421         }
1422
1423         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424         for_each_pci_dev(pdev) {
1425                 struct resource *r;
1426
1427                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428                         r = &pdev->resource[i];
1429                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                                 continue;
1431                         iova = reserve_iova(&reserved_iova_list,
1432                                             IOVA_PFN(r->start),
1433                                             IOVA_PFN(r->end));
1434                         if (!iova) {
1435                                 printk(KERN_ERR "Reserve iova failed\n");
1436                                 return -ENODEV;
1437                         }
1438                 }
1439         }
1440         return 0;
1441 }
1442
1443 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444 {
1445         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446 }
1447
1448 static inline int guestwidth_to_adjustwidth(int gaw)
1449 {
1450         int agaw;
1451         int r = (gaw - 12) % 9;
1452
1453         if (r == 0)
1454                 agaw = gaw;
1455         else
1456                 agaw = gaw + 9 - r;
1457         if (agaw > 64)
1458                 agaw = 64;
1459         return agaw;
1460 }
1461
1462 static int domain_init(struct dmar_domain *domain, int guest_width)
1463 {
1464         struct intel_iommu *iommu;
1465         int adjust_width, agaw;
1466         unsigned long sagaw;
1467
1468         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469         spin_lock_init(&domain->iommu_lock);
1470
1471         domain_reserve_special_ranges(domain);
1472
1473         /* calculate AGAW */
1474         iommu = domain_get_iommu(domain);
1475         if (guest_width > cap_mgaw(iommu->cap))
1476                 guest_width = cap_mgaw(iommu->cap);
1477         domain->gaw = guest_width;
1478         adjust_width = guestwidth_to_adjustwidth(guest_width);
1479         agaw = width_to_agaw(adjust_width);
1480         sagaw = cap_sagaw(iommu->cap);
1481         if (!test_bit(agaw, &sagaw)) {
1482                 /* hardware doesn't support it, choose a bigger one */
1483                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484                 agaw = find_next_bit(&sagaw, 5, agaw);
1485                 if (agaw >= 5)
1486                         return -ENODEV;
1487         }
1488         domain->agaw = agaw;
1489         INIT_LIST_HEAD(&domain->devices);
1490
1491         if (ecap_coherent(iommu->ecap))
1492                 domain->iommu_coherency = 1;
1493         else
1494                 domain->iommu_coherency = 0;
1495
1496         if (ecap_sc_support(iommu->ecap))
1497                 domain->iommu_snooping = 1;
1498         else
1499                 domain->iommu_snooping = 0;
1500
1501         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502         domain->iommu_count = 1;
1503         domain->nid = iommu->node;
1504
1505         /* always allocate the top pgd */
1506         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507         if (!domain->pgd)
1508                 return -ENOMEM;
1509         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510         return 0;
1511 }
1512
1513 static void domain_exit(struct dmar_domain *domain)
1514 {
1515         struct dmar_drhd_unit *drhd;
1516         struct intel_iommu *iommu;
1517
1518         /* Domain 0 is reserved, so dont process it */
1519         if (!domain)
1520                 return;
1521
1522         /* Flush any lazy unmaps that may reference this domain */
1523         if (!intel_iommu_strict)
1524                 flush_unmaps_timeout(0);
1525
1526         domain_remove_dev_info(domain);
1527         /* destroy iovas */
1528         put_iova_domain(&domain->iovad);
1529
1530         /* clear ptes */
1531         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533         /* free page tables */
1534         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536         for_each_active_iommu(iommu, drhd)
1537                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538                         iommu_detach_domain(domain, iommu);
1539
1540         free_domain_mem(domain);
1541 }
1542
1543 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                                  u8 bus, u8 devfn, int translation)
1545 {
1546         struct context_entry *context;
1547         unsigned long flags;
1548         struct intel_iommu *iommu;
1549         struct dma_pte *pgd;
1550         unsigned long num;
1551         unsigned long ndomains;
1552         int id;
1553         int agaw;
1554         struct device_domain_info *info = NULL;
1555
1556         pr_debug("Set context mapping for %02x:%02x.%d\n",
1557                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559         BUG_ON(!domain->pgd);
1560         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561                translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563         iommu = device_to_iommu(segment, bus, devfn);
1564         if (!iommu)
1565                 return -ENODEV;
1566
1567         context = device_to_context_entry(iommu, bus, devfn);
1568         if (!context)
1569                 return -ENOMEM;
1570         spin_lock_irqsave(&iommu->lock, flags);
1571         if (context_present(context)) {
1572                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                 return 0;
1574         }
1575
1576         id = domain->id;
1577         pgd = domain->pgd;
1578
1579         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581                 int found = 0;
1582
1583                 /* find an available domain id for this device in iommu */
1584                 ndomains = cap_ndoms(iommu->cap);
1585                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586                         if (iommu->domains[num] == domain) {
1587                                 id = num;
1588                                 found = 1;
1589                                 break;
1590                         }
1591                 }
1592
1593                 if (found == 0) {
1594                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595                         if (num >= ndomains) {
1596                                 spin_unlock_irqrestore(&iommu->lock, flags);
1597                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                                 return -EFAULT;
1599                         }
1600
1601                         set_bit(num, iommu->domain_ids);
1602                         iommu->domains[num] = domain;
1603                         id = num;
1604                 }
1605
1606                 /* Skip top levels of page tables for
1607                  * iommu which has less agaw than default.
1608                  * Unnecessary for PT mode.
1609                  */
1610                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1611                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1613                                 if (!dma_pte_present(pgd)) {
1614                                         spin_unlock_irqrestore(&iommu->lock, flags);
1615                                         return -ENOMEM;
1616                                 }
1617                         }
1618                 }
1619         }
1620
1621         context_set_domain_id(context, id);
1622
1623         if (translation != CONTEXT_TT_PASS_THROUGH) {
1624                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                                      CONTEXT_TT_MULTI_LEVEL;
1627         }
1628         /*
1629          * In pass through mode, AW must be programmed to indicate the largest
1630          * AGAW value supported by hardware. And ASR is ignored by hardware.
1631          */
1632         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633                 context_set_address_width(context, iommu->msagaw);
1634         else {
1635                 context_set_address_root(context, virt_to_phys(pgd));
1636                 context_set_address_width(context, iommu->agaw);
1637         }
1638
1639         context_set_translation_type(context, translation);
1640         context_set_fault_enable(context);
1641         context_set_present(context);
1642         domain_flush_cache(domain, context, sizeof(*context));
1643
1644         /*
1645          * It's a non-present to present mapping. If hardware doesn't cache
1646          * non-present entry we only need to flush the write-buffer. If the
1647          * _does_ cache non-present entries, then it does so in the special
1648          * domain #0, which we have to flush:
1649          */
1650         if (cap_caching_mode(iommu->cap)) {
1651                 iommu->flush.flush_context(iommu, 0,
1652                                            (((u16)bus) << 8) | devfn,
1653                                            DMA_CCMD_MASK_NOBIT,
1654                                            DMA_CCMD_DEVICE_INVL);
1655                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656         } else {
1657                 iommu_flush_write_buffer(iommu);
1658         }
1659         iommu_enable_dev_iotlb(info);
1660         spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662         spin_lock_irqsave(&domain->iommu_lock, flags);
1663         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664                 domain->iommu_count++;
1665                 if (domain->iommu_count == 1)
1666                         domain->nid = iommu->node;
1667                 domain_update_iommu_cap(domain);
1668         }
1669         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670         return 0;
1671 }
1672
1673 static int
1674 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675                         int translation)
1676 {
1677         int ret;
1678         struct pci_dev *tmp, *parent;
1679
1680         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                                          pdev->bus->number, pdev->devfn,
1682                                          translation);
1683         if (ret)
1684                 return ret;
1685
1686         /* dependent device mapping */
1687         tmp = pci_find_upstream_pcie_bridge(pdev);
1688         if (!tmp)
1689                 return 0;
1690         /* Secondary interface's bus number and devfn 0 */
1691         parent = pdev->bus->self;
1692         while (parent != tmp) {
1693                 ret = domain_context_mapping_one(domain,
1694                                                  pci_domain_nr(parent->bus),
1695                                                  parent->bus->number,
1696                                                  parent->devfn, translation);
1697                 if (ret)
1698                         return ret;
1699                 parent = parent->bus->self;
1700         }
1701         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702                 return domain_context_mapping_one(domain,
1703                                         pci_domain_nr(tmp->subordinate),
1704                                         tmp->subordinate->number, 0,
1705                                         translation);
1706         else /* this is a legacy PCI bridge */
1707                 return domain_context_mapping_one(domain,
1708                                                   pci_domain_nr(tmp->bus),
1709                                                   tmp->bus->number,
1710                                                   tmp->devfn,
1711                                                   translation);
1712 }
1713
1714 static int domain_context_mapped(struct pci_dev *pdev)
1715 {
1716         int ret;
1717         struct pci_dev *tmp, *parent;
1718         struct intel_iommu *iommu;
1719
1720         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                                 pdev->devfn);
1722         if (!iommu)
1723                 return -ENODEV;
1724
1725         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726         if (!ret)
1727                 return ret;
1728         /* dependent device mapping */
1729         tmp = pci_find_upstream_pcie_bridge(pdev);
1730         if (!tmp)
1731                 return ret;
1732         /* Secondary interface's bus number and devfn 0 */
1733         parent = pdev->bus->self;
1734         while (parent != tmp) {
1735                 ret = device_context_mapped(iommu, parent->bus->number,
1736                                             parent->devfn);
1737                 if (!ret)
1738                         return ret;
1739                 parent = parent->bus->self;
1740         }
1741         if (pci_is_pcie(tmp))
1742                 return device_context_mapped(iommu, tmp->subordinate->number,
1743                                              0);
1744         else
1745                 return device_context_mapped(iommu, tmp->bus->number,
1746                                              tmp->devfn);
1747 }
1748
1749 /* Returns a number of VTD pages, but aligned to MM page size */
1750 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                                             size_t size)
1752 {
1753         host_addr &= ~PAGE_MASK;
1754         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755 }
1756
1757 /* Return largest possible superpage level for a given mapping */
1758 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                                           unsigned long iov_pfn,
1760                                           unsigned long phy_pfn,
1761                                           unsigned long pages)
1762 {
1763         int support, level = 1;
1764         unsigned long pfnmerge;
1765
1766         support = domain->iommu_superpage;
1767
1768         /* To use a large page, the virtual *and* physical addresses
1769            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770            of them will mean we have to use smaller pages. So just
1771            merge them and check both at once. */
1772         pfnmerge = iov_pfn | phy_pfn;
1773
1774         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775                 pages >>= VTD_STRIDE_SHIFT;
1776                 if (!pages)
1777                         break;
1778                 pfnmerge >>= VTD_STRIDE_SHIFT;
1779                 level++;
1780                 support--;
1781         }
1782         return level;
1783 }
1784
1785 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                             struct scatterlist *sg, unsigned long phys_pfn,
1787                             unsigned long nr_pages, int prot)
1788 {
1789         struct dma_pte *first_pte = NULL, *pte = NULL;
1790         phys_addr_t uninitialized_var(pteval);
1791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792         unsigned long sg_res;
1793         unsigned int largepage_lvl = 0;
1794         unsigned long lvl_pages = 0;
1795
1796         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799                 return -EINVAL;
1800
1801         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803         if (sg)
1804                 sg_res = 0;
1805         else {
1806                 sg_res = nr_pages + 1;
1807                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808         }
1809
1810         while (nr_pages > 0) {
1811                 uint64_t tmp;
1812
1813                 if (!sg_res) {
1814                         sg_res = aligned_nrpages(sg->offset, sg->length);
1815                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816                         sg->dma_length = sg->length;
1817                         pteval = page_to_phys(sg_page(sg)) | prot;
1818                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819                 }
1820
1821                 if (!pte) {
1822                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825                         if (!pte)
1826                                 return -ENOMEM;
1827                         /* It is large page*/
1828                         if (largepage_lvl > 1)
1829                                 pteval |= DMA_PTE_LARGE_PAGE;
1830                         else
1831                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833                 }
1834                 /* We don't need lock here, nobody else
1835                  * touches the iova range
1836                  */
1837                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838                 if (tmp) {
1839                         static int dumps = 5;
1840                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                                iov_pfn, tmp, (unsigned long long)pteval);
1842                         if (dumps) {
1843                                 dumps--;
1844                                 debug_dma_dump_mappings(NULL);
1845                         }
1846                         WARN_ON(1);
1847                 }
1848
1849                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851                 BUG_ON(nr_pages < lvl_pages);
1852                 BUG_ON(sg_res < lvl_pages);
1853
1854                 nr_pages -= lvl_pages;
1855                 iov_pfn += lvl_pages;
1856                 phys_pfn += lvl_pages;
1857                 pteval += lvl_pages * VTD_PAGE_SIZE;
1858                 sg_res -= lvl_pages;
1859
1860                 /* If the next PTE would be the first in a new page, then we
1861                    need to flush the cache on the entries we've just written.
1862                    And then we'll need to recalculate 'pte', so clear it and
1863                    let it get set again in the if (!pte) block above.
1864
1865                    If we're done (!nr_pages) we need to flush the cache too.
1866
1867                    Also if we've been setting superpages, we may need to
1868                    recalculate 'pte' and switch back to smaller pages for the
1869                    end of the mapping, if the trailing size is not enough to
1870                    use another superpage (i.e. sg_res < lvl_pages). */
1871                 pte++;
1872                 if (!nr_pages || first_pte_in_page(pte) ||
1873                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874                         domain_flush_cache(domain, first_pte,
1875                                            (void *)pte - (void *)first_pte);
1876                         pte = NULL;
1877                 }
1878
1879                 if (!sg_res && nr_pages)
1880                         sg = sg_next(sg);
1881         }
1882         return 0;
1883 }
1884
1885 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                     struct scatterlist *sg, unsigned long nr_pages,
1887                                     int prot)
1888 {
1889         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890 }
1891
1892 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                                      unsigned long phys_pfn, unsigned long nr_pages,
1894                                      int prot)
1895 {
1896         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897 }
1898
1899 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900 {
1901         if (!iommu)
1902                 return;
1903
1904         clear_context_table(iommu, bus, devfn);
1905         iommu->flush.flush_context(iommu, 0, 0, 0,
1906                                            DMA_CCMD_GLOBAL_INVL);
1907         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908 }
1909
1910 static inline void unlink_domain_info(struct device_domain_info *info)
1911 {
1912         assert_spin_locked(&device_domain_lock);
1913         list_del(&info->link);
1914         list_del(&info->global);
1915         if (info->dev)
1916                 info->dev->dev.archdata.iommu = NULL;
1917 }
1918
1919 static void domain_remove_dev_info(struct dmar_domain *domain)
1920 {
1921         struct device_domain_info *info;
1922         unsigned long flags;
1923         struct intel_iommu *iommu;
1924
1925         spin_lock_irqsave(&device_domain_lock, flags);
1926         while (!list_empty(&domain->devices)) {
1927                 info = list_entry(domain->devices.next,
1928                         struct device_domain_info, link);
1929                 unlink_domain_info(info);
1930                 spin_unlock_irqrestore(&device_domain_lock, flags);
1931
1932                 iommu_disable_dev_iotlb(info);
1933                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1934                 iommu_detach_dev(iommu, info->bus, info->devfn);
1935                 free_devinfo_mem(info);
1936
1937                 spin_lock_irqsave(&device_domain_lock, flags);
1938         }
1939         spin_unlock_irqrestore(&device_domain_lock, flags);
1940 }
1941
1942 /*
1943  * find_domain
1944  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1945  */
1946 static struct dmar_domain *
1947 find_domain(struct pci_dev *pdev)
1948 {
1949         struct device_domain_info *info;
1950
1951         /* No lock here, assumes no domain exit in normal case */
1952         info = pdev->dev.archdata.iommu;
1953         if (info)
1954                 return info->domain;
1955         return NULL;
1956 }
1957
1958 /* domain is initialized */
1959 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1960 {
1961         struct dmar_domain *domain, *found = NULL;
1962         struct intel_iommu *iommu;
1963         struct dmar_drhd_unit *drhd;
1964         struct device_domain_info *info, *tmp;
1965         struct pci_dev *dev_tmp;
1966         unsigned long flags;
1967         int bus = 0, devfn = 0;
1968         int segment;
1969         int ret;
1970
1971         domain = find_domain(pdev);
1972         if (domain)
1973                 return domain;
1974
1975         segment = pci_domain_nr(pdev->bus);
1976
1977         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1978         if (dev_tmp) {
1979                 if (pci_is_pcie(dev_tmp)) {
1980                         bus = dev_tmp->subordinate->number;
1981                         devfn = 0;
1982                 } else {
1983                         bus = dev_tmp->bus->number;
1984                         devfn = dev_tmp->devfn;
1985                 }
1986                 spin_lock_irqsave(&device_domain_lock, flags);
1987                 list_for_each_entry(info, &device_domain_list, global) {
1988                         if (info->segment == segment &&
1989                             info->bus == bus && info->devfn == devfn) {
1990                                 found = info->domain;
1991                                 break;
1992                         }
1993                 }
1994                 spin_unlock_irqrestore(&device_domain_lock, flags);
1995                 /* pcie-pci bridge already has a domain, uses it */
1996                 if (found) {
1997                         domain = found;
1998                         goto found_domain;
1999                 }
2000         }
2001
2002         domain = alloc_domain();
2003         if (!domain)
2004                 goto error;
2005
2006         /* Allocate new domain for the device */
2007         drhd = dmar_find_matched_drhd_unit(pdev);
2008         if (!drhd) {
2009                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2010                         pci_name(pdev));
2011                 free_domain_mem(domain);
2012                 return NULL;
2013         }
2014         iommu = drhd->iommu;
2015
2016         ret = iommu_attach_domain(domain, iommu);
2017         if (ret) {
2018                 free_domain_mem(domain);
2019                 goto error;
2020         }
2021
2022         if (domain_init(domain, gaw)) {
2023                 domain_exit(domain);
2024                 goto error;
2025         }
2026
2027         /* register pcie-to-pci device */
2028         if (dev_tmp) {
2029                 info = alloc_devinfo_mem();
2030                 if (!info) {
2031                         domain_exit(domain);
2032                         goto error;
2033                 }
2034                 info->segment = segment;
2035                 info->bus = bus;
2036                 info->devfn = devfn;
2037                 info->dev = NULL;
2038                 info->domain = domain;
2039                 /* This domain is shared by devices under p2p bridge */
2040                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2041
2042                 /* pcie-to-pci bridge already has a domain, uses it */
2043                 found = NULL;
2044                 spin_lock_irqsave(&device_domain_lock, flags);
2045                 list_for_each_entry(tmp, &device_domain_list, global) {
2046                         if (tmp->segment == segment &&
2047                             tmp->bus == bus && tmp->devfn == devfn) {
2048                                 found = tmp->domain;
2049                                 break;
2050                         }
2051                 }
2052                 if (found) {
2053                         spin_unlock_irqrestore(&device_domain_lock, flags);
2054                         free_devinfo_mem(info);
2055                         domain_exit(domain);
2056                         domain = found;
2057                 } else {
2058                         list_add(&info->link, &domain->devices);
2059                         list_add(&info->global, &device_domain_list);
2060                         spin_unlock_irqrestore(&device_domain_lock, flags);
2061                 }
2062         }
2063
2064 found_domain:
2065         info = alloc_devinfo_mem();
2066         if (!info)
2067                 goto error;
2068         info->segment = segment;
2069         info->bus = pdev->bus->number;
2070         info->devfn = pdev->devfn;
2071         info->dev = pdev;
2072         info->domain = domain;
2073         spin_lock_irqsave(&device_domain_lock, flags);
2074         /* somebody is fast */
2075         found = find_domain(pdev);
2076         if (found != NULL) {
2077                 spin_unlock_irqrestore(&device_domain_lock, flags);
2078                 if (found != domain) {
2079                         domain_exit(domain);
2080                         domain = found;
2081                 }
2082                 free_devinfo_mem(info);
2083                 return domain;
2084         }
2085         list_add(&info->link, &domain->devices);
2086         list_add(&info->global, &device_domain_list);
2087         pdev->dev.archdata.iommu = info;
2088         spin_unlock_irqrestore(&device_domain_lock, flags);
2089         return domain;
2090 error:
2091         /* recheck it here, maybe others set it */
2092         return find_domain(pdev);
2093 }
2094
2095 static int iommu_identity_mapping;
2096 #define IDENTMAP_ALL            1
2097 #define IDENTMAP_GFX            2
2098 #define IDENTMAP_AZALIA         4
2099
2100 static int iommu_domain_identity_map(struct dmar_domain *domain,
2101                                      unsigned long long start,
2102                                      unsigned long long end)
2103 {
2104         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2105         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2106
2107         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2108                           dma_to_mm_pfn(last_vpfn))) {
2109                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2110                 return -ENOMEM;
2111         }
2112
2113         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2114                  start, end, domain->id);
2115         /*
2116          * RMRR range might have overlap with physical memory range,
2117          * clear it first
2118          */
2119         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2120
2121         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2122                                   last_vpfn - first_vpfn + 1,
2123                                   DMA_PTE_READ|DMA_PTE_WRITE);
2124 }
2125
2126 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2127                                       unsigned long long start,
2128                                       unsigned long long end)
2129 {
2130         struct dmar_domain *domain;
2131         int ret;
2132
2133         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2134         if (!domain)
2135                 return -ENOMEM;
2136
2137         /* For _hardware_ passthrough, don't bother. But for software
2138            passthrough, we do it anyway -- it may indicate a memory
2139            range which is reserved in E820, so which didn't get set
2140            up to start with in si_domain */
2141         if (domain == si_domain && hw_pass_through) {
2142                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2143                        pci_name(pdev), start, end);
2144                 return 0;
2145         }
2146
2147         printk(KERN_INFO
2148                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2149                pci_name(pdev), start, end);
2150         
2151         if (end < start) {
2152                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2153                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2154                         dmi_get_system_info(DMI_BIOS_VENDOR),
2155                         dmi_get_system_info(DMI_BIOS_VERSION),
2156                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2157                 ret = -EIO;
2158                 goto error;
2159         }
2160
2161         if (end >> agaw_to_width(domain->agaw)) {
2162                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2163                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2164                      agaw_to_width(domain->agaw),
2165                      dmi_get_system_info(DMI_BIOS_VENDOR),
2166                      dmi_get_system_info(DMI_BIOS_VERSION),
2167                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2168                 ret = -EIO;
2169                 goto error;
2170         }
2171
2172         ret = iommu_domain_identity_map(domain, start, end);
2173         if (ret)
2174                 goto error;
2175
2176         /* context entry init */
2177         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2178         if (ret)
2179                 goto error;
2180
2181         return 0;
2182
2183  error:
2184         domain_exit(domain);
2185         return ret;
2186 }
2187
2188 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2189         struct pci_dev *pdev)
2190 {
2191         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2192                 return 0;
2193         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2194                 rmrr->end_address);
2195 }
2196
2197 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2198 static inline void iommu_prepare_isa(void)
2199 {
2200         struct pci_dev *pdev;
2201         int ret;
2202
2203         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2204         if (!pdev)
2205                 return;
2206
2207         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2208         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2209
2210         if (ret)
2211                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2212                        "floppy might not work\n");
2213
2214 }
2215 #else
2216 static inline void iommu_prepare_isa(void)
2217 {
2218         return;
2219 }
2220 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2221
2222 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2223
2224 static int __init si_domain_init(int hw)
2225 {
2226         struct dmar_drhd_unit *drhd;
2227         struct intel_iommu *iommu;
2228         int nid, ret = 0;
2229
2230         si_domain = alloc_domain();
2231         if (!si_domain)
2232                 return -EFAULT;
2233
2234         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2235
2236         for_each_active_iommu(iommu, drhd) {
2237                 ret = iommu_attach_domain(si_domain, iommu);
2238                 if (ret) {
2239                         domain_exit(si_domain);
2240                         return -EFAULT;
2241                 }
2242         }
2243
2244         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2245                 domain_exit(si_domain);
2246                 return -EFAULT;
2247         }
2248
2249         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2250
2251         if (hw)
2252                 return 0;
2253
2254         for_each_online_node(nid) {
2255                 unsigned long start_pfn, end_pfn;
2256                 int i;
2257
2258                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2259                         ret = iommu_domain_identity_map(si_domain,
2260                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2261                         if (ret)
2262                                 return ret;
2263                 }
2264         }
2265
2266         return 0;
2267 }
2268
2269 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2270                                           struct pci_dev *pdev);
2271 static int identity_mapping(struct pci_dev *pdev)
2272 {
2273         struct device_domain_info *info;
2274
2275         if (likely(!iommu_identity_mapping))
2276                 return 0;
2277
2278         info = pdev->dev.archdata.iommu;
2279         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2280                 return (info->domain == si_domain);
2281
2282         return 0;
2283 }
2284
2285 static int domain_add_dev_info(struct dmar_domain *domain,
2286                                struct pci_dev *pdev,
2287                                int translation)
2288 {
2289         struct device_domain_info *info;
2290         unsigned long flags;
2291         int ret;
2292
2293         info = alloc_devinfo_mem();
2294         if (!info)
2295                 return -ENOMEM;
2296
2297         info->segment = pci_domain_nr(pdev->bus);
2298         info->bus = pdev->bus->number;
2299         info->devfn = pdev->devfn;
2300         info->dev = pdev;
2301         info->domain = domain;
2302
2303         spin_lock_irqsave(&device_domain_lock, flags);
2304         list_add(&info->link, &domain->devices);
2305         list_add(&info->global, &device_domain_list);
2306         pdev->dev.archdata.iommu = info;
2307         spin_unlock_irqrestore(&device_domain_lock, flags);
2308
2309         ret = domain_context_mapping(domain, pdev, translation);
2310         if (ret) {
2311                 spin_lock_irqsave(&device_domain_lock, flags);
2312                 unlink_domain_info(info);
2313                 spin_unlock_irqrestore(&device_domain_lock, flags);
2314                 free_devinfo_mem(info);
2315                 return ret;
2316         }
2317
2318         return 0;
2319 }
2320
2321 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2322 {
2323         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2324                 return 1;
2325
2326         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2327                 return 1;
2328
2329         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2330                 return 0;
2331
2332         /*
2333          * We want to start off with all devices in the 1:1 domain, and
2334          * take them out later if we find they can't access all of memory.
2335          *
2336          * However, we can't do this for PCI devices behind bridges,
2337          * because all PCI devices behind the same bridge will end up
2338          * with the same source-id on their transactions.
2339          *
2340          * Practically speaking, we can't change things around for these
2341          * devices at run-time, because we can't be sure there'll be no
2342          * DMA transactions in flight for any of their siblings.
2343          * 
2344          * So PCI devices (unless they're on the root bus) as well as
2345          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2346          * the 1:1 domain, just in _case_ one of their siblings turns out
2347          * not to be able to map all of memory.
2348          */
2349         if (!pci_is_pcie(pdev)) {
2350                 if (!pci_is_root_bus(pdev->bus))
2351                         return 0;
2352                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2353                         return 0;
2354         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2355                 return 0;
2356
2357         /* 
2358          * At boot time, we don't yet know if devices will be 64-bit capable.
2359          * Assume that they will -- if they turn out not to be, then we can 
2360          * take them out of the 1:1 domain later.
2361          */
2362         if (!startup) {
2363                 /*
2364                  * If the device's dma_mask is less than the system's memory
2365                  * size then this is not a candidate for identity mapping.
2366                  */
2367                 u64 dma_mask = pdev->dma_mask;
2368
2369                 if (pdev->dev.coherent_dma_mask &&
2370                     pdev->dev.coherent_dma_mask < dma_mask)
2371                         dma_mask = pdev->dev.coherent_dma_mask;
2372
2373                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2374         }
2375
2376         return 1;
2377 }
2378
2379 static int __init iommu_prepare_static_identity_mapping(int hw)
2380 {
2381         struct pci_dev *pdev = NULL;
2382         int ret;
2383
2384         ret = si_domain_init(hw);
2385         if (ret)
2386                 return -EFAULT;
2387
2388         for_each_pci_dev(pdev) {
2389                 if (iommu_should_identity_map(pdev, 1)) {
2390                         ret = domain_add_dev_info(si_domain, pdev,
2391                                              hw ? CONTEXT_TT_PASS_THROUGH :
2392                                                   CONTEXT_TT_MULTI_LEVEL);
2393                         if (ret) {
2394                                 /* device not associated with an iommu */
2395                                 if (ret == -ENODEV)
2396                                         continue;
2397                                 return ret;
2398                         }
2399                         pr_info("IOMMU: %s identity mapping for device %s\n",
2400                                 hw ? "hardware" : "software", pci_name(pdev));
2401                 }
2402         }
2403
2404         return 0;
2405 }
2406
2407 static int __init init_dmars(void)
2408 {
2409         struct dmar_drhd_unit *drhd;
2410         struct dmar_rmrr_unit *rmrr;
2411         struct pci_dev *pdev;
2412         struct intel_iommu *iommu;
2413         int i, ret;
2414
2415         /*
2416          * for each drhd
2417          *    allocate root
2418          *    initialize and program root entry to not present
2419          * endfor
2420          */
2421         for_each_drhd_unit(drhd) {
2422                 /*
2423                  * lock not needed as this is only incremented in the single
2424                  * threaded kernel __init code path all other access are read
2425                  * only
2426                  */
2427                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2428                         g_num_of_iommus++;
2429                         continue;
2430                 }
2431                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2432                           IOMMU_UNITS_SUPPORTED);
2433         }
2434
2435         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2436                         GFP_KERNEL);
2437         if (!g_iommus) {
2438                 printk(KERN_ERR "Allocating global iommu array failed\n");
2439                 ret = -ENOMEM;
2440                 goto error;
2441         }
2442
2443         deferred_flush = kzalloc(g_num_of_iommus *
2444                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2445         if (!deferred_flush) {
2446                 ret = -ENOMEM;
2447                 goto error;
2448         }
2449
2450         for_each_drhd_unit(drhd) {
2451                 if (drhd->ignored)
2452                         continue;
2453
2454                 iommu = drhd->iommu;
2455                 g_iommus[iommu->seq_id] = iommu;
2456
2457                 ret = iommu_init_domains(iommu);
2458                 if (ret)
2459                         goto error;
2460
2461                 /*
2462                  * TBD:
2463                  * we could share the same root & context tables
2464                  * among all IOMMU's. Need to Split it later.
2465                  */
2466                 ret = iommu_alloc_root_entry(iommu);
2467                 if (ret) {
2468                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2469                         goto error;
2470                 }
2471                 if (!ecap_pass_through(iommu->ecap))
2472                         hw_pass_through = 0;
2473         }
2474
2475         /*
2476          * Start from the sane iommu hardware state.
2477          */
2478         for_each_drhd_unit(drhd) {
2479                 if (drhd->ignored)
2480                         continue;
2481
2482                 iommu = drhd->iommu;
2483
2484                 /*
2485                  * If the queued invalidation is already initialized by us
2486                  * (for example, while enabling interrupt-remapping) then
2487                  * we got the things already rolling from a sane state.
2488                  */
2489                 if (iommu->qi)
2490                         continue;
2491
2492                 /*
2493                  * Clear any previous faults.
2494                  */
2495                 dmar_fault(-1, iommu);
2496                 /*
2497                  * Disable queued invalidation if supported and already enabled
2498                  * before OS handover.
2499                  */
2500                 dmar_disable_qi(iommu);
2501         }
2502
2503         for_each_drhd_unit(drhd) {
2504                 if (drhd->ignored)
2505                         continue;
2506
2507                 iommu = drhd->iommu;
2508
2509                 if (dmar_enable_qi(iommu)) {
2510                         /*
2511                          * Queued Invalidate not enabled, use Register Based
2512                          * Invalidate
2513                          */
2514                         iommu->flush.flush_context = __iommu_flush_context;
2515                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2516                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2517                                "invalidation\n",
2518                                 iommu->seq_id,
2519                                (unsigned long long)drhd->reg_base_addr);
2520                 } else {
2521                         iommu->flush.flush_context = qi_flush_context;
2522                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2523                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2524                                "invalidation\n",
2525                                 iommu->seq_id,
2526                                (unsigned long long)drhd->reg_base_addr);
2527                 }
2528         }
2529
2530         if (iommu_pass_through)
2531                 iommu_identity_mapping |= IDENTMAP_ALL;
2532
2533 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2534         iommu_identity_mapping |= IDENTMAP_GFX;
2535 #endif
2536
2537         check_tylersburg_isoch();
2538
2539         /*
2540          * If pass through is not set or not enabled, setup context entries for
2541          * identity mappings for rmrr, gfx, and isa and may fall back to static
2542          * identity mapping if iommu_identity_mapping is set.
2543          */
2544         if (iommu_identity_mapping) {
2545                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2546                 if (ret) {
2547                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2548                         goto error;
2549                 }
2550         }
2551         /*
2552          * For each rmrr
2553          *   for each dev attached to rmrr
2554          *   do
2555          *     locate drhd for dev, alloc domain for dev
2556          *     allocate free domain
2557          *     allocate page table entries for rmrr
2558          *     if context not allocated for bus
2559          *           allocate and init context
2560          *           set present in root table for this bus
2561          *     init context with domain, translation etc
2562          *    endfor
2563          * endfor
2564          */
2565         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2566         for_each_rmrr_units(rmrr) {
2567                 for (i = 0; i < rmrr->devices_cnt; i++) {
2568                         pdev = rmrr->devices[i];
2569                         /*
2570                          * some BIOS lists non-exist devices in DMAR
2571                          * table.
2572                          */
2573                         if (!pdev)
2574                                 continue;
2575                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2576                         if (ret)
2577                                 printk(KERN_ERR
2578                                        "IOMMU: mapping reserved region failed\n");
2579                 }
2580         }
2581
2582         iommu_prepare_isa();
2583
2584         /*
2585          * for each drhd
2586          *   enable fault log
2587          *   global invalidate context cache
2588          *   global invalidate iotlb
2589          *   enable translation
2590          */
2591         for_each_drhd_unit(drhd) {
2592                 if (drhd->ignored) {
2593                         /*
2594                          * we always have to disable PMRs or DMA may fail on
2595                          * this device
2596                          */
2597                         if (force_on)
2598                                 iommu_disable_protect_mem_regions(drhd->iommu);
2599                         continue;
2600                 }
2601                 iommu = drhd->iommu;
2602
2603                 iommu_flush_write_buffer(iommu);
2604
2605                 ret = dmar_set_interrupt(iommu);
2606                 if (ret)
2607                         goto error;
2608
2609                 iommu_set_root_entry(iommu);
2610
2611                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2612                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2613
2614                 ret = iommu_enable_translation(iommu);
2615                 if (ret)
2616                         goto error;
2617
2618                 iommu_disable_protect_mem_regions(iommu);
2619         }
2620
2621         return 0;
2622 error:
2623         for_each_drhd_unit(drhd) {
2624                 if (drhd->ignored)
2625                         continue;
2626                 iommu = drhd->iommu;
2627                 free_iommu(iommu);
2628         }
2629         kfree(g_iommus);
2630         return ret;
2631 }
2632
2633 /* This takes a number of _MM_ pages, not VTD pages */
2634 static struct iova *intel_alloc_iova(struct device *dev,
2635                                      struct dmar_domain *domain,
2636                                      unsigned long nrpages, uint64_t dma_mask)
2637 {
2638         struct pci_dev *pdev = to_pci_dev(dev);
2639         struct iova *iova = NULL;
2640
2641         /* Restrict dma_mask to the width that the iommu can handle */
2642         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2643
2644         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2645                 /*
2646                  * First try to allocate an io virtual address in
2647                  * DMA_BIT_MASK(32) and if that fails then try allocating
2648                  * from higher range
2649                  */
2650                 iova = alloc_iova(&domain->iovad, nrpages,
2651                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2652                 if (iova)
2653                         return iova;
2654         }
2655         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2656         if (unlikely(!iova)) {
2657                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2658                        nrpages, pci_name(pdev));
2659                 return NULL;
2660         }
2661
2662         return iova;
2663 }
2664
2665 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2666 {
2667         struct dmar_domain *domain;
2668         int ret;
2669
2670         domain = get_domain_for_dev(pdev,
2671                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2672         if (!domain) {
2673                 printk(KERN_ERR
2674                         "Allocating domain for %s failed", pci_name(pdev));
2675                 return NULL;
2676         }
2677
2678         /* make sure context mapping is ok */
2679         if (unlikely(!domain_context_mapped(pdev))) {
2680                 ret = domain_context_mapping(domain, pdev,
2681                                              CONTEXT_TT_MULTI_LEVEL);
2682                 if (ret) {
2683                         printk(KERN_ERR
2684                                 "Domain context map for %s failed",
2685                                 pci_name(pdev));
2686                         return NULL;
2687                 }
2688         }
2689
2690         return domain;
2691 }
2692
2693 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2694 {
2695         struct device_domain_info *info;
2696
2697         /* No lock here, assumes no domain exit in normal case */
2698         info = dev->dev.archdata.iommu;
2699         if (likely(info))
2700                 return info->domain;
2701
2702         return __get_valid_domain_for_dev(dev);
2703 }
2704
2705 static int iommu_dummy(struct pci_dev *pdev)
2706 {
2707         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2708 }
2709
2710 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2711 static int iommu_no_mapping(struct device *dev)
2712 {
2713         struct pci_dev *pdev;
2714         int found;
2715
2716         if (unlikely(dev->bus != &pci_bus_type))
2717                 return 1;
2718
2719         pdev = to_pci_dev(dev);
2720         if (iommu_dummy(pdev))
2721                 return 1;
2722
2723         if (!iommu_identity_mapping)
2724                 return 0;
2725
2726         found = identity_mapping(pdev);
2727         if (found) {
2728                 if (iommu_should_identity_map(pdev, 0))
2729                         return 1;
2730                 else {
2731                         /*
2732                          * 32 bit DMA is removed from si_domain and fall back
2733                          * to non-identity mapping.
2734                          */
2735                         domain_remove_one_dev_info(si_domain, pdev);
2736                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2737                                pci_name(pdev));
2738                         return 0;
2739                 }
2740         } else {
2741                 /*
2742                  * In case of a detached 64 bit DMA device from vm, the device
2743                  * is put into si_domain for identity mapping.
2744                  */
2745                 if (iommu_should_identity_map(pdev, 0)) {
2746                         int ret;
2747                         ret = domain_add_dev_info(si_domain, pdev,
2748                                                   hw_pass_through ?
2749                                                   CONTEXT_TT_PASS_THROUGH :
2750                                                   CONTEXT_TT_MULTI_LEVEL);
2751                         if (!ret) {
2752                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2753                                        pci_name(pdev));
2754                                 return 1;
2755                         }
2756                 }
2757         }
2758
2759         return 0;
2760 }
2761
2762 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2763                                      size_t size, int dir, u64 dma_mask)
2764 {
2765         struct pci_dev *pdev = to_pci_dev(hwdev);
2766         struct dmar_domain *domain;
2767         phys_addr_t start_paddr;
2768         struct iova *iova;
2769         int prot = 0;
2770         int ret;
2771         struct intel_iommu *iommu;
2772         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2773
2774         BUG_ON(dir == DMA_NONE);
2775
2776         if (iommu_no_mapping(hwdev))
2777                 return paddr;
2778
2779         domain = get_valid_domain_for_dev(pdev);
2780         if (!domain)
2781                 return 0;
2782
2783         iommu = domain_get_iommu(domain);
2784         size = aligned_nrpages(paddr, size);
2785
2786         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2787         if (!iova)
2788                 goto error;
2789
2790         /*
2791          * Check if DMAR supports zero-length reads on write only
2792          * mappings..
2793          */
2794         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2795                         !cap_zlr(iommu->cap))
2796                 prot |= DMA_PTE_READ;
2797         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2798                 prot |= DMA_PTE_WRITE;
2799         /*
2800          * paddr - (paddr + size) might be partial page, we should map the whole
2801          * page.  Note: if two part of one page are separately mapped, we
2802          * might have two guest_addr mapping to the same host paddr, but this
2803          * is not a big problem
2804          */
2805         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2806                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2807         if (ret)
2808                 goto error;
2809
2810         /* it's a non-present to present mapping. Only flush if caching mode */
2811         if (cap_caching_mode(iommu->cap))
2812                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2813         else
2814                 iommu_flush_write_buffer(iommu);
2815
2816         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2817         start_paddr += paddr & ~PAGE_MASK;
2818         return start_paddr;
2819
2820 error:
2821         if (iova)
2822                 __free_iova(&domain->iovad, iova);
2823         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2824                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2825         return 0;
2826 }
2827
2828 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2829                                  unsigned long offset, size_t size,
2830                                  enum dma_data_direction dir,
2831                                  struct dma_attrs *attrs)
2832 {
2833         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2834                                   dir, to_pci_dev(dev)->dma_mask);
2835 }
2836
2837 static void flush_unmaps(void)
2838 {
2839         int i, j;
2840
2841         timer_on = 0;
2842
2843         /* just flush them all */
2844         for (i = 0; i < g_num_of_iommus; i++) {
2845                 struct intel_iommu *iommu = g_iommus[i];
2846                 if (!iommu)
2847                         continue;
2848
2849                 if (!deferred_flush[i].next)
2850                         continue;
2851
2852                 /* In caching mode, global flushes turn emulation expensive */
2853                 if (!cap_caching_mode(iommu->cap))
2854                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2855                                          DMA_TLB_GLOBAL_FLUSH);
2856                 for (j = 0; j < deferred_flush[i].next; j++) {
2857                         unsigned long mask;
2858                         struct iova *iova = deferred_flush[i].iova[j];
2859                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2860
2861                         /* On real hardware multiple invalidations are expensive */
2862                         if (cap_caching_mode(iommu->cap))
2863                                 iommu_flush_iotlb_psi(iommu, domain->id,
2864                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2865                         else {
2866                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2867                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2868                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2869                         }
2870                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2871                 }
2872                 deferred_flush[i].next = 0;
2873         }
2874
2875         list_size = 0;
2876 }
2877
2878 static void flush_unmaps_timeout(unsigned long data)
2879 {
2880         unsigned long flags;
2881
2882         spin_lock_irqsave(&async_umap_flush_lock, flags);
2883         flush_unmaps();
2884         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2885 }
2886
2887 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2888 {
2889         unsigned long flags;
2890         int next, iommu_id;
2891         struct intel_iommu *iommu;
2892
2893         spin_lock_irqsave(&async_umap_flush_lock, flags);
2894         if (list_size == HIGH_WATER_MARK)
2895                 flush_unmaps();
2896
2897         iommu = domain_get_iommu(dom);
2898         iommu_id = iommu->seq_id;
2899
2900         next = deferred_flush[iommu_id].next;
2901         deferred_flush[iommu_id].domain[next] = dom;
2902         deferred_flush[iommu_id].iova[next] = iova;
2903         deferred_flush[iommu_id].next++;
2904
2905         if (!timer_on) {
2906                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2907                 timer_on = 1;
2908         }
2909         list_size++;
2910         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2911 }
2912
2913 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2914                              size_t size, enum dma_data_direction dir,
2915                              struct dma_attrs *attrs)
2916 {
2917         struct pci_dev *pdev = to_pci_dev(dev);
2918         struct dmar_domain *domain;
2919         unsigned long start_pfn, last_pfn;
2920         struct iova *iova;
2921         struct intel_iommu *iommu;
2922
2923         if (iommu_no_mapping(dev))
2924                 return;
2925
2926         domain = find_domain(pdev);
2927         BUG_ON(!domain);
2928
2929         iommu = domain_get_iommu(domain);
2930
2931         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2932         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2933                       (unsigned long long)dev_addr))
2934                 return;
2935
2936         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2937         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2938
2939         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2940                  pci_name(pdev), start_pfn, last_pfn);
2941
2942         /*  clear the whole page */
2943         dma_pte_clear_range(domain, start_pfn, last_pfn);
2944
2945         /* free page tables */
2946         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2947
2948         if (intel_iommu_strict) {
2949                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2950                                       last_pfn - start_pfn + 1, 0);
2951                 /* free iova */
2952                 __free_iova(&domain->iovad, iova);
2953         } else {
2954                 add_unmap(domain, iova);
2955                 /*
2956                  * queue up the release of the unmap to save the 1/6th of the
2957                  * cpu used up by the iotlb flush operation...
2958                  */
2959         }
2960 }
2961
2962 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2963                                   dma_addr_t *dma_handle, gfp_t flags,
2964                                   struct dma_attrs *attrs)
2965 {
2966         void *vaddr;
2967         int order;
2968
2969         size = PAGE_ALIGN(size);
2970         order = get_order(size);
2971
2972         if (!iommu_no_mapping(hwdev))
2973                 flags &= ~(GFP_DMA | GFP_DMA32);
2974         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2975                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2976                         flags |= GFP_DMA;
2977                 else
2978                         flags |= GFP_DMA32;
2979         }
2980
2981         vaddr = (void *)__get_free_pages(flags, order);
2982         if (!vaddr)
2983                 return NULL;
2984         memset(vaddr, 0, size);
2985
2986         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2987                                          DMA_BIDIRECTIONAL,
2988                                          hwdev->coherent_dma_mask);
2989         if (*dma_handle)
2990                 return vaddr;
2991         free_pages((unsigned long)vaddr, order);
2992         return NULL;
2993 }
2994
2995 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2996                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2997 {
2998         int order;
2999
3000         size = PAGE_ALIGN(size);
3001         order = get_order(size);
3002
3003         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3004         free_pages((unsigned long)vaddr, order);
3005 }
3006
3007 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3008                            int nelems, enum dma_data_direction dir,
3009                            struct dma_attrs *attrs)
3010 {
3011         struct pci_dev *pdev = to_pci_dev(hwdev);
3012         struct dmar_domain *domain;
3013         unsigned long start_pfn, last_pfn;
3014         struct iova *iova;
3015         struct intel_iommu *iommu;
3016
3017         if (iommu_no_mapping(hwdev))
3018                 return;
3019
3020         domain = find_domain(pdev);
3021         BUG_ON(!domain);
3022
3023         iommu = domain_get_iommu(domain);
3024
3025         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3026         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3027                       (unsigned long long)sglist[0].dma_address))
3028                 return;
3029
3030         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3031         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3032
3033         /*  clear the whole page */
3034         dma_pte_clear_range(domain, start_pfn, last_pfn);
3035
3036         /* free page tables */
3037         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3038
3039         if (intel_iommu_strict) {
3040                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3041                                       last_pfn - start_pfn + 1, 0);
3042                 /* free iova */
3043                 __free_iova(&domain->iovad, iova);
3044         } else {
3045                 add_unmap(domain, iova);
3046                 /*
3047                  * queue up the release of the unmap to save the 1/6th of the
3048                  * cpu used up by the iotlb flush operation...
3049                  */
3050         }
3051 }
3052
3053 static int intel_nontranslate_map_sg(struct device *hddev,
3054         struct scatterlist *sglist, int nelems, int dir)
3055 {
3056         int i;
3057         struct scatterlist *sg;
3058
3059         for_each_sg(sglist, sg, nelems, i) {
3060                 BUG_ON(!sg_page(sg));
3061                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3062                 sg->dma_length = sg->length;
3063         }
3064         return nelems;
3065 }
3066
3067 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3068                         enum dma_data_direction dir, struct dma_attrs *attrs)
3069 {
3070         int i;
3071         struct pci_dev *pdev = to_pci_dev(hwdev);
3072         struct dmar_domain *domain;
3073         size_t size = 0;
3074         int prot = 0;
3075         struct iova *iova = NULL;
3076         int ret;
3077         struct scatterlist *sg;
3078         unsigned long start_vpfn;
3079         struct intel_iommu *iommu;
3080
3081         BUG_ON(dir == DMA_NONE);
3082         if (iommu_no_mapping(hwdev))
3083                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3084
3085         domain = get_valid_domain_for_dev(pdev);
3086         if (!domain)
3087                 return 0;
3088
3089         iommu = domain_get_iommu(domain);
3090
3091         for_each_sg(sglist, sg, nelems, i)
3092                 size += aligned_nrpages(sg->offset, sg->length);
3093
3094         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3095                                 pdev->dma_mask);
3096         if (!iova) {
3097                 sglist->dma_length = 0;
3098                 return 0;
3099         }
3100
3101         /*
3102          * Check if DMAR supports zero-length reads on write only
3103          * mappings..
3104          */
3105         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3106                         !cap_zlr(iommu->cap))
3107                 prot |= DMA_PTE_READ;
3108         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3109                 prot |= DMA_PTE_WRITE;
3110
3111         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3112
3113         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3114         if (unlikely(ret)) {
3115                 /*  clear the page */
3116                 dma_pte_clear_range(domain, start_vpfn,
3117                                     start_vpfn + size - 1);
3118                 /* free page tables */
3119                 dma_pte_free_pagetable(domain, start_vpfn,
3120                                        start_vpfn + size - 1);
3121                 /* free iova */
3122                 __free_iova(&domain->iovad, iova);
3123                 return 0;
3124         }
3125
3126         /* it's a non-present to present mapping. Only flush if caching mode */
3127         if (cap_caching_mode(iommu->cap))
3128                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3129         else
3130                 iommu_flush_write_buffer(iommu);
3131
3132         return nelems;
3133 }
3134
3135 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3136 {
3137         return !dma_addr;
3138 }
3139
3140 struct dma_map_ops intel_dma_ops = {
3141         .alloc = intel_alloc_coherent,
3142         .free = intel_free_coherent,
3143         .map_sg = intel_map_sg,
3144         .unmap_sg = intel_unmap_sg,
3145         .map_page = intel_map_page,
3146         .unmap_page = intel_unmap_page,
3147         .mapping_error = intel_mapping_error,
3148 };
3149
3150 static inline int iommu_domain_cache_init(void)
3151 {
3152         int ret = 0;
3153
3154         iommu_domain_cache = kmem_cache_create("iommu_domain",
3155                                          sizeof(struct dmar_domain),
3156                                          0,
3157                                          SLAB_HWCACHE_ALIGN,
3158
3159                                          NULL);
3160         if (!iommu_domain_cache) {
3161                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3162                 ret = -ENOMEM;
3163         }
3164
3165         return ret;
3166 }
3167
3168 static inline int iommu_devinfo_cache_init(void)
3169 {
3170         int ret = 0;
3171
3172         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3173                                          sizeof(struct device_domain_info),
3174                                          0,
3175                                          SLAB_HWCACHE_ALIGN,
3176                                          NULL);
3177         if (!iommu_devinfo_cache) {
3178                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3179                 ret = -ENOMEM;
3180         }
3181
3182         return ret;
3183 }
3184
3185 static inline int iommu_iova_cache_init(void)
3186 {
3187         int ret = 0;
3188
3189         iommu_iova_cache = kmem_cache_create("iommu_iova",
3190                                          sizeof(struct iova),
3191                                          0,
3192                                          SLAB_HWCACHE_ALIGN,
3193                                          NULL);
3194         if (!iommu_iova_cache) {
3195                 printk(KERN_ERR "Couldn't create iova cache\n");
3196                 ret = -ENOMEM;
3197         }
3198
3199         return ret;
3200 }
3201
3202 static int __init iommu_init_mempool(void)
3203 {
3204         int ret;
3205         ret = iommu_iova_cache_init();
3206         if (ret)
3207                 return ret;
3208
3209         ret = iommu_domain_cache_init();
3210         if (ret)
3211                 goto domain_error;
3212
3213         ret = iommu_devinfo_cache_init();
3214         if (!ret)
3215                 return ret;
3216
3217         kmem_cache_destroy(iommu_domain_cache);
3218 domain_error:
3219         kmem_cache_destroy(iommu_iova_cache);
3220
3221         return -ENOMEM;
3222 }
3223
3224 static void __init iommu_exit_mempool(void)
3225 {
3226         kmem_cache_destroy(iommu_devinfo_cache);
3227         kmem_cache_destroy(iommu_domain_cache);
3228         kmem_cache_destroy(iommu_iova_cache);
3229
3230 }
3231
3232 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3233 {
3234         struct dmar_drhd_unit *drhd;
3235         u32 vtbar;
3236         int rc;
3237
3238         /* We know that this device on this chipset has its own IOMMU.
3239          * If we find it under a different IOMMU, then the BIOS is lying
3240          * to us. Hope that the IOMMU for this device is actually
3241          * disabled, and it needs no translation...
3242          */
3243         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3244         if (rc) {
3245                 /* "can't" happen */
3246                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3247                 return;
3248         }
3249         vtbar &= 0xffff0000;
3250
3251         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3252         drhd = dmar_find_matched_drhd_unit(pdev);
3253         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3254                             TAINT_FIRMWARE_WORKAROUND,
3255                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3256                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3257 }
3258 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3259
3260 static void __init init_no_remapping_devices(void)
3261 {
3262         struct dmar_drhd_unit *drhd;
3263
3264         for_each_drhd_unit(drhd) {
3265                 if (!drhd->include_all) {
3266                         int i;
3267                         for (i = 0; i < drhd->devices_cnt; i++)
3268                                 if (drhd->devices[i] != NULL)
3269                                         break;
3270                         /* ignore DMAR unit if no pci devices exist */
3271                         if (i == drhd->devices_cnt)
3272                                 drhd->ignored = 1;
3273                 }
3274         }
3275
3276         for_each_drhd_unit(drhd) {
3277                 int i;
3278                 if (drhd->ignored || drhd->include_all)
3279                         continue;
3280
3281                 for (i = 0; i < drhd->devices_cnt; i++)
3282                         if (drhd->devices[i] &&
3283                             !IS_GFX_DEVICE(drhd->devices[i]))
3284                                 break;
3285
3286                 if (i < drhd->devices_cnt)
3287                         continue;
3288
3289                 /* This IOMMU has *only* gfx devices. Either bypass it or
3290                    set the gfx_mapped flag, as appropriate */
3291                 if (dmar_map_gfx) {
3292                         intel_iommu_gfx_mapped = 1;
3293                 } else {
3294                         drhd->ignored = 1;
3295                         for (i = 0; i < drhd->devices_cnt; i++) {
3296                                 if (!drhd->devices[i])
3297                                         continue;
3298                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299                         }
3300                 }
3301         }
3302 }
3303
3304 #ifdef CONFIG_SUSPEND
3305 static int init_iommu_hw(void)
3306 {
3307         struct dmar_drhd_unit *drhd;
3308         struct intel_iommu *iommu = NULL;
3309
3310         for_each_active_iommu(iommu, drhd)
3311                 if (iommu->qi)
3312                         dmar_reenable_qi(iommu);
3313
3314         for_each_iommu(iommu, drhd) {
3315                 if (drhd->ignored) {
3316                         /*
3317                          * we always have to disable PMRs or DMA may fail on
3318                          * this device
3319                          */
3320                         if (force_on)
3321                                 iommu_disable_protect_mem_regions(iommu);
3322                         continue;
3323                 }
3324         
3325                 iommu_flush_write_buffer(iommu);
3326
3327                 iommu_set_root_entry(iommu);
3328
3329                 iommu->flush.flush_context(iommu, 0, 0, 0,
3330                                            DMA_CCMD_GLOBAL_INVL);
3331                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3332                                          DMA_TLB_GLOBAL_FLUSH);
3333                 if (iommu_enable_translation(iommu))
3334                         return 1;
3335                 iommu_disable_protect_mem_regions(iommu);
3336         }
3337
3338         return 0;
3339 }
3340
3341 static void iommu_flush_all(void)
3342 {
3343         struct dmar_drhd_unit *drhd;
3344         struct intel_iommu *iommu;
3345
3346         for_each_active_iommu(iommu, drhd) {
3347                 iommu->flush.flush_context(iommu, 0, 0, 0,
3348                                            DMA_CCMD_GLOBAL_INVL);
3349                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3350                                          DMA_TLB_GLOBAL_FLUSH);
3351         }
3352 }
3353
3354 static int iommu_suspend(void)
3355 {
3356         struct dmar_drhd_unit *drhd;
3357         struct intel_iommu *iommu = NULL;
3358         unsigned long flag;
3359
3360         for_each_active_iommu(iommu, drhd) {
3361                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3362                                                  GFP_ATOMIC);
3363                 if (!iommu->iommu_state)
3364                         goto nomem;
3365         }
3366
3367         iommu_flush_all();
3368
3369         for_each_active_iommu(iommu, drhd) {
3370                 iommu_disable_translation(iommu);
3371
3372                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3373
3374                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3375                         readl(iommu->reg + DMAR_FECTL_REG);
3376                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3377                         readl(iommu->reg + DMAR_FEDATA_REG);
3378                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3379                         readl(iommu->reg + DMAR_FEADDR_REG);
3380                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3381                         readl(iommu->reg + DMAR_FEUADDR_REG);
3382
3383                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3384         }
3385         return 0;
3386
3387 nomem:
3388         for_each_active_iommu(iommu, drhd)
3389                 kfree(iommu->iommu_state);
3390
3391         return -ENOMEM;
3392 }
3393
3394 static void iommu_resume(void)
3395 {
3396         struct dmar_drhd_unit *drhd;
3397         struct intel_iommu *iommu = NULL;
3398         unsigned long flag;
3399
3400         if (init_iommu_hw()) {
3401                 if (force_on)
3402                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3403                 else
3404                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3405                 return;
3406         }
3407
3408         for_each_active_iommu(iommu, drhd) {
3409
3410                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3411
3412                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3413                         iommu->reg + DMAR_FECTL_REG);
3414                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3415                         iommu->reg + DMAR_FEDATA_REG);
3416                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3417                         iommu->reg + DMAR_FEADDR_REG);
3418                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3419                         iommu->reg + DMAR_FEUADDR_REG);
3420
3421                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3422         }
3423
3424         for_each_active_iommu(iommu, drhd)
3425                 kfree(iommu->iommu_state);
3426 }
3427
3428 static struct syscore_ops iommu_syscore_ops = {
3429         .resume         = iommu_resume,
3430         .suspend        = iommu_suspend,
3431 };
3432
3433 static void __init init_iommu_pm_ops(void)
3434 {
3435         register_syscore_ops(&iommu_syscore_ops);
3436 }
3437
3438 #else
3439 static inline void init_iommu_pm_ops(void) {}
3440 #endif  /* CONFIG_PM */
3441
3442 LIST_HEAD(dmar_rmrr_units);
3443
3444 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3445 {
3446         list_add(&rmrr->list, &dmar_rmrr_units);
3447 }
3448
3449
3450 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3451 {
3452         struct acpi_dmar_reserved_memory *rmrr;
3453         struct dmar_rmrr_unit *rmrru;
3454
3455         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3456         if (!rmrru)
3457                 return -ENOMEM;
3458
3459         rmrru->hdr = header;
3460         rmrr = (struct acpi_dmar_reserved_memory *)header;
3461         rmrru->base_address = rmrr->base_address;
3462         rmrru->end_address = rmrr->end_address;
3463
3464         dmar_register_rmrr_unit(rmrru);
3465         return 0;
3466 }
3467
3468 static int __init
3469 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3470 {
3471         struct acpi_dmar_reserved_memory *rmrr;
3472         int ret;
3473
3474         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3475         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3476                 ((void *)rmrr) + rmrr->header.length,
3477                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3478
3479         if (ret || (rmrru->devices_cnt == 0)) {
3480                 list_del(&rmrru->list);
3481                 kfree(rmrru);
3482         }
3483         return ret;
3484 }
3485
3486 static LIST_HEAD(dmar_atsr_units);
3487
3488 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3489 {
3490         struct acpi_dmar_atsr *atsr;
3491         struct dmar_atsr_unit *atsru;
3492
3493         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3494         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3495         if (!atsru)
3496                 return -ENOMEM;
3497
3498         atsru->hdr = hdr;
3499         atsru->include_all = atsr->flags & 0x1;
3500
3501         list_add(&atsru->list, &dmar_atsr_units);
3502
3503         return 0;
3504 }
3505
3506 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3507 {
3508         int rc;
3509         struct acpi_dmar_atsr *atsr;
3510
3511         if (atsru->include_all)
3512                 return 0;
3513
3514         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3515         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3516                                 (void *)atsr + atsr->header.length,
3517                                 &atsru->devices_cnt, &atsru->devices,
3518                                 atsr->segment);
3519         if (rc || !atsru->devices_cnt) {
3520                 list_del(&atsru->list);
3521                 kfree(atsru);
3522         }
3523
3524         return rc;
3525 }
3526
3527 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3528 {
3529         int i;
3530         struct pci_bus *bus;
3531         struct acpi_dmar_atsr *atsr;
3532         struct dmar_atsr_unit *atsru;
3533
3534         dev = pci_physfn(dev);
3535
3536         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3537                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3538                 if (atsr->segment == pci_domain_nr(dev->bus))
3539                         goto found;
3540         }
3541
3542         return 0;
3543
3544 found:
3545         for (bus = dev->bus; bus; bus = bus->parent) {
3546                 struct pci_dev *bridge = bus->self;
3547
3548                 if (!bridge || !pci_is_pcie(bridge) ||
3549                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3550                         return 0;
3551
3552                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3553                         for (i = 0; i < atsru->devices_cnt; i++)
3554                                 if (atsru->devices[i] == bridge)
3555                                         return 1;
3556                         break;
3557                 }
3558         }
3559
3560         if (atsru->include_all)
3561                 return 1;
3562
3563         return 0;
3564 }
3565
3566 int __init dmar_parse_rmrr_atsr_dev(void)
3567 {
3568         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3569         struct dmar_atsr_unit *atsr, *atsr_n;
3570         int ret = 0;
3571
3572         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3573                 ret = rmrr_parse_dev(rmrr);
3574                 if (ret)
3575                         return ret;
3576         }
3577
3578         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3579                 ret = atsr_parse_dev(atsr);
3580                 if (ret)
3581                         return ret;
3582         }
3583
3584         return ret;
3585 }
3586
3587 /*
3588  * Here we only respond to action of unbound device from driver.
3589  *
3590  * Added device is not attached to its DMAR domain here yet. That will happen
3591  * when mapping the device to iova.
3592  */
3593 static int device_notifier(struct notifier_block *nb,
3594                                   unsigned long action, void *data)
3595 {
3596         struct device *dev = data;
3597         struct pci_dev *pdev = to_pci_dev(dev);
3598         struct dmar_domain *domain;
3599
3600         if (iommu_no_mapping(dev))
3601                 return 0;
3602
3603         domain = find_domain(pdev);
3604         if (!domain)
3605                 return 0;
3606
3607         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3608                 domain_remove_one_dev_info(domain, pdev);
3609
3610                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3611                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3612                     list_empty(&domain->devices))
3613                         domain_exit(domain);
3614         }
3615
3616         return 0;
3617 }
3618
3619 static struct notifier_block device_nb = {
3620         .notifier_call = device_notifier,
3621 };
3622
3623 int __init intel_iommu_init(void)
3624 {
3625         int ret = 0;
3626
3627         /* VT-d is required for a TXT/tboot launch, so enforce that */
3628         force_on = tboot_force_iommu();
3629
3630         if (dmar_table_init()) {
3631                 if (force_on)
3632                         panic("tboot: Failed to initialize DMAR table\n");
3633                 return  -ENODEV;
3634         }
3635
3636         if (dmar_dev_scope_init() < 0) {
3637                 if (force_on)
3638                         panic("tboot: Failed to initialize DMAR device scope\n");
3639                 return  -ENODEV;
3640         }
3641
3642         if (no_iommu || dmar_disabled)
3643                 return -ENODEV;
3644
3645         if (iommu_init_mempool()) {
3646                 if (force_on)
3647                         panic("tboot: Failed to initialize iommu memory\n");
3648                 return  -ENODEV;
3649         }
3650
3651         if (list_empty(&dmar_rmrr_units))
3652                 printk(KERN_INFO "DMAR: No RMRR found\n");
3653
3654         if (list_empty(&dmar_atsr_units))
3655                 printk(KERN_INFO "DMAR: No ATSR found\n");
3656
3657         if (dmar_init_reserved_ranges()) {
3658                 if (force_on)
3659                         panic("tboot: Failed to reserve iommu ranges\n");
3660                 return  -ENODEV;
3661         }
3662
3663         init_no_remapping_devices();
3664
3665         ret = init_dmars();
3666         if (ret) {
3667                 if (force_on)
3668                         panic("tboot: Failed to initialize DMARs\n");
3669                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3670                 put_iova_domain(&reserved_iova_list);
3671                 iommu_exit_mempool();
3672                 return ret;
3673         }
3674         printk(KERN_INFO
3675         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3676
3677         init_timer(&unmap_timer);
3678 #ifdef CONFIG_SWIOTLB
3679         swiotlb = 0;
3680 #endif
3681         dma_ops = &intel_dma_ops;
3682
3683         init_iommu_pm_ops();
3684
3685         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3686
3687         bus_register_notifier(&pci_bus_type, &device_nb);
3688
3689         intel_iommu_enabled = 1;
3690
3691         return 0;
3692 }
3693
3694 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3695                                            struct pci_dev *pdev)
3696 {
3697         struct pci_dev *tmp, *parent;
3698
3699         if (!iommu || !pdev)
3700                 return;
3701
3702         /* dependent device detach */
3703         tmp = pci_find_upstream_pcie_bridge(pdev);
3704         /* Secondary interface's bus number and devfn 0 */
3705         if (tmp) {
3706                 parent = pdev->bus->self;
3707                 while (parent != tmp) {
3708                         iommu_detach_dev(iommu, parent->bus->number,
3709                                          parent->devfn);
3710                         parent = parent->bus->self;
3711                 }
3712                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3713                         iommu_detach_dev(iommu,
3714                                 tmp->subordinate->number, 0);
3715                 else /* this is a legacy PCI bridge */
3716                         iommu_detach_dev(iommu, tmp->bus->number,
3717                                          tmp->devfn);
3718         }
3719 }
3720
3721 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3722                                           struct pci_dev *pdev)
3723 {
3724         struct device_domain_info *info;
3725         struct intel_iommu *iommu;
3726         unsigned long flags;
3727         int found = 0;
3728         struct list_head *entry, *tmp;
3729
3730         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3731                                 pdev->devfn);
3732         if (!iommu)
3733                 return;
3734
3735         spin_lock_irqsave(&device_domain_lock, flags);
3736         list_for_each_safe(entry, tmp, &domain->devices) {
3737                 info = list_entry(entry, struct device_domain_info, link);
3738                 if (info->segment == pci_domain_nr(pdev->bus) &&
3739                     info->bus == pdev->bus->number &&
3740                     info->devfn == pdev->devfn) {
3741                         unlink_domain_info(info);
3742                         spin_unlock_irqrestore(&device_domain_lock, flags);
3743
3744                         iommu_disable_dev_iotlb(info);
3745                         iommu_detach_dev(iommu, info->bus, info->devfn);
3746                         iommu_detach_dependent_devices(iommu, pdev);
3747                         free_devinfo_mem(info);
3748
3749                         spin_lock_irqsave(&device_domain_lock, flags);
3750
3751                         if (found)
3752                                 break;
3753                         else
3754                                 continue;
3755                 }
3756
3757                 /* if there is no other devices under the same iommu
3758                  * owned by this domain, clear this iommu in iommu_bmp
3759                  * update iommu count and coherency
3760                  */
3761                 if (iommu == device_to_iommu(info->segment, info->bus,
3762                                             info->devfn))
3763                         found = 1;
3764         }
3765
3766         spin_unlock_irqrestore(&device_domain_lock, flags);
3767
3768         if (found == 0) {
3769                 unsigned long tmp_flags;
3770                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3771                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3772                 domain->iommu_count--;
3773                 domain_update_iommu_cap(domain);
3774                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3775
3776                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3777                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3778                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3779                         clear_bit(domain->id, iommu->domain_ids);
3780                         iommu->domains[domain->id] = NULL;
3781                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3782                 }
3783         }
3784 }
3785
3786 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3787 {
3788         struct device_domain_info *info;
3789         struct intel_iommu *iommu;
3790         unsigned long flags1, flags2;
3791
3792         spin_lock_irqsave(&device_domain_lock, flags1);
3793         while (!list_empty(&domain->devices)) {
3794                 info = list_entry(domain->devices.next,
3795                         struct device_domain_info, link);
3796                 unlink_domain_info(info);
3797                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3798
3799                 iommu_disable_dev_iotlb(info);
3800                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3801                 iommu_detach_dev(iommu, info->bus, info->devfn);
3802                 iommu_detach_dependent_devices(iommu, info->dev);
3803
3804                 /* clear this iommu in iommu_bmp, update iommu count
3805                  * and capabilities
3806                  */
3807                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3808                 if (test_and_clear_bit(iommu->seq_id,
3809                                        domain->iommu_bmp)) {
3810                         domain->iommu_count--;
3811                         domain_update_iommu_cap(domain);
3812                 }
3813                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3814
3815                 free_devinfo_mem(info);
3816                 spin_lock_irqsave(&device_domain_lock, flags1);
3817         }
3818         spin_unlock_irqrestore(&device_domain_lock, flags1);
3819 }
3820
3821 /* domain id for virtual machine, it won't be set in context */
3822 static unsigned long vm_domid;
3823
3824 static struct dmar_domain *iommu_alloc_vm_domain(void)
3825 {
3826         struct dmar_domain *domain;
3827
3828         domain = alloc_domain_mem();
3829         if (!domain)
3830                 return NULL;
3831
3832         domain->id = vm_domid++;
3833         domain->nid = -1;
3834         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3835         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3836
3837         return domain;
3838 }
3839
3840 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3841 {
3842         int adjust_width;
3843
3844         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3845         spin_lock_init(&domain->iommu_lock);
3846
3847         domain_reserve_special_ranges(domain);
3848
3849         /* calculate AGAW */
3850         domain->gaw = guest_width;
3851         adjust_width = guestwidth_to_adjustwidth(guest_width);
3852         domain->agaw = width_to_agaw(adjust_width);
3853
3854         INIT_LIST_HEAD(&domain->devices);
3855
3856         domain->iommu_count = 0;
3857         domain->iommu_coherency = 0;
3858         domain->iommu_snooping = 0;
3859         domain->iommu_superpage = 0;
3860         domain->max_addr = 0;
3861         domain->nid = -1;
3862
3863         /* always allocate the top pgd */
3864         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3865         if (!domain->pgd)
3866                 return -ENOMEM;
3867         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3868         return 0;
3869 }
3870
3871 static void iommu_free_vm_domain(struct dmar_domain *domain)
3872 {
3873         unsigned long flags;
3874         struct dmar_drhd_unit *drhd;
3875         struct intel_iommu *iommu;
3876         unsigned long i;
3877         unsigned long ndomains;
3878
3879         for_each_drhd_unit(drhd) {
3880                 if (drhd->ignored)
3881                         continue;
3882                 iommu = drhd->iommu;
3883
3884                 ndomains = cap_ndoms(iommu->cap);
3885                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3886                         if (iommu->domains[i] == domain) {
3887                                 spin_lock_irqsave(&iommu->lock, flags);
3888                                 clear_bit(i, iommu->domain_ids);
3889                                 iommu->domains[i] = NULL;
3890                                 spin_unlock_irqrestore(&iommu->lock, flags);
3891                                 break;
3892                         }
3893                 }
3894         }
3895 }
3896
3897 static void vm_domain_exit(struct dmar_domain *domain)
3898 {
3899         /* Domain 0 is reserved, so dont process it */
3900         if (!domain)
3901                 return;
3902
3903         vm_domain_remove_all_dev_info(domain);
3904         /* destroy iovas */
3905         put_iova_domain(&domain->iovad);
3906
3907         /* clear ptes */
3908         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909
3910         /* free page tables */
3911         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3912
3913         iommu_free_vm_domain(domain);
3914         free_domain_mem(domain);
3915 }
3916
3917 static int intel_iommu_domain_init(struct iommu_domain *domain)
3918 {
3919         struct dmar_domain *dmar_domain;
3920
3921         dmar_domain = iommu_alloc_vm_domain();
3922         if (!dmar_domain) {
3923                 printk(KERN_ERR
3924                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3925                 return -ENOMEM;
3926         }
3927         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3928                 printk(KERN_ERR
3929                         "intel_iommu_domain_init() failed\n");
3930                 vm_domain_exit(dmar_domain);
3931                 return -ENOMEM;
3932         }
3933         domain_update_iommu_cap(dmar_domain);
3934         domain->priv = dmar_domain;
3935
3936         domain->geometry.aperture_start = 0;
3937         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3938         domain->geometry.force_aperture = true;
3939
3940         return 0;
3941 }
3942
3943 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3944 {
3945         struct dmar_domain *dmar_domain = domain->priv;
3946
3947         domain->priv = NULL;
3948         vm_domain_exit(dmar_domain);
3949 }
3950
3951 static int intel_iommu_attach_device(struct iommu_domain *domain,
3952                                      struct device *dev)
3953 {
3954         struct dmar_domain *dmar_domain = domain->priv;
3955         struct pci_dev *pdev = to_pci_dev(dev);
3956         struct intel_iommu *iommu;
3957         int addr_width;
3958
3959         /* normally pdev is not mapped */
3960         if (unlikely(domain_context_mapped(pdev))) {
3961                 struct dmar_domain *old_domain;
3962
3963                 old_domain = find_domain(pdev);
3964                 if (old_domain) {
3965                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3966                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3967                                 domain_remove_one_dev_info(old_domain, pdev);
3968                         else
3969                                 domain_remove_dev_info(old_domain);
3970                 }
3971         }
3972
3973         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3974                                 pdev->devfn);
3975         if (!iommu)
3976                 return -ENODEV;
3977
3978         /* check if this iommu agaw is sufficient for max mapped address */
3979         addr_width = agaw_to_width(iommu->agaw);
3980         if (addr_width > cap_mgaw(iommu->cap))
3981                 addr_width = cap_mgaw(iommu->cap);
3982
3983         if (dmar_domain->max_addr > (1LL << addr_width)) {
3984                 printk(KERN_ERR "%s: iommu width (%d) is not "
3985                        "sufficient for the mapped address (%llx)\n",
3986                        __func__, addr_width, dmar_domain->max_addr);
3987                 return -EFAULT;
3988         }
3989         dmar_domain->gaw = addr_width;
3990
3991         /*
3992          * Knock out extra levels of page tables if necessary
3993          */
3994         while (iommu->agaw < dmar_domain->agaw) {
3995                 struct dma_pte *pte;
3996
3997                 pte = dmar_domain->pgd;
3998                 if (dma_pte_present(pte)) {
3999                         dmar_domain->pgd = (struct dma_pte *)
4000                                 phys_to_virt(dma_pte_addr(pte));
4001                         free_pgtable_page(pte);
4002                 }
4003                 dmar_domain->agaw--;
4004         }
4005
4006         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4007 }
4008
4009 static void intel_iommu_detach_device(struct iommu_domain *domain,
4010                                       struct device *dev)
4011 {
4012         struct dmar_domain *dmar_domain = domain->priv;
4013         struct pci_dev *pdev = to_pci_dev(dev);
4014
4015         domain_remove_one_dev_info(dmar_domain, pdev);
4016 }
4017
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019                            unsigned long iova, phys_addr_t hpa,
4020                            size_t size, int iommu_prot)
4021 {
4022         struct dmar_domain *dmar_domain = domain->priv;
4023         u64 max_addr;
4024         int prot = 0;
4025         int ret;
4026
4027         if (iommu_prot & IOMMU_READ)
4028                 prot |= DMA_PTE_READ;
4029         if (iommu_prot & IOMMU_WRITE)
4030                 prot |= DMA_PTE_WRITE;
4031         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4032                 prot |= DMA_PTE_SNP;
4033
4034         max_addr = iova + size;
4035         if (dmar_domain->max_addr < max_addr) {
4036                 u64 end;
4037
4038                 /* check if minimum agaw is sufficient for mapped address */
4039                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4040                 if (end < max_addr) {
4041                         printk(KERN_ERR "%s: iommu width (%d) is not "
4042                                "sufficient for the mapped address (%llx)\n",
4043                                __func__, dmar_domain->gaw, max_addr);
4044                         return -EFAULT;
4045                 }
4046                 dmar_domain->max_addr = max_addr;
4047         }
4048         /* Round up size to next multiple of PAGE_SIZE, if it and
4049            the low bits of hpa would take us onto the next page */
4050         size = aligned_nrpages(hpa, size);
4051         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4052                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4053         return ret;
4054 }
4055
4056 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4057                              unsigned long iova, size_t size)
4058 {
4059         struct dmar_domain *dmar_domain = domain->priv;
4060         int order;
4061
4062         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4063                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4064
4065         if (dmar_domain->max_addr == iova + size)
4066                 dmar_domain->max_addr = iova;
4067
4068         return PAGE_SIZE << order;
4069 }
4070
4071 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4072                                             unsigned long iova)
4073 {
4074         struct dmar_domain *dmar_domain = domain->priv;
4075         struct dma_pte *pte;
4076         u64 phys = 0;
4077
4078         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4079         if (pte)
4080                 phys = dma_pte_addr(pte);
4081
4082         return phys;
4083 }
4084
4085 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4086                                       unsigned long cap)
4087 {
4088         struct dmar_domain *dmar_domain = domain->priv;
4089
4090         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4091                 return dmar_domain->iommu_snooping;
4092         if (cap == IOMMU_CAP_INTR_REMAP)
4093                 return irq_remapping_enabled;
4094
4095         return 0;
4096 }
4097
4098 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4099 {
4100         pci_dev_put(*from);
4101         *from = to;
4102 }
4103
4104 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4105
4106 static int intel_iommu_add_device(struct device *dev)
4107 {
4108         struct pci_dev *pdev = to_pci_dev(dev);
4109         struct pci_dev *bridge, *dma_pdev;
4110         struct iommu_group *group;
4111         int ret;
4112
4113         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4114                              pdev->bus->number, pdev->devfn))
4115                 return -ENODEV;
4116
4117         bridge = pci_find_upstream_pcie_bridge(pdev);
4118         if (bridge) {
4119                 if (pci_is_pcie(bridge))
4120                         dma_pdev = pci_get_domain_bus_and_slot(
4121                                                 pci_domain_nr(pdev->bus),
4122                                                 bridge->subordinate->number, 0);
4123                 else
4124                         dma_pdev = pci_dev_get(bridge);
4125         } else
4126                 dma_pdev = pci_dev_get(pdev);
4127
4128         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4129
4130         if (dma_pdev->multifunction &&
4131             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4132                 swap_pci_ref(&dma_pdev,
4133                              pci_get_slot(dma_pdev->bus,
4134                                           PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4135                                           0)));
4136
4137         while (!pci_is_root_bus(dma_pdev->bus)) {
4138                 if (pci_acs_path_enabled(dma_pdev->bus->self,
4139                                          NULL, REQ_ACS_FLAGS))
4140                         break;
4141
4142                 swap_pci_ref(&dma_pdev, pci_dev_get(dma_pdev->bus->self));
4143         }
4144
4145         group = iommu_group_get(&dma_pdev->dev);
4146         pci_dev_put(dma_pdev);
4147         if (!group) {
4148                 group = iommu_group_alloc();
4149                 if (IS_ERR(group))
4150                         return PTR_ERR(group);
4151         }
4152
4153         ret = iommu_group_add_device(group, dev);
4154
4155         iommu_group_put(group);
4156         return ret;
4157 }
4158
4159 static void intel_iommu_remove_device(struct device *dev)
4160 {
4161         iommu_group_remove_device(dev);
4162 }
4163
4164 static struct iommu_ops intel_iommu_ops = {
4165         .domain_init    = intel_iommu_domain_init,
4166         .domain_destroy = intel_iommu_domain_destroy,
4167         .attach_dev     = intel_iommu_attach_device,
4168         .detach_dev     = intel_iommu_detach_device,
4169         .map            = intel_iommu_map,
4170         .unmap          = intel_iommu_unmap,
4171         .iova_to_phys   = intel_iommu_iova_to_phys,
4172         .domain_has_cap = intel_iommu_domain_has_cap,
4173         .add_device     = intel_iommu_add_device,
4174         .remove_device  = intel_iommu_remove_device,
4175         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4176 };
4177
4178 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4179 {
4180         /*
4181          * Mobile 4 Series Chipset neglects to set RWBF capability,
4182          * but needs it:
4183          */
4184         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4185         rwbf_quirk = 1;
4186
4187         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4188         if (dev->revision == 0x07) {
4189                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4190                 dmar_map_gfx = 0;
4191         }
4192 }
4193
4194 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4195
4196 #define GGC 0x52
4197 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4198 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4199 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4200 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4201 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4202 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4203 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4204 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4205
4206 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4207 {
4208         unsigned short ggc;
4209
4210         if (pci_read_config_word(dev, GGC, &ggc))
4211                 return;
4212
4213         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4214                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4215                 dmar_map_gfx = 0;
4216         } else if (dmar_map_gfx) {
4217                 /* we have to ensure the gfx device is idle before we flush */
4218                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4219                 intel_iommu_strict = 1;
4220        }
4221 }
4222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4226
4227 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4228    ISOCH DMAR unit for the Azalia sound device, but not give it any
4229    TLB entries, which causes it to deadlock. Check for that.  We do
4230    this in a function called from init_dmars(), instead of in a PCI
4231    quirk, because we don't want to print the obnoxious "BIOS broken"
4232    message if VT-d is actually disabled.
4233 */
4234 static void __init check_tylersburg_isoch(void)
4235 {
4236         struct pci_dev *pdev;
4237         uint32_t vtisochctrl;
4238
4239         /* If there's no Azalia in the system anyway, forget it. */
4240         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4241         if (!pdev)
4242                 return;
4243         pci_dev_put(pdev);
4244
4245         /* System Management Registers. Might be hidden, in which case
4246            we can't do the sanity check. But that's OK, because the
4247            known-broken BIOSes _don't_ actually hide it, so far. */
4248         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4249         if (!pdev)
4250                 return;
4251
4252         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4253                 pci_dev_put(pdev);
4254                 return;
4255         }
4256
4257         pci_dev_put(pdev);
4258
4259         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4260         if (vtisochctrl & 1)
4261                 return;
4262
4263         /* Drop all bits other than the number of TLB entries */
4264         vtisochctrl &= 0x1c;
4265
4266         /* If we have the recommended number of TLB entries (16), fine. */
4267         if (vtisochctrl == 0x10)
4268                 return;
4269
4270         /* Zero TLB entries? You get to ride the short bus to school. */
4271         if (!vtisochctrl) {
4272                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4273                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4274                      dmi_get_system_info(DMI_BIOS_VENDOR),
4275                      dmi_get_system_info(DMI_BIOS_VERSION),
4276                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4277                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4278                 return;
4279         }
4280         
4281         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4282                vtisochctrl);
4283 }