Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
56 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
57
58 #define IOAPIC_RANGE_START      (0xfee00000)
59 #define IOAPIC_RANGE_END        (0xfeefffff)
60 #define IOVA_START_ADDR         (0x1000)
61
62 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
63
64 #define MAX_AGAW_WIDTH 64
65
66 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
72                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
76 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
77 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
78
79 /* page table handling */
80 #define LEVEL_STRIDE            (9)
81 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
82
83 /*
84  * This bitmap is used to advertise the page sizes our hardware support
85  * to the IOMMU core, which will then use this information to split
86  * physically contiguous memory regions it is mapping into page sizes
87  * that we support.
88  *
89  * Traditionally the IOMMU core just handed us the mappings directly,
90  * after making sure the size is an order of a 4KiB page and that the
91  * mapping has natural alignment.
92  *
93  * To retain this behavior, we currently advertise that we support
94  * all page sizes that are an order of 4KiB.
95  *
96  * If at some point we'd like to utilize the IOMMU core's new behavior,
97  * we could change this to advertise the real page sizes we support.
98  */
99 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
100
101 static inline int agaw_to_level(int agaw)
102 {
103         return agaw + 2;
104 }
105
106 static inline int agaw_to_width(int agaw)
107 {
108         return 30 + agaw * LEVEL_STRIDE;
109 }
110
111 static inline int width_to_agaw(int width)
112 {
113         return (width - 30) / LEVEL_STRIDE;
114 }
115
116 static inline unsigned int level_to_offset_bits(int level)
117 {
118         return (level - 1) * LEVEL_STRIDE;
119 }
120
121 static inline int pfn_level_offset(unsigned long pfn, int level)
122 {
123         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
124 }
125
126 static inline unsigned long level_mask(int level)
127 {
128         return -1UL << level_to_offset_bits(level);
129 }
130
131 static inline unsigned long level_size(int level)
132 {
133         return 1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 {
138         return (pfn + level_size(level) - 1) & level_mask(level);
139 }
140
141 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 {
143         return  1 << ((lvl - 1) * LEVEL_STRIDE);
144 }
145
146 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
147    are never going to work. */
148 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 {
150         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
151 }
152
153 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 {
155         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 {
159         return mm_to_dma_pfn(page_to_pfn(pg));
160 }
161 static inline unsigned long virt_to_dma_pfn(void *p)
162 {
163         return page_to_dma_pfn(virt_to_page(p));
164 }
165
166 /* global iommu list, set NULL for ignored DMAR units */
167 static struct intel_iommu **g_iommus;
168
169 static void __init check_tylersburg_isoch(void);
170 static int rwbf_quirk;
171
172 /*
173  * set to 1 to panic kernel if can't successfully enable VT-d
174  * (used when kernel is launched w/ TXT)
175  */
176 static int force_on = 0;
177
178 /*
179  * 0: Present
180  * 1-11: Reserved
181  * 12-63: Context Ptr (12 - (haw-1))
182  * 64-127: Reserved
183  */
184 struct root_entry {
185         u64     val;
186         u64     rsvd1;
187 };
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189 static inline bool root_present(struct root_entry *root)
190 {
191         return (root->val & 1);
192 }
193 static inline void set_root_present(struct root_entry *root)
194 {
195         root->val |= 1;
196 }
197 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 {
199         root->val |= value & VTD_PAGE_MASK;
200 }
201
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
204 {
205         return (struct context_entry *)
206                 (root_present(root)?phys_to_virt(
207                 root->val & VTD_PAGE_MASK) :
208                 NULL);
209 }
210
211 /*
212  * low 64 bits:
213  * 0: present
214  * 1: fault processing disable
215  * 2-3: translation type
216  * 12-63: address space root
217  * high 64 bits:
218  * 0-2: address width
219  * 3-6: aval
220  * 8-23: domain id
221  */
222 struct context_entry {
223         u64 lo;
224         u64 hi;
225 };
226
227 static inline bool context_present(struct context_entry *context)
228 {
229         return (context->lo & 1);
230 }
231 static inline void context_set_present(struct context_entry *context)
232 {
233         context->lo |= 1;
234 }
235
236 static inline void context_set_fault_enable(struct context_entry *context)
237 {
238         context->lo &= (((u64)-1) << 2) | 1;
239 }
240
241 static inline void context_set_translation_type(struct context_entry *context,
242                                                 unsigned long value)
243 {
244         context->lo &= (((u64)-1) << 4) | 3;
245         context->lo |= (value & 3) << 2;
246 }
247
248 static inline void context_set_address_root(struct context_entry *context,
249                                             unsigned long value)
250 {
251         context->lo |= value & VTD_PAGE_MASK;
252 }
253
254 static inline void context_set_address_width(struct context_entry *context,
255                                              unsigned long value)
256 {
257         context->hi |= value & 7;
258 }
259
260 static inline void context_set_domain_id(struct context_entry *context,
261                                          unsigned long value)
262 {
263         context->hi |= (value & ((1 << 16) - 1)) << 8;
264 }
265
266 static inline void context_clear_entry(struct context_entry *context)
267 {
268         context->lo = 0;
269         context->hi = 0;
270 }
271
272 /*
273  * 0: readable
274  * 1: writable
275  * 2-6: reserved
276  * 7: super page
277  * 8-10: available
278  * 11: snoop behavior
279  * 12-63: Host physcial address
280  */
281 struct dma_pte {
282         u64 val;
283 };
284
285 static inline void dma_clear_pte(struct dma_pte *pte)
286 {
287         pte->val = 0;
288 }
289
290 static inline void dma_set_pte_readable(struct dma_pte *pte)
291 {
292         pte->val |= DMA_PTE_READ;
293 }
294
295 static inline void dma_set_pte_writable(struct dma_pte *pte)
296 {
297         pte->val |= DMA_PTE_WRITE;
298 }
299
300 static inline void dma_set_pte_snp(struct dma_pte *pte)
301 {
302         pte->val |= DMA_PTE_SNP;
303 }
304
305 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
306 {
307         pte->val = (pte->val & ~3) | (prot & 3);
308 }
309
310 static inline u64 dma_pte_addr(struct dma_pte *pte)
311 {
312 #ifdef CONFIG_64BIT
313         return pte->val & VTD_PAGE_MASK;
314 #else
315         /* Must have a full atomic 64-bit read */
316         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
317 #endif
318 }
319
320 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
321 {
322         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
323 }
324
325 static inline bool dma_pte_present(struct dma_pte *pte)
326 {
327         return (pte->val & 3) != 0;
328 }
329
330 static inline bool dma_pte_superpage(struct dma_pte *pte)
331 {
332         return (pte->val & (1 << 7));
333 }
334
335 static inline int first_pte_in_page(struct dma_pte *pte)
336 {
337         return !((unsigned long)pte & ~VTD_PAGE_MASK);
338 }
339
340 /*
341  * This domain is a statically identity mapping domain.
342  *      1. This domain creats a static 1:1 mapping to all usable memory.
343  *      2. It maps to each iommu if successful.
344  *      3. Each iommu mapps to this domain if successful.
345  */
346 static struct dmar_domain *si_domain;
347 static int hw_pass_through = 1;
348
349 /* devices under the same p2p bridge are owned in one domain */
350 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
351
352 /* domain represents a virtual machine, more than one devices
353  * across iommus may be owned in one domain, e.g. kvm guest.
354  */
355 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
356
357 /* si_domain contains mulitple devices */
358 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
359
360 /* define the limit of IOMMUs supported in each domain */
361 #ifdef  CONFIG_X86
362 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
363 #else
364 # define        IOMMU_UNITS_SUPPORTED   64
365 #endif
366
367 struct dmar_domain {
368         int     id;                     /* domain id */
369         int     nid;                    /* node id */
370         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
371                                         /* bitmap of iommus this domain uses*/
372
373         struct list_head devices;       /* all devices' list */
374         struct iova_domain iovad;       /* iova's that belong to this domain */
375
376         struct dma_pte  *pgd;           /* virtual address */
377         int             gaw;            /* max guest address width */
378
379         /* adjusted guest address width, 0 is level 2 30-bit */
380         int             agaw;
381
382         int             flags;          /* flags to find out type of domain */
383
384         int             iommu_coherency;/* indicate coherency of iommu access */
385         int             iommu_snooping; /* indicate snooping control feature*/
386         int             iommu_count;    /* reference count of iommu */
387         int             iommu_superpage;/* Level of superpages supported:
388                                            0 == 4KiB (no superpages), 1 == 2MiB,
389                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
390         spinlock_t      iommu_lock;     /* protect iommu set in domain */
391         u64             max_addr;       /* maximum mapped address */
392 };
393
394 /* PCI domain-device relationship */
395 struct device_domain_info {
396         struct list_head link;  /* link to domain siblings */
397         struct list_head global; /* link to global list */
398         int segment;            /* PCI domain */
399         u8 bus;                 /* PCI bus number */
400         u8 devfn;               /* PCI devfn number */
401         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
402         struct intel_iommu *iommu; /* IOMMU used by this device */
403         struct dmar_domain *domain; /* pointer to domain */
404 };
405
406 static void flush_unmaps_timeout(unsigned long data);
407
408 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
409
410 #define HIGH_WATER_MARK 250
411 struct deferred_flush_tables {
412         int next;
413         struct iova *iova[HIGH_WATER_MARK];
414         struct dmar_domain *domain[HIGH_WATER_MARK];
415 };
416
417 static struct deferred_flush_tables *deferred_flush;
418
419 /* bitmap for indexing intel_iommus */
420 static int g_num_of_iommus;
421
422 static DEFINE_SPINLOCK(async_umap_flush_lock);
423 static LIST_HEAD(unmaps_to_do);
424
425 static int timer_on;
426 static long list_size;
427
428 static void domain_remove_dev_info(struct dmar_domain *domain);
429
430 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
431 int dmar_disabled = 0;
432 #else
433 int dmar_disabled = 1;
434 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
435
436 int intel_iommu_enabled = 0;
437 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
438
439 static int dmar_map_gfx = 1;
440 static int dmar_forcedac;
441 static int intel_iommu_strict;
442 static int intel_iommu_superpage = 1;
443
444 int intel_iommu_gfx_mapped;
445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
446
447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
448 static DEFINE_SPINLOCK(device_domain_lock);
449 static LIST_HEAD(device_domain_list);
450
451 static struct iommu_ops intel_iommu_ops;
452
453 static int __init intel_iommu_setup(char *str)
454 {
455         if (!str)
456                 return -EINVAL;
457         while (*str) {
458                 if (!strncmp(str, "on", 2)) {
459                         dmar_disabled = 0;
460                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
461                 } else if (!strncmp(str, "off", 3)) {
462                         dmar_disabled = 1;
463                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
464                 } else if (!strncmp(str, "igfx_off", 8)) {
465                         dmar_map_gfx = 0;
466                         printk(KERN_INFO
467                                 "Intel-IOMMU: disable GFX device mapping\n");
468                 } else if (!strncmp(str, "forcedac", 8)) {
469                         printk(KERN_INFO
470                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
471                         dmar_forcedac = 1;
472                 } else if (!strncmp(str, "strict", 6)) {
473                         printk(KERN_INFO
474                                 "Intel-IOMMU: disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         printk(KERN_INFO
478                                 "Intel-IOMMU: disable supported super page\n");
479                         intel_iommu_superpage = 0;
480                 }
481
482                 str += strcspn(str, ",");
483                 while (*str == ',')
484                         str++;
485         }
486         return 0;
487 }
488 __setup("intel_iommu=", intel_iommu_setup);
489
490 static struct kmem_cache *iommu_domain_cache;
491 static struct kmem_cache *iommu_devinfo_cache;
492 static struct kmem_cache *iommu_iova_cache;
493
494 static inline void *alloc_pgtable_page(int node)
495 {
496         struct page *page;
497         void *vaddr = NULL;
498
499         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
500         if (page)
501                 vaddr = page_address(page);
502         return vaddr;
503 }
504
505 static inline void free_pgtable_page(void *vaddr)
506 {
507         free_page((unsigned long)vaddr);
508 }
509
510 static inline void *alloc_domain_mem(void)
511 {
512         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
513 }
514
515 static void free_domain_mem(void *vaddr)
516 {
517         kmem_cache_free(iommu_domain_cache, vaddr);
518 }
519
520 static inline void * alloc_devinfo_mem(void)
521 {
522         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
523 }
524
525 static inline void free_devinfo_mem(void *vaddr)
526 {
527         kmem_cache_free(iommu_devinfo_cache, vaddr);
528 }
529
530 struct iova *alloc_iova_mem(void)
531 {
532         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
533 }
534
535 void free_iova_mem(struct iova *iova)
536 {
537         kmem_cache_free(iommu_iova_cache, iova);
538 }
539
540
541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
542 {
543         unsigned long sagaw;
544         int agaw = -1;
545
546         sagaw = cap_sagaw(iommu->cap);
547         for (agaw = width_to_agaw(max_gaw);
548              agaw >= 0; agaw--) {
549                 if (test_bit(agaw, &sagaw))
550                         break;
551         }
552
553         return agaw;
554 }
555
556 /*
557  * Calculate max SAGAW for each iommu.
558  */
559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
560 {
561         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
562 }
563
564 /*
565  * calculate agaw for each iommu.
566  * "SAGAW" may be different across iommus, use a default agaw, and
567  * get a supported less agaw for iommus that don't support the default agaw.
568  */
569 int iommu_calculate_agaw(struct intel_iommu *iommu)
570 {
571         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
572 }
573
574 /* This functionin only returns single iommu in a domain */
575 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
576 {
577         int iommu_id;
578
579         /* si_domain and vm domain should not get here. */
580         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
581         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
582
583         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
584         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
585                 return NULL;
586
587         return g_iommus[iommu_id];
588 }
589
590 static void domain_update_iommu_coherency(struct dmar_domain *domain)
591 {
592         int i;
593
594         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
595
596         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
597
598         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
599                 if (!ecap_coherent(g_iommus[i]->ecap)) {
600                         domain->iommu_coherency = 0;
601                         break;
602                 }
603         }
604 }
605
606 static void domain_update_iommu_snooping(struct dmar_domain *domain)
607 {
608         int i;
609
610         domain->iommu_snooping = 1;
611
612         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
613                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
614                         domain->iommu_snooping = 0;
615                         break;
616                 }
617         }
618 }
619
620 static void domain_update_iommu_superpage(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu = NULL;
624         int mask = 0xf;
625
626         if (!intel_iommu_superpage) {
627                 domain->iommu_superpage = 0;
628                 return;
629         }
630
631         /* set iommu_superpage to the smallest common denominator */
632         for_each_active_iommu(iommu, drhd) {
633                 mask &= cap_super_page_val(iommu->cap);
634                 if (!mask) {
635                         break;
636                 }
637         }
638         domain->iommu_superpage = fls(mask);
639 }
640
641 /* Some capabilities may be different across iommus */
642 static void domain_update_iommu_cap(struct dmar_domain *domain)
643 {
644         domain_update_iommu_coherency(domain);
645         domain_update_iommu_snooping(domain);
646         domain_update_iommu_superpage(domain);
647 }
648
649 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
650 {
651         struct dmar_drhd_unit *drhd = NULL;
652         int i;
653
654         for_each_drhd_unit(drhd) {
655                 if (drhd->ignored)
656                         continue;
657                 if (segment != drhd->segment)
658                         continue;
659
660                 for (i = 0; i < drhd->devices_cnt; i++) {
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->bus->number == bus &&
663                             drhd->devices[i]->devfn == devfn)
664                                 return drhd->iommu;
665                         if (drhd->devices[i] &&
666                             drhd->devices[i]->subordinate &&
667                             drhd->devices[i]->subordinate->number <= bus &&
668                             drhd->devices[i]->subordinate->busn_res.end >= bus)
669                                 return drhd->iommu;
670                 }
671
672                 if (drhd->include_all)
673                         return drhd->iommu;
674         }
675
676         return NULL;
677 }
678
679 static void domain_flush_cache(struct dmar_domain *domain,
680                                void *addr, int size)
681 {
682         if (!domain->iommu_coherency)
683                 clflush_cache_range(addr, size);
684 }
685
686 /* Gets context entry for a given bus and devfn */
687 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
688                 u8 bus, u8 devfn)
689 {
690         struct root_entry *root;
691         struct context_entry *context;
692         unsigned long phy_addr;
693         unsigned long flags;
694
695         spin_lock_irqsave(&iommu->lock, flags);
696         root = &iommu->root_entry[bus];
697         context = get_context_addr_from_root(root);
698         if (!context) {
699                 context = (struct context_entry *)
700                                 alloc_pgtable_page(iommu->node);
701                 if (!context) {
702                         spin_unlock_irqrestore(&iommu->lock, flags);
703                         return NULL;
704                 }
705                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
706                 phy_addr = virt_to_phys((void *)context);
707                 set_root_value(root, phy_addr);
708                 set_root_present(root);
709                 __iommu_flush_cache(iommu, root, sizeof(*root));
710         }
711         spin_unlock_irqrestore(&iommu->lock, flags);
712         return &context[devfn];
713 }
714
715 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
716 {
717         struct root_entry *root;
718         struct context_entry *context;
719         int ret;
720         unsigned long flags;
721
722         spin_lock_irqsave(&iommu->lock, flags);
723         root = &iommu->root_entry[bus];
724         context = get_context_addr_from_root(root);
725         if (!context) {
726                 ret = 0;
727                 goto out;
728         }
729         ret = context_present(&context[devfn]);
730 out:
731         spin_unlock_irqrestore(&iommu->lock, flags);
732         return ret;
733 }
734
735 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
736 {
737         struct root_entry *root;
738         struct context_entry *context;
739         unsigned long flags;
740
741         spin_lock_irqsave(&iommu->lock, flags);
742         root = &iommu->root_entry[bus];
743         context = get_context_addr_from_root(root);
744         if (context) {
745                 context_clear_entry(&context[devfn]);
746                 __iommu_flush_cache(iommu, &context[devfn], \
747                         sizeof(*context));
748         }
749         spin_unlock_irqrestore(&iommu->lock, flags);
750 }
751
752 static void free_context_table(struct intel_iommu *iommu)
753 {
754         struct root_entry *root;
755         int i;
756         unsigned long flags;
757         struct context_entry *context;
758
759         spin_lock_irqsave(&iommu->lock, flags);
760         if (!iommu->root_entry) {
761                 goto out;
762         }
763         for (i = 0; i < ROOT_ENTRY_NR; i++) {
764                 root = &iommu->root_entry[i];
765                 context = get_context_addr_from_root(root);
766                 if (context)
767                         free_pgtable_page(context);
768         }
769         free_pgtable_page(iommu->root_entry);
770         iommu->root_entry = NULL;
771 out:
772         spin_unlock_irqrestore(&iommu->lock, flags);
773 }
774
775 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
776                                       unsigned long pfn, int target_level)
777 {
778         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
779         struct dma_pte *parent, *pte = NULL;
780         int level = agaw_to_level(domain->agaw);
781         int offset;
782
783         BUG_ON(!domain->pgd);
784         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
785         parent = domain->pgd;
786
787         while (level > 0) {
788                 void *tmp_page;
789
790                 offset = pfn_level_offset(pfn, level);
791                 pte = &parent[offset];
792                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
793                         break;
794                 if (level == target_level)
795                         break;
796
797                 if (!dma_pte_present(pte)) {
798                         uint64_t pteval;
799
800                         tmp_page = alloc_pgtable_page(domain->nid);
801
802                         if (!tmp_page)
803                                 return NULL;
804
805                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
806                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
807                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
808                                 /* Someone else set it while we were thinking; use theirs. */
809                                 free_pgtable_page(tmp_page);
810                         } else {
811                                 dma_pte_addr(pte);
812                                 domain_flush_cache(domain, pte, sizeof(*pte));
813                         }
814                 }
815                 parent = phys_to_virt(dma_pte_addr(pte));
816                 level--;
817         }
818
819         return pte;
820 }
821
822
823 /* return address's pte at specific level */
824 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
825                                          unsigned long pfn,
826                                          int level, int *large_page)
827 {
828         struct dma_pte *parent, *pte = NULL;
829         int total = agaw_to_level(domain->agaw);
830         int offset;
831
832         parent = domain->pgd;
833         while (level <= total) {
834                 offset = pfn_level_offset(pfn, total);
835                 pte = &parent[offset];
836                 if (level == total)
837                         return pte;
838
839                 if (!dma_pte_present(pte)) {
840                         *large_page = total;
841                         break;
842                 }
843
844                 if (pte->val & DMA_PTE_LARGE_PAGE) {
845                         *large_page = total;
846                         return pte;
847                 }
848
849                 parent = phys_to_virt(dma_pte_addr(pte));
850                 total--;
851         }
852         return NULL;
853 }
854
855 /* clear last level pte, a tlb flush should be followed */
856 static int dma_pte_clear_range(struct dmar_domain *domain,
857                                 unsigned long start_pfn,
858                                 unsigned long last_pfn)
859 {
860         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
861         unsigned int large_page = 1;
862         struct dma_pte *first_pte, *pte;
863         int order;
864
865         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
866         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
867         BUG_ON(start_pfn > last_pfn);
868
869         /* we don't need lock here; nobody else touches the iova range */
870         do {
871                 large_page = 1;
872                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
873                 if (!pte) {
874                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
875                         continue;
876                 }
877                 do {
878                         dma_clear_pte(pte);
879                         start_pfn += lvl_to_nr_pages(large_page);
880                         pte++;
881                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
882
883                 domain_flush_cache(domain, first_pte,
884                                    (void *)pte - (void *)first_pte);
885
886         } while (start_pfn && start_pfn <= last_pfn);
887
888         order = (large_page - 1) * 9;
889         return order;
890 }
891
892 /* free page table pages. last level pte should already be cleared */
893 static void dma_pte_free_pagetable(struct dmar_domain *domain,
894                                    unsigned long start_pfn,
895                                    unsigned long last_pfn)
896 {
897         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
898         struct dma_pte *first_pte, *pte;
899         int total = agaw_to_level(domain->agaw);
900         int level;
901         unsigned long tmp;
902         int large_page = 2;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         level = 2;
910         while (level <= total) {
911                 tmp = align_to_level(start_pfn, level);
912
913                 /* If we can't even clear one PTE at this level, we're done */
914                 if (tmp + level_size(level) - 1 > last_pfn)
915                         return;
916
917                 do {
918                         large_page = level;
919                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
920                         if (large_page > level)
921                                 level = large_page + 1;
922                         if (!pte) {
923                                 tmp = align_to_level(tmp + 1, level + 1);
924                                 continue;
925                         }
926                         do {
927                                 if (dma_pte_present(pte)) {
928                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
929                                         dma_clear_pte(pte);
930                                 }
931                                 pte++;
932                                 tmp += level_size(level);
933                         } while (!first_pte_in_page(pte) &&
934                                  tmp + level_size(level) - 1 <= last_pfn);
935
936                         domain_flush_cache(domain, first_pte,
937                                            (void *)pte - (void *)first_pte);
938                         
939                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
940                 level++;
941         }
942         /* free pgd */
943         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
944                 free_pgtable_page(domain->pgd);
945                 domain->pgd = NULL;
946         }
947 }
948
949 /* iommu handling */
950 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
951 {
952         struct root_entry *root;
953         unsigned long flags;
954
955         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
956         if (!root)
957                 return -ENOMEM;
958
959         __iommu_flush_cache(iommu, root, ROOT_SIZE);
960
961         spin_lock_irqsave(&iommu->lock, flags);
962         iommu->root_entry = root;
963         spin_unlock_irqrestore(&iommu->lock, flags);
964
965         return 0;
966 }
967
968 static void iommu_set_root_entry(struct intel_iommu *iommu)
969 {
970         void *addr;
971         u32 sts;
972         unsigned long flag;
973
974         addr = iommu->root_entry;
975
976         raw_spin_lock_irqsave(&iommu->register_lock, flag);
977         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
978
979         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
980
981         /* Make sure hardware complete it */
982         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983                       readl, (sts & DMA_GSTS_RTPS), sts);
984
985         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
986 }
987
988 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
989 {
990         u32 val;
991         unsigned long flag;
992
993         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
994                 return;
995
996         raw_spin_lock_irqsave(&iommu->register_lock, flag);
997         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
998
999         /* Make sure hardware complete it */
1000         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001                       readl, (!(val & DMA_GSTS_WBFS)), val);
1002
1003         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1004 }
1005
1006 /* return value determine if we need a write buffer flush */
1007 static void __iommu_flush_context(struct intel_iommu *iommu,
1008                                   u16 did, u16 source_id, u8 function_mask,
1009                                   u64 type)
1010 {
1011         u64 val = 0;
1012         unsigned long flag;
1013
1014         switch (type) {
1015         case DMA_CCMD_GLOBAL_INVL:
1016                 val = DMA_CCMD_GLOBAL_INVL;
1017                 break;
1018         case DMA_CCMD_DOMAIN_INVL:
1019                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1020                 break;
1021         case DMA_CCMD_DEVICE_INVL:
1022                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1023                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1024                 break;
1025         default:
1026                 BUG();
1027         }
1028         val |= DMA_CCMD_ICC;
1029
1030         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1031         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1032
1033         /* Make sure hardware complete it */
1034         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1035                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1036
1037         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1038 }
1039
1040 /* return value determine if we need a write buffer flush */
1041 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1042                                 u64 addr, unsigned int size_order, u64 type)
1043 {
1044         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1045         u64 val = 0, val_iva = 0;
1046         unsigned long flag;
1047
1048         switch (type) {
1049         case DMA_TLB_GLOBAL_FLUSH:
1050                 /* global flush doesn't need set IVA_REG */
1051                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1052                 break;
1053         case DMA_TLB_DSI_FLUSH:
1054                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1055                 break;
1056         case DMA_TLB_PSI_FLUSH:
1057                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1058                 /* Note: always flush non-leaf currently */
1059                 val_iva = size_order | addr;
1060                 break;
1061         default:
1062                 BUG();
1063         }
1064         /* Note: set drain read/write */
1065 #if 0
1066         /*
1067          * This is probably to be super secure.. Looks like we can
1068          * ignore it without any impact.
1069          */
1070         if (cap_read_drain(iommu->cap))
1071                 val |= DMA_TLB_READ_DRAIN;
1072 #endif
1073         if (cap_write_drain(iommu->cap))
1074                 val |= DMA_TLB_WRITE_DRAIN;
1075
1076         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1077         /* Note: Only uses first TLB reg currently */
1078         if (val_iva)
1079                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1080         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1081
1082         /* Make sure hardware complete it */
1083         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1084                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1085
1086         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1087
1088         /* check IOTLB invalidation granularity */
1089         if (DMA_TLB_IAIG(val) == 0)
1090                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1091         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1092                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1093                         (unsigned long long)DMA_TLB_IIRG(type),
1094                         (unsigned long long)DMA_TLB_IAIG(val));
1095 }
1096
1097 static struct device_domain_info *iommu_support_dev_iotlb(
1098         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1099 {
1100         int found = 0;
1101         unsigned long flags;
1102         struct device_domain_info *info;
1103         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1104
1105         if (!ecap_dev_iotlb_support(iommu->ecap))
1106                 return NULL;
1107
1108         if (!iommu->qi)
1109                 return NULL;
1110
1111         spin_lock_irqsave(&device_domain_lock, flags);
1112         list_for_each_entry(info, &domain->devices, link)
1113                 if (info->bus == bus && info->devfn == devfn) {
1114                         found = 1;
1115                         break;
1116                 }
1117         spin_unlock_irqrestore(&device_domain_lock, flags);
1118
1119         if (!found || !info->dev)
1120                 return NULL;
1121
1122         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1123                 return NULL;
1124
1125         if (!dmar_find_matched_atsr_unit(info->dev))
1126                 return NULL;
1127
1128         info->iommu = iommu;
1129
1130         return info;
1131 }
1132
1133 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1134 {
1135         if (!info)
1136                 return;
1137
1138         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1139 }
1140
1141 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1142 {
1143         if (!info->dev || !pci_ats_enabled(info->dev))
1144                 return;
1145
1146         pci_disable_ats(info->dev);
1147 }
1148
1149 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1150                                   u64 addr, unsigned mask)
1151 {
1152         u16 sid, qdep;
1153         unsigned long flags;
1154         struct device_domain_info *info;
1155
1156         spin_lock_irqsave(&device_domain_lock, flags);
1157         list_for_each_entry(info, &domain->devices, link) {
1158                 if (!info->dev || !pci_ats_enabled(info->dev))
1159                         continue;
1160
1161                 sid = info->bus << 8 | info->devfn;
1162                 qdep = pci_ats_queue_depth(info->dev);
1163                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1164         }
1165         spin_unlock_irqrestore(&device_domain_lock, flags);
1166 }
1167
1168 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1169                                   unsigned long pfn, unsigned int pages, int map)
1170 {
1171         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1172         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1173
1174         BUG_ON(pages == 0);
1175
1176         /*
1177          * Fallback to domain selective flush if no PSI support or the size is
1178          * too big.
1179          * PSI requires page size to be 2 ^ x, and the base address is naturally
1180          * aligned to the size
1181          */
1182         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1183                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1184                                                 DMA_TLB_DSI_FLUSH);
1185         else
1186                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1187                                                 DMA_TLB_PSI_FLUSH);
1188
1189         /*
1190          * In caching mode, changes of pages from non-present to present require
1191          * flush. However, device IOTLB doesn't need to be flushed in this case.
1192          */
1193         if (!cap_caching_mode(iommu->cap) || !map)
1194                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1195 }
1196
1197 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1198 {
1199         u32 pmen;
1200         unsigned long flags;
1201
1202         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1203         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1204         pmen &= ~DMA_PMEN_EPM;
1205         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1206
1207         /* wait for the protected region status bit to clear */
1208         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1209                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1210
1211         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1212 }
1213
1214 static int iommu_enable_translation(struct intel_iommu *iommu)
1215 {
1216         u32 sts;
1217         unsigned long flags;
1218
1219         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1220         iommu->gcmd |= DMA_GCMD_TE;
1221         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1222
1223         /* Make sure hardware complete it */
1224         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225                       readl, (sts & DMA_GSTS_TES), sts);
1226
1227         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1228         return 0;
1229 }
1230
1231 static int iommu_disable_translation(struct intel_iommu *iommu)
1232 {
1233         u32 sts;
1234         unsigned long flag;
1235
1236         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237         iommu->gcmd &= ~DMA_GCMD_TE;
1238         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1239
1240         /* Make sure hardware complete it */
1241         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1242                       readl, (!(sts & DMA_GSTS_TES)), sts);
1243
1244         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245         return 0;
1246 }
1247
1248
1249 static int iommu_init_domains(struct intel_iommu *iommu)
1250 {
1251         unsigned long ndomains;
1252         unsigned long nlongs;
1253
1254         ndomains = cap_ndoms(iommu->cap);
1255         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1256                         ndomains);
1257         nlongs = BITS_TO_LONGS(ndomains);
1258
1259         spin_lock_init(&iommu->lock);
1260
1261         /* TBD: there might be 64K domains,
1262          * consider other allocation for future chip
1263          */
1264         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1265         if (!iommu->domain_ids) {
1266                 printk(KERN_ERR "Allocating domain id array failed\n");
1267                 return -ENOMEM;
1268         }
1269         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1270                         GFP_KERNEL);
1271         if (!iommu->domains) {
1272                 printk(KERN_ERR "Allocating domain array failed\n");
1273                 return -ENOMEM;
1274         }
1275
1276         /*
1277          * if Caching mode is set, then invalid translations are tagged
1278          * with domainid 0. Hence we need to pre-allocate it.
1279          */
1280         if (cap_caching_mode(iommu->cap))
1281                 set_bit(0, iommu->domain_ids);
1282         return 0;
1283 }
1284
1285
1286 static void domain_exit(struct dmar_domain *domain);
1287 static void vm_domain_exit(struct dmar_domain *domain);
1288
1289 void free_dmar_iommu(struct intel_iommu *iommu)
1290 {
1291         struct dmar_domain *domain;
1292         int i;
1293         unsigned long flags;
1294
1295         if ((iommu->domains) && (iommu->domain_ids)) {
1296                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1297                         domain = iommu->domains[i];
1298                         clear_bit(i, iommu->domain_ids);
1299
1300                         spin_lock_irqsave(&domain->iommu_lock, flags);
1301                         if (--domain->iommu_count == 0) {
1302                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1303                                         vm_domain_exit(domain);
1304                                 else
1305                                         domain_exit(domain);
1306                         }
1307                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1308                 }
1309         }
1310
1311         if (iommu->gcmd & DMA_GCMD_TE)
1312                 iommu_disable_translation(iommu);
1313
1314         if (iommu->irq) {
1315                 irq_set_handler_data(iommu->irq, NULL);
1316                 /* This will mask the irq */
1317                 free_irq(iommu->irq, iommu);
1318                 destroy_irq(iommu->irq);
1319         }
1320
1321         kfree(iommu->domains);
1322         kfree(iommu->domain_ids);
1323
1324         g_iommus[iommu->seq_id] = NULL;
1325
1326         /* if all iommus are freed, free g_iommus */
1327         for (i = 0; i < g_num_of_iommus; i++) {
1328                 if (g_iommus[i])
1329                         break;
1330         }
1331
1332         if (i == g_num_of_iommus)
1333                 kfree(g_iommus);
1334
1335         /* free context mapping */
1336         free_context_table(iommu);
1337 }
1338
1339 static struct dmar_domain *alloc_domain(void)
1340 {
1341         struct dmar_domain *domain;
1342
1343         domain = alloc_domain_mem();
1344         if (!domain)
1345                 return NULL;
1346
1347         domain->nid = -1;
1348         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1349         domain->flags = 0;
1350
1351         return domain;
1352 }
1353
1354 static int iommu_attach_domain(struct dmar_domain *domain,
1355                                struct intel_iommu *iommu)
1356 {
1357         int num;
1358         unsigned long ndomains;
1359         unsigned long flags;
1360
1361         ndomains = cap_ndoms(iommu->cap);
1362
1363         spin_lock_irqsave(&iommu->lock, flags);
1364
1365         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1366         if (num >= ndomains) {
1367                 spin_unlock_irqrestore(&iommu->lock, flags);
1368                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1369                 return -ENOMEM;
1370         }
1371
1372         domain->id = num;
1373         set_bit(num, iommu->domain_ids);
1374         set_bit(iommu->seq_id, domain->iommu_bmp);
1375         iommu->domains[num] = domain;
1376         spin_unlock_irqrestore(&iommu->lock, flags);
1377
1378         return 0;
1379 }
1380
1381 static void iommu_detach_domain(struct dmar_domain *domain,
1382                                 struct intel_iommu *iommu)
1383 {
1384         unsigned long flags;
1385         int num, ndomains;
1386         int found = 0;
1387
1388         spin_lock_irqsave(&iommu->lock, flags);
1389         ndomains = cap_ndoms(iommu->cap);
1390         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1391                 if (iommu->domains[num] == domain) {
1392                         found = 1;
1393                         break;
1394                 }
1395         }
1396
1397         if (found) {
1398                 clear_bit(num, iommu->domain_ids);
1399                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1400                 iommu->domains[num] = NULL;
1401         }
1402         spin_unlock_irqrestore(&iommu->lock, flags);
1403 }
1404
1405 static struct iova_domain reserved_iova_list;
1406 static struct lock_class_key reserved_rbtree_key;
1407
1408 static int dmar_init_reserved_ranges(void)
1409 {
1410         struct pci_dev *pdev = NULL;
1411         struct iova *iova;
1412         int i;
1413
1414         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1415
1416         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1417                 &reserved_rbtree_key);
1418
1419         /* IOAPIC ranges shouldn't be accessed by DMA */
1420         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1421                 IOVA_PFN(IOAPIC_RANGE_END));
1422         if (!iova) {
1423                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1424                 return -ENODEV;
1425         }
1426
1427         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1428         for_each_pci_dev(pdev) {
1429                 struct resource *r;
1430
1431                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1432                         r = &pdev->resource[i];
1433                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1434                                 continue;
1435                         iova = reserve_iova(&reserved_iova_list,
1436                                             IOVA_PFN(r->start),
1437                                             IOVA_PFN(r->end));
1438                         if (!iova) {
1439                                 printk(KERN_ERR "Reserve iova failed\n");
1440                                 return -ENODEV;
1441                         }
1442                 }
1443         }
1444         return 0;
1445 }
1446
1447 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1448 {
1449         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1450 }
1451
1452 static inline int guestwidth_to_adjustwidth(int gaw)
1453 {
1454         int agaw;
1455         int r = (gaw - 12) % 9;
1456
1457         if (r == 0)
1458                 agaw = gaw;
1459         else
1460                 agaw = gaw + 9 - r;
1461         if (agaw > 64)
1462                 agaw = 64;
1463         return agaw;
1464 }
1465
1466 static int domain_init(struct dmar_domain *domain, int guest_width)
1467 {
1468         struct intel_iommu *iommu;
1469         int adjust_width, agaw;
1470         unsigned long sagaw;
1471
1472         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1473         spin_lock_init(&domain->iommu_lock);
1474
1475         domain_reserve_special_ranges(domain);
1476
1477         /* calculate AGAW */
1478         iommu = domain_get_iommu(domain);
1479         if (guest_width > cap_mgaw(iommu->cap))
1480                 guest_width = cap_mgaw(iommu->cap);
1481         domain->gaw = guest_width;
1482         adjust_width = guestwidth_to_adjustwidth(guest_width);
1483         agaw = width_to_agaw(adjust_width);
1484         sagaw = cap_sagaw(iommu->cap);
1485         if (!test_bit(agaw, &sagaw)) {
1486                 /* hardware doesn't support it, choose a bigger one */
1487                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1488                 agaw = find_next_bit(&sagaw, 5, agaw);
1489                 if (agaw >= 5)
1490                         return -ENODEV;
1491         }
1492         domain->agaw = agaw;
1493         INIT_LIST_HEAD(&domain->devices);
1494
1495         if (ecap_coherent(iommu->ecap))
1496                 domain->iommu_coherency = 1;
1497         else
1498                 domain->iommu_coherency = 0;
1499
1500         if (ecap_sc_support(iommu->ecap))
1501                 domain->iommu_snooping = 1;
1502         else
1503                 domain->iommu_snooping = 0;
1504
1505         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1506         domain->iommu_count = 1;
1507         domain->nid = iommu->node;
1508
1509         /* always allocate the top pgd */
1510         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1511         if (!domain->pgd)
1512                 return -ENOMEM;
1513         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1514         return 0;
1515 }
1516
1517 static void domain_exit(struct dmar_domain *domain)
1518 {
1519         struct dmar_drhd_unit *drhd;
1520         struct intel_iommu *iommu;
1521
1522         /* Domain 0 is reserved, so dont process it */
1523         if (!domain)
1524                 return;
1525
1526         /* Flush any lazy unmaps that may reference this domain */
1527         if (!intel_iommu_strict)
1528                 flush_unmaps_timeout(0);
1529
1530         domain_remove_dev_info(domain);
1531         /* destroy iovas */
1532         put_iova_domain(&domain->iovad);
1533
1534         /* clear ptes */
1535         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1536
1537         /* free page tables */
1538         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1539
1540         for_each_active_iommu(iommu, drhd)
1541                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1542                         iommu_detach_domain(domain, iommu);
1543
1544         free_domain_mem(domain);
1545 }
1546
1547 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1548                                  u8 bus, u8 devfn, int translation)
1549 {
1550         struct context_entry *context;
1551         unsigned long flags;
1552         struct intel_iommu *iommu;
1553         struct dma_pte *pgd;
1554         unsigned long num;
1555         unsigned long ndomains;
1556         int id;
1557         int agaw;
1558         struct device_domain_info *info = NULL;
1559
1560         pr_debug("Set context mapping for %02x:%02x.%d\n",
1561                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1562
1563         BUG_ON(!domain->pgd);
1564         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1565                translation != CONTEXT_TT_MULTI_LEVEL);
1566
1567         iommu = device_to_iommu(segment, bus, devfn);
1568         if (!iommu)
1569                 return -ENODEV;
1570
1571         context = device_to_context_entry(iommu, bus, devfn);
1572         if (!context)
1573                 return -ENOMEM;
1574         spin_lock_irqsave(&iommu->lock, flags);
1575         if (context_present(context)) {
1576                 spin_unlock_irqrestore(&iommu->lock, flags);
1577                 return 0;
1578         }
1579
1580         id = domain->id;
1581         pgd = domain->pgd;
1582
1583         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1584             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1585                 int found = 0;
1586
1587                 /* find an available domain id for this device in iommu */
1588                 ndomains = cap_ndoms(iommu->cap);
1589                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1590                         if (iommu->domains[num] == domain) {
1591                                 id = num;
1592                                 found = 1;
1593                                 break;
1594                         }
1595                 }
1596
1597                 if (found == 0) {
1598                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1599                         if (num >= ndomains) {
1600                                 spin_unlock_irqrestore(&iommu->lock, flags);
1601                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1602                                 return -EFAULT;
1603                         }
1604
1605                         set_bit(num, iommu->domain_ids);
1606                         iommu->domains[num] = domain;
1607                         id = num;
1608                 }
1609
1610                 /* Skip top levels of page tables for
1611                  * iommu which has less agaw than default.
1612                  * Unnecessary for PT mode.
1613                  */
1614                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1615                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1616                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1617                                 if (!dma_pte_present(pgd)) {
1618                                         spin_unlock_irqrestore(&iommu->lock, flags);
1619                                         return -ENOMEM;
1620                                 }
1621                         }
1622                 }
1623         }
1624
1625         context_set_domain_id(context, id);
1626
1627         if (translation != CONTEXT_TT_PASS_THROUGH) {
1628                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1629                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1630                                      CONTEXT_TT_MULTI_LEVEL;
1631         }
1632         /*
1633          * In pass through mode, AW must be programmed to indicate the largest
1634          * AGAW value supported by hardware. And ASR is ignored by hardware.
1635          */
1636         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1637                 context_set_address_width(context, iommu->msagaw);
1638         else {
1639                 context_set_address_root(context, virt_to_phys(pgd));
1640                 context_set_address_width(context, iommu->agaw);
1641         }
1642
1643         context_set_translation_type(context, translation);
1644         context_set_fault_enable(context);
1645         context_set_present(context);
1646         domain_flush_cache(domain, context, sizeof(*context));
1647
1648         /*
1649          * It's a non-present to present mapping. If hardware doesn't cache
1650          * non-present entry we only need to flush the write-buffer. If the
1651          * _does_ cache non-present entries, then it does so in the special
1652          * domain #0, which we have to flush:
1653          */
1654         if (cap_caching_mode(iommu->cap)) {
1655                 iommu->flush.flush_context(iommu, 0,
1656                                            (((u16)bus) << 8) | devfn,
1657                                            DMA_CCMD_MASK_NOBIT,
1658                                            DMA_CCMD_DEVICE_INVL);
1659                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1660         } else {
1661                 iommu_flush_write_buffer(iommu);
1662         }
1663         iommu_enable_dev_iotlb(info);
1664         spin_unlock_irqrestore(&iommu->lock, flags);
1665
1666         spin_lock_irqsave(&domain->iommu_lock, flags);
1667         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1668                 domain->iommu_count++;
1669                 if (domain->iommu_count == 1)
1670                         domain->nid = iommu->node;
1671                 domain_update_iommu_cap(domain);
1672         }
1673         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1674         return 0;
1675 }
1676
1677 static int
1678 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1679                         int translation)
1680 {
1681         int ret;
1682         struct pci_dev *tmp, *parent;
1683
1684         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1685                                          pdev->bus->number, pdev->devfn,
1686                                          translation);
1687         if (ret)
1688                 return ret;
1689
1690         /* dependent device mapping */
1691         tmp = pci_find_upstream_pcie_bridge(pdev);
1692         if (!tmp)
1693                 return 0;
1694         /* Secondary interface's bus number and devfn 0 */
1695         parent = pdev->bus->self;
1696         while (parent != tmp) {
1697                 ret = domain_context_mapping_one(domain,
1698                                                  pci_domain_nr(parent->bus),
1699                                                  parent->bus->number,
1700                                                  parent->devfn, translation);
1701                 if (ret)
1702                         return ret;
1703                 parent = parent->bus->self;
1704         }
1705         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1706                 return domain_context_mapping_one(domain,
1707                                         pci_domain_nr(tmp->subordinate),
1708                                         tmp->subordinate->number, 0,
1709                                         translation);
1710         else /* this is a legacy PCI bridge */
1711                 return domain_context_mapping_one(domain,
1712                                                   pci_domain_nr(tmp->bus),
1713                                                   tmp->bus->number,
1714                                                   tmp->devfn,
1715                                                   translation);
1716 }
1717
1718 static int domain_context_mapped(struct pci_dev *pdev)
1719 {
1720         int ret;
1721         struct pci_dev *tmp, *parent;
1722         struct intel_iommu *iommu;
1723
1724         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1725                                 pdev->devfn);
1726         if (!iommu)
1727                 return -ENODEV;
1728
1729         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1730         if (!ret)
1731                 return ret;
1732         /* dependent device mapping */
1733         tmp = pci_find_upstream_pcie_bridge(pdev);
1734         if (!tmp)
1735                 return ret;
1736         /* Secondary interface's bus number and devfn 0 */
1737         parent = pdev->bus->self;
1738         while (parent != tmp) {
1739                 ret = device_context_mapped(iommu, parent->bus->number,
1740                                             parent->devfn);
1741                 if (!ret)
1742                         return ret;
1743                 parent = parent->bus->self;
1744         }
1745         if (pci_is_pcie(tmp))
1746                 return device_context_mapped(iommu, tmp->subordinate->number,
1747                                              0);
1748         else
1749                 return device_context_mapped(iommu, tmp->bus->number,
1750                                              tmp->devfn);
1751 }
1752
1753 /* Returns a number of VTD pages, but aligned to MM page size */
1754 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1755                                             size_t size)
1756 {
1757         host_addr &= ~PAGE_MASK;
1758         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1759 }
1760
1761 /* Return largest possible superpage level for a given mapping */
1762 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1763                                           unsigned long iov_pfn,
1764                                           unsigned long phy_pfn,
1765                                           unsigned long pages)
1766 {
1767         int support, level = 1;
1768         unsigned long pfnmerge;
1769
1770         support = domain->iommu_superpage;
1771
1772         /* To use a large page, the virtual *and* physical addresses
1773            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1774            of them will mean we have to use smaller pages. So just
1775            merge them and check both at once. */
1776         pfnmerge = iov_pfn | phy_pfn;
1777
1778         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1779                 pages >>= VTD_STRIDE_SHIFT;
1780                 if (!pages)
1781                         break;
1782                 pfnmerge >>= VTD_STRIDE_SHIFT;
1783                 level++;
1784                 support--;
1785         }
1786         return level;
1787 }
1788
1789 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1790                             struct scatterlist *sg, unsigned long phys_pfn,
1791                             unsigned long nr_pages, int prot)
1792 {
1793         struct dma_pte *first_pte = NULL, *pte = NULL;
1794         phys_addr_t uninitialized_var(pteval);
1795         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1796         unsigned long sg_res;
1797         unsigned int largepage_lvl = 0;
1798         unsigned long lvl_pages = 0;
1799
1800         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1801
1802         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1803                 return -EINVAL;
1804
1805         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1806
1807         if (sg)
1808                 sg_res = 0;
1809         else {
1810                 sg_res = nr_pages + 1;
1811                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1812         }
1813
1814         while (nr_pages > 0) {
1815                 uint64_t tmp;
1816
1817                 if (!sg_res) {
1818                         sg_res = aligned_nrpages(sg->offset, sg->length);
1819                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1820                         sg->dma_length = sg->length;
1821                         pteval = page_to_phys(sg_page(sg)) | prot;
1822                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1823                 }
1824
1825                 if (!pte) {
1826                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1827
1828                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1829                         if (!pte)
1830                                 return -ENOMEM;
1831                         /* It is large page*/
1832                         if (largepage_lvl > 1) {
1833                                 pteval |= DMA_PTE_LARGE_PAGE;
1834                                 /* Ensure that old small page tables are removed to make room
1835                                    for superpage, if they exist. */
1836                                 dma_pte_clear_range(domain, iov_pfn,
1837                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                                 dma_pte_free_pagetable(domain, iov_pfn,
1839                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1840                         } else {
1841                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1842                         }
1843
1844                 }
1845                 /* We don't need lock here, nobody else
1846                  * touches the iova range
1847                  */
1848                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1849                 if (tmp) {
1850                         static int dumps = 5;
1851                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1852                                iov_pfn, tmp, (unsigned long long)pteval);
1853                         if (dumps) {
1854                                 dumps--;
1855                                 debug_dma_dump_mappings(NULL);
1856                         }
1857                         WARN_ON(1);
1858                 }
1859
1860                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1861
1862                 BUG_ON(nr_pages < lvl_pages);
1863                 BUG_ON(sg_res < lvl_pages);
1864
1865                 nr_pages -= lvl_pages;
1866                 iov_pfn += lvl_pages;
1867                 phys_pfn += lvl_pages;
1868                 pteval += lvl_pages * VTD_PAGE_SIZE;
1869                 sg_res -= lvl_pages;
1870
1871                 /* If the next PTE would be the first in a new page, then we
1872                    need to flush the cache on the entries we've just written.
1873                    And then we'll need to recalculate 'pte', so clear it and
1874                    let it get set again in the if (!pte) block above.
1875
1876                    If we're done (!nr_pages) we need to flush the cache too.
1877
1878                    Also if we've been setting superpages, we may need to
1879                    recalculate 'pte' and switch back to smaller pages for the
1880                    end of the mapping, if the trailing size is not enough to
1881                    use another superpage (i.e. sg_res < lvl_pages). */
1882                 pte++;
1883                 if (!nr_pages || first_pte_in_page(pte) ||
1884                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1885                         domain_flush_cache(domain, first_pte,
1886                                            (void *)pte - (void *)first_pte);
1887                         pte = NULL;
1888                 }
1889
1890                 if (!sg_res && nr_pages)
1891                         sg = sg_next(sg);
1892         }
1893         return 0;
1894 }
1895
1896 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1897                                     struct scatterlist *sg, unsigned long nr_pages,
1898                                     int prot)
1899 {
1900         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1901 }
1902
1903 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1904                                      unsigned long phys_pfn, unsigned long nr_pages,
1905                                      int prot)
1906 {
1907         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1908 }
1909
1910 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1911 {
1912         if (!iommu)
1913                 return;
1914
1915         clear_context_table(iommu, bus, devfn);
1916         iommu->flush.flush_context(iommu, 0, 0, 0,
1917                                            DMA_CCMD_GLOBAL_INVL);
1918         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1919 }
1920
1921 static inline void unlink_domain_info(struct device_domain_info *info)
1922 {
1923         assert_spin_locked(&device_domain_lock);
1924         list_del(&info->link);
1925         list_del(&info->global);
1926         if (info->dev)
1927                 info->dev->dev.archdata.iommu = NULL;
1928 }
1929
1930 static void domain_remove_dev_info(struct dmar_domain *domain)
1931 {
1932         struct device_domain_info *info;
1933         unsigned long flags;
1934         struct intel_iommu *iommu;
1935
1936         spin_lock_irqsave(&device_domain_lock, flags);
1937         while (!list_empty(&domain->devices)) {
1938                 info = list_entry(domain->devices.next,
1939                         struct device_domain_info, link);
1940                 unlink_domain_info(info);
1941                 spin_unlock_irqrestore(&device_domain_lock, flags);
1942
1943                 iommu_disable_dev_iotlb(info);
1944                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1945                 iommu_detach_dev(iommu, info->bus, info->devfn);
1946                 free_devinfo_mem(info);
1947
1948                 spin_lock_irqsave(&device_domain_lock, flags);
1949         }
1950         spin_unlock_irqrestore(&device_domain_lock, flags);
1951 }
1952
1953 /*
1954  * find_domain
1955  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1956  */
1957 static struct dmar_domain *
1958 find_domain(struct pci_dev *pdev)
1959 {
1960         struct device_domain_info *info;
1961
1962         /* No lock here, assumes no domain exit in normal case */
1963         info = pdev->dev.archdata.iommu;
1964         if (info)
1965                 return info->domain;
1966         return NULL;
1967 }
1968
1969 /* domain is initialized */
1970 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1971 {
1972         struct dmar_domain *domain, *found = NULL;
1973         struct intel_iommu *iommu;
1974         struct dmar_drhd_unit *drhd;
1975         struct device_domain_info *info, *tmp;
1976         struct pci_dev *dev_tmp;
1977         unsigned long flags;
1978         int bus = 0, devfn = 0;
1979         int segment;
1980         int ret;
1981
1982         domain = find_domain(pdev);
1983         if (domain)
1984                 return domain;
1985
1986         segment = pci_domain_nr(pdev->bus);
1987
1988         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1989         if (dev_tmp) {
1990                 if (pci_is_pcie(dev_tmp)) {
1991                         bus = dev_tmp->subordinate->number;
1992                         devfn = 0;
1993                 } else {
1994                         bus = dev_tmp->bus->number;
1995                         devfn = dev_tmp->devfn;
1996                 }
1997                 spin_lock_irqsave(&device_domain_lock, flags);
1998                 list_for_each_entry(info, &device_domain_list, global) {
1999                         if (info->segment == segment &&
2000                             info->bus == bus && info->devfn == devfn) {
2001                                 found = info->domain;
2002                                 break;
2003                         }
2004                 }
2005                 spin_unlock_irqrestore(&device_domain_lock, flags);
2006                 /* pcie-pci bridge already has a domain, uses it */
2007                 if (found) {
2008                         domain = found;
2009                         goto found_domain;
2010                 }
2011         }
2012
2013         domain = alloc_domain();
2014         if (!domain)
2015                 goto error;
2016
2017         /* Allocate new domain for the device */
2018         drhd = dmar_find_matched_drhd_unit(pdev);
2019         if (!drhd) {
2020                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2021                         pci_name(pdev));
2022                 free_domain_mem(domain);
2023                 return NULL;
2024         }
2025         iommu = drhd->iommu;
2026
2027         ret = iommu_attach_domain(domain, iommu);
2028         if (ret) {
2029                 free_domain_mem(domain);
2030                 goto error;
2031         }
2032
2033         if (domain_init(domain, gaw)) {
2034                 domain_exit(domain);
2035                 goto error;
2036         }
2037
2038         /* register pcie-to-pci device */
2039         if (dev_tmp) {
2040                 info = alloc_devinfo_mem();
2041                 if (!info) {
2042                         domain_exit(domain);
2043                         goto error;
2044                 }
2045                 info->segment = segment;
2046                 info->bus = bus;
2047                 info->devfn = devfn;
2048                 info->dev = NULL;
2049                 info->domain = domain;
2050                 /* This domain is shared by devices under p2p bridge */
2051                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2052
2053                 /* pcie-to-pci bridge already has a domain, uses it */
2054                 found = NULL;
2055                 spin_lock_irqsave(&device_domain_lock, flags);
2056                 list_for_each_entry(tmp, &device_domain_list, global) {
2057                         if (tmp->segment == segment &&
2058                             tmp->bus == bus && tmp->devfn == devfn) {
2059                                 found = tmp->domain;
2060                                 break;
2061                         }
2062                 }
2063                 if (found) {
2064                         spin_unlock_irqrestore(&device_domain_lock, flags);
2065                         free_devinfo_mem(info);
2066                         domain_exit(domain);
2067                         domain = found;
2068                 } else {
2069                         list_add(&info->link, &domain->devices);
2070                         list_add(&info->global, &device_domain_list);
2071                         spin_unlock_irqrestore(&device_domain_lock, flags);
2072                 }
2073         }
2074
2075 found_domain:
2076         info = alloc_devinfo_mem();
2077         if (!info)
2078                 goto error;
2079         info->segment = segment;
2080         info->bus = pdev->bus->number;
2081         info->devfn = pdev->devfn;
2082         info->dev = pdev;
2083         info->domain = domain;
2084         spin_lock_irqsave(&device_domain_lock, flags);
2085         /* somebody is fast */
2086         found = find_domain(pdev);
2087         if (found != NULL) {
2088                 spin_unlock_irqrestore(&device_domain_lock, flags);
2089                 if (found != domain) {
2090                         domain_exit(domain);
2091                         domain = found;
2092                 }
2093                 free_devinfo_mem(info);
2094                 return domain;
2095         }
2096         list_add(&info->link, &domain->devices);
2097         list_add(&info->global, &device_domain_list);
2098         pdev->dev.archdata.iommu = info;
2099         spin_unlock_irqrestore(&device_domain_lock, flags);
2100         return domain;
2101 error:
2102         /* recheck it here, maybe others set it */
2103         return find_domain(pdev);
2104 }
2105
2106 static int iommu_identity_mapping;
2107 #define IDENTMAP_ALL            1
2108 #define IDENTMAP_GFX            2
2109 #define IDENTMAP_AZALIA         4
2110
2111 static int iommu_domain_identity_map(struct dmar_domain *domain,
2112                                      unsigned long long start,
2113                                      unsigned long long end)
2114 {
2115         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2116         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2117
2118         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2119                           dma_to_mm_pfn(last_vpfn))) {
2120                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2121                 return -ENOMEM;
2122         }
2123
2124         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2125                  start, end, domain->id);
2126         /*
2127          * RMRR range might have overlap with physical memory range,
2128          * clear it first
2129          */
2130         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2131
2132         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2133                                   last_vpfn - first_vpfn + 1,
2134                                   DMA_PTE_READ|DMA_PTE_WRITE);
2135 }
2136
2137 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2138                                       unsigned long long start,
2139                                       unsigned long long end)
2140 {
2141         struct dmar_domain *domain;
2142         int ret;
2143
2144         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2145         if (!domain)
2146                 return -ENOMEM;
2147
2148         /* For _hardware_ passthrough, don't bother. But for software
2149            passthrough, we do it anyway -- it may indicate a memory
2150            range which is reserved in E820, so which didn't get set
2151            up to start with in si_domain */
2152         if (domain == si_domain && hw_pass_through) {
2153                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2154                        pci_name(pdev), start, end);
2155                 return 0;
2156         }
2157
2158         printk(KERN_INFO
2159                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2160                pci_name(pdev), start, end);
2161         
2162         if (end < start) {
2163                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2164                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165                         dmi_get_system_info(DMI_BIOS_VENDOR),
2166                         dmi_get_system_info(DMI_BIOS_VERSION),
2167                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2168                 ret = -EIO;
2169                 goto error;
2170         }
2171
2172         if (end >> agaw_to_width(domain->agaw)) {
2173                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2174                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2175                      agaw_to_width(domain->agaw),
2176                      dmi_get_system_info(DMI_BIOS_VENDOR),
2177                      dmi_get_system_info(DMI_BIOS_VERSION),
2178                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2179                 ret = -EIO;
2180                 goto error;
2181         }
2182
2183         ret = iommu_domain_identity_map(domain, start, end);
2184         if (ret)
2185                 goto error;
2186
2187         /* context entry init */
2188         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2189         if (ret)
2190                 goto error;
2191
2192         return 0;
2193
2194  error:
2195         domain_exit(domain);
2196         return ret;
2197 }
2198
2199 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2200         struct pci_dev *pdev)
2201 {
2202         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2203                 return 0;
2204         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2205                 rmrr->end_address);
2206 }
2207
2208 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2209 static inline void iommu_prepare_isa(void)
2210 {
2211         struct pci_dev *pdev;
2212         int ret;
2213
2214         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2215         if (!pdev)
2216                 return;
2217
2218         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2219         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2220
2221         if (ret)
2222                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2223                        "floppy might not work\n");
2224
2225 }
2226 #else
2227 static inline void iommu_prepare_isa(void)
2228 {
2229         return;
2230 }
2231 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2232
2233 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2234
2235 static int __init si_domain_init(int hw)
2236 {
2237         struct dmar_drhd_unit *drhd;
2238         struct intel_iommu *iommu;
2239         int nid, ret = 0;
2240
2241         si_domain = alloc_domain();
2242         if (!si_domain)
2243                 return -EFAULT;
2244
2245         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2246
2247         for_each_active_iommu(iommu, drhd) {
2248                 ret = iommu_attach_domain(si_domain, iommu);
2249                 if (ret) {
2250                         domain_exit(si_domain);
2251                         return -EFAULT;
2252                 }
2253         }
2254
2255         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2256                 domain_exit(si_domain);
2257                 return -EFAULT;
2258         }
2259
2260         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2261
2262         if (hw)
2263                 return 0;
2264
2265         for_each_online_node(nid) {
2266                 unsigned long start_pfn, end_pfn;
2267                 int i;
2268
2269                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2270                         ret = iommu_domain_identity_map(si_domain,
2271                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2272                         if (ret)
2273                                 return ret;
2274                 }
2275         }
2276
2277         return 0;
2278 }
2279
2280 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2281                                           struct pci_dev *pdev);
2282 static int identity_mapping(struct pci_dev *pdev)
2283 {
2284         struct device_domain_info *info;
2285
2286         if (likely(!iommu_identity_mapping))
2287                 return 0;
2288
2289         info = pdev->dev.archdata.iommu;
2290         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2291                 return (info->domain == si_domain);
2292
2293         return 0;
2294 }
2295
2296 static int domain_add_dev_info(struct dmar_domain *domain,
2297                                struct pci_dev *pdev,
2298                                int translation)
2299 {
2300         struct device_domain_info *info;
2301         unsigned long flags;
2302         int ret;
2303
2304         info = alloc_devinfo_mem();
2305         if (!info)
2306                 return -ENOMEM;
2307
2308         info->segment = pci_domain_nr(pdev->bus);
2309         info->bus = pdev->bus->number;
2310         info->devfn = pdev->devfn;
2311         info->dev = pdev;
2312         info->domain = domain;
2313
2314         spin_lock_irqsave(&device_domain_lock, flags);
2315         list_add(&info->link, &domain->devices);
2316         list_add(&info->global, &device_domain_list);
2317         pdev->dev.archdata.iommu = info;
2318         spin_unlock_irqrestore(&device_domain_lock, flags);
2319
2320         ret = domain_context_mapping(domain, pdev, translation);
2321         if (ret) {
2322                 spin_lock_irqsave(&device_domain_lock, flags);
2323                 unlink_domain_info(info);
2324                 spin_unlock_irqrestore(&device_domain_lock, flags);
2325                 free_devinfo_mem(info);
2326                 return ret;
2327         }
2328
2329         return 0;
2330 }
2331
2332 static bool device_has_rmrr(struct pci_dev *dev)
2333 {
2334         struct dmar_rmrr_unit *rmrr;
2335         int i;
2336
2337         for_each_rmrr_units(rmrr) {
2338                 for (i = 0; i < rmrr->devices_cnt; i++) {
2339                         /*
2340                          * Return TRUE if this RMRR contains the device that
2341                          * is passed in.
2342                          */
2343                         if (rmrr->devices[i] == dev)
2344                                 return true;
2345                 }
2346         }
2347         return false;
2348 }
2349
2350 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2351 {
2352
2353         /*
2354          * We want to prevent any device associated with an RMRR from
2355          * getting placed into the SI Domain. This is done because
2356          * problems exist when devices are moved in and out of domains
2357          * and their respective RMRR info is lost. We exempt USB devices
2358          * from this process due to their usage of RMRRs that are known
2359          * to not be needed after BIOS hand-off to OS.
2360          */
2361         if (device_has_rmrr(pdev) &&
2362             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2363                 return 0;
2364
2365         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2366                 return 1;
2367
2368         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2369                 return 1;
2370
2371         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2372                 return 0;
2373
2374         /*
2375          * We want to start off with all devices in the 1:1 domain, and
2376          * take them out later if we find they can't access all of memory.
2377          *
2378          * However, we can't do this for PCI devices behind bridges,
2379          * because all PCI devices behind the same bridge will end up
2380          * with the same source-id on their transactions.
2381          *
2382          * Practically speaking, we can't change things around for these
2383          * devices at run-time, because we can't be sure there'll be no
2384          * DMA transactions in flight for any of their siblings.
2385          * 
2386          * So PCI devices (unless they're on the root bus) as well as
2387          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2388          * the 1:1 domain, just in _case_ one of their siblings turns out
2389          * not to be able to map all of memory.
2390          */
2391         if (!pci_is_pcie(pdev)) {
2392                 if (!pci_is_root_bus(pdev->bus))
2393                         return 0;
2394                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2395                         return 0;
2396         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2397                 return 0;
2398
2399         /* 
2400          * At boot time, we don't yet know if devices will be 64-bit capable.
2401          * Assume that they will -- if they turn out not to be, then we can 
2402          * take them out of the 1:1 domain later.
2403          */
2404         if (!startup) {
2405                 /*
2406                  * If the device's dma_mask is less than the system's memory
2407                  * size then this is not a candidate for identity mapping.
2408                  */
2409                 u64 dma_mask = pdev->dma_mask;
2410
2411                 if (pdev->dev.coherent_dma_mask &&
2412                     pdev->dev.coherent_dma_mask < dma_mask)
2413                         dma_mask = pdev->dev.coherent_dma_mask;
2414
2415                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2416         }
2417
2418         return 1;
2419 }
2420
2421 static int __init iommu_prepare_static_identity_mapping(int hw)
2422 {
2423         struct pci_dev *pdev = NULL;
2424         int ret;
2425
2426         ret = si_domain_init(hw);
2427         if (ret)
2428                 return -EFAULT;
2429
2430         for_each_pci_dev(pdev) {
2431                 if (iommu_should_identity_map(pdev, 1)) {
2432                         ret = domain_add_dev_info(si_domain, pdev,
2433                                              hw ? CONTEXT_TT_PASS_THROUGH :
2434                                                   CONTEXT_TT_MULTI_LEVEL);
2435                         if (ret) {
2436                                 /* device not associated with an iommu */
2437                                 if (ret == -ENODEV)
2438                                         continue;
2439                                 return ret;
2440                         }
2441                         pr_info("IOMMU: %s identity mapping for device %s\n",
2442                                 hw ? "hardware" : "software", pci_name(pdev));
2443                 }
2444         }
2445
2446         return 0;
2447 }
2448
2449 static int __init init_dmars(void)
2450 {
2451         struct dmar_drhd_unit *drhd;
2452         struct dmar_rmrr_unit *rmrr;
2453         struct pci_dev *pdev;
2454         struct intel_iommu *iommu;
2455         int i, ret;
2456
2457         /*
2458          * for each drhd
2459          *    allocate root
2460          *    initialize and program root entry to not present
2461          * endfor
2462          */
2463         for_each_drhd_unit(drhd) {
2464                 /*
2465                  * lock not needed as this is only incremented in the single
2466                  * threaded kernel __init code path all other access are read
2467                  * only
2468                  */
2469                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2470                         g_num_of_iommus++;
2471                         continue;
2472                 }
2473                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2474                           IOMMU_UNITS_SUPPORTED);
2475         }
2476
2477         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2478                         GFP_KERNEL);
2479         if (!g_iommus) {
2480                 printk(KERN_ERR "Allocating global iommu array failed\n");
2481                 ret = -ENOMEM;
2482                 goto error;
2483         }
2484
2485         deferred_flush = kzalloc(g_num_of_iommus *
2486                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2487         if (!deferred_flush) {
2488                 ret = -ENOMEM;
2489                 goto error;
2490         }
2491
2492         for_each_drhd_unit(drhd) {
2493                 if (drhd->ignored)
2494                         continue;
2495
2496                 iommu = drhd->iommu;
2497                 g_iommus[iommu->seq_id] = iommu;
2498
2499                 ret = iommu_init_domains(iommu);
2500                 if (ret)
2501                         goto error;
2502
2503                 /*
2504                  * TBD:
2505                  * we could share the same root & context tables
2506                  * among all IOMMU's. Need to Split it later.
2507                  */
2508                 ret = iommu_alloc_root_entry(iommu);
2509                 if (ret) {
2510                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2511                         goto error;
2512                 }
2513                 if (!ecap_pass_through(iommu->ecap))
2514                         hw_pass_through = 0;
2515         }
2516
2517         /*
2518          * Start from the sane iommu hardware state.
2519          */
2520         for_each_drhd_unit(drhd) {
2521                 if (drhd->ignored)
2522                         continue;
2523
2524                 iommu = drhd->iommu;
2525
2526                 /*
2527                  * If the queued invalidation is already initialized by us
2528                  * (for example, while enabling interrupt-remapping) then
2529                  * we got the things already rolling from a sane state.
2530                  */
2531                 if (iommu->qi)
2532                         continue;
2533
2534                 /*
2535                  * Clear any previous faults.
2536                  */
2537                 dmar_fault(-1, iommu);
2538                 /*
2539                  * Disable queued invalidation if supported and already enabled
2540                  * before OS handover.
2541                  */
2542                 dmar_disable_qi(iommu);
2543         }
2544
2545         for_each_drhd_unit(drhd) {
2546                 if (drhd->ignored)
2547                         continue;
2548
2549                 iommu = drhd->iommu;
2550
2551                 if (dmar_enable_qi(iommu)) {
2552                         /*
2553                          * Queued Invalidate not enabled, use Register Based
2554                          * Invalidate
2555                          */
2556                         iommu->flush.flush_context = __iommu_flush_context;
2557                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2558                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2559                                "invalidation\n",
2560                                 iommu->seq_id,
2561                                (unsigned long long)drhd->reg_base_addr);
2562                 } else {
2563                         iommu->flush.flush_context = qi_flush_context;
2564                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2565                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2566                                "invalidation\n",
2567                                 iommu->seq_id,
2568                                (unsigned long long)drhd->reg_base_addr);
2569                 }
2570         }
2571
2572         if (iommu_pass_through)
2573                 iommu_identity_mapping |= IDENTMAP_ALL;
2574
2575 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2576         iommu_identity_mapping |= IDENTMAP_GFX;
2577 #endif
2578
2579         check_tylersburg_isoch();
2580
2581         /*
2582          * If pass through is not set or not enabled, setup context entries for
2583          * identity mappings for rmrr, gfx, and isa and may fall back to static
2584          * identity mapping if iommu_identity_mapping is set.
2585          */
2586         if (iommu_identity_mapping) {
2587                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2588                 if (ret) {
2589                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2590                         goto error;
2591                 }
2592         }
2593         /*
2594          * For each rmrr
2595          *   for each dev attached to rmrr
2596          *   do
2597          *     locate drhd for dev, alloc domain for dev
2598          *     allocate free domain
2599          *     allocate page table entries for rmrr
2600          *     if context not allocated for bus
2601          *           allocate and init context
2602          *           set present in root table for this bus
2603          *     init context with domain, translation etc
2604          *    endfor
2605          * endfor
2606          */
2607         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2608         for_each_rmrr_units(rmrr) {
2609                 for (i = 0; i < rmrr->devices_cnt; i++) {
2610                         pdev = rmrr->devices[i];
2611                         /*
2612                          * some BIOS lists non-exist devices in DMAR
2613                          * table.
2614                          */
2615                         if (!pdev)
2616                                 continue;
2617                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2618                         if (ret)
2619                                 printk(KERN_ERR
2620                                        "IOMMU: mapping reserved region failed\n");
2621                 }
2622         }
2623
2624         iommu_prepare_isa();
2625
2626         /*
2627          * for each drhd
2628          *   enable fault log
2629          *   global invalidate context cache
2630          *   global invalidate iotlb
2631          *   enable translation
2632          */
2633         for_each_drhd_unit(drhd) {
2634                 if (drhd->ignored) {
2635                         /*
2636                          * we always have to disable PMRs or DMA may fail on
2637                          * this device
2638                          */
2639                         if (force_on)
2640                                 iommu_disable_protect_mem_regions(drhd->iommu);
2641                         continue;
2642                 }
2643                 iommu = drhd->iommu;
2644
2645                 iommu_flush_write_buffer(iommu);
2646
2647                 ret = dmar_set_interrupt(iommu);
2648                 if (ret)
2649                         goto error;
2650
2651                 iommu_set_root_entry(iommu);
2652
2653                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2655
2656                 ret = iommu_enable_translation(iommu);
2657                 if (ret)
2658                         goto error;
2659
2660                 iommu_disable_protect_mem_regions(iommu);
2661         }
2662
2663         return 0;
2664 error:
2665         for_each_drhd_unit(drhd) {
2666                 if (drhd->ignored)
2667                         continue;
2668                 iommu = drhd->iommu;
2669                 free_iommu(iommu);
2670         }
2671         kfree(g_iommus);
2672         return ret;
2673 }
2674
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova *intel_alloc_iova(struct device *dev,
2677                                      struct dmar_domain *domain,
2678                                      unsigned long nrpages, uint64_t dma_mask)
2679 {
2680         struct pci_dev *pdev = to_pci_dev(dev);
2681         struct iova *iova = NULL;
2682
2683         /* Restrict dma_mask to the width that the iommu can handle */
2684         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2685
2686         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2687                 /*
2688                  * First try to allocate an io virtual address in
2689                  * DMA_BIT_MASK(32) and if that fails then try allocating
2690                  * from higher range
2691                  */
2692                 iova = alloc_iova(&domain->iovad, nrpages,
2693                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2694                 if (iova)
2695                         return iova;
2696         }
2697         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698         if (unlikely(!iova)) {
2699                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700                        nrpages, pci_name(pdev));
2701                 return NULL;
2702         }
2703
2704         return iova;
2705 }
2706
2707 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2708 {
2709         struct dmar_domain *domain;
2710         int ret;
2711
2712         domain = get_domain_for_dev(pdev,
2713                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2714         if (!domain) {
2715                 printk(KERN_ERR
2716                         "Allocating domain for %s failed", pci_name(pdev));
2717                 return NULL;
2718         }
2719
2720         /* make sure context mapping is ok */
2721         if (unlikely(!domain_context_mapped(pdev))) {
2722                 ret = domain_context_mapping(domain, pdev,
2723                                              CONTEXT_TT_MULTI_LEVEL);
2724                 if (ret) {
2725                         printk(KERN_ERR
2726                                 "Domain context map for %s failed",
2727                                 pci_name(pdev));
2728                         return NULL;
2729                 }
2730         }
2731
2732         return domain;
2733 }
2734
2735 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2736 {
2737         struct device_domain_info *info;
2738
2739         /* No lock here, assumes no domain exit in normal case */
2740         info = dev->dev.archdata.iommu;
2741         if (likely(info))
2742                 return info->domain;
2743
2744         return __get_valid_domain_for_dev(dev);
2745 }
2746
2747 static int iommu_dummy(struct pci_dev *pdev)
2748 {
2749         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2750 }
2751
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device *dev)
2754 {
2755         struct pci_dev *pdev;
2756         int found;
2757
2758         if (unlikely(dev->bus != &pci_bus_type))
2759                 return 1;
2760
2761         pdev = to_pci_dev(dev);
2762         if (iommu_dummy(pdev))
2763                 return 1;
2764
2765         if (!iommu_identity_mapping)
2766                 return 0;
2767
2768         found = identity_mapping(pdev);
2769         if (found) {
2770                 if (iommu_should_identity_map(pdev, 0))
2771                         return 1;
2772                 else {
2773                         /*
2774                          * 32 bit DMA is removed from si_domain and fall back
2775                          * to non-identity mapping.
2776                          */
2777                         domain_remove_one_dev_info(si_domain, pdev);
2778                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2779                                pci_name(pdev));
2780                         return 0;
2781                 }
2782         } else {
2783                 /*
2784                  * In case of a detached 64 bit DMA device from vm, the device
2785                  * is put into si_domain for identity mapping.
2786                  */
2787                 if (iommu_should_identity_map(pdev, 0)) {
2788                         int ret;
2789                         ret = domain_add_dev_info(si_domain, pdev,
2790                                                   hw_pass_through ?
2791                                                   CONTEXT_TT_PASS_THROUGH :
2792                                                   CONTEXT_TT_MULTI_LEVEL);
2793                         if (!ret) {
2794                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2795                                        pci_name(pdev));
2796                                 return 1;
2797                         }
2798                 }
2799         }
2800
2801         return 0;
2802 }
2803
2804 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805                                      size_t size, int dir, u64 dma_mask)
2806 {
2807         struct pci_dev *pdev = to_pci_dev(hwdev);
2808         struct dmar_domain *domain;
2809         phys_addr_t start_paddr;
2810         struct iova *iova;
2811         int prot = 0;
2812         int ret;
2813         struct intel_iommu *iommu;
2814         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2815
2816         BUG_ON(dir == DMA_NONE);
2817
2818         if (iommu_no_mapping(hwdev))
2819                 return paddr;
2820
2821         domain = get_valid_domain_for_dev(pdev);
2822         if (!domain)
2823                 return 0;
2824
2825         iommu = domain_get_iommu(domain);
2826         size = aligned_nrpages(paddr, size);
2827
2828         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2829         if (!iova)
2830                 goto error;
2831
2832         /*
2833          * Check if DMAR supports zero-length reads on write only
2834          * mappings..
2835          */
2836         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837                         !cap_zlr(iommu->cap))
2838                 prot |= DMA_PTE_READ;
2839         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840                 prot |= DMA_PTE_WRITE;
2841         /*
2842          * paddr - (paddr + size) might be partial page, we should map the whole
2843          * page.  Note: if two part of one page are separately mapped, we
2844          * might have two guest_addr mapping to the same host paddr, but this
2845          * is not a big problem
2846          */
2847         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2849         if (ret)
2850                 goto error;
2851
2852         /* it's a non-present to present mapping. Only flush if caching mode */
2853         if (cap_caching_mode(iommu->cap))
2854                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2855         else
2856                 iommu_flush_write_buffer(iommu);
2857
2858         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859         start_paddr += paddr & ~PAGE_MASK;
2860         return start_paddr;
2861
2862 error:
2863         if (iova)
2864                 __free_iova(&domain->iovad, iova);
2865         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2867         return 0;
2868 }
2869
2870 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871                                  unsigned long offset, size_t size,
2872                                  enum dma_data_direction dir,
2873                                  struct dma_attrs *attrs)
2874 {
2875         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876                                   dir, to_pci_dev(dev)->dma_mask);
2877 }
2878
2879 static void flush_unmaps(void)
2880 {
2881         int i, j;
2882
2883         timer_on = 0;
2884
2885         /* just flush them all */
2886         for (i = 0; i < g_num_of_iommus; i++) {
2887                 struct intel_iommu *iommu = g_iommus[i];
2888                 if (!iommu)
2889                         continue;
2890
2891                 if (!deferred_flush[i].next)
2892                         continue;
2893
2894                 /* In caching mode, global flushes turn emulation expensive */
2895                 if (!cap_caching_mode(iommu->cap))
2896                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897                                          DMA_TLB_GLOBAL_FLUSH);
2898                 for (j = 0; j < deferred_flush[i].next; j++) {
2899                         unsigned long mask;
2900                         struct iova *iova = deferred_flush[i].iova[j];
2901                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2902
2903                         /* On real hardware multiple invalidations are expensive */
2904                         if (cap_caching_mode(iommu->cap))
2905                                 iommu_flush_iotlb_psi(iommu, domain->id,
2906                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2907                         else {
2908                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2911                         }
2912                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2913                 }
2914                 deferred_flush[i].next = 0;
2915         }
2916
2917         list_size = 0;
2918 }
2919
2920 static void flush_unmaps_timeout(unsigned long data)
2921 {
2922         unsigned long flags;
2923
2924         spin_lock_irqsave(&async_umap_flush_lock, flags);
2925         flush_unmaps();
2926         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2927 }
2928
2929 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2930 {
2931         unsigned long flags;
2932         int next, iommu_id;
2933         struct intel_iommu *iommu;
2934
2935         spin_lock_irqsave(&async_umap_flush_lock, flags);
2936         if (list_size == HIGH_WATER_MARK)
2937                 flush_unmaps();
2938
2939         iommu = domain_get_iommu(dom);
2940         iommu_id = iommu->seq_id;
2941
2942         next = deferred_flush[iommu_id].next;
2943         deferred_flush[iommu_id].domain[next] = dom;
2944         deferred_flush[iommu_id].iova[next] = iova;
2945         deferred_flush[iommu_id].next++;
2946
2947         if (!timer_on) {
2948                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2949                 timer_on = 1;
2950         }
2951         list_size++;
2952         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2953 }
2954
2955 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956                              size_t size, enum dma_data_direction dir,
2957                              struct dma_attrs *attrs)
2958 {
2959         struct pci_dev *pdev = to_pci_dev(dev);
2960         struct dmar_domain *domain;
2961         unsigned long start_pfn, last_pfn;
2962         struct iova *iova;
2963         struct intel_iommu *iommu;
2964
2965         if (iommu_no_mapping(dev))
2966                 return;
2967
2968         domain = find_domain(pdev);
2969         BUG_ON(!domain);
2970
2971         iommu = domain_get_iommu(domain);
2972
2973         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975                       (unsigned long long)dev_addr))
2976                 return;
2977
2978         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2980
2981         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982                  pci_name(pdev), start_pfn, last_pfn);
2983
2984         /*  clear the whole page */
2985         dma_pte_clear_range(domain, start_pfn, last_pfn);
2986
2987         /* free page tables */
2988         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2989
2990         if (intel_iommu_strict) {
2991                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992                                       last_pfn - start_pfn + 1, 0);
2993                 /* free iova */
2994                 __free_iova(&domain->iovad, iova);
2995         } else {
2996                 add_unmap(domain, iova);
2997                 /*
2998                  * queue up the release of the unmap to save the 1/6th of the
2999                  * cpu used up by the iotlb flush operation...
3000                  */
3001         }
3002 }
3003
3004 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005                                   dma_addr_t *dma_handle, gfp_t flags,
3006                                   struct dma_attrs *attrs)
3007 {
3008         void *vaddr;
3009         int order;
3010
3011         size = PAGE_ALIGN(size);
3012         order = get_order(size);
3013
3014         if (!iommu_no_mapping(hwdev))
3015                 flags &= ~(GFP_DMA | GFP_DMA32);
3016         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3018                         flags |= GFP_DMA;
3019                 else
3020                         flags |= GFP_DMA32;
3021         }
3022
3023         vaddr = (void *)__get_free_pages(flags, order);
3024         if (!vaddr)
3025                 return NULL;
3026         memset(vaddr, 0, size);
3027
3028         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3029                                          DMA_BIDIRECTIONAL,
3030                                          hwdev->coherent_dma_mask);
3031         if (*dma_handle)
3032                 return vaddr;
3033         free_pages((unsigned long)vaddr, order);
3034         return NULL;
3035 }
3036
3037 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3039 {
3040         int order;
3041
3042         size = PAGE_ALIGN(size);
3043         order = get_order(size);
3044
3045         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046         free_pages((unsigned long)vaddr, order);
3047 }
3048
3049 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050                            int nelems, enum dma_data_direction dir,
3051                            struct dma_attrs *attrs)
3052 {
3053         struct pci_dev *pdev = to_pci_dev(hwdev);
3054         struct dmar_domain *domain;
3055         unsigned long start_pfn, last_pfn;
3056         struct iova *iova;
3057         struct intel_iommu *iommu;
3058
3059         if (iommu_no_mapping(hwdev))
3060                 return;
3061
3062         domain = find_domain(pdev);
3063         BUG_ON(!domain);
3064
3065         iommu = domain_get_iommu(domain);
3066
3067         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069                       (unsigned long long)sglist[0].dma_address))
3070                 return;
3071
3072         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3074
3075         /*  clear the whole page */
3076         dma_pte_clear_range(domain, start_pfn, last_pfn);
3077
3078         /* free page tables */
3079         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3080
3081         if (intel_iommu_strict) {
3082                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083                                       last_pfn - start_pfn + 1, 0);
3084                 /* free iova */
3085                 __free_iova(&domain->iovad, iova);
3086         } else {
3087                 add_unmap(domain, iova);
3088                 /*
3089                  * queue up the release of the unmap to save the 1/6th of the
3090                  * cpu used up by the iotlb flush operation...
3091                  */
3092         }
3093 }
3094
3095 static int intel_nontranslate_map_sg(struct device *hddev,
3096         struct scatterlist *sglist, int nelems, int dir)
3097 {
3098         int i;
3099         struct scatterlist *sg;
3100
3101         for_each_sg(sglist, sg, nelems, i) {
3102                 BUG_ON(!sg_page(sg));
3103                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104                 sg->dma_length = sg->length;
3105         }
3106         return nelems;
3107 }
3108
3109 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110                         enum dma_data_direction dir, struct dma_attrs *attrs)
3111 {
3112         int i;
3113         struct pci_dev *pdev = to_pci_dev(hwdev);
3114         struct dmar_domain *domain;
3115         size_t size = 0;
3116         int prot = 0;
3117         struct iova *iova = NULL;
3118         int ret;
3119         struct scatterlist *sg;
3120         unsigned long start_vpfn;
3121         struct intel_iommu *iommu;
3122
3123         BUG_ON(dir == DMA_NONE);
3124         if (iommu_no_mapping(hwdev))
3125                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3126
3127         domain = get_valid_domain_for_dev(pdev);
3128         if (!domain)
3129                 return 0;
3130
3131         iommu = domain_get_iommu(domain);
3132
3133         for_each_sg(sglist, sg, nelems, i)
3134                 size += aligned_nrpages(sg->offset, sg->length);
3135
3136         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3137                                 pdev->dma_mask);
3138         if (!iova) {
3139                 sglist->dma_length = 0;
3140                 return 0;
3141         }
3142
3143         /*
3144          * Check if DMAR supports zero-length reads on write only
3145          * mappings..
3146          */
3147         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148                         !cap_zlr(iommu->cap))
3149                 prot |= DMA_PTE_READ;
3150         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151                 prot |= DMA_PTE_WRITE;
3152
3153         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3154
3155         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156         if (unlikely(ret)) {
3157                 /*  clear the page */
3158                 dma_pte_clear_range(domain, start_vpfn,
3159                                     start_vpfn + size - 1);
3160                 /* free page tables */
3161                 dma_pte_free_pagetable(domain, start_vpfn,
3162                                        start_vpfn + size - 1);
3163                 /* free iova */
3164                 __free_iova(&domain->iovad, iova);
3165                 return 0;
3166         }
3167
3168         /* it's a non-present to present mapping. Only flush if caching mode */
3169         if (cap_caching_mode(iommu->cap))
3170                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3171         else
3172                 iommu_flush_write_buffer(iommu);
3173
3174         return nelems;
3175 }
3176
3177 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3178 {
3179         return !dma_addr;
3180 }
3181
3182 struct dma_map_ops intel_dma_ops = {
3183         .alloc = intel_alloc_coherent,
3184         .free = intel_free_coherent,
3185         .map_sg = intel_map_sg,
3186         .unmap_sg = intel_unmap_sg,
3187         .map_page = intel_map_page,
3188         .unmap_page = intel_unmap_page,
3189         .mapping_error = intel_mapping_error,
3190 };
3191
3192 static inline int iommu_domain_cache_init(void)
3193 {
3194         int ret = 0;
3195
3196         iommu_domain_cache = kmem_cache_create("iommu_domain",
3197                                          sizeof(struct dmar_domain),
3198                                          0,
3199                                          SLAB_HWCACHE_ALIGN,
3200
3201                                          NULL);
3202         if (!iommu_domain_cache) {
3203                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3204                 ret = -ENOMEM;
3205         }
3206
3207         return ret;
3208 }
3209
3210 static inline int iommu_devinfo_cache_init(void)
3211 {
3212         int ret = 0;
3213
3214         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215                                          sizeof(struct device_domain_info),
3216                                          0,
3217                                          SLAB_HWCACHE_ALIGN,
3218                                          NULL);
3219         if (!iommu_devinfo_cache) {
3220                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3221                 ret = -ENOMEM;
3222         }
3223
3224         return ret;
3225 }
3226
3227 static inline int iommu_iova_cache_init(void)
3228 {
3229         int ret = 0;
3230
3231         iommu_iova_cache = kmem_cache_create("iommu_iova",
3232                                          sizeof(struct iova),
3233                                          0,
3234                                          SLAB_HWCACHE_ALIGN,
3235                                          NULL);
3236         if (!iommu_iova_cache) {
3237                 printk(KERN_ERR "Couldn't create iova cache\n");
3238                 ret = -ENOMEM;
3239         }
3240
3241         return ret;
3242 }
3243
3244 static int __init iommu_init_mempool(void)
3245 {
3246         int ret;
3247         ret = iommu_iova_cache_init();
3248         if (ret)
3249                 return ret;
3250
3251         ret = iommu_domain_cache_init();
3252         if (ret)
3253                 goto domain_error;
3254
3255         ret = iommu_devinfo_cache_init();
3256         if (!ret)
3257                 return ret;
3258
3259         kmem_cache_destroy(iommu_domain_cache);
3260 domain_error:
3261         kmem_cache_destroy(iommu_iova_cache);
3262
3263         return -ENOMEM;
3264 }
3265
3266 static void __init iommu_exit_mempool(void)
3267 {
3268         kmem_cache_destroy(iommu_devinfo_cache);
3269         kmem_cache_destroy(iommu_domain_cache);
3270         kmem_cache_destroy(iommu_iova_cache);
3271
3272 }
3273
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3275 {
3276         struct dmar_drhd_unit *drhd;
3277         u32 vtbar;
3278         int rc;
3279
3280         /* We know that this device on this chipset has its own IOMMU.
3281          * If we find it under a different IOMMU, then the BIOS is lying
3282          * to us. Hope that the IOMMU for this device is actually
3283          * disabled, and it needs no translation...
3284          */
3285         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3286         if (rc) {
3287                 /* "can't" happen */
3288                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3289                 return;
3290         }
3291         vtbar &= 0xffff0000;
3292
3293         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294         drhd = dmar_find_matched_drhd_unit(pdev);
3295         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296                             TAINT_FIRMWARE_WORKAROUND,
3297                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299 }
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3301
3302 static void __init init_no_remapping_devices(void)
3303 {
3304         struct dmar_drhd_unit *drhd;
3305
3306         for_each_drhd_unit(drhd) {
3307                 if (!drhd->include_all) {
3308                         int i;
3309                         for (i = 0; i < drhd->devices_cnt; i++)
3310                                 if (drhd->devices[i] != NULL)
3311                                         break;
3312                         /* ignore DMAR unit if no pci devices exist */
3313                         if (i == drhd->devices_cnt)
3314                                 drhd->ignored = 1;
3315                 }
3316         }
3317
3318         for_each_drhd_unit(drhd) {
3319                 int i;
3320                 if (drhd->ignored || drhd->include_all)
3321                         continue;
3322
3323                 for (i = 0; i < drhd->devices_cnt; i++)
3324                         if (drhd->devices[i] &&
3325                             !IS_GFX_DEVICE(drhd->devices[i]))
3326                                 break;
3327
3328                 if (i < drhd->devices_cnt)
3329                         continue;
3330
3331                 /* This IOMMU has *only* gfx devices. Either bypass it or
3332                    set the gfx_mapped flag, as appropriate */
3333                 if (dmar_map_gfx) {
3334                         intel_iommu_gfx_mapped = 1;
3335                 } else {
3336                         drhd->ignored = 1;
3337                         for (i = 0; i < drhd->devices_cnt; i++) {
3338                                 if (!drhd->devices[i])
3339                                         continue;
3340                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3341                         }
3342                 }
3343         }
3344 }
3345
3346 #ifdef CONFIG_SUSPEND
3347 static int init_iommu_hw(void)
3348 {
3349         struct dmar_drhd_unit *drhd;
3350         struct intel_iommu *iommu = NULL;
3351
3352         for_each_active_iommu(iommu, drhd)
3353                 if (iommu->qi)
3354                         dmar_reenable_qi(iommu);
3355
3356         for_each_iommu(iommu, drhd) {
3357                 if (drhd->ignored) {
3358                         /*
3359                          * we always have to disable PMRs or DMA may fail on
3360                          * this device
3361                          */
3362                         if (force_on)
3363                                 iommu_disable_protect_mem_regions(iommu);
3364                         continue;
3365                 }
3366         
3367                 iommu_flush_write_buffer(iommu);
3368
3369                 iommu_set_root_entry(iommu);
3370
3371                 iommu->flush.flush_context(iommu, 0, 0, 0,
3372                                            DMA_CCMD_GLOBAL_INVL);
3373                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3374                                          DMA_TLB_GLOBAL_FLUSH);
3375                 if (iommu_enable_translation(iommu))
3376                         return 1;
3377                 iommu_disable_protect_mem_regions(iommu);
3378         }
3379
3380         return 0;
3381 }
3382
3383 static void iommu_flush_all(void)
3384 {
3385         struct dmar_drhd_unit *drhd;
3386         struct intel_iommu *iommu;
3387
3388         for_each_active_iommu(iommu, drhd) {
3389                 iommu->flush.flush_context(iommu, 0, 0, 0,
3390                                            DMA_CCMD_GLOBAL_INVL);
3391                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3392                                          DMA_TLB_GLOBAL_FLUSH);
3393         }
3394 }
3395
3396 static int iommu_suspend(void)
3397 {
3398         struct dmar_drhd_unit *drhd;
3399         struct intel_iommu *iommu = NULL;
3400         unsigned long flag;
3401
3402         for_each_active_iommu(iommu, drhd) {
3403                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3404                                                  GFP_ATOMIC);
3405                 if (!iommu->iommu_state)
3406                         goto nomem;
3407         }
3408
3409         iommu_flush_all();
3410
3411         for_each_active_iommu(iommu, drhd) {
3412                 iommu_disable_translation(iommu);
3413
3414                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3415
3416                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3417                         readl(iommu->reg + DMAR_FECTL_REG);
3418                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3419                         readl(iommu->reg + DMAR_FEDATA_REG);
3420                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3421                         readl(iommu->reg + DMAR_FEADDR_REG);
3422                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3423                         readl(iommu->reg + DMAR_FEUADDR_REG);
3424
3425                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3426         }
3427         return 0;
3428
3429 nomem:
3430         for_each_active_iommu(iommu, drhd)
3431                 kfree(iommu->iommu_state);
3432
3433         return -ENOMEM;
3434 }
3435
3436 static void iommu_resume(void)
3437 {
3438         struct dmar_drhd_unit *drhd;
3439         struct intel_iommu *iommu = NULL;
3440         unsigned long flag;
3441
3442         if (init_iommu_hw()) {
3443                 if (force_on)
3444                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3445                 else
3446                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3447                 return;
3448         }
3449
3450         for_each_active_iommu(iommu, drhd) {
3451
3452                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3453
3454                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3455                         iommu->reg + DMAR_FECTL_REG);
3456                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3457                         iommu->reg + DMAR_FEDATA_REG);
3458                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3459                         iommu->reg + DMAR_FEADDR_REG);
3460                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3461                         iommu->reg + DMAR_FEUADDR_REG);
3462
3463                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3464         }
3465
3466         for_each_active_iommu(iommu, drhd)
3467                 kfree(iommu->iommu_state);
3468 }
3469
3470 static struct syscore_ops iommu_syscore_ops = {
3471         .resume         = iommu_resume,
3472         .suspend        = iommu_suspend,
3473 };
3474
3475 static void __init init_iommu_pm_ops(void)
3476 {
3477         register_syscore_ops(&iommu_syscore_ops);
3478 }
3479
3480 #else
3481 static inline void init_iommu_pm_ops(void) {}
3482 #endif  /* CONFIG_PM */
3483
3484 LIST_HEAD(dmar_rmrr_units);
3485
3486 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3487 {
3488         list_add(&rmrr->list, &dmar_rmrr_units);
3489 }
3490
3491
3492 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3493 {
3494         struct acpi_dmar_reserved_memory *rmrr;
3495         struct dmar_rmrr_unit *rmrru;
3496
3497         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3498         if (!rmrru)
3499                 return -ENOMEM;
3500
3501         rmrru->hdr = header;
3502         rmrr = (struct acpi_dmar_reserved_memory *)header;
3503         rmrru->base_address = rmrr->base_address;
3504         rmrru->end_address = rmrr->end_address;
3505
3506         dmar_register_rmrr_unit(rmrru);
3507         return 0;
3508 }
3509
3510 static int __init
3511 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3512 {
3513         struct acpi_dmar_reserved_memory *rmrr;
3514         int ret;
3515
3516         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3517         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3518                 ((void *)rmrr) + rmrr->header.length,
3519                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3520
3521         if (ret || (rmrru->devices_cnt == 0)) {
3522                 list_del(&rmrru->list);
3523                 kfree(rmrru);
3524         }
3525         return ret;
3526 }
3527
3528 static LIST_HEAD(dmar_atsr_units);
3529
3530 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3531 {
3532         struct acpi_dmar_atsr *atsr;
3533         struct dmar_atsr_unit *atsru;
3534
3535         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3536         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3537         if (!atsru)
3538                 return -ENOMEM;
3539
3540         atsru->hdr = hdr;
3541         atsru->include_all = atsr->flags & 0x1;
3542
3543         list_add(&atsru->list, &dmar_atsr_units);
3544
3545         return 0;
3546 }
3547
3548 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3549 {
3550         int rc;
3551         struct acpi_dmar_atsr *atsr;
3552
3553         if (atsru->include_all)
3554                 return 0;
3555
3556         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3557         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3558                                 (void *)atsr + atsr->header.length,
3559                                 &atsru->devices_cnt, &atsru->devices,
3560                                 atsr->segment);
3561         if (rc || !atsru->devices_cnt) {
3562                 list_del(&atsru->list);
3563                 kfree(atsru);
3564         }
3565
3566         return rc;
3567 }
3568
3569 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3570 {
3571         int i;
3572         struct pci_bus *bus;
3573         struct acpi_dmar_atsr *atsr;
3574         struct dmar_atsr_unit *atsru;
3575
3576         dev = pci_physfn(dev);
3577
3578         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3579                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3580                 if (atsr->segment == pci_domain_nr(dev->bus))
3581                         goto found;
3582         }
3583
3584         return 0;
3585
3586 found:
3587         for (bus = dev->bus; bus; bus = bus->parent) {
3588                 struct pci_dev *bridge = bus->self;
3589
3590                 if (!bridge || !pci_is_pcie(bridge) ||
3591                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3592                         return 0;
3593
3594                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3595                         for (i = 0; i < atsru->devices_cnt; i++)
3596                                 if (atsru->devices[i] == bridge)
3597                                         return 1;
3598                         break;
3599                 }
3600         }
3601
3602         if (atsru->include_all)
3603                 return 1;
3604
3605         return 0;
3606 }
3607
3608 int __init dmar_parse_rmrr_atsr_dev(void)
3609 {
3610         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3611         struct dmar_atsr_unit *atsr, *atsr_n;
3612         int ret = 0;
3613
3614         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3615                 ret = rmrr_parse_dev(rmrr);
3616                 if (ret)
3617                         return ret;
3618         }
3619
3620         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3621                 ret = atsr_parse_dev(atsr);
3622                 if (ret)
3623                         return ret;
3624         }
3625
3626         return ret;
3627 }
3628
3629 /*
3630  * Here we only respond to action of unbound device from driver.
3631  *
3632  * Added device is not attached to its DMAR domain here yet. That will happen
3633  * when mapping the device to iova.
3634  */
3635 static int device_notifier(struct notifier_block *nb,
3636                                   unsigned long action, void *data)
3637 {
3638         struct device *dev = data;
3639         struct pci_dev *pdev = to_pci_dev(dev);
3640         struct dmar_domain *domain;
3641
3642         if (iommu_no_mapping(dev))
3643                 return 0;
3644
3645         domain = find_domain(pdev);
3646         if (!domain)
3647                 return 0;
3648
3649         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3650                 domain_remove_one_dev_info(domain, pdev);
3651
3652                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3653                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3654                     list_empty(&domain->devices))
3655                         domain_exit(domain);
3656         }
3657
3658         return 0;
3659 }
3660
3661 static struct notifier_block device_nb = {
3662         .notifier_call = device_notifier,
3663 };
3664
3665 int __init intel_iommu_init(void)
3666 {
3667         int ret = 0;
3668
3669         /* VT-d is required for a TXT/tboot launch, so enforce that */
3670         force_on = tboot_force_iommu();
3671
3672         if (dmar_table_init()) {
3673                 if (force_on)
3674                         panic("tboot: Failed to initialize DMAR table\n");
3675                 return  -ENODEV;
3676         }
3677
3678         if (dmar_dev_scope_init() < 0) {
3679                 if (force_on)
3680                         panic("tboot: Failed to initialize DMAR device scope\n");
3681                 return  -ENODEV;
3682         }
3683
3684         if (no_iommu || dmar_disabled)
3685                 return -ENODEV;
3686
3687         if (iommu_init_mempool()) {
3688                 if (force_on)
3689                         panic("tboot: Failed to initialize iommu memory\n");
3690                 return  -ENODEV;
3691         }
3692
3693         if (list_empty(&dmar_rmrr_units))
3694                 printk(KERN_INFO "DMAR: No RMRR found\n");
3695
3696         if (list_empty(&dmar_atsr_units))
3697                 printk(KERN_INFO "DMAR: No ATSR found\n");
3698
3699         if (dmar_init_reserved_ranges()) {
3700                 if (force_on)
3701                         panic("tboot: Failed to reserve iommu ranges\n");
3702                 return  -ENODEV;
3703         }
3704
3705         init_no_remapping_devices();
3706
3707         ret = init_dmars();
3708         if (ret) {
3709                 if (force_on)
3710                         panic("tboot: Failed to initialize DMARs\n");
3711                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3712                 put_iova_domain(&reserved_iova_list);
3713                 iommu_exit_mempool();
3714                 return ret;
3715         }
3716         printk(KERN_INFO
3717         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3718
3719         init_timer(&unmap_timer);
3720 #ifdef CONFIG_SWIOTLB
3721         swiotlb = 0;
3722 #endif
3723         dma_ops = &intel_dma_ops;
3724
3725         init_iommu_pm_ops();
3726
3727         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3728
3729         bus_register_notifier(&pci_bus_type, &device_nb);
3730
3731         intel_iommu_enabled = 1;
3732
3733         return 0;
3734 }
3735
3736 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3737                                            struct pci_dev *pdev)
3738 {
3739         struct pci_dev *tmp, *parent;
3740
3741         if (!iommu || !pdev)
3742                 return;
3743
3744         /* dependent device detach */
3745         tmp = pci_find_upstream_pcie_bridge(pdev);
3746         /* Secondary interface's bus number and devfn 0 */
3747         if (tmp) {
3748                 parent = pdev->bus->self;
3749                 while (parent != tmp) {
3750                         iommu_detach_dev(iommu, parent->bus->number,
3751                                          parent->devfn);
3752                         parent = parent->bus->self;
3753                 }
3754                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3755                         iommu_detach_dev(iommu,
3756                                 tmp->subordinate->number, 0);
3757                 else /* this is a legacy PCI bridge */
3758                         iommu_detach_dev(iommu, tmp->bus->number,
3759                                          tmp->devfn);
3760         }
3761 }
3762
3763 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3764                                           struct pci_dev *pdev)
3765 {
3766         struct device_domain_info *info;
3767         struct intel_iommu *iommu;
3768         unsigned long flags;
3769         int found = 0;
3770         struct list_head *entry, *tmp;
3771
3772         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3773                                 pdev->devfn);
3774         if (!iommu)
3775                 return;
3776
3777         spin_lock_irqsave(&device_domain_lock, flags);
3778         list_for_each_safe(entry, tmp, &domain->devices) {
3779                 info = list_entry(entry, struct device_domain_info, link);
3780                 if (info->segment == pci_domain_nr(pdev->bus) &&
3781                     info->bus == pdev->bus->number &&
3782                     info->devfn == pdev->devfn) {
3783                         unlink_domain_info(info);
3784                         spin_unlock_irqrestore(&device_domain_lock, flags);
3785
3786                         iommu_disable_dev_iotlb(info);
3787                         iommu_detach_dev(iommu, info->bus, info->devfn);
3788                         iommu_detach_dependent_devices(iommu, pdev);
3789                         free_devinfo_mem(info);
3790
3791                         spin_lock_irqsave(&device_domain_lock, flags);
3792
3793                         if (found)
3794                                 break;
3795                         else
3796                                 continue;
3797                 }
3798
3799                 /* if there is no other devices under the same iommu
3800                  * owned by this domain, clear this iommu in iommu_bmp
3801                  * update iommu count and coherency
3802                  */
3803                 if (iommu == device_to_iommu(info->segment, info->bus,
3804                                             info->devfn))
3805                         found = 1;
3806         }
3807
3808         spin_unlock_irqrestore(&device_domain_lock, flags);
3809
3810         if (found == 0) {
3811                 unsigned long tmp_flags;
3812                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3813                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3814                 domain->iommu_count--;
3815                 domain_update_iommu_cap(domain);
3816                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3817
3818                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3819                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3820                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3821                         clear_bit(domain->id, iommu->domain_ids);
3822                         iommu->domains[domain->id] = NULL;
3823                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3824                 }
3825         }
3826 }
3827
3828 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3829 {
3830         struct device_domain_info *info;
3831         struct intel_iommu *iommu;
3832         unsigned long flags1, flags2;
3833
3834         spin_lock_irqsave(&device_domain_lock, flags1);
3835         while (!list_empty(&domain->devices)) {
3836                 info = list_entry(domain->devices.next,
3837                         struct device_domain_info, link);
3838                 unlink_domain_info(info);
3839                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3840
3841                 iommu_disable_dev_iotlb(info);
3842                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3843                 iommu_detach_dev(iommu, info->bus, info->devfn);
3844                 iommu_detach_dependent_devices(iommu, info->dev);
3845
3846                 /* clear this iommu in iommu_bmp, update iommu count
3847                  * and capabilities
3848                  */
3849                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3850                 if (test_and_clear_bit(iommu->seq_id,
3851                                        domain->iommu_bmp)) {
3852                         domain->iommu_count--;
3853                         domain_update_iommu_cap(domain);
3854                 }
3855                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3856
3857                 free_devinfo_mem(info);
3858                 spin_lock_irqsave(&device_domain_lock, flags1);
3859         }
3860         spin_unlock_irqrestore(&device_domain_lock, flags1);
3861 }
3862
3863 /* domain id for virtual machine, it won't be set in context */
3864 static unsigned long vm_domid;
3865
3866 static struct dmar_domain *iommu_alloc_vm_domain(void)
3867 {
3868         struct dmar_domain *domain;
3869
3870         domain = alloc_domain_mem();
3871         if (!domain)
3872                 return NULL;
3873
3874         domain->id = vm_domid++;
3875         domain->nid = -1;
3876         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3877         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3878
3879         return domain;
3880 }
3881
3882 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3883 {
3884         int adjust_width;
3885
3886         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3887         spin_lock_init(&domain->iommu_lock);
3888
3889         domain_reserve_special_ranges(domain);
3890
3891         /* calculate AGAW */
3892         domain->gaw = guest_width;
3893         adjust_width = guestwidth_to_adjustwidth(guest_width);
3894         domain->agaw = width_to_agaw(adjust_width);
3895
3896         INIT_LIST_HEAD(&domain->devices);
3897
3898         domain->iommu_count = 0;
3899         domain->iommu_coherency = 0;
3900         domain->iommu_snooping = 0;
3901         domain->iommu_superpage = 0;
3902         domain->max_addr = 0;
3903         domain->nid = -1;
3904
3905         /* always allocate the top pgd */
3906         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3907         if (!domain->pgd)
3908                 return -ENOMEM;
3909         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3910         return 0;
3911 }
3912
3913 static void iommu_free_vm_domain(struct dmar_domain *domain)
3914 {
3915         unsigned long flags;
3916         struct dmar_drhd_unit *drhd;
3917         struct intel_iommu *iommu;
3918         unsigned long i;
3919         unsigned long ndomains;
3920
3921         for_each_drhd_unit(drhd) {
3922                 if (drhd->ignored)
3923                         continue;
3924                 iommu = drhd->iommu;
3925
3926                 ndomains = cap_ndoms(iommu->cap);
3927                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3928                         if (iommu->domains[i] == domain) {
3929                                 spin_lock_irqsave(&iommu->lock, flags);
3930                                 clear_bit(i, iommu->domain_ids);
3931                                 iommu->domains[i] = NULL;
3932                                 spin_unlock_irqrestore(&iommu->lock, flags);
3933                                 break;
3934                         }
3935                 }
3936         }
3937 }
3938
3939 static void vm_domain_exit(struct dmar_domain *domain)
3940 {
3941         /* Domain 0 is reserved, so dont process it */
3942         if (!domain)
3943                 return;
3944
3945         vm_domain_remove_all_dev_info(domain);
3946         /* destroy iovas */
3947         put_iova_domain(&domain->iovad);
3948
3949         /* clear ptes */
3950         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3951
3952         /* free page tables */
3953         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3954
3955         iommu_free_vm_domain(domain);
3956         free_domain_mem(domain);
3957 }
3958
3959 static int intel_iommu_domain_init(struct iommu_domain *domain)
3960 {
3961         struct dmar_domain *dmar_domain;
3962
3963         dmar_domain = iommu_alloc_vm_domain();
3964         if (!dmar_domain) {
3965                 printk(KERN_ERR
3966                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3967                 return -ENOMEM;
3968         }
3969         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3970                 printk(KERN_ERR
3971                         "intel_iommu_domain_init() failed\n");
3972                 vm_domain_exit(dmar_domain);
3973                 return -ENOMEM;
3974         }
3975         domain_update_iommu_cap(dmar_domain);
3976         domain->priv = dmar_domain;
3977
3978         domain->geometry.aperture_start = 0;
3979         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3980         domain->geometry.force_aperture = true;
3981
3982         return 0;
3983 }
3984
3985 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3986 {
3987         struct dmar_domain *dmar_domain = domain->priv;
3988
3989         domain->priv = NULL;
3990         vm_domain_exit(dmar_domain);
3991 }
3992
3993 static int intel_iommu_attach_device(struct iommu_domain *domain,
3994                                      struct device *dev)
3995 {
3996         struct dmar_domain *dmar_domain = domain->priv;
3997         struct pci_dev *pdev = to_pci_dev(dev);
3998         struct intel_iommu *iommu;
3999         int addr_width;
4000
4001         /* normally pdev is not mapped */
4002         if (unlikely(domain_context_mapped(pdev))) {
4003                 struct dmar_domain *old_domain;
4004
4005                 old_domain = find_domain(pdev);
4006                 if (old_domain) {
4007                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4008                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4009                                 domain_remove_one_dev_info(old_domain, pdev);
4010                         else
4011                                 domain_remove_dev_info(old_domain);
4012                 }
4013         }
4014
4015         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4016                                 pdev->devfn);
4017         if (!iommu)
4018                 return -ENODEV;
4019
4020         /* check if this iommu agaw is sufficient for max mapped address */
4021         addr_width = agaw_to_width(iommu->agaw);
4022         if (addr_width > cap_mgaw(iommu->cap))
4023                 addr_width = cap_mgaw(iommu->cap);
4024
4025         if (dmar_domain->max_addr > (1LL << addr_width)) {
4026                 printk(KERN_ERR "%s: iommu width (%d) is not "
4027                        "sufficient for the mapped address (%llx)\n",
4028                        __func__, addr_width, dmar_domain->max_addr);
4029                 return -EFAULT;
4030         }
4031         dmar_domain->gaw = addr_width;
4032
4033         /*
4034          * Knock out extra levels of page tables if necessary
4035          */
4036         while (iommu->agaw < dmar_domain->agaw) {
4037                 struct dma_pte *pte;
4038
4039                 pte = dmar_domain->pgd;
4040                 if (dma_pte_present(pte)) {
4041                         dmar_domain->pgd = (struct dma_pte *)
4042                                 phys_to_virt(dma_pte_addr(pte));
4043                         free_pgtable_page(pte);
4044                 }
4045                 dmar_domain->agaw--;
4046         }
4047
4048         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4049 }
4050
4051 static void intel_iommu_detach_device(struct iommu_domain *domain,
4052                                       struct device *dev)
4053 {
4054         struct dmar_domain *dmar_domain = domain->priv;
4055         struct pci_dev *pdev = to_pci_dev(dev);
4056
4057         domain_remove_one_dev_info(dmar_domain, pdev);
4058 }
4059
4060 static int intel_iommu_map(struct iommu_domain *domain,
4061                            unsigned long iova, phys_addr_t hpa,
4062                            size_t size, int iommu_prot)
4063 {
4064         struct dmar_domain *dmar_domain = domain->priv;
4065         u64 max_addr;
4066         int prot = 0;
4067         int ret;
4068
4069         if (iommu_prot & IOMMU_READ)
4070                 prot |= DMA_PTE_READ;
4071         if (iommu_prot & IOMMU_WRITE)
4072                 prot |= DMA_PTE_WRITE;
4073         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4074                 prot |= DMA_PTE_SNP;
4075
4076         max_addr = iova + size;
4077         if (dmar_domain->max_addr < max_addr) {
4078                 u64 end;
4079
4080                 /* check if minimum agaw is sufficient for mapped address */
4081                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4082                 if (end < max_addr) {
4083                         printk(KERN_ERR "%s: iommu width (%d) is not "
4084                                "sufficient for the mapped address (%llx)\n",
4085                                __func__, dmar_domain->gaw, max_addr);
4086                         return -EFAULT;
4087                 }
4088                 dmar_domain->max_addr = max_addr;
4089         }
4090         /* Round up size to next multiple of PAGE_SIZE, if it and
4091            the low bits of hpa would take us onto the next page */
4092         size = aligned_nrpages(hpa, size);
4093         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4094                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4095         return ret;
4096 }
4097
4098 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4099                              unsigned long iova, size_t size)
4100 {
4101         struct dmar_domain *dmar_domain = domain->priv;
4102         int order;
4103
4104         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4105                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4106
4107         if (dmar_domain->max_addr == iova + size)
4108                 dmar_domain->max_addr = iova;
4109
4110         return PAGE_SIZE << order;
4111 }
4112
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114                                             unsigned long iova)
4115 {
4116         struct dmar_domain *dmar_domain = domain->priv;
4117         struct dma_pte *pte;
4118         u64 phys = 0;
4119
4120         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4121         if (pte)
4122                 phys = dma_pte_addr(pte);
4123
4124         return phys;
4125 }
4126
4127 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4128                                       unsigned long cap)
4129 {
4130         struct dmar_domain *dmar_domain = domain->priv;
4131
4132         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4133                 return dmar_domain->iommu_snooping;
4134         if (cap == IOMMU_CAP_INTR_REMAP)
4135                 return irq_remapping_enabled;
4136
4137         return 0;
4138 }
4139
4140 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4141 {
4142         pci_dev_put(*from);
4143         *from = to;
4144 }
4145
4146 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4147
4148 static int intel_iommu_add_device(struct device *dev)
4149 {
4150         struct pci_dev *pdev = to_pci_dev(dev);
4151         struct pci_dev *bridge, *dma_pdev = NULL;
4152         struct iommu_group *group;
4153         int ret;
4154
4155         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4156                              pdev->bus->number, pdev->devfn))
4157                 return -ENODEV;
4158
4159         bridge = pci_find_upstream_pcie_bridge(pdev);
4160         if (bridge) {
4161                 if (pci_is_pcie(bridge))
4162                         dma_pdev = pci_get_domain_bus_and_slot(
4163                                                 pci_domain_nr(pdev->bus),
4164                                                 bridge->subordinate->number, 0);
4165                 if (!dma_pdev)
4166                         dma_pdev = pci_dev_get(bridge);
4167         } else
4168                 dma_pdev = pci_dev_get(pdev);
4169
4170         /* Account for quirked devices */
4171         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4172
4173         /*
4174          * If it's a multifunction device that does not support our
4175          * required ACS flags, add to the same group as function 0.
4176          */
4177         if (dma_pdev->multifunction &&
4178             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4179                 swap_pci_ref(&dma_pdev,
4180                              pci_get_slot(dma_pdev->bus,
4181                                           PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4182                                           0)));
4183
4184         /*
4185          * Devices on the root bus go through the iommu.  If that's not us,
4186          * find the next upstream device and test ACS up to the root bus.
4187          * Finding the next device may require skipping virtual buses.
4188          */
4189         while (!pci_is_root_bus(dma_pdev->bus)) {
4190                 struct pci_bus *bus = dma_pdev->bus;
4191
4192                 while (!bus->self) {
4193                         if (!pci_is_root_bus(bus))
4194                                 bus = bus->parent;
4195                         else
4196                                 goto root_bus;
4197                 }
4198
4199                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4200                         break;
4201
4202                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4203         }
4204
4205 root_bus:
4206         group = iommu_group_get(&dma_pdev->dev);
4207         pci_dev_put(dma_pdev);
4208         if (!group) {
4209                 group = iommu_group_alloc();
4210                 if (IS_ERR(group))
4211                         return PTR_ERR(group);
4212         }
4213
4214         ret = iommu_group_add_device(group, dev);
4215
4216         iommu_group_put(group);
4217         return ret;
4218 }
4219
4220 static void intel_iommu_remove_device(struct device *dev)
4221 {
4222         iommu_group_remove_device(dev);
4223 }
4224
4225 static struct iommu_ops intel_iommu_ops = {
4226         .domain_init    = intel_iommu_domain_init,
4227         .domain_destroy = intel_iommu_domain_destroy,
4228         .attach_dev     = intel_iommu_attach_device,
4229         .detach_dev     = intel_iommu_detach_device,
4230         .map            = intel_iommu_map,
4231         .unmap          = intel_iommu_unmap,
4232         .iova_to_phys   = intel_iommu_iova_to_phys,
4233         .domain_has_cap = intel_iommu_domain_has_cap,
4234         .add_device     = intel_iommu_add_device,
4235         .remove_device  = intel_iommu_remove_device,
4236         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4237 };
4238
4239 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4240 {
4241         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4242         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4243         dmar_map_gfx = 0;
4244 }
4245
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4253
4254 static void quirk_iommu_rwbf(struct pci_dev *dev)
4255 {
4256         /*
4257          * Mobile 4 Series Chipset neglects to set RWBF capability,
4258          * but needs it. Same seems to hold for the desktop versions.
4259          */
4260         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4261         rwbf_quirk = 1;
4262 }
4263
4264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4271
4272 #define GGC 0x52
4273 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4274 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4275 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4276 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4277 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4278 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4279 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4280 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4281
4282 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4283 {
4284         unsigned short ggc;
4285
4286         if (pci_read_config_word(dev, GGC, &ggc))
4287                 return;
4288
4289         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4290                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4291                 dmar_map_gfx = 0;
4292         } else if (dmar_map_gfx) {
4293                 /* we have to ensure the gfx device is idle before we flush */
4294                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4295                 intel_iommu_strict = 1;
4296        }
4297 }
4298 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4302
4303 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4304    ISOCH DMAR unit for the Azalia sound device, but not give it any
4305    TLB entries, which causes it to deadlock. Check for that.  We do
4306    this in a function called from init_dmars(), instead of in a PCI
4307    quirk, because we don't want to print the obnoxious "BIOS broken"
4308    message if VT-d is actually disabled.
4309 */
4310 static void __init check_tylersburg_isoch(void)
4311 {
4312         struct pci_dev *pdev;
4313         uint32_t vtisochctrl;
4314
4315         /* If there's no Azalia in the system anyway, forget it. */
4316         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4317         if (!pdev)
4318                 return;
4319         pci_dev_put(pdev);
4320
4321         /* System Management Registers. Might be hidden, in which case
4322            we can't do the sanity check. But that's OK, because the
4323            known-broken BIOSes _don't_ actually hide it, so far. */
4324         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4325         if (!pdev)
4326                 return;
4327
4328         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4329                 pci_dev_put(pdev);
4330                 return;
4331         }
4332
4333         pci_dev_put(pdev);
4334
4335         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4336         if (vtisochctrl & 1)
4337                 return;
4338
4339         /* Drop all bits other than the number of TLB entries */
4340         vtisochctrl &= 0x1c;
4341
4342         /* If we have the recommended number of TLB entries (16), fine. */
4343         if (vtisochctrl == 0x10)
4344                 return;
4345
4346         /* Zero TLB entries? You get to ride the short bus to school. */
4347         if (!vtisochctrl) {
4348                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4349                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4350                      dmi_get_system_info(DMI_BIOS_VENDOR),
4351                      dmi_get_system_info(DMI_BIOS_VERSION),
4352                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4353                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4354                 return;
4355         }
4356         
4357         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4358                vtisochctrl);
4359 }