intel-iommu: Avoid global flushes with caching mode.
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495                 if (!ecap_coherent(g_iommus[i]->ecap)) {
496                         domain->iommu_coherency = 0;
497                         break;
498                 }
499         }
500 }
501
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
503 {
504         int i;
505
506         domain->iommu_snooping = 1;
507
508         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510                         domain->iommu_snooping = 0;
511                         break;
512                 }
513         }
514 }
515
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
518 {
519         domain_update_iommu_coherency(domain);
520         domain_update_iommu_snooping(domain);
521 }
522
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
524 {
525         struct dmar_drhd_unit *drhd = NULL;
526         int i;
527
528         for_each_drhd_unit(drhd) {
529                 if (drhd->ignored)
530                         continue;
531                 if (segment != drhd->segment)
532                         continue;
533
534                 for (i = 0; i < drhd->devices_cnt; i++) {
535                         if (drhd->devices[i] &&
536                             drhd->devices[i]->bus->number == bus &&
537                             drhd->devices[i]->devfn == devfn)
538                                 return drhd->iommu;
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->subordinate &&
541                             drhd->devices[i]->subordinate->number <= bus &&
542                             drhd->devices[i]->subordinate->subordinate >= bus)
543                                 return drhd->iommu;
544                 }
545
546                 if (drhd->include_all)
547                         return drhd->iommu;
548         }
549
550         return NULL;
551 }
552
553 static void domain_flush_cache(struct dmar_domain *domain,
554                                void *addr, int size)
555 {
556         if (!domain->iommu_coherency)
557                 clflush_cache_range(addr, size);
558 }
559
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
562                 u8 bus, u8 devfn)
563 {
564         struct root_entry *root;
565         struct context_entry *context;
566         unsigned long phy_addr;
567         unsigned long flags;
568
569         spin_lock_irqsave(&iommu->lock, flags);
570         root = &iommu->root_entry[bus];
571         context = get_context_addr_from_root(root);
572         if (!context) {
573                 context = (struct context_entry *)
574                                 alloc_pgtable_page(iommu->node);
575                 if (!context) {
576                         spin_unlock_irqrestore(&iommu->lock, flags);
577                         return NULL;
578                 }
579                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580                 phy_addr = virt_to_phys((void *)context);
581                 set_root_value(root, phy_addr);
582                 set_root_present(root);
583                 __iommu_flush_cache(iommu, root, sizeof(*root));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586         return &context[devfn];
587 }
588
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
590 {
591         struct root_entry *root;
592         struct context_entry *context;
593         int ret;
594         unsigned long flags;
595
596         spin_lock_irqsave(&iommu->lock, flags);
597         root = &iommu->root_entry[bus];
598         context = get_context_addr_from_root(root);
599         if (!context) {
600                 ret = 0;
601                 goto out;
602         }
603         ret = context_present(&context[devfn]);
604 out:
605         spin_unlock_irqrestore(&iommu->lock, flags);
606         return ret;
607 }
608
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long flags;
614
615         spin_lock_irqsave(&iommu->lock, flags);
616         root = &iommu->root_entry[bus];
617         context = get_context_addr_from_root(root);
618         if (context) {
619                 context_clear_entry(&context[devfn]);
620                 __iommu_flush_cache(iommu, &context[devfn], \
621                         sizeof(*context));
622         }
623         spin_unlock_irqrestore(&iommu->lock, flags);
624 }
625
626 static void free_context_table(struct intel_iommu *iommu)
627 {
628         struct root_entry *root;
629         int i;
630         unsigned long flags;
631         struct context_entry *context;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         if (!iommu->root_entry) {
635                 goto out;
636         }
637         for (i = 0; i < ROOT_ENTRY_NR; i++) {
638                 root = &iommu->root_entry[i];
639                 context = get_context_addr_from_root(root);
640                 if (context)
641                         free_pgtable_page(context);
642         }
643         free_pgtable_page(iommu->root_entry);
644         iommu->root_entry = NULL;
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 /* page table handling */
650 #define LEVEL_STRIDE            (9)
651 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
652
653 static inline int agaw_to_level(int agaw)
654 {
655         return agaw + 2;
656 }
657
658 static inline int agaw_to_width(int agaw)
659 {
660         return 30 + agaw * LEVEL_STRIDE;
661
662 }
663
664 static inline int width_to_agaw(int width)
665 {
666         return (width - 30) / LEVEL_STRIDE;
667 }
668
669 static inline unsigned int level_to_offset_bits(int level)
670 {
671         return (level - 1) * LEVEL_STRIDE;
672 }
673
674 static inline int pfn_level_offset(unsigned long pfn, int level)
675 {
676         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
677 }
678
679 static inline unsigned long level_mask(int level)
680 {
681         return -1UL << level_to_offset_bits(level);
682 }
683
684 static inline unsigned long level_size(int level)
685 {
686         return 1UL << level_to_offset_bits(level);
687 }
688
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
690 {
691         return (pfn + level_size(level) - 1) & level_mask(level);
692 }
693
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
695                                       unsigned long pfn)
696 {
697         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698         struct dma_pte *parent, *pte = NULL;
699         int level = agaw_to_level(domain->agaw);
700         int offset;
701
702         BUG_ON(!domain->pgd);
703         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704         parent = domain->pgd;
705
706         while (level > 0) {
707                 void *tmp_page;
708
709                 offset = pfn_level_offset(pfn, level);
710                 pte = &parent[offset];
711                 if (level == 1)
712                         break;
713
714                 if (!dma_pte_present(pte)) {
715                         uint64_t pteval;
716
717                         tmp_page = alloc_pgtable_page(domain->nid);
718
719                         if (!tmp_page)
720                                 return NULL;
721
722                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725                                 /* Someone else set it while we were thinking; use theirs. */
726                                 free_pgtable_page(tmp_page);
727                         } else {
728                                 dma_pte_addr(pte);
729                                 domain_flush_cache(domain, pte, sizeof(*pte));
730                         }
731                 }
732                 parent = phys_to_virt(dma_pte_addr(pte));
733                 level--;
734         }
735
736         return pte;
737 }
738
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
741                                          unsigned long pfn,
742                                          int level)
743 {
744         struct dma_pte *parent, *pte = NULL;
745         int total = agaw_to_level(domain->agaw);
746         int offset;
747
748         parent = domain->pgd;
749         while (level <= total) {
750                 offset = pfn_level_offset(pfn, total);
751                 pte = &parent[offset];
752                 if (level == total)
753                         return pte;
754
755                 if (!dma_pte_present(pte))
756                         break;
757                 parent = phys_to_virt(dma_pte_addr(pte));
758                 total--;
759         }
760         return NULL;
761 }
762
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765                                 unsigned long start_pfn,
766                                 unsigned long last_pfn)
767 {
768         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769         struct dma_pte *first_pte, *pte;
770
771         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773         BUG_ON(start_pfn > last_pfn);
774
775         /* we don't need lock here; nobody else touches the iova range */
776         do {
777                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
778                 if (!pte) {
779                         start_pfn = align_to_level(start_pfn + 1, 2);
780                         continue;
781                 }
782                 do { 
783                         dma_clear_pte(pte);
784                         start_pfn++;
785                         pte++;
786                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
787
788                 domain_flush_cache(domain, first_pte,
789                                    (void *)pte - (void *)first_pte);
790
791         } while (start_pfn && start_pfn <= last_pfn);
792 }
793
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796                                    unsigned long start_pfn,
797                                    unsigned long last_pfn)
798 {
799         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800         struct dma_pte *first_pte, *pte;
801         int total = agaw_to_level(domain->agaw);
802         int level;
803         unsigned long tmp;
804
805         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807         BUG_ON(start_pfn > last_pfn);
808
809         /* We don't need lock here; nobody else touches the iova range */
810         level = 2;
811         while (level <= total) {
812                 tmp = align_to_level(start_pfn, level);
813
814                 /* If we can't even clear one PTE at this level, we're done */
815                 if (tmp + level_size(level) - 1 > last_pfn)
816                         return;
817
818                 do {
819                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
820                         if (!pte) {
821                                 tmp = align_to_level(tmp + 1, level + 1);
822                                 continue;
823                         }
824                         do {
825                                 if (dma_pte_present(pte)) {
826                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
827                                         dma_clear_pte(pte);
828                                 }
829                                 pte++;
830                                 tmp += level_size(level);
831                         } while (!first_pte_in_page(pte) &&
832                                  tmp + level_size(level) - 1 <= last_pfn);
833
834                         domain_flush_cache(domain, first_pte,
835                                            (void *)pte - (void *)first_pte);
836                         
837                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
838                 level++;
839         }
840         /* free pgd */
841         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842                 free_pgtable_page(domain->pgd);
843                 domain->pgd = NULL;
844         }
845 }
846
847 /* iommu handling */
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
849 {
850         struct root_entry *root;
851         unsigned long flags;
852
853         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
854         if (!root)
855                 return -ENOMEM;
856
857         __iommu_flush_cache(iommu, root, ROOT_SIZE);
858
859         spin_lock_irqsave(&iommu->lock, flags);
860         iommu->root_entry = root;
861         spin_unlock_irqrestore(&iommu->lock, flags);
862
863         return 0;
864 }
865
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
867 {
868         void *addr;
869         u32 sts;
870         unsigned long flag;
871
872         addr = iommu->root_entry;
873
874         spin_lock_irqsave(&iommu->register_lock, flag);
875         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
876
877         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
878
879         /* Make sure hardware complete it */
880         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881                       readl, (sts & DMA_GSTS_RTPS), sts);
882
883         spin_unlock_irqrestore(&iommu->register_lock, flag);
884 }
885
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 {
888         u32 val;
889         unsigned long flag;
890
891         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
892                 return;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (!(val & DMA_GSTS_WBFS)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906                                   u16 did, u16 source_id, u8 function_mask,
907                                   u64 type)
908 {
909         u64 val = 0;
910         unsigned long flag;
911
912         switch (type) {
913         case DMA_CCMD_GLOBAL_INVL:
914                 val = DMA_CCMD_GLOBAL_INVL;
915                 break;
916         case DMA_CCMD_DOMAIN_INVL:
917                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
918                 break;
919         case DMA_CCMD_DEVICE_INVL:
920                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
922                 break;
923         default:
924                 BUG();
925         }
926         val |= DMA_CCMD_ICC;
927
928         spin_lock_irqsave(&iommu->register_lock, flag);
929         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
930
931         /* Make sure hardware complete it */
932         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
934
935         spin_unlock_irqrestore(&iommu->register_lock, flag);
936 }
937
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940                                 u64 addr, unsigned int size_order, u64 type)
941 {
942         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943         u64 val = 0, val_iva = 0;
944         unsigned long flag;
945
946         switch (type) {
947         case DMA_TLB_GLOBAL_FLUSH:
948                 /* global flush doesn't need set IVA_REG */
949                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
950                 break;
951         case DMA_TLB_DSI_FLUSH:
952                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 break;
954         case DMA_TLB_PSI_FLUSH:
955                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956                 /* Note: always flush non-leaf currently */
957                 val_iva = size_order | addr;
958                 break;
959         default:
960                 BUG();
961         }
962         /* Note: set drain read/write */
963 #if 0
964         /*
965          * This is probably to be super secure.. Looks like we can
966          * ignore it without any impact.
967          */
968         if (cap_read_drain(iommu->cap))
969                 val |= DMA_TLB_READ_DRAIN;
970 #endif
971         if (cap_write_drain(iommu->cap))
972                 val |= DMA_TLB_WRITE_DRAIN;
973
974         spin_lock_irqsave(&iommu->register_lock, flag);
975         /* Note: Only uses first TLB reg currently */
976         if (val_iva)
977                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978         dmar_writeq(iommu->reg + tlb_offset + 8, val);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flag);
985
986         /* check IOTLB invalidation granularity */
987         if (DMA_TLB_IAIG(val) == 0)
988                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991                         (unsigned long long)DMA_TLB_IIRG(type),
992                         (unsigned long long)DMA_TLB_IAIG(val));
993 }
994
995 static struct device_domain_info *iommu_support_dev_iotlb(
996         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
997 {
998         int found = 0;
999         unsigned long flags;
1000         struct device_domain_info *info;
1001         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1002
1003         if (!ecap_dev_iotlb_support(iommu->ecap))
1004                 return NULL;
1005
1006         if (!iommu->qi)
1007                 return NULL;
1008
1009         spin_lock_irqsave(&device_domain_lock, flags);
1010         list_for_each_entry(info, &domain->devices, link)
1011                 if (info->bus == bus && info->devfn == devfn) {
1012                         found = 1;
1013                         break;
1014                 }
1015         spin_unlock_irqrestore(&device_domain_lock, flags);
1016
1017         if (!found || !info->dev)
1018                 return NULL;
1019
1020         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1021                 return NULL;
1022
1023         if (!dmar_find_matched_atsr_unit(info->dev))
1024                 return NULL;
1025
1026         info->iommu = iommu;
1027
1028         return info;
1029 }
1030
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 {
1033         if (!info)
1034                 return;
1035
1036         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 }
1038
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1040 {
1041         if (!info->dev || !pci_ats_enabled(info->dev))
1042                 return;
1043
1044         pci_disable_ats(info->dev);
1045 }
1046
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048                                   u64 addr, unsigned mask)
1049 {
1050         u16 sid, qdep;
1051         unsigned long flags;
1052         struct device_domain_info *info;
1053
1054         spin_lock_irqsave(&device_domain_lock, flags);
1055         list_for_each_entry(info, &domain->devices, link) {
1056                 if (!info->dev || !pci_ats_enabled(info->dev))
1057                         continue;
1058
1059                 sid = info->bus << 8 | info->devfn;
1060                 qdep = pci_ats_queue_depth(info->dev);
1061                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1062         }
1063         spin_unlock_irqrestore(&device_domain_lock, flags);
1064 }
1065
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067                                   unsigned long pfn, unsigned int pages, int map)
1068 {
1069         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071
1072         BUG_ON(pages == 0);
1073
1074         /*
1075          * Fallback to domain selective flush if no PSI support or the size is
1076          * too big.
1077          * PSI requires page size to be 2 ^ x, and the base address is naturally
1078          * aligned to the size
1079          */
1080         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1082                                                 DMA_TLB_DSI_FLUSH);
1083         else
1084                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1085                                                 DMA_TLB_PSI_FLUSH);
1086
1087         /*
1088          * In caching mode, changes of pages from non-present to present require
1089          * flush. However, device IOTLB doesn't need to be flushed in this case.
1090          */
1091         if (!cap_caching_mode(iommu->cap) || !map)
1092                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 }
1094
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1096 {
1097         u32 pmen;
1098         unsigned long flags;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flags);
1101         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102         pmen &= ~DMA_PMEN_EPM;
1103         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1104
1105         /* wait for the protected region status bit to clear */
1106         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1108
1109         spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1113 {
1114         u32 sts;
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&iommu->register_lock, flags);
1118         iommu->gcmd |= DMA_GCMD_TE;
1119         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1120
1121         /* Make sure hardware complete it */
1122         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123                       readl, (sts & DMA_GSTS_TES), sts);
1124
1125         spin_unlock_irqrestore(&iommu->register_lock, flags);
1126         return 0;
1127 }
1128
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 {
1131         u32 sts;
1132         unsigned long flag;
1133
1134         spin_lock_irqsave(&iommu->register_lock, flag);
1135         iommu->gcmd &= ~DMA_GCMD_TE;
1136         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1137
1138         /* Make sure hardware complete it */
1139         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140                       readl, (!(sts & DMA_GSTS_TES)), sts);
1141
1142         spin_unlock_irqrestore(&iommu->register_lock, flag);
1143         return 0;
1144 }
1145
1146
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1148 {
1149         unsigned long ndomains;
1150         unsigned long nlongs;
1151
1152         ndomains = cap_ndoms(iommu->cap);
1153         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1154         nlongs = BITS_TO_LONGS(ndomains);
1155
1156         spin_lock_init(&iommu->lock);
1157
1158         /* TBD: there might be 64K domains,
1159          * consider other allocation for future chip
1160          */
1161         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1162         if (!iommu->domain_ids) {
1163                 printk(KERN_ERR "Allocating domain id array failed\n");
1164                 return -ENOMEM;
1165         }
1166         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1167                         GFP_KERNEL);
1168         if (!iommu->domains) {
1169                 printk(KERN_ERR "Allocating domain array failed\n");
1170                 return -ENOMEM;
1171         }
1172
1173         /*
1174          * if Caching mode is set, then invalid translations are tagged
1175          * with domainid 0. Hence we need to pre-allocate it.
1176          */
1177         if (cap_caching_mode(iommu->cap))
1178                 set_bit(0, iommu->domain_ids);
1179         return 0;
1180 }
1181
1182
1183 static void domain_exit(struct dmar_domain *domain);
1184 static void vm_domain_exit(struct dmar_domain *domain);
1185
1186 void free_dmar_iommu(struct intel_iommu *iommu)
1187 {
1188         struct dmar_domain *domain;
1189         int i;
1190         unsigned long flags;
1191
1192         if ((iommu->domains) && (iommu->domain_ids)) {
1193                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1194                         domain = iommu->domains[i];
1195                         clear_bit(i, iommu->domain_ids);
1196
1197                         spin_lock_irqsave(&domain->iommu_lock, flags);
1198                         if (--domain->iommu_count == 0) {
1199                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1200                                         vm_domain_exit(domain);
1201                                 else
1202                                         domain_exit(domain);
1203                         }
1204                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1205                 }
1206         }
1207
1208         if (iommu->gcmd & DMA_GCMD_TE)
1209                 iommu_disable_translation(iommu);
1210
1211         if (iommu->irq) {
1212                 set_irq_data(iommu->irq, NULL);
1213                 /* This will mask the irq */
1214                 free_irq(iommu->irq, iommu);
1215                 destroy_irq(iommu->irq);
1216         }
1217
1218         kfree(iommu->domains);
1219         kfree(iommu->domain_ids);
1220
1221         g_iommus[iommu->seq_id] = NULL;
1222
1223         /* if all iommus are freed, free g_iommus */
1224         for (i = 0; i < g_num_of_iommus; i++) {
1225                 if (g_iommus[i])
1226                         break;
1227         }
1228
1229         if (i == g_num_of_iommus)
1230                 kfree(g_iommus);
1231
1232         /* free context mapping */
1233         free_context_table(iommu);
1234 }
1235
1236 static struct dmar_domain *alloc_domain(void)
1237 {
1238         struct dmar_domain *domain;
1239
1240         domain = alloc_domain_mem();
1241         if (!domain)
1242                 return NULL;
1243
1244         domain->nid = -1;
1245         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1246         domain->flags = 0;
1247
1248         return domain;
1249 }
1250
1251 static int iommu_attach_domain(struct dmar_domain *domain,
1252                                struct intel_iommu *iommu)
1253 {
1254         int num;
1255         unsigned long ndomains;
1256         unsigned long flags;
1257
1258         ndomains = cap_ndoms(iommu->cap);
1259
1260         spin_lock_irqsave(&iommu->lock, flags);
1261
1262         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1263         if (num >= ndomains) {
1264                 spin_unlock_irqrestore(&iommu->lock, flags);
1265                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1266                 return -ENOMEM;
1267         }
1268
1269         domain->id = num;
1270         set_bit(num, iommu->domain_ids);
1271         set_bit(iommu->seq_id, &domain->iommu_bmp);
1272         iommu->domains[num] = domain;
1273         spin_unlock_irqrestore(&iommu->lock, flags);
1274
1275         return 0;
1276 }
1277
1278 static void iommu_detach_domain(struct dmar_domain *domain,
1279                                 struct intel_iommu *iommu)
1280 {
1281         unsigned long flags;
1282         int num, ndomains;
1283         int found = 0;
1284
1285         spin_lock_irqsave(&iommu->lock, flags);
1286         ndomains = cap_ndoms(iommu->cap);
1287         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1288                 if (iommu->domains[num] == domain) {
1289                         found = 1;
1290                         break;
1291                 }
1292         }
1293
1294         if (found) {
1295                 clear_bit(num, iommu->domain_ids);
1296                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1297                 iommu->domains[num] = NULL;
1298         }
1299         spin_unlock_irqrestore(&iommu->lock, flags);
1300 }
1301
1302 static struct iova_domain reserved_iova_list;
1303 static struct lock_class_key reserved_rbtree_key;
1304
1305 static void dmar_init_reserved_ranges(void)
1306 {
1307         struct pci_dev *pdev = NULL;
1308         struct iova *iova;
1309         int i;
1310
1311         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1312
1313         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1314                 &reserved_rbtree_key);
1315
1316         /* IOAPIC ranges shouldn't be accessed by DMA */
1317         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1318                 IOVA_PFN(IOAPIC_RANGE_END));
1319         if (!iova)
1320                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1321
1322         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1323         for_each_pci_dev(pdev) {
1324                 struct resource *r;
1325
1326                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1327                         r = &pdev->resource[i];
1328                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1329                                 continue;
1330                         iova = reserve_iova(&reserved_iova_list,
1331                                             IOVA_PFN(r->start),
1332                                             IOVA_PFN(r->end));
1333                         if (!iova)
1334                                 printk(KERN_ERR "Reserve iova failed\n");
1335                 }
1336         }
1337
1338 }
1339
1340 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1341 {
1342         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1343 }
1344
1345 static inline int guestwidth_to_adjustwidth(int gaw)
1346 {
1347         int agaw;
1348         int r = (gaw - 12) % 9;
1349
1350         if (r == 0)
1351                 agaw = gaw;
1352         else
1353                 agaw = gaw + 9 - r;
1354         if (agaw > 64)
1355                 agaw = 64;
1356         return agaw;
1357 }
1358
1359 static int domain_init(struct dmar_domain *domain, int guest_width)
1360 {
1361         struct intel_iommu *iommu;
1362         int adjust_width, agaw;
1363         unsigned long sagaw;
1364
1365         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1366         spin_lock_init(&domain->iommu_lock);
1367
1368         domain_reserve_special_ranges(domain);
1369
1370         /* calculate AGAW */
1371         iommu = domain_get_iommu(domain);
1372         if (guest_width > cap_mgaw(iommu->cap))
1373                 guest_width = cap_mgaw(iommu->cap);
1374         domain->gaw = guest_width;
1375         adjust_width = guestwidth_to_adjustwidth(guest_width);
1376         agaw = width_to_agaw(adjust_width);
1377         sagaw = cap_sagaw(iommu->cap);
1378         if (!test_bit(agaw, &sagaw)) {
1379                 /* hardware doesn't support it, choose a bigger one */
1380                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1381                 agaw = find_next_bit(&sagaw, 5, agaw);
1382                 if (agaw >= 5)
1383                         return -ENODEV;
1384         }
1385         domain->agaw = agaw;
1386         INIT_LIST_HEAD(&domain->devices);
1387
1388         if (ecap_coherent(iommu->ecap))
1389                 domain->iommu_coherency = 1;
1390         else
1391                 domain->iommu_coherency = 0;
1392
1393         if (ecap_sc_support(iommu->ecap))
1394                 domain->iommu_snooping = 1;
1395         else
1396                 domain->iommu_snooping = 0;
1397
1398         domain->iommu_count = 1;
1399         domain->nid = iommu->node;
1400
1401         /* always allocate the top pgd */
1402         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1403         if (!domain->pgd)
1404                 return -ENOMEM;
1405         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1406         return 0;
1407 }
1408
1409 static void domain_exit(struct dmar_domain *domain)
1410 {
1411         struct dmar_drhd_unit *drhd;
1412         struct intel_iommu *iommu;
1413
1414         /* Domain 0 is reserved, so dont process it */
1415         if (!domain)
1416                 return;
1417
1418         domain_remove_dev_info(domain);
1419         /* destroy iovas */
1420         put_iova_domain(&domain->iovad);
1421
1422         /* clear ptes */
1423         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1424
1425         /* free page tables */
1426         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1427
1428         for_each_active_iommu(iommu, drhd)
1429                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1430                         iommu_detach_domain(domain, iommu);
1431
1432         free_domain_mem(domain);
1433 }
1434
1435 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1436                                  u8 bus, u8 devfn, int translation)
1437 {
1438         struct context_entry *context;
1439         unsigned long flags;
1440         struct intel_iommu *iommu;
1441         struct dma_pte *pgd;
1442         unsigned long num;
1443         unsigned long ndomains;
1444         int id;
1445         int agaw;
1446         struct device_domain_info *info = NULL;
1447
1448         pr_debug("Set context mapping for %02x:%02x.%d\n",
1449                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1450
1451         BUG_ON(!domain->pgd);
1452         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1453                translation != CONTEXT_TT_MULTI_LEVEL);
1454
1455         iommu = device_to_iommu(segment, bus, devfn);
1456         if (!iommu)
1457                 return -ENODEV;
1458
1459         context = device_to_context_entry(iommu, bus, devfn);
1460         if (!context)
1461                 return -ENOMEM;
1462         spin_lock_irqsave(&iommu->lock, flags);
1463         if (context_present(context)) {
1464                 spin_unlock_irqrestore(&iommu->lock, flags);
1465                 return 0;
1466         }
1467
1468         id = domain->id;
1469         pgd = domain->pgd;
1470
1471         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1472             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1473                 int found = 0;
1474
1475                 /* find an available domain id for this device in iommu */
1476                 ndomains = cap_ndoms(iommu->cap);
1477                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1478                         if (iommu->domains[num] == domain) {
1479                                 id = num;
1480                                 found = 1;
1481                                 break;
1482                         }
1483                 }
1484
1485                 if (found == 0) {
1486                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1487                         if (num >= ndomains) {
1488                                 spin_unlock_irqrestore(&iommu->lock, flags);
1489                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1490                                 return -EFAULT;
1491                         }
1492
1493                         set_bit(num, iommu->domain_ids);
1494                         iommu->domains[num] = domain;
1495                         id = num;
1496                 }
1497
1498                 /* Skip top levels of page tables for
1499                  * iommu which has less agaw than default.
1500                  * Unnecessary for PT mode.
1501                  */
1502                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1503                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1504                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1505                                 if (!dma_pte_present(pgd)) {
1506                                         spin_unlock_irqrestore(&iommu->lock, flags);
1507                                         return -ENOMEM;
1508                                 }
1509                         }
1510                 }
1511         }
1512
1513         context_set_domain_id(context, id);
1514
1515         if (translation != CONTEXT_TT_PASS_THROUGH) {
1516                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1517                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1518                                      CONTEXT_TT_MULTI_LEVEL;
1519         }
1520         /*
1521          * In pass through mode, AW must be programmed to indicate the largest
1522          * AGAW value supported by hardware. And ASR is ignored by hardware.
1523          */
1524         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1525                 context_set_address_width(context, iommu->msagaw);
1526         else {
1527                 context_set_address_root(context, virt_to_phys(pgd));
1528                 context_set_address_width(context, iommu->agaw);
1529         }
1530
1531         context_set_translation_type(context, translation);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /*
1537          * It's a non-present to present mapping. If hardware doesn't cache
1538          * non-present entry we only need to flush the write-buffer. If the
1539          * _does_ cache non-present entries, then it does so in the special
1540          * domain #0, which we have to flush:
1541          */
1542         if (cap_caching_mode(iommu->cap)) {
1543                 iommu->flush.flush_context(iommu, 0,
1544                                            (((u16)bus) << 8) | devfn,
1545                                            DMA_CCMD_MASK_NOBIT,
1546                                            DMA_CCMD_DEVICE_INVL);
1547                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1548         } else {
1549                 iommu_flush_write_buffer(iommu);
1550         }
1551         iommu_enable_dev_iotlb(info);
1552         spin_unlock_irqrestore(&iommu->lock, flags);
1553
1554         spin_lock_irqsave(&domain->iommu_lock, flags);
1555         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1556                 domain->iommu_count++;
1557                 if (domain->iommu_count == 1)
1558                         domain->nid = iommu->node;
1559                 domain_update_iommu_cap(domain);
1560         }
1561         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1562         return 0;
1563 }
1564
1565 static int
1566 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1567                         int translation)
1568 {
1569         int ret;
1570         struct pci_dev *tmp, *parent;
1571
1572         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1573                                          pdev->bus->number, pdev->devfn,
1574                                          translation);
1575         if (ret)
1576                 return ret;
1577
1578         /* dependent device mapping */
1579         tmp = pci_find_upstream_pcie_bridge(pdev);
1580         if (!tmp)
1581                 return 0;
1582         /* Secondary interface's bus number and devfn 0 */
1583         parent = pdev->bus->self;
1584         while (parent != tmp) {
1585                 ret = domain_context_mapping_one(domain,
1586                                                  pci_domain_nr(parent->bus),
1587                                                  parent->bus->number,
1588                                                  parent->devfn, translation);
1589                 if (ret)
1590                         return ret;
1591                 parent = parent->bus->self;
1592         }
1593         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1594                 return domain_context_mapping_one(domain,
1595                                         pci_domain_nr(tmp->subordinate),
1596                                         tmp->subordinate->number, 0,
1597                                         translation);
1598         else /* this is a legacy PCI bridge */
1599                 return domain_context_mapping_one(domain,
1600                                                   pci_domain_nr(tmp->bus),
1601                                                   tmp->bus->number,
1602                                                   tmp->devfn,
1603                                                   translation);
1604 }
1605
1606 static int domain_context_mapped(struct pci_dev *pdev)
1607 {
1608         int ret;
1609         struct pci_dev *tmp, *parent;
1610         struct intel_iommu *iommu;
1611
1612         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1613                                 pdev->devfn);
1614         if (!iommu)
1615                 return -ENODEV;
1616
1617         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1618         if (!ret)
1619                 return ret;
1620         /* dependent device mapping */
1621         tmp = pci_find_upstream_pcie_bridge(pdev);
1622         if (!tmp)
1623                 return ret;
1624         /* Secondary interface's bus number and devfn 0 */
1625         parent = pdev->bus->self;
1626         while (parent != tmp) {
1627                 ret = device_context_mapped(iommu, parent->bus->number,
1628                                             parent->devfn);
1629                 if (!ret)
1630                         return ret;
1631                 parent = parent->bus->self;
1632         }
1633         if (pci_is_pcie(tmp))
1634                 return device_context_mapped(iommu, tmp->subordinate->number,
1635                                              0);
1636         else
1637                 return device_context_mapped(iommu, tmp->bus->number,
1638                                              tmp->devfn);
1639 }
1640
1641 /* Returns a number of VTD pages, but aligned to MM page size */
1642 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1643                                             size_t size)
1644 {
1645         host_addr &= ~PAGE_MASK;
1646         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1647 }
1648
1649 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1650                             struct scatterlist *sg, unsigned long phys_pfn,
1651                             unsigned long nr_pages, int prot)
1652 {
1653         struct dma_pte *first_pte = NULL, *pte = NULL;
1654         phys_addr_t uninitialized_var(pteval);
1655         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1656         unsigned long sg_res;
1657
1658         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1659
1660         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1661                 return -EINVAL;
1662
1663         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1664
1665         if (sg)
1666                 sg_res = 0;
1667         else {
1668                 sg_res = nr_pages + 1;
1669                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1670         }
1671
1672         while (nr_pages--) {
1673                 uint64_t tmp;
1674
1675                 if (!sg_res) {
1676                         sg_res = aligned_nrpages(sg->offset, sg->length);
1677                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1678                         sg->dma_length = sg->length;
1679                         pteval = page_to_phys(sg_page(sg)) | prot;
1680                 }
1681                 if (!pte) {
1682                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1683                         if (!pte)
1684                                 return -ENOMEM;
1685                 }
1686                 /* We don't need lock here, nobody else
1687                  * touches the iova range
1688                  */
1689                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1690                 if (tmp) {
1691                         static int dumps = 5;
1692                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1693                                iov_pfn, tmp, (unsigned long long)pteval);
1694                         if (dumps) {
1695                                 dumps--;
1696                                 debug_dma_dump_mappings(NULL);
1697                         }
1698                         WARN_ON(1);
1699                 }
1700                 pte++;
1701                 if (!nr_pages || first_pte_in_page(pte)) {
1702                         domain_flush_cache(domain, first_pte,
1703                                            (void *)pte - (void *)first_pte);
1704                         pte = NULL;
1705                 }
1706                 iov_pfn++;
1707                 pteval += VTD_PAGE_SIZE;
1708                 sg_res--;
1709                 if (!sg_res)
1710                         sg = sg_next(sg);
1711         }
1712         return 0;
1713 }
1714
1715 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1716                                     struct scatterlist *sg, unsigned long nr_pages,
1717                                     int prot)
1718 {
1719         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1720 }
1721
1722 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723                                      unsigned long phys_pfn, unsigned long nr_pages,
1724                                      int prot)
1725 {
1726         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1727 }
1728
1729 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1730 {
1731         if (!iommu)
1732                 return;
1733
1734         clear_context_table(iommu, bus, devfn);
1735         iommu->flush.flush_context(iommu, 0, 0, 0,
1736                                            DMA_CCMD_GLOBAL_INVL);
1737         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1738 }
1739
1740 static void domain_remove_dev_info(struct dmar_domain *domain)
1741 {
1742         struct device_domain_info *info;
1743         unsigned long flags;
1744         struct intel_iommu *iommu;
1745
1746         spin_lock_irqsave(&device_domain_lock, flags);
1747         while (!list_empty(&domain->devices)) {
1748                 info = list_entry(domain->devices.next,
1749                         struct device_domain_info, link);
1750                 list_del(&info->link);
1751                 list_del(&info->global);
1752                 if (info->dev)
1753                         info->dev->dev.archdata.iommu = NULL;
1754                 spin_unlock_irqrestore(&device_domain_lock, flags);
1755
1756                 iommu_disable_dev_iotlb(info);
1757                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1758                 iommu_detach_dev(iommu, info->bus, info->devfn);
1759                 free_devinfo_mem(info);
1760
1761                 spin_lock_irqsave(&device_domain_lock, flags);
1762         }
1763         spin_unlock_irqrestore(&device_domain_lock, flags);
1764 }
1765
1766 /*
1767  * find_domain
1768  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1769  */
1770 static struct dmar_domain *
1771 find_domain(struct pci_dev *pdev)
1772 {
1773         struct device_domain_info *info;
1774
1775         /* No lock here, assumes no domain exit in normal case */
1776         info = pdev->dev.archdata.iommu;
1777         if (info)
1778                 return info->domain;
1779         return NULL;
1780 }
1781
1782 /* domain is initialized */
1783 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1784 {
1785         struct dmar_domain *domain, *found = NULL;
1786         struct intel_iommu *iommu;
1787         struct dmar_drhd_unit *drhd;
1788         struct device_domain_info *info, *tmp;
1789         struct pci_dev *dev_tmp;
1790         unsigned long flags;
1791         int bus = 0, devfn = 0;
1792         int segment;
1793         int ret;
1794
1795         domain = find_domain(pdev);
1796         if (domain)
1797                 return domain;
1798
1799         segment = pci_domain_nr(pdev->bus);
1800
1801         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1802         if (dev_tmp) {
1803                 if (pci_is_pcie(dev_tmp)) {
1804                         bus = dev_tmp->subordinate->number;
1805                         devfn = 0;
1806                 } else {
1807                         bus = dev_tmp->bus->number;
1808                         devfn = dev_tmp->devfn;
1809                 }
1810                 spin_lock_irqsave(&device_domain_lock, flags);
1811                 list_for_each_entry(info, &device_domain_list, global) {
1812                         if (info->segment == segment &&
1813                             info->bus == bus && info->devfn == devfn) {
1814                                 found = info->domain;
1815                                 break;
1816                         }
1817                 }
1818                 spin_unlock_irqrestore(&device_domain_lock, flags);
1819                 /* pcie-pci bridge already has a domain, uses it */
1820                 if (found) {
1821                         domain = found;
1822                         goto found_domain;
1823                 }
1824         }
1825
1826         domain = alloc_domain();
1827         if (!domain)
1828                 goto error;
1829
1830         /* Allocate new domain for the device */
1831         drhd = dmar_find_matched_drhd_unit(pdev);
1832         if (!drhd) {
1833                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1834                         pci_name(pdev));
1835                 return NULL;
1836         }
1837         iommu = drhd->iommu;
1838
1839         ret = iommu_attach_domain(domain, iommu);
1840         if (ret) {
1841                 domain_exit(domain);
1842                 goto error;
1843         }
1844
1845         if (domain_init(domain, gaw)) {
1846                 domain_exit(domain);
1847                 goto error;
1848         }
1849
1850         /* register pcie-to-pci device */
1851         if (dev_tmp) {
1852                 info = alloc_devinfo_mem();
1853                 if (!info) {
1854                         domain_exit(domain);
1855                         goto error;
1856                 }
1857                 info->segment = segment;
1858                 info->bus = bus;
1859                 info->devfn = devfn;
1860                 info->dev = NULL;
1861                 info->domain = domain;
1862                 /* This domain is shared by devices under p2p bridge */
1863                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1864
1865                 /* pcie-to-pci bridge already has a domain, uses it */
1866                 found = NULL;
1867                 spin_lock_irqsave(&device_domain_lock, flags);
1868                 list_for_each_entry(tmp, &device_domain_list, global) {
1869                         if (tmp->segment == segment &&
1870                             tmp->bus == bus && tmp->devfn == devfn) {
1871                                 found = tmp->domain;
1872                                 break;
1873                         }
1874                 }
1875                 if (found) {
1876                         free_devinfo_mem(info);
1877                         domain_exit(domain);
1878                         domain = found;
1879                 } else {
1880                         list_add(&info->link, &domain->devices);
1881                         list_add(&info->global, &device_domain_list);
1882                 }
1883                 spin_unlock_irqrestore(&device_domain_lock, flags);
1884         }
1885
1886 found_domain:
1887         info = alloc_devinfo_mem();
1888         if (!info)
1889                 goto error;
1890         info->segment = segment;
1891         info->bus = pdev->bus->number;
1892         info->devfn = pdev->devfn;
1893         info->dev = pdev;
1894         info->domain = domain;
1895         spin_lock_irqsave(&device_domain_lock, flags);
1896         /* somebody is fast */
1897         found = find_domain(pdev);
1898         if (found != NULL) {
1899                 spin_unlock_irqrestore(&device_domain_lock, flags);
1900                 if (found != domain) {
1901                         domain_exit(domain);
1902                         domain = found;
1903                 }
1904                 free_devinfo_mem(info);
1905                 return domain;
1906         }
1907         list_add(&info->link, &domain->devices);
1908         list_add(&info->global, &device_domain_list);
1909         pdev->dev.archdata.iommu = info;
1910         spin_unlock_irqrestore(&device_domain_lock, flags);
1911         return domain;
1912 error:
1913         /* recheck it here, maybe others set it */
1914         return find_domain(pdev);
1915 }
1916
1917 static int iommu_identity_mapping;
1918 #define IDENTMAP_ALL            1
1919 #define IDENTMAP_GFX            2
1920 #define IDENTMAP_AZALIA         4
1921
1922 static int iommu_domain_identity_map(struct dmar_domain *domain,
1923                                      unsigned long long start,
1924                                      unsigned long long end)
1925 {
1926         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1927         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1928
1929         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1930                           dma_to_mm_pfn(last_vpfn))) {
1931                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1932                 return -ENOMEM;
1933         }
1934
1935         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1936                  start, end, domain->id);
1937         /*
1938          * RMRR range might have overlap with physical memory range,
1939          * clear it first
1940          */
1941         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1942
1943         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1944                                   last_vpfn - first_vpfn + 1,
1945                                   DMA_PTE_READ|DMA_PTE_WRITE);
1946 }
1947
1948 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1949                                       unsigned long long start,
1950                                       unsigned long long end)
1951 {
1952         struct dmar_domain *domain;
1953         int ret;
1954
1955         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1956         if (!domain)
1957                 return -ENOMEM;
1958
1959         /* For _hardware_ passthrough, don't bother. But for software
1960            passthrough, we do it anyway -- it may indicate a memory
1961            range which is reserved in E820, so which didn't get set
1962            up to start with in si_domain */
1963         if (domain == si_domain && hw_pass_through) {
1964                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1965                        pci_name(pdev), start, end);
1966                 return 0;
1967         }
1968
1969         printk(KERN_INFO
1970                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1971                pci_name(pdev), start, end);
1972         
1973         if (end < start) {
1974                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1975                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1976                         dmi_get_system_info(DMI_BIOS_VENDOR),
1977                         dmi_get_system_info(DMI_BIOS_VERSION),
1978                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1979                 ret = -EIO;
1980                 goto error;
1981         }
1982
1983         if (end >> agaw_to_width(domain->agaw)) {
1984                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1985                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1986                      agaw_to_width(domain->agaw),
1987                      dmi_get_system_info(DMI_BIOS_VENDOR),
1988                      dmi_get_system_info(DMI_BIOS_VERSION),
1989                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1990                 ret = -EIO;
1991                 goto error;
1992         }
1993
1994         ret = iommu_domain_identity_map(domain, start, end);
1995         if (ret)
1996                 goto error;
1997
1998         /* context entry init */
1999         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2000         if (ret)
2001                 goto error;
2002
2003         return 0;
2004
2005  error:
2006         domain_exit(domain);
2007         return ret;
2008 }
2009
2010 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2011         struct pci_dev *pdev)
2012 {
2013         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2014                 return 0;
2015         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2016                 rmrr->end_address + 1);
2017 }
2018
2019 #ifdef CONFIG_DMAR_FLOPPY_WA
2020 static inline void iommu_prepare_isa(void)
2021 {
2022         struct pci_dev *pdev;
2023         int ret;
2024
2025         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2026         if (!pdev)
2027                 return;
2028
2029         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2030         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2031
2032         if (ret)
2033                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2034                        "floppy might not work\n");
2035
2036 }
2037 #else
2038 static inline void iommu_prepare_isa(void)
2039 {
2040         return;
2041 }
2042 #endif /* !CONFIG_DMAR_FLPY_WA */
2043
2044 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2045
2046 static int __init si_domain_work_fn(unsigned long start_pfn,
2047                                     unsigned long end_pfn, void *datax)
2048 {
2049         int *ret = datax;
2050
2051         *ret = iommu_domain_identity_map(si_domain,
2052                                          (uint64_t)start_pfn << PAGE_SHIFT,
2053                                          (uint64_t)end_pfn << PAGE_SHIFT);
2054         return *ret;
2055
2056 }
2057
2058 static int __init si_domain_init(int hw)
2059 {
2060         struct dmar_drhd_unit *drhd;
2061         struct intel_iommu *iommu;
2062         int nid, ret = 0;
2063
2064         si_domain = alloc_domain();
2065         if (!si_domain)
2066                 return -EFAULT;
2067
2068         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2069
2070         for_each_active_iommu(iommu, drhd) {
2071                 ret = iommu_attach_domain(si_domain, iommu);
2072                 if (ret) {
2073                         domain_exit(si_domain);
2074                         return -EFAULT;
2075                 }
2076         }
2077
2078         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2079                 domain_exit(si_domain);
2080                 return -EFAULT;
2081         }
2082
2083         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2084
2085         if (hw)
2086                 return 0;
2087
2088         for_each_online_node(nid) {
2089                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2090                 if (ret)
2091                         return ret;
2092         }
2093
2094         return 0;
2095 }
2096
2097 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2098                                           struct pci_dev *pdev);
2099 static int identity_mapping(struct pci_dev *pdev)
2100 {
2101         struct device_domain_info *info;
2102
2103         if (likely(!iommu_identity_mapping))
2104                 return 0;
2105
2106
2107         list_for_each_entry(info, &si_domain->devices, link)
2108                 if (info->dev == pdev)
2109                         return 1;
2110         return 0;
2111 }
2112
2113 static int domain_add_dev_info(struct dmar_domain *domain,
2114                                struct pci_dev *pdev,
2115                                int translation)
2116 {
2117         struct device_domain_info *info;
2118         unsigned long flags;
2119         int ret;
2120
2121         info = alloc_devinfo_mem();
2122         if (!info)
2123                 return -ENOMEM;
2124
2125         ret = domain_context_mapping(domain, pdev, translation);
2126         if (ret) {
2127                 free_devinfo_mem(info);
2128                 return ret;
2129         }
2130
2131         info->segment = pci_domain_nr(pdev->bus);
2132         info->bus = pdev->bus->number;
2133         info->devfn = pdev->devfn;
2134         info->dev = pdev;
2135         info->domain = domain;
2136
2137         spin_lock_irqsave(&device_domain_lock, flags);
2138         list_add(&info->link, &domain->devices);
2139         list_add(&info->global, &device_domain_list);
2140         pdev->dev.archdata.iommu = info;
2141         spin_unlock_irqrestore(&device_domain_lock, flags);
2142
2143         return 0;
2144 }
2145
2146 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2147 {
2148         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2149                 return 1;
2150
2151         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2152                 return 1;
2153
2154         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2155                 return 0;
2156
2157         /*
2158          * We want to start off with all devices in the 1:1 domain, and
2159          * take them out later if we find they can't access all of memory.
2160          *
2161          * However, we can't do this for PCI devices behind bridges,
2162          * because all PCI devices behind the same bridge will end up
2163          * with the same source-id on their transactions.
2164          *
2165          * Practically speaking, we can't change things around for these
2166          * devices at run-time, because we can't be sure there'll be no
2167          * DMA transactions in flight for any of their siblings.
2168          * 
2169          * So PCI devices (unless they're on the root bus) as well as
2170          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2171          * the 1:1 domain, just in _case_ one of their siblings turns out
2172          * not to be able to map all of memory.
2173          */
2174         if (!pci_is_pcie(pdev)) {
2175                 if (!pci_is_root_bus(pdev->bus))
2176                         return 0;
2177                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2178                         return 0;
2179         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2180                 return 0;
2181
2182         /* 
2183          * At boot time, we don't yet know if devices will be 64-bit capable.
2184          * Assume that they will -- if they turn out not to be, then we can 
2185          * take them out of the 1:1 domain later.
2186          */
2187         if (!startup)
2188                 return pdev->dma_mask > DMA_BIT_MASK(32);
2189
2190         return 1;
2191 }
2192
2193 static int __init iommu_prepare_static_identity_mapping(int hw)
2194 {
2195         struct pci_dev *pdev = NULL;
2196         int ret;
2197
2198         ret = si_domain_init(hw);
2199         if (ret)
2200                 return -EFAULT;
2201
2202         for_each_pci_dev(pdev) {
2203                 if (iommu_should_identity_map(pdev, 1)) {
2204                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2205                                hw ? "hardware" : "software", pci_name(pdev));
2206
2207                         ret = domain_add_dev_info(si_domain, pdev,
2208                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2209                                                      CONTEXT_TT_MULTI_LEVEL);
2210                         if (ret)
2211                                 return ret;
2212                 }
2213         }
2214
2215         return 0;
2216 }
2217
2218 int __init init_dmars(void)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221         struct dmar_rmrr_unit *rmrr;
2222         struct pci_dev *pdev;
2223         struct intel_iommu *iommu;
2224         int i, ret;
2225
2226         /*
2227          * for each drhd
2228          *    allocate root
2229          *    initialize and program root entry to not present
2230          * endfor
2231          */
2232         for_each_drhd_unit(drhd) {
2233                 g_num_of_iommus++;
2234                 /*
2235                  * lock not needed as this is only incremented in the single
2236                  * threaded kernel __init code path all other access are read
2237                  * only
2238                  */
2239         }
2240
2241         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2242                         GFP_KERNEL);
2243         if (!g_iommus) {
2244                 printk(KERN_ERR "Allocating global iommu array failed\n");
2245                 ret = -ENOMEM;
2246                 goto error;
2247         }
2248
2249         deferred_flush = kzalloc(g_num_of_iommus *
2250                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2251         if (!deferred_flush) {
2252                 ret = -ENOMEM;
2253                 goto error;
2254         }
2255
2256         for_each_drhd_unit(drhd) {
2257                 if (drhd->ignored)
2258                         continue;
2259
2260                 iommu = drhd->iommu;
2261                 g_iommus[iommu->seq_id] = iommu;
2262
2263                 ret = iommu_init_domains(iommu);
2264                 if (ret)
2265                         goto error;
2266
2267                 /*
2268                  * TBD:
2269                  * we could share the same root & context tables
2270                  * amoung all IOMMU's. Need to Split it later.
2271                  */
2272                 ret = iommu_alloc_root_entry(iommu);
2273                 if (ret) {
2274                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2275                         goto error;
2276                 }
2277                 if (!ecap_pass_through(iommu->ecap))
2278                         hw_pass_through = 0;
2279         }
2280
2281         /*
2282          * Start from the sane iommu hardware state.
2283          */
2284         for_each_drhd_unit(drhd) {
2285                 if (drhd->ignored)
2286                         continue;
2287
2288                 iommu = drhd->iommu;
2289
2290                 /*
2291                  * If the queued invalidation is already initialized by us
2292                  * (for example, while enabling interrupt-remapping) then
2293                  * we got the things already rolling from a sane state.
2294                  */
2295                 if (iommu->qi)
2296                         continue;
2297
2298                 /*
2299                  * Clear any previous faults.
2300                  */
2301                 dmar_fault(-1, iommu);
2302                 /*
2303                  * Disable queued invalidation if supported and already enabled
2304                  * before OS handover.
2305                  */
2306                 dmar_disable_qi(iommu);
2307         }
2308
2309         for_each_drhd_unit(drhd) {
2310                 if (drhd->ignored)
2311                         continue;
2312
2313                 iommu = drhd->iommu;
2314
2315                 if (dmar_enable_qi(iommu)) {
2316                         /*
2317                          * Queued Invalidate not enabled, use Register Based
2318                          * Invalidate
2319                          */
2320                         iommu->flush.flush_context = __iommu_flush_context;
2321                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2322                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2323                                "invalidation\n",
2324                                (unsigned long long)drhd->reg_base_addr);
2325                 } else {
2326                         iommu->flush.flush_context = qi_flush_context;
2327                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2328                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2329                                "invalidation\n",
2330                                (unsigned long long)drhd->reg_base_addr);
2331                 }
2332         }
2333
2334         if (iommu_pass_through)
2335                 iommu_identity_mapping |= IDENTMAP_ALL;
2336
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338         iommu_identity_mapping |= IDENTMAP_GFX;
2339 #endif
2340
2341         check_tylersburg_isoch();
2342
2343         /*
2344          * If pass through is not set or not enabled, setup context entries for
2345          * identity mappings for rmrr, gfx, and isa and may fall back to static
2346          * identity mapping if iommu_identity_mapping is set.
2347          */
2348         if (iommu_identity_mapping) {
2349                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2350                 if (ret) {
2351                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2352                         goto error;
2353                 }
2354         }
2355         /*
2356          * For each rmrr
2357          *   for each dev attached to rmrr
2358          *   do
2359          *     locate drhd for dev, alloc domain for dev
2360          *     allocate free domain
2361          *     allocate page table entries for rmrr
2362          *     if context not allocated for bus
2363          *           allocate and init context
2364          *           set present in root table for this bus
2365          *     init context with domain, translation etc
2366          *    endfor
2367          * endfor
2368          */
2369         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370         for_each_rmrr_units(rmrr) {
2371                 for (i = 0; i < rmrr->devices_cnt; i++) {
2372                         pdev = rmrr->devices[i];
2373                         /*
2374                          * some BIOS lists non-exist devices in DMAR
2375                          * table.
2376                          */
2377                         if (!pdev)
2378                                 continue;
2379                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2380                         if (ret)
2381                                 printk(KERN_ERR
2382                                        "IOMMU: mapping reserved region failed\n");
2383                 }
2384         }
2385
2386         iommu_prepare_isa();
2387
2388         /*
2389          * for each drhd
2390          *   enable fault log
2391          *   global invalidate context cache
2392          *   global invalidate iotlb
2393          *   enable translation
2394          */
2395         for_each_drhd_unit(drhd) {
2396                 if (drhd->ignored)
2397                         continue;
2398                 iommu = drhd->iommu;
2399
2400                 iommu_flush_write_buffer(iommu);
2401
2402                 ret = dmar_set_interrupt(iommu);
2403                 if (ret)
2404                         goto error;
2405
2406                 iommu_set_root_entry(iommu);
2407
2408                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2410
2411                 ret = iommu_enable_translation(iommu);
2412                 if (ret)
2413                         goto error;
2414
2415                 iommu_disable_protect_mem_regions(iommu);
2416         }
2417
2418         return 0;
2419 error:
2420         for_each_drhd_unit(drhd) {
2421                 if (drhd->ignored)
2422                         continue;
2423                 iommu = drhd->iommu;
2424                 free_iommu(iommu);
2425         }
2426         kfree(g_iommus);
2427         return ret;
2428 }
2429
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432                                      struct dmar_domain *domain,
2433                                      unsigned long nrpages, uint64_t dma_mask)
2434 {
2435         struct pci_dev *pdev = to_pci_dev(dev);
2436         struct iova *iova = NULL;
2437
2438         /* Restrict dma_mask to the width that the iommu can handle */
2439         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2440
2441         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2442                 /*
2443                  * First try to allocate an io virtual address in
2444                  * DMA_BIT_MASK(32) and if that fails then try allocating
2445                  * from higher range
2446                  */
2447                 iova = alloc_iova(&domain->iovad, nrpages,
2448                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2449                 if (iova)
2450                         return iova;
2451         }
2452         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453         if (unlikely(!iova)) {
2454                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455                        nrpages, pci_name(pdev));
2456                 return NULL;
2457         }
2458
2459         return iova;
2460 }
2461
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2463 {
2464         struct dmar_domain *domain;
2465         int ret;
2466
2467         domain = get_domain_for_dev(pdev,
2468                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2469         if (!domain) {
2470                 printk(KERN_ERR
2471                         "Allocating domain for %s failed", pci_name(pdev));
2472                 return NULL;
2473         }
2474
2475         /* make sure context mapping is ok */
2476         if (unlikely(!domain_context_mapped(pdev))) {
2477                 ret = domain_context_mapping(domain, pdev,
2478                                              CONTEXT_TT_MULTI_LEVEL);
2479                 if (ret) {
2480                         printk(KERN_ERR
2481                                 "Domain context map for %s failed",
2482                                 pci_name(pdev));
2483                         return NULL;
2484                 }
2485         }
2486
2487         return domain;
2488 }
2489
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2491 {
2492         struct device_domain_info *info;
2493
2494         /* No lock here, assumes no domain exit in normal case */
2495         info = dev->dev.archdata.iommu;
2496         if (likely(info))
2497                 return info->domain;
2498
2499         return __get_valid_domain_for_dev(dev);
2500 }
2501
2502 static int iommu_dummy(struct pci_dev *pdev)
2503 {
2504         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2505 }
2506
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2509 {
2510         struct pci_dev *pdev;
2511         int found;
2512
2513         if (unlikely(dev->bus != &pci_bus_type))
2514                 return 1;
2515
2516         pdev = to_pci_dev(dev);
2517         if (iommu_dummy(pdev))
2518                 return 1;
2519
2520         if (!iommu_identity_mapping)
2521                 return 0;
2522
2523         found = identity_mapping(pdev);
2524         if (found) {
2525                 if (iommu_should_identity_map(pdev, 0))
2526                         return 1;
2527                 else {
2528                         /*
2529                          * 32 bit DMA is removed from si_domain and fall back
2530                          * to non-identity mapping.
2531                          */
2532                         domain_remove_one_dev_info(si_domain, pdev);
2533                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2534                                pci_name(pdev));
2535                         return 0;
2536                 }
2537         } else {
2538                 /*
2539                  * In case of a detached 64 bit DMA device from vm, the device
2540                  * is put into si_domain for identity mapping.
2541                  */
2542                 if (iommu_should_identity_map(pdev, 0)) {
2543                         int ret;
2544                         ret = domain_add_dev_info(si_domain, pdev,
2545                                                   hw_pass_through ?
2546                                                   CONTEXT_TT_PASS_THROUGH :
2547                                                   CONTEXT_TT_MULTI_LEVEL);
2548                         if (!ret) {
2549                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2550                                        pci_name(pdev));
2551                                 return 1;
2552                         }
2553                 }
2554         }
2555
2556         return 0;
2557 }
2558
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560                                      size_t size, int dir, u64 dma_mask)
2561 {
2562         struct pci_dev *pdev = to_pci_dev(hwdev);
2563         struct dmar_domain *domain;
2564         phys_addr_t start_paddr;
2565         struct iova *iova;
2566         int prot = 0;
2567         int ret;
2568         struct intel_iommu *iommu;
2569         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2570
2571         BUG_ON(dir == DMA_NONE);
2572
2573         if (iommu_no_mapping(hwdev))
2574                 return paddr;
2575
2576         domain = get_valid_domain_for_dev(pdev);
2577         if (!domain)
2578                 return 0;
2579
2580         iommu = domain_get_iommu(domain);
2581         size = aligned_nrpages(paddr, size);
2582
2583         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2584                                 pdev->dma_mask);
2585         if (!iova)
2586                 goto error;
2587
2588         /*
2589          * Check if DMAR supports zero-length reads on write only
2590          * mappings..
2591          */
2592         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593                         !cap_zlr(iommu->cap))
2594                 prot |= DMA_PTE_READ;
2595         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596                 prot |= DMA_PTE_WRITE;
2597         /*
2598          * paddr - (paddr + size) might be partial page, we should map the whole
2599          * page.  Note: if two part of one page are separately mapped, we
2600          * might have two guest_addr mapping to the same host paddr, but this
2601          * is not a big problem
2602          */
2603         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2605         if (ret)
2606                 goto error;
2607
2608         /* it's a non-present to present mapping. Only flush if caching mode */
2609         if (cap_caching_mode(iommu->cap))
2610                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2611         else
2612                 iommu_flush_write_buffer(iommu);
2613
2614         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615         start_paddr += paddr & ~PAGE_MASK;
2616         return start_paddr;
2617
2618 error:
2619         if (iova)
2620                 __free_iova(&domain->iovad, iova);
2621         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2623         return 0;
2624 }
2625
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627                                  unsigned long offset, size_t size,
2628                                  enum dma_data_direction dir,
2629                                  struct dma_attrs *attrs)
2630 {
2631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632                                   dir, to_pci_dev(dev)->dma_mask);
2633 }
2634
2635 static void flush_unmaps(void)
2636 {
2637         int i, j;
2638
2639         timer_on = 0;
2640
2641         /* just flush them all */
2642         for (i = 0; i < g_num_of_iommus; i++) {
2643                 struct intel_iommu *iommu = g_iommus[i];
2644                 if (!iommu)
2645                         continue;
2646
2647                 if (!deferred_flush[i].next)
2648                         continue;
2649
2650                 /* In caching mode, global flushes turn emulation expensive */
2651                 if (!cap_caching_mode(iommu->cap))
2652                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2653                                          DMA_TLB_GLOBAL_FLUSH);
2654                 for (j = 0; j < deferred_flush[i].next; j++) {
2655                         unsigned long mask;
2656                         struct iova *iova = deferred_flush[i].iova[j];
2657                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2658
2659                         /* On real hardware multiple invalidations are expensive */
2660                         if (cap_caching_mode(iommu->cap))
2661                                 iommu_flush_iotlb_psi(iommu, domain->id,
2662                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2663                         else {
2664                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2665                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2666                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2667                         }
2668                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2669                 }
2670                 deferred_flush[i].next = 0;
2671         }
2672
2673         list_size = 0;
2674 }
2675
2676 static void flush_unmaps_timeout(unsigned long data)
2677 {
2678         unsigned long flags;
2679
2680         spin_lock_irqsave(&async_umap_flush_lock, flags);
2681         flush_unmaps();
2682         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2683 }
2684
2685 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2686 {
2687         unsigned long flags;
2688         int next, iommu_id;
2689         struct intel_iommu *iommu;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         if (list_size == HIGH_WATER_MARK)
2693                 flush_unmaps();
2694
2695         iommu = domain_get_iommu(dom);
2696         iommu_id = iommu->seq_id;
2697
2698         next = deferred_flush[iommu_id].next;
2699         deferred_flush[iommu_id].domain[next] = dom;
2700         deferred_flush[iommu_id].iova[next] = iova;
2701         deferred_flush[iommu_id].next++;
2702
2703         if (!timer_on) {
2704                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2705                 timer_on = 1;
2706         }
2707         list_size++;
2708         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2709 }
2710
2711 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2712                              size_t size, enum dma_data_direction dir,
2713                              struct dma_attrs *attrs)
2714 {
2715         struct pci_dev *pdev = to_pci_dev(dev);
2716         struct dmar_domain *domain;
2717         unsigned long start_pfn, last_pfn;
2718         struct iova *iova;
2719         struct intel_iommu *iommu;
2720
2721         if (iommu_no_mapping(dev))
2722                 return;
2723
2724         domain = find_domain(pdev);
2725         BUG_ON(!domain);
2726
2727         iommu = domain_get_iommu(domain);
2728
2729         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2730         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2731                       (unsigned long long)dev_addr))
2732                 return;
2733
2734         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2735         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2736
2737         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2738                  pci_name(pdev), start_pfn, last_pfn);
2739
2740         /*  clear the whole page */
2741         dma_pte_clear_range(domain, start_pfn, last_pfn);
2742
2743         /* free page tables */
2744         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2745
2746         if (intel_iommu_strict) {
2747                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2748                                       last_pfn - start_pfn + 1, 0);
2749                 /* free iova */
2750                 __free_iova(&domain->iovad, iova);
2751         } else {
2752                 add_unmap(domain, iova);
2753                 /*
2754                  * queue up the release of the unmap to save the 1/6th of the
2755                  * cpu used up by the iotlb flush operation...
2756                  */
2757         }
2758 }
2759
2760 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2761                                   dma_addr_t *dma_handle, gfp_t flags)
2762 {
2763         void *vaddr;
2764         int order;
2765
2766         size = PAGE_ALIGN(size);
2767         order = get_order(size);
2768
2769         if (!iommu_no_mapping(hwdev))
2770                 flags &= ~(GFP_DMA | GFP_DMA32);
2771         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2772                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2773                         flags |= GFP_DMA;
2774                 else
2775                         flags |= GFP_DMA32;
2776         }
2777
2778         vaddr = (void *)__get_free_pages(flags, order);
2779         if (!vaddr)
2780                 return NULL;
2781         memset(vaddr, 0, size);
2782
2783         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2784                                          DMA_BIDIRECTIONAL,
2785                                          hwdev->coherent_dma_mask);
2786         if (*dma_handle)
2787                 return vaddr;
2788         free_pages((unsigned long)vaddr, order);
2789         return NULL;
2790 }
2791
2792 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2793                                 dma_addr_t dma_handle)
2794 {
2795         int order;
2796
2797         size = PAGE_ALIGN(size);
2798         order = get_order(size);
2799
2800         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2801         free_pages((unsigned long)vaddr, order);
2802 }
2803
2804 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2805                            int nelems, enum dma_data_direction dir,
2806                            struct dma_attrs *attrs)
2807 {
2808         struct pci_dev *pdev = to_pci_dev(hwdev);
2809         struct dmar_domain *domain;
2810         unsigned long start_pfn, last_pfn;
2811         struct iova *iova;
2812         struct intel_iommu *iommu;
2813
2814         if (iommu_no_mapping(hwdev))
2815                 return;
2816
2817         domain = find_domain(pdev);
2818         BUG_ON(!domain);
2819
2820         iommu = domain_get_iommu(domain);
2821
2822         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2823         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2824                       (unsigned long long)sglist[0].dma_address))
2825                 return;
2826
2827         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2828         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2829
2830         /*  clear the whole page */
2831         dma_pte_clear_range(domain, start_pfn, last_pfn);
2832
2833         /* free page tables */
2834         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2835
2836         if (intel_iommu_strict) {
2837                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2838                                       last_pfn - start_pfn + 1, 0);
2839                 /* free iova */
2840                 __free_iova(&domain->iovad, iova);
2841         } else {
2842                 add_unmap(domain, iova);
2843                 /*
2844                  * queue up the release of the unmap to save the 1/6th of the
2845                  * cpu used up by the iotlb flush operation...
2846                  */
2847         }
2848 }
2849
2850 static int intel_nontranslate_map_sg(struct device *hddev,
2851         struct scatterlist *sglist, int nelems, int dir)
2852 {
2853         int i;
2854         struct scatterlist *sg;
2855
2856         for_each_sg(sglist, sg, nelems, i) {
2857                 BUG_ON(!sg_page(sg));
2858                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2859                 sg->dma_length = sg->length;
2860         }
2861         return nelems;
2862 }
2863
2864 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2865                         enum dma_data_direction dir, struct dma_attrs *attrs)
2866 {
2867         int i;
2868         struct pci_dev *pdev = to_pci_dev(hwdev);
2869         struct dmar_domain *domain;
2870         size_t size = 0;
2871         int prot = 0;
2872         struct iova *iova = NULL;
2873         int ret;
2874         struct scatterlist *sg;
2875         unsigned long start_vpfn;
2876         struct intel_iommu *iommu;
2877
2878         BUG_ON(dir == DMA_NONE);
2879         if (iommu_no_mapping(hwdev))
2880                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2881
2882         domain = get_valid_domain_for_dev(pdev);
2883         if (!domain)
2884                 return 0;
2885
2886         iommu = domain_get_iommu(domain);
2887
2888         for_each_sg(sglist, sg, nelems, i)
2889                 size += aligned_nrpages(sg->offset, sg->length);
2890
2891         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2892                                 pdev->dma_mask);
2893         if (!iova) {
2894                 sglist->dma_length = 0;
2895                 return 0;
2896         }
2897
2898         /*
2899          * Check if DMAR supports zero-length reads on write only
2900          * mappings..
2901          */
2902         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2903                         !cap_zlr(iommu->cap))
2904                 prot |= DMA_PTE_READ;
2905         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2906                 prot |= DMA_PTE_WRITE;
2907
2908         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2909
2910         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2911         if (unlikely(ret)) {
2912                 /*  clear the page */
2913                 dma_pte_clear_range(domain, start_vpfn,
2914                                     start_vpfn + size - 1);
2915                 /* free page tables */
2916                 dma_pte_free_pagetable(domain, start_vpfn,
2917                                        start_vpfn + size - 1);
2918                 /* free iova */
2919                 __free_iova(&domain->iovad, iova);
2920                 return 0;
2921         }
2922
2923         /* it's a non-present to present mapping. Only flush if caching mode */
2924         if (cap_caching_mode(iommu->cap))
2925                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2926         else
2927                 iommu_flush_write_buffer(iommu);
2928
2929         return nelems;
2930 }
2931
2932 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2933 {
2934         return !dma_addr;
2935 }
2936
2937 struct dma_map_ops intel_dma_ops = {
2938         .alloc_coherent = intel_alloc_coherent,
2939         .free_coherent = intel_free_coherent,
2940         .map_sg = intel_map_sg,
2941         .unmap_sg = intel_unmap_sg,
2942         .map_page = intel_map_page,
2943         .unmap_page = intel_unmap_page,
2944         .mapping_error = intel_mapping_error,
2945 };
2946
2947 static inline int iommu_domain_cache_init(void)
2948 {
2949         int ret = 0;
2950
2951         iommu_domain_cache = kmem_cache_create("iommu_domain",
2952                                          sizeof(struct dmar_domain),
2953                                          0,
2954                                          SLAB_HWCACHE_ALIGN,
2955
2956                                          NULL);
2957         if (!iommu_domain_cache) {
2958                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2959                 ret = -ENOMEM;
2960         }
2961
2962         return ret;
2963 }
2964
2965 static inline int iommu_devinfo_cache_init(void)
2966 {
2967         int ret = 0;
2968
2969         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2970                                          sizeof(struct device_domain_info),
2971                                          0,
2972                                          SLAB_HWCACHE_ALIGN,
2973                                          NULL);
2974         if (!iommu_devinfo_cache) {
2975                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2976                 ret = -ENOMEM;
2977         }
2978
2979         return ret;
2980 }
2981
2982 static inline int iommu_iova_cache_init(void)
2983 {
2984         int ret = 0;
2985
2986         iommu_iova_cache = kmem_cache_create("iommu_iova",
2987                                          sizeof(struct iova),
2988                                          0,
2989                                          SLAB_HWCACHE_ALIGN,
2990                                          NULL);
2991         if (!iommu_iova_cache) {
2992                 printk(KERN_ERR "Couldn't create iova cache\n");
2993                 ret = -ENOMEM;
2994         }
2995
2996         return ret;
2997 }
2998
2999 static int __init iommu_init_mempool(void)
3000 {
3001         int ret;
3002         ret = iommu_iova_cache_init();
3003         if (ret)
3004                 return ret;
3005
3006         ret = iommu_domain_cache_init();
3007         if (ret)
3008                 goto domain_error;
3009
3010         ret = iommu_devinfo_cache_init();
3011         if (!ret)
3012                 return ret;
3013
3014         kmem_cache_destroy(iommu_domain_cache);
3015 domain_error:
3016         kmem_cache_destroy(iommu_iova_cache);
3017
3018         return -ENOMEM;
3019 }
3020
3021 static void __init iommu_exit_mempool(void)
3022 {
3023         kmem_cache_destroy(iommu_devinfo_cache);
3024         kmem_cache_destroy(iommu_domain_cache);
3025         kmem_cache_destroy(iommu_iova_cache);
3026
3027 }
3028
3029 static void __init init_no_remapping_devices(void)
3030 {
3031         struct dmar_drhd_unit *drhd;
3032
3033         for_each_drhd_unit(drhd) {
3034                 if (!drhd->include_all) {
3035                         int i;
3036                         for (i = 0; i < drhd->devices_cnt; i++)
3037                                 if (drhd->devices[i] != NULL)
3038                                         break;
3039                         /* ignore DMAR unit if no pci devices exist */
3040                         if (i == drhd->devices_cnt)
3041                                 drhd->ignored = 1;
3042                 }
3043         }
3044
3045         if (dmar_map_gfx)
3046                 return;
3047
3048         for_each_drhd_unit(drhd) {
3049                 int i;
3050                 if (drhd->ignored || drhd->include_all)
3051                         continue;
3052
3053                 for (i = 0; i < drhd->devices_cnt; i++)
3054                         if (drhd->devices[i] &&
3055                                 !IS_GFX_DEVICE(drhd->devices[i]))
3056                                 break;
3057
3058                 if (i < drhd->devices_cnt)
3059                         continue;
3060
3061                 /* bypass IOMMU if it is just for gfx devices */
3062                 drhd->ignored = 1;
3063                 for (i = 0; i < drhd->devices_cnt; i++) {
3064                         if (!drhd->devices[i])
3065                                 continue;
3066                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3067                 }
3068         }
3069 }
3070
3071 #ifdef CONFIG_SUSPEND
3072 static int init_iommu_hw(void)
3073 {
3074         struct dmar_drhd_unit *drhd;
3075         struct intel_iommu *iommu = NULL;
3076
3077         for_each_active_iommu(iommu, drhd)
3078                 if (iommu->qi)
3079                         dmar_reenable_qi(iommu);
3080
3081         for_each_active_iommu(iommu, drhd) {
3082                 iommu_flush_write_buffer(iommu);
3083
3084                 iommu_set_root_entry(iommu);
3085
3086                 iommu->flush.flush_context(iommu, 0, 0, 0,
3087                                            DMA_CCMD_GLOBAL_INVL);
3088                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3089                                          DMA_TLB_GLOBAL_FLUSH);
3090                 iommu_enable_translation(iommu);
3091                 iommu_disable_protect_mem_regions(iommu);
3092         }
3093
3094         return 0;
3095 }
3096
3097 static void iommu_flush_all(void)
3098 {
3099         struct dmar_drhd_unit *drhd;
3100         struct intel_iommu *iommu;
3101
3102         for_each_active_iommu(iommu, drhd) {
3103                 iommu->flush.flush_context(iommu, 0, 0, 0,
3104                                            DMA_CCMD_GLOBAL_INVL);
3105                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3106                                          DMA_TLB_GLOBAL_FLUSH);
3107         }
3108 }
3109
3110 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3111 {
3112         struct dmar_drhd_unit *drhd;
3113         struct intel_iommu *iommu = NULL;
3114         unsigned long flag;
3115
3116         for_each_active_iommu(iommu, drhd) {
3117                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3118                                                  GFP_ATOMIC);
3119                 if (!iommu->iommu_state)
3120                         goto nomem;
3121         }
3122
3123         iommu_flush_all();
3124
3125         for_each_active_iommu(iommu, drhd) {
3126                 iommu_disable_translation(iommu);
3127
3128                 spin_lock_irqsave(&iommu->register_lock, flag);
3129
3130                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3131                         readl(iommu->reg + DMAR_FECTL_REG);
3132                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3133                         readl(iommu->reg + DMAR_FEDATA_REG);
3134                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3135                         readl(iommu->reg + DMAR_FEADDR_REG);
3136                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3137                         readl(iommu->reg + DMAR_FEUADDR_REG);
3138
3139                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3140         }
3141         return 0;
3142
3143 nomem:
3144         for_each_active_iommu(iommu, drhd)
3145                 kfree(iommu->iommu_state);
3146
3147         return -ENOMEM;
3148 }
3149
3150 static int iommu_resume(struct sys_device *dev)
3151 {
3152         struct dmar_drhd_unit *drhd;
3153         struct intel_iommu *iommu = NULL;
3154         unsigned long flag;
3155
3156         if (init_iommu_hw()) {
3157                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3158                 return -EIO;
3159         }
3160
3161         for_each_active_iommu(iommu, drhd) {
3162
3163                 spin_lock_irqsave(&iommu->register_lock, flag);
3164
3165                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3166                         iommu->reg + DMAR_FECTL_REG);
3167                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3168                         iommu->reg + DMAR_FEDATA_REG);
3169                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3170                         iommu->reg + DMAR_FEADDR_REG);
3171                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3172                         iommu->reg + DMAR_FEUADDR_REG);
3173
3174                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3175         }
3176
3177         for_each_active_iommu(iommu, drhd)
3178                 kfree(iommu->iommu_state);
3179
3180         return 0;
3181 }
3182
3183 static struct sysdev_class iommu_sysclass = {
3184         .name           = "iommu",
3185         .resume         = iommu_resume,
3186         .suspend        = iommu_suspend,
3187 };
3188
3189 static struct sys_device device_iommu = {
3190         .cls    = &iommu_sysclass,
3191 };
3192
3193 static int __init init_iommu_sysfs(void)
3194 {
3195         int error;
3196
3197         error = sysdev_class_register(&iommu_sysclass);
3198         if (error)
3199                 return error;
3200
3201         error = sysdev_register(&device_iommu);
3202         if (error)
3203                 sysdev_class_unregister(&iommu_sysclass);
3204
3205         return error;
3206 }
3207
3208 #else
3209 static int __init init_iommu_sysfs(void)
3210 {
3211         return 0;
3212 }
3213 #endif  /* CONFIG_PM */
3214
3215 /*
3216  * Here we only respond to action of unbound device from driver.
3217  *
3218  * Added device is not attached to its DMAR domain here yet. That will happen
3219  * when mapping the device to iova.
3220  */
3221 static int device_notifier(struct notifier_block *nb,
3222                                   unsigned long action, void *data)
3223 {
3224         struct device *dev = data;
3225         struct pci_dev *pdev = to_pci_dev(dev);
3226         struct dmar_domain *domain;
3227
3228         if (iommu_no_mapping(dev))
3229                 return 0;
3230
3231         domain = find_domain(pdev);
3232         if (!domain)
3233                 return 0;
3234
3235         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3236                 domain_remove_one_dev_info(domain, pdev);
3237
3238         return 0;
3239 }
3240
3241 static struct notifier_block device_nb = {
3242         .notifier_call = device_notifier,
3243 };
3244
3245 int __init intel_iommu_init(void)
3246 {
3247         int ret = 0;
3248         int force_on = 0;
3249
3250         /* VT-d is required for a TXT/tboot launch, so enforce that */
3251         force_on = tboot_force_iommu();
3252
3253         if (dmar_table_init()) {
3254                 if (force_on)
3255                         panic("tboot: Failed to initialize DMAR table\n");
3256                 return  -ENODEV;
3257         }
3258
3259         if (dmar_dev_scope_init()) {
3260                 if (force_on)
3261                         panic("tboot: Failed to initialize DMAR device scope\n");
3262                 return  -ENODEV;
3263         }
3264
3265         /*
3266          * Check the need for DMA-remapping initialization now.
3267          * Above initialization will also be used by Interrupt-remapping.
3268          */
3269         if (no_iommu || dmar_disabled)
3270                 return -ENODEV;
3271
3272         iommu_init_mempool();
3273         dmar_init_reserved_ranges();
3274
3275         init_no_remapping_devices();
3276
3277         ret = init_dmars();
3278         if (ret) {
3279                 if (force_on)
3280                         panic("tboot: Failed to initialize DMARs\n");
3281                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3282                 put_iova_domain(&reserved_iova_list);
3283                 iommu_exit_mempool();
3284                 return ret;
3285         }
3286         printk(KERN_INFO
3287         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3288
3289         init_timer(&unmap_timer);
3290 #ifdef CONFIG_SWIOTLB
3291         swiotlb = 0;
3292 #endif
3293         dma_ops = &intel_dma_ops;
3294
3295         init_iommu_sysfs();
3296
3297         register_iommu(&intel_iommu_ops);
3298
3299         bus_register_notifier(&pci_bus_type, &device_nb);
3300
3301         return 0;
3302 }
3303
3304 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3305                                            struct pci_dev *pdev)
3306 {
3307         struct pci_dev *tmp, *parent;
3308
3309         if (!iommu || !pdev)
3310                 return;
3311
3312         /* dependent device detach */
3313         tmp = pci_find_upstream_pcie_bridge(pdev);
3314         /* Secondary interface's bus number and devfn 0 */
3315         if (tmp) {
3316                 parent = pdev->bus->self;
3317                 while (parent != tmp) {
3318                         iommu_detach_dev(iommu, parent->bus->number,
3319                                          parent->devfn);
3320                         parent = parent->bus->self;
3321                 }
3322                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3323                         iommu_detach_dev(iommu,
3324                                 tmp->subordinate->number, 0);
3325                 else /* this is a legacy PCI bridge */
3326                         iommu_detach_dev(iommu, tmp->bus->number,
3327                                          tmp->devfn);
3328         }
3329 }
3330
3331 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3332                                           struct pci_dev *pdev)
3333 {
3334         struct device_domain_info *info;
3335         struct intel_iommu *iommu;
3336         unsigned long flags;
3337         int found = 0;
3338         struct list_head *entry, *tmp;
3339
3340         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3341                                 pdev->devfn);
3342         if (!iommu)
3343                 return;
3344
3345         spin_lock_irqsave(&device_domain_lock, flags);
3346         list_for_each_safe(entry, tmp, &domain->devices) {
3347                 info = list_entry(entry, struct device_domain_info, link);
3348                 /* No need to compare PCI domain; it has to be the same */
3349                 if (info->bus == pdev->bus->number &&
3350                     info->devfn == pdev->devfn) {
3351                         list_del(&info->link);
3352                         list_del(&info->global);
3353                         if (info->dev)
3354                                 info->dev->dev.archdata.iommu = NULL;
3355                         spin_unlock_irqrestore(&device_domain_lock, flags);
3356
3357                         iommu_disable_dev_iotlb(info);
3358                         iommu_detach_dev(iommu, info->bus, info->devfn);
3359                         iommu_detach_dependent_devices(iommu, pdev);
3360                         free_devinfo_mem(info);
3361
3362                         spin_lock_irqsave(&device_domain_lock, flags);
3363
3364                         if (found)
3365                                 break;
3366                         else
3367                                 continue;
3368                 }
3369
3370                 /* if there is no other devices under the same iommu
3371                  * owned by this domain, clear this iommu in iommu_bmp
3372                  * update iommu count and coherency
3373                  */
3374                 if (iommu == device_to_iommu(info->segment, info->bus,
3375                                             info->devfn))
3376                         found = 1;
3377         }
3378
3379         if (found == 0) {
3380                 unsigned long tmp_flags;
3381                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3382                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3383                 domain->iommu_count--;
3384                 domain_update_iommu_cap(domain);
3385                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3386         }
3387
3388         spin_unlock_irqrestore(&device_domain_lock, flags);
3389 }
3390
3391 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3392 {
3393         struct device_domain_info *info;
3394         struct intel_iommu *iommu;
3395         unsigned long flags1, flags2;
3396
3397         spin_lock_irqsave(&device_domain_lock, flags1);
3398         while (!list_empty(&domain->devices)) {
3399                 info = list_entry(domain->devices.next,
3400                         struct device_domain_info, link);
3401                 list_del(&info->link);
3402                 list_del(&info->global);
3403                 if (info->dev)
3404                         info->dev->dev.archdata.iommu = NULL;
3405
3406                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3407
3408                 iommu_disable_dev_iotlb(info);
3409                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3410                 iommu_detach_dev(iommu, info->bus, info->devfn);
3411                 iommu_detach_dependent_devices(iommu, info->dev);
3412
3413                 /* clear this iommu in iommu_bmp, update iommu count
3414                  * and capabilities
3415                  */
3416                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3417                 if (test_and_clear_bit(iommu->seq_id,
3418                                        &domain->iommu_bmp)) {
3419                         domain->iommu_count--;
3420                         domain_update_iommu_cap(domain);
3421                 }
3422                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3423
3424                 free_devinfo_mem(info);
3425                 spin_lock_irqsave(&device_domain_lock, flags1);
3426         }
3427         spin_unlock_irqrestore(&device_domain_lock, flags1);
3428 }
3429
3430 /* domain id for virtual machine, it won't be set in context */
3431 static unsigned long vm_domid;
3432
3433 static int vm_domain_min_agaw(struct dmar_domain *domain)
3434 {
3435         int i;
3436         int min_agaw = domain->agaw;
3437
3438         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
3439                 if (min_agaw > g_iommus[i]->agaw)
3440                         min_agaw = g_iommus[i]->agaw;
3441         }
3442
3443         return min_agaw;
3444 }
3445
3446 static struct dmar_domain *iommu_alloc_vm_domain(void)
3447 {
3448         struct dmar_domain *domain;
3449
3450         domain = alloc_domain_mem();
3451         if (!domain)
3452                 return NULL;
3453
3454         domain->id = vm_domid++;
3455         domain->nid = -1;
3456         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3457         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3458
3459         return domain;
3460 }
3461
3462 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3463 {
3464         int adjust_width;
3465
3466         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3467         spin_lock_init(&domain->iommu_lock);
3468
3469         domain_reserve_special_ranges(domain);
3470
3471         /* calculate AGAW */
3472         domain->gaw = guest_width;
3473         adjust_width = guestwidth_to_adjustwidth(guest_width);
3474         domain->agaw = width_to_agaw(adjust_width);
3475
3476         INIT_LIST_HEAD(&domain->devices);
3477
3478         domain->iommu_count = 0;
3479         domain->iommu_coherency = 0;
3480         domain->iommu_snooping = 0;
3481         domain->max_addr = 0;
3482         domain->nid = -1;
3483
3484         /* always allocate the top pgd */
3485         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3486         if (!domain->pgd)
3487                 return -ENOMEM;
3488         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3489         return 0;
3490 }
3491
3492 static void iommu_free_vm_domain(struct dmar_domain *domain)
3493 {
3494         unsigned long flags;
3495         struct dmar_drhd_unit *drhd;
3496         struct intel_iommu *iommu;
3497         unsigned long i;
3498         unsigned long ndomains;
3499
3500         for_each_drhd_unit(drhd) {
3501                 if (drhd->ignored)
3502                         continue;
3503                 iommu = drhd->iommu;
3504
3505                 ndomains = cap_ndoms(iommu->cap);
3506                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3507                         if (iommu->domains[i] == domain) {
3508                                 spin_lock_irqsave(&iommu->lock, flags);
3509                                 clear_bit(i, iommu->domain_ids);
3510                                 iommu->domains[i] = NULL;
3511                                 spin_unlock_irqrestore(&iommu->lock, flags);
3512                                 break;
3513                         }
3514                 }
3515         }
3516 }
3517
3518 static void vm_domain_exit(struct dmar_domain *domain)
3519 {
3520         /* Domain 0 is reserved, so dont process it */
3521         if (!domain)
3522                 return;
3523
3524         vm_domain_remove_all_dev_info(domain);
3525         /* destroy iovas */
3526         put_iova_domain(&domain->iovad);
3527
3528         /* clear ptes */
3529         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3530
3531         /* free page tables */
3532         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3533
3534         iommu_free_vm_domain(domain);
3535         free_domain_mem(domain);
3536 }
3537
3538 static int intel_iommu_domain_init(struct iommu_domain *domain)
3539 {
3540         struct dmar_domain *dmar_domain;
3541
3542         dmar_domain = iommu_alloc_vm_domain();
3543         if (!dmar_domain) {
3544                 printk(KERN_ERR
3545                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3546                 return -ENOMEM;
3547         }
3548         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3549                 printk(KERN_ERR
3550                         "intel_iommu_domain_init() failed\n");
3551                 vm_domain_exit(dmar_domain);
3552                 return -ENOMEM;
3553         }
3554         domain->priv = dmar_domain;
3555
3556         return 0;
3557 }
3558
3559 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3560 {
3561         struct dmar_domain *dmar_domain = domain->priv;
3562
3563         domain->priv = NULL;
3564         vm_domain_exit(dmar_domain);
3565 }
3566
3567 static int intel_iommu_attach_device(struct iommu_domain *domain,
3568                                      struct device *dev)
3569 {
3570         struct dmar_domain *dmar_domain = domain->priv;
3571         struct pci_dev *pdev = to_pci_dev(dev);
3572         struct intel_iommu *iommu;
3573         int addr_width;
3574         u64 end;
3575
3576         /* normally pdev is not mapped */
3577         if (unlikely(domain_context_mapped(pdev))) {
3578                 struct dmar_domain *old_domain;
3579
3580                 old_domain = find_domain(pdev);
3581                 if (old_domain) {
3582                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3583                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3584                                 domain_remove_one_dev_info(old_domain, pdev);
3585                         else
3586                                 domain_remove_dev_info(old_domain);
3587                 }
3588         }
3589
3590         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3591                                 pdev->devfn);
3592         if (!iommu)
3593                 return -ENODEV;
3594
3595         /* check if this iommu agaw is sufficient for max mapped address */
3596         addr_width = agaw_to_width(iommu->agaw);
3597         end = DOMAIN_MAX_ADDR(addr_width);
3598         end = end & VTD_PAGE_MASK;
3599         if (end < dmar_domain->max_addr) {
3600                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3601                        "sufficient for the mapped address (%llx)\n",
3602                        __func__, iommu->agaw, dmar_domain->max_addr);
3603                 return -EFAULT;
3604         }
3605
3606         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3607 }
3608
3609 static void intel_iommu_detach_device(struct iommu_domain *domain,
3610                                       struct device *dev)
3611 {
3612         struct dmar_domain *dmar_domain = domain->priv;
3613         struct pci_dev *pdev = to_pci_dev(dev);
3614
3615         domain_remove_one_dev_info(dmar_domain, pdev);
3616 }
3617
3618 static int intel_iommu_map_range(struct iommu_domain *domain,
3619                                  unsigned long iova, phys_addr_t hpa,
3620                                  size_t size, int iommu_prot)
3621 {
3622         struct dmar_domain *dmar_domain = domain->priv;
3623         u64 max_addr;
3624         int addr_width;
3625         int prot = 0;
3626         int ret;
3627
3628         if (iommu_prot & IOMMU_READ)
3629                 prot |= DMA_PTE_READ;
3630         if (iommu_prot & IOMMU_WRITE)
3631                 prot |= DMA_PTE_WRITE;
3632         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3633                 prot |= DMA_PTE_SNP;
3634
3635         max_addr = iova + size;
3636         if (dmar_domain->max_addr < max_addr) {
3637                 int min_agaw;
3638                 u64 end;
3639
3640                 /* check if minimum agaw is sufficient for mapped address */
3641                 min_agaw = vm_domain_min_agaw(dmar_domain);
3642                 addr_width = agaw_to_width(min_agaw);
3643                 end = DOMAIN_MAX_ADDR(addr_width);
3644                 end = end & VTD_PAGE_MASK;
3645                 if (end < max_addr) {
3646                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3647                                "sufficient for the mapped address (%llx)\n",
3648                                __func__, min_agaw, max_addr);
3649                         return -EFAULT;
3650                 }
3651                 dmar_domain->max_addr = max_addr;
3652         }
3653         /* Round up size to next multiple of PAGE_SIZE, if it and
3654            the low bits of hpa would take us onto the next page */
3655         size = aligned_nrpages(hpa, size);
3656         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3657                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3658         return ret;
3659 }
3660
3661 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3662                                     unsigned long iova, size_t size)
3663 {
3664         struct dmar_domain *dmar_domain = domain->priv;
3665
3666         if (!size)
3667                 return;
3668
3669         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3670                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3671
3672         if (dmar_domain->max_addr == iova + size)
3673                 dmar_domain->max_addr = iova;
3674 }
3675
3676 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3677                                             unsigned long iova)
3678 {
3679         struct dmar_domain *dmar_domain = domain->priv;
3680         struct dma_pte *pte;
3681         u64 phys = 0;
3682
3683         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3684         if (pte)
3685                 phys = dma_pte_addr(pte);
3686
3687         return phys;
3688 }
3689
3690 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3691                                       unsigned long cap)
3692 {
3693         struct dmar_domain *dmar_domain = domain->priv;
3694
3695         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3696                 return dmar_domain->iommu_snooping;
3697
3698         return 0;
3699 }
3700
3701 static struct iommu_ops intel_iommu_ops = {
3702         .domain_init    = intel_iommu_domain_init,
3703         .domain_destroy = intel_iommu_domain_destroy,
3704         .attach_dev     = intel_iommu_attach_device,
3705         .detach_dev     = intel_iommu_detach_device,
3706         .map            = intel_iommu_map_range,
3707         .unmap          = intel_iommu_unmap_range,
3708         .iova_to_phys   = intel_iommu_iova_to_phys,
3709         .domain_has_cap = intel_iommu_domain_has_cap,
3710 };
3711
3712 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3713 {
3714         /*
3715          * Mobile 4 Series Chipset neglects to set RWBF capability,
3716          * but needs it:
3717          */
3718         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3719         rwbf_quirk = 1;
3720 }
3721
3722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3723
3724 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3725    ISOCH DMAR unit for the Azalia sound device, but not give it any
3726    TLB entries, which causes it to deadlock. Check for that.  We do
3727    this in a function called from init_dmars(), instead of in a PCI
3728    quirk, because we don't want to print the obnoxious "BIOS broken"
3729    message if VT-d is actually disabled.
3730 */
3731 static void __init check_tylersburg_isoch(void)
3732 {
3733         struct pci_dev *pdev;
3734         uint32_t vtisochctrl;
3735
3736         /* If there's no Azalia in the system anyway, forget it. */
3737         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3738         if (!pdev)
3739                 return;
3740         pci_dev_put(pdev);
3741
3742         /* System Management Registers. Might be hidden, in which case
3743            we can't do the sanity check. But that's OK, because the
3744            known-broken BIOSes _don't_ actually hide it, so far. */
3745         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3746         if (!pdev)
3747                 return;
3748
3749         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3750                 pci_dev_put(pdev);
3751                 return;
3752         }
3753
3754         pci_dev_put(pdev);
3755
3756         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3757         if (vtisochctrl & 1)
3758                 return;
3759
3760         /* Drop all bits other than the number of TLB entries */
3761         vtisochctrl &= 0x1c;
3762
3763         /* If we have the recommended number of TLB entries (16), fine. */
3764         if (vtisochctrl == 0x10)
3765                 return;
3766
3767         /* Zero TLB entries? You get to ride the short bus to school. */
3768         if (!vtisochctrl) {
3769                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3770                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3771                      dmi_get_system_info(DMI_BIOS_VENDOR),
3772                      dmi_get_system_info(DMI_BIOS_VERSION),
3773                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3774                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3775                 return;
3776         }
3777         
3778         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3779                vtisochctrl);
3780 }