intel-iommu: use for_each_set_bit()
[platform/kernel/linux-starfive.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495                 if (!ecap_coherent(g_iommus[i]->ecap)) {
496                         domain->iommu_coherency = 0;
497                         break;
498                 }
499         }
500 }
501
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
503 {
504         int i;
505
506         domain->iommu_snooping = 1;
507
508         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510                         domain->iommu_snooping = 0;
511                         break;
512                 }
513         }
514 }
515
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
518 {
519         domain_update_iommu_coherency(domain);
520         domain_update_iommu_snooping(domain);
521 }
522
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
524 {
525         struct dmar_drhd_unit *drhd = NULL;
526         int i;
527
528         for_each_drhd_unit(drhd) {
529                 if (drhd->ignored)
530                         continue;
531                 if (segment != drhd->segment)
532                         continue;
533
534                 for (i = 0; i < drhd->devices_cnt; i++) {
535                         if (drhd->devices[i] &&
536                             drhd->devices[i]->bus->number == bus &&
537                             drhd->devices[i]->devfn == devfn)
538                                 return drhd->iommu;
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->subordinate &&
541                             drhd->devices[i]->subordinate->number <= bus &&
542                             drhd->devices[i]->subordinate->subordinate >= bus)
543                                 return drhd->iommu;
544                 }
545
546                 if (drhd->include_all)
547                         return drhd->iommu;
548         }
549
550         return NULL;
551 }
552
553 static void domain_flush_cache(struct dmar_domain *domain,
554                                void *addr, int size)
555 {
556         if (!domain->iommu_coherency)
557                 clflush_cache_range(addr, size);
558 }
559
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
562                 u8 bus, u8 devfn)
563 {
564         struct root_entry *root;
565         struct context_entry *context;
566         unsigned long phy_addr;
567         unsigned long flags;
568
569         spin_lock_irqsave(&iommu->lock, flags);
570         root = &iommu->root_entry[bus];
571         context = get_context_addr_from_root(root);
572         if (!context) {
573                 context = (struct context_entry *)
574                                 alloc_pgtable_page(iommu->node);
575                 if (!context) {
576                         spin_unlock_irqrestore(&iommu->lock, flags);
577                         return NULL;
578                 }
579                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580                 phy_addr = virt_to_phys((void *)context);
581                 set_root_value(root, phy_addr);
582                 set_root_present(root);
583                 __iommu_flush_cache(iommu, root, sizeof(*root));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586         return &context[devfn];
587 }
588
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
590 {
591         struct root_entry *root;
592         struct context_entry *context;
593         int ret;
594         unsigned long flags;
595
596         spin_lock_irqsave(&iommu->lock, flags);
597         root = &iommu->root_entry[bus];
598         context = get_context_addr_from_root(root);
599         if (!context) {
600                 ret = 0;
601                 goto out;
602         }
603         ret = context_present(&context[devfn]);
604 out:
605         spin_unlock_irqrestore(&iommu->lock, flags);
606         return ret;
607 }
608
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long flags;
614
615         spin_lock_irqsave(&iommu->lock, flags);
616         root = &iommu->root_entry[bus];
617         context = get_context_addr_from_root(root);
618         if (context) {
619                 context_clear_entry(&context[devfn]);
620                 __iommu_flush_cache(iommu, &context[devfn], \
621                         sizeof(*context));
622         }
623         spin_unlock_irqrestore(&iommu->lock, flags);
624 }
625
626 static void free_context_table(struct intel_iommu *iommu)
627 {
628         struct root_entry *root;
629         int i;
630         unsigned long flags;
631         struct context_entry *context;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         if (!iommu->root_entry) {
635                 goto out;
636         }
637         for (i = 0; i < ROOT_ENTRY_NR; i++) {
638                 root = &iommu->root_entry[i];
639                 context = get_context_addr_from_root(root);
640                 if (context)
641                         free_pgtable_page(context);
642         }
643         free_pgtable_page(iommu->root_entry);
644         iommu->root_entry = NULL;
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 /* page table handling */
650 #define LEVEL_STRIDE            (9)
651 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
652
653 static inline int agaw_to_level(int agaw)
654 {
655         return agaw + 2;
656 }
657
658 static inline int agaw_to_width(int agaw)
659 {
660         return 30 + agaw * LEVEL_STRIDE;
661
662 }
663
664 static inline int width_to_agaw(int width)
665 {
666         return (width - 30) / LEVEL_STRIDE;
667 }
668
669 static inline unsigned int level_to_offset_bits(int level)
670 {
671         return (level - 1) * LEVEL_STRIDE;
672 }
673
674 static inline int pfn_level_offset(unsigned long pfn, int level)
675 {
676         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
677 }
678
679 static inline unsigned long level_mask(int level)
680 {
681         return -1UL << level_to_offset_bits(level);
682 }
683
684 static inline unsigned long level_size(int level)
685 {
686         return 1UL << level_to_offset_bits(level);
687 }
688
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
690 {
691         return (pfn + level_size(level) - 1) & level_mask(level);
692 }
693
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
695                                       unsigned long pfn)
696 {
697         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698         struct dma_pte *parent, *pte = NULL;
699         int level = agaw_to_level(domain->agaw);
700         int offset;
701
702         BUG_ON(!domain->pgd);
703         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704         parent = domain->pgd;
705
706         while (level > 0) {
707                 void *tmp_page;
708
709                 offset = pfn_level_offset(pfn, level);
710                 pte = &parent[offset];
711                 if (level == 1)
712                         break;
713
714                 if (!dma_pte_present(pte)) {
715                         uint64_t pteval;
716
717                         tmp_page = alloc_pgtable_page(domain->nid);
718
719                         if (!tmp_page)
720                                 return NULL;
721
722                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725                                 /* Someone else set it while we were thinking; use theirs. */
726                                 free_pgtable_page(tmp_page);
727                         } else {
728                                 dma_pte_addr(pte);
729                                 domain_flush_cache(domain, pte, sizeof(*pte));
730                         }
731                 }
732                 parent = phys_to_virt(dma_pte_addr(pte));
733                 level--;
734         }
735
736         return pte;
737 }
738
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
741                                          unsigned long pfn,
742                                          int level)
743 {
744         struct dma_pte *parent, *pte = NULL;
745         int total = agaw_to_level(domain->agaw);
746         int offset;
747
748         parent = domain->pgd;
749         while (level <= total) {
750                 offset = pfn_level_offset(pfn, total);
751                 pte = &parent[offset];
752                 if (level == total)
753                         return pte;
754
755                 if (!dma_pte_present(pte))
756                         break;
757                 parent = phys_to_virt(dma_pte_addr(pte));
758                 total--;
759         }
760         return NULL;
761 }
762
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765                                 unsigned long start_pfn,
766                                 unsigned long last_pfn)
767 {
768         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769         struct dma_pte *first_pte, *pte;
770
771         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773         BUG_ON(start_pfn > last_pfn);
774
775         /* we don't need lock here; nobody else touches the iova range */
776         do {
777                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
778                 if (!pte) {
779                         start_pfn = align_to_level(start_pfn + 1, 2);
780                         continue;
781                 }
782                 do { 
783                         dma_clear_pte(pte);
784                         start_pfn++;
785                         pte++;
786                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
787
788                 domain_flush_cache(domain, first_pte,
789                                    (void *)pte - (void *)first_pte);
790
791         } while (start_pfn && start_pfn <= last_pfn);
792 }
793
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796                                    unsigned long start_pfn,
797                                    unsigned long last_pfn)
798 {
799         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800         struct dma_pte *first_pte, *pte;
801         int total = agaw_to_level(domain->agaw);
802         int level;
803         unsigned long tmp;
804
805         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807         BUG_ON(start_pfn > last_pfn);
808
809         /* We don't need lock here; nobody else touches the iova range */
810         level = 2;
811         while (level <= total) {
812                 tmp = align_to_level(start_pfn, level);
813
814                 /* If we can't even clear one PTE at this level, we're done */
815                 if (tmp + level_size(level) - 1 > last_pfn)
816                         return;
817
818                 do {
819                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
820                         if (!pte) {
821                                 tmp = align_to_level(tmp + 1, level + 1);
822                                 continue;
823                         }
824                         do {
825                                 if (dma_pte_present(pte)) {
826                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
827                                         dma_clear_pte(pte);
828                                 }
829                                 pte++;
830                                 tmp += level_size(level);
831                         } while (!first_pte_in_page(pte) &&
832                                  tmp + level_size(level) - 1 <= last_pfn);
833
834                         domain_flush_cache(domain, first_pte,
835                                            (void *)pte - (void *)first_pte);
836                         
837                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
838                 level++;
839         }
840         /* free pgd */
841         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842                 free_pgtable_page(domain->pgd);
843                 domain->pgd = NULL;
844         }
845 }
846
847 /* iommu handling */
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
849 {
850         struct root_entry *root;
851         unsigned long flags;
852
853         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
854         if (!root)
855                 return -ENOMEM;
856
857         __iommu_flush_cache(iommu, root, ROOT_SIZE);
858
859         spin_lock_irqsave(&iommu->lock, flags);
860         iommu->root_entry = root;
861         spin_unlock_irqrestore(&iommu->lock, flags);
862
863         return 0;
864 }
865
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
867 {
868         void *addr;
869         u32 sts;
870         unsigned long flag;
871
872         addr = iommu->root_entry;
873
874         spin_lock_irqsave(&iommu->register_lock, flag);
875         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
876
877         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
878
879         /* Make sure hardware complete it */
880         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881                       readl, (sts & DMA_GSTS_RTPS), sts);
882
883         spin_unlock_irqrestore(&iommu->register_lock, flag);
884 }
885
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 {
888         u32 val;
889         unsigned long flag;
890
891         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
892                 return;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (!(val & DMA_GSTS_WBFS)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906                                   u16 did, u16 source_id, u8 function_mask,
907                                   u64 type)
908 {
909         u64 val = 0;
910         unsigned long flag;
911
912         switch (type) {
913         case DMA_CCMD_GLOBAL_INVL:
914                 val = DMA_CCMD_GLOBAL_INVL;
915                 break;
916         case DMA_CCMD_DOMAIN_INVL:
917                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
918                 break;
919         case DMA_CCMD_DEVICE_INVL:
920                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
922                 break;
923         default:
924                 BUG();
925         }
926         val |= DMA_CCMD_ICC;
927
928         spin_lock_irqsave(&iommu->register_lock, flag);
929         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
930
931         /* Make sure hardware complete it */
932         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
934
935         spin_unlock_irqrestore(&iommu->register_lock, flag);
936 }
937
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940                                 u64 addr, unsigned int size_order, u64 type)
941 {
942         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943         u64 val = 0, val_iva = 0;
944         unsigned long flag;
945
946         switch (type) {
947         case DMA_TLB_GLOBAL_FLUSH:
948                 /* global flush doesn't need set IVA_REG */
949                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
950                 break;
951         case DMA_TLB_DSI_FLUSH:
952                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 break;
954         case DMA_TLB_PSI_FLUSH:
955                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956                 /* Note: always flush non-leaf currently */
957                 val_iva = size_order | addr;
958                 break;
959         default:
960                 BUG();
961         }
962         /* Note: set drain read/write */
963 #if 0
964         /*
965          * This is probably to be super secure.. Looks like we can
966          * ignore it without any impact.
967          */
968         if (cap_read_drain(iommu->cap))
969                 val |= DMA_TLB_READ_DRAIN;
970 #endif
971         if (cap_write_drain(iommu->cap))
972                 val |= DMA_TLB_WRITE_DRAIN;
973
974         spin_lock_irqsave(&iommu->register_lock, flag);
975         /* Note: Only uses first TLB reg currently */
976         if (val_iva)
977                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978         dmar_writeq(iommu->reg + tlb_offset + 8, val);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flag);
985
986         /* check IOTLB invalidation granularity */
987         if (DMA_TLB_IAIG(val) == 0)
988                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991                         (unsigned long long)DMA_TLB_IIRG(type),
992                         (unsigned long long)DMA_TLB_IAIG(val));
993 }
994
995 static struct device_domain_info *iommu_support_dev_iotlb(
996         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
997 {
998         int found = 0;
999         unsigned long flags;
1000         struct device_domain_info *info;
1001         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1002
1003         if (!ecap_dev_iotlb_support(iommu->ecap))
1004                 return NULL;
1005
1006         if (!iommu->qi)
1007                 return NULL;
1008
1009         spin_lock_irqsave(&device_domain_lock, flags);
1010         list_for_each_entry(info, &domain->devices, link)
1011                 if (info->bus == bus && info->devfn == devfn) {
1012                         found = 1;
1013                         break;
1014                 }
1015         spin_unlock_irqrestore(&device_domain_lock, flags);
1016
1017         if (!found || !info->dev)
1018                 return NULL;
1019
1020         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1021                 return NULL;
1022
1023         if (!dmar_find_matched_atsr_unit(info->dev))
1024                 return NULL;
1025
1026         info->iommu = iommu;
1027
1028         return info;
1029 }
1030
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 {
1033         if (!info)
1034                 return;
1035
1036         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 }
1038
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1040 {
1041         if (!info->dev || !pci_ats_enabled(info->dev))
1042                 return;
1043
1044         pci_disable_ats(info->dev);
1045 }
1046
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048                                   u64 addr, unsigned mask)
1049 {
1050         u16 sid, qdep;
1051         unsigned long flags;
1052         struct device_domain_info *info;
1053
1054         spin_lock_irqsave(&device_domain_lock, flags);
1055         list_for_each_entry(info, &domain->devices, link) {
1056                 if (!info->dev || !pci_ats_enabled(info->dev))
1057                         continue;
1058
1059                 sid = info->bus << 8 | info->devfn;
1060                 qdep = pci_ats_queue_depth(info->dev);
1061                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1062         }
1063         spin_unlock_irqrestore(&device_domain_lock, flags);
1064 }
1065
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067                                   unsigned long pfn, unsigned int pages)
1068 {
1069         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071
1072         BUG_ON(pages == 0);
1073
1074         /*
1075          * Fallback to domain selective flush if no PSI support or the size is
1076          * too big.
1077          * PSI requires page size to be 2 ^ x, and the base address is naturally
1078          * aligned to the size
1079          */
1080         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1082                                                 DMA_TLB_DSI_FLUSH);
1083         else
1084                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1085                                                 DMA_TLB_PSI_FLUSH);
1086
1087         /*
1088          * In caching mode, domain ID 0 is reserved for non-present to present
1089          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1090          */
1091         if (!cap_caching_mode(iommu->cap) || did)
1092                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 }
1094
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1096 {
1097         u32 pmen;
1098         unsigned long flags;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flags);
1101         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102         pmen &= ~DMA_PMEN_EPM;
1103         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1104
1105         /* wait for the protected region status bit to clear */
1106         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1108
1109         spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1113 {
1114         u32 sts;
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&iommu->register_lock, flags);
1118         iommu->gcmd |= DMA_GCMD_TE;
1119         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1120
1121         /* Make sure hardware complete it */
1122         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123                       readl, (sts & DMA_GSTS_TES), sts);
1124
1125         spin_unlock_irqrestore(&iommu->register_lock, flags);
1126         return 0;
1127 }
1128
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 {
1131         u32 sts;
1132         unsigned long flag;
1133
1134         spin_lock_irqsave(&iommu->register_lock, flag);
1135         iommu->gcmd &= ~DMA_GCMD_TE;
1136         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1137
1138         /* Make sure hardware complete it */
1139         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140                       readl, (!(sts & DMA_GSTS_TES)), sts);
1141
1142         spin_unlock_irqrestore(&iommu->register_lock, flag);
1143         return 0;
1144 }
1145
1146
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1148 {
1149         unsigned long ndomains;
1150         unsigned long nlongs;
1151
1152         ndomains = cap_ndoms(iommu->cap);
1153         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1154         nlongs = BITS_TO_LONGS(ndomains);
1155
1156         spin_lock_init(&iommu->lock);
1157
1158         /* TBD: there might be 64K domains,
1159          * consider other allocation for future chip
1160          */
1161         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1162         if (!iommu->domain_ids) {
1163                 printk(KERN_ERR "Allocating domain id array failed\n");
1164                 return -ENOMEM;
1165         }
1166         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1167                         GFP_KERNEL);
1168         if (!iommu->domains) {
1169                 printk(KERN_ERR "Allocating domain array failed\n");
1170                 return -ENOMEM;
1171         }
1172
1173         /*
1174          * if Caching mode is set, then invalid translations are tagged
1175          * with domainid 0. Hence we need to pre-allocate it.
1176          */
1177         if (cap_caching_mode(iommu->cap))
1178                 set_bit(0, iommu->domain_ids);
1179         return 0;
1180 }
1181
1182
1183 static void domain_exit(struct dmar_domain *domain);
1184 static void vm_domain_exit(struct dmar_domain *domain);
1185
1186 void free_dmar_iommu(struct intel_iommu *iommu)
1187 {
1188         struct dmar_domain *domain;
1189         int i;
1190         unsigned long flags;
1191
1192         if ((iommu->domains) && (iommu->domain_ids)) {
1193                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1194                         domain = iommu->domains[i];
1195                         clear_bit(i, iommu->domain_ids);
1196
1197                         spin_lock_irqsave(&domain->iommu_lock, flags);
1198                         if (--domain->iommu_count == 0) {
1199                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1200                                         vm_domain_exit(domain);
1201                                 else
1202                                         domain_exit(domain);
1203                         }
1204                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1205                 }
1206         }
1207
1208         if (iommu->gcmd & DMA_GCMD_TE)
1209                 iommu_disable_translation(iommu);
1210
1211         if (iommu->irq) {
1212                 set_irq_data(iommu->irq, NULL);
1213                 /* This will mask the irq */
1214                 free_irq(iommu->irq, iommu);
1215                 destroy_irq(iommu->irq);
1216         }
1217
1218         kfree(iommu->domains);
1219         kfree(iommu->domain_ids);
1220
1221         g_iommus[iommu->seq_id] = NULL;
1222
1223         /* if all iommus are freed, free g_iommus */
1224         for (i = 0; i < g_num_of_iommus; i++) {
1225                 if (g_iommus[i])
1226                         break;
1227         }
1228
1229         if (i == g_num_of_iommus)
1230                 kfree(g_iommus);
1231
1232         /* free context mapping */
1233         free_context_table(iommu);
1234 }
1235
1236 static struct dmar_domain *alloc_domain(void)
1237 {
1238         struct dmar_domain *domain;
1239
1240         domain = alloc_domain_mem();
1241         if (!domain)
1242                 return NULL;
1243
1244         domain->nid = -1;
1245         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1246         domain->flags = 0;
1247
1248         return domain;
1249 }
1250
1251 static int iommu_attach_domain(struct dmar_domain *domain,
1252                                struct intel_iommu *iommu)
1253 {
1254         int num;
1255         unsigned long ndomains;
1256         unsigned long flags;
1257
1258         ndomains = cap_ndoms(iommu->cap);
1259
1260         spin_lock_irqsave(&iommu->lock, flags);
1261
1262         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1263         if (num >= ndomains) {
1264                 spin_unlock_irqrestore(&iommu->lock, flags);
1265                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1266                 return -ENOMEM;
1267         }
1268
1269         domain->id = num;
1270         set_bit(num, iommu->domain_ids);
1271         set_bit(iommu->seq_id, &domain->iommu_bmp);
1272         iommu->domains[num] = domain;
1273         spin_unlock_irqrestore(&iommu->lock, flags);
1274
1275         return 0;
1276 }
1277
1278 static void iommu_detach_domain(struct dmar_domain *domain,
1279                                 struct intel_iommu *iommu)
1280 {
1281         unsigned long flags;
1282         int num, ndomains;
1283         int found = 0;
1284
1285         spin_lock_irqsave(&iommu->lock, flags);
1286         ndomains = cap_ndoms(iommu->cap);
1287         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1288                 if (iommu->domains[num] == domain) {
1289                         found = 1;
1290                         break;
1291                 }
1292         }
1293
1294         if (found) {
1295                 clear_bit(num, iommu->domain_ids);
1296                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1297                 iommu->domains[num] = NULL;
1298         }
1299         spin_unlock_irqrestore(&iommu->lock, flags);
1300 }
1301
1302 static struct iova_domain reserved_iova_list;
1303 static struct lock_class_key reserved_rbtree_key;
1304
1305 static void dmar_init_reserved_ranges(void)
1306 {
1307         struct pci_dev *pdev = NULL;
1308         struct iova *iova;
1309         int i;
1310
1311         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1312
1313         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1314                 &reserved_rbtree_key);
1315
1316         /* IOAPIC ranges shouldn't be accessed by DMA */
1317         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1318                 IOVA_PFN(IOAPIC_RANGE_END));
1319         if (!iova)
1320                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1321
1322         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1323         for_each_pci_dev(pdev) {
1324                 struct resource *r;
1325
1326                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1327                         r = &pdev->resource[i];
1328                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1329                                 continue;
1330                         iova = reserve_iova(&reserved_iova_list,
1331                                             IOVA_PFN(r->start),
1332                                             IOVA_PFN(r->end));
1333                         if (!iova)
1334                                 printk(KERN_ERR "Reserve iova failed\n");
1335                 }
1336         }
1337
1338 }
1339
1340 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1341 {
1342         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1343 }
1344
1345 static inline int guestwidth_to_adjustwidth(int gaw)
1346 {
1347         int agaw;
1348         int r = (gaw - 12) % 9;
1349
1350         if (r == 0)
1351                 agaw = gaw;
1352         else
1353                 agaw = gaw + 9 - r;
1354         if (agaw > 64)
1355                 agaw = 64;
1356         return agaw;
1357 }
1358
1359 static int domain_init(struct dmar_domain *domain, int guest_width)
1360 {
1361         struct intel_iommu *iommu;
1362         int adjust_width, agaw;
1363         unsigned long sagaw;
1364
1365         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1366         spin_lock_init(&domain->iommu_lock);
1367
1368         domain_reserve_special_ranges(domain);
1369
1370         /* calculate AGAW */
1371         iommu = domain_get_iommu(domain);
1372         if (guest_width > cap_mgaw(iommu->cap))
1373                 guest_width = cap_mgaw(iommu->cap);
1374         domain->gaw = guest_width;
1375         adjust_width = guestwidth_to_adjustwidth(guest_width);
1376         agaw = width_to_agaw(adjust_width);
1377         sagaw = cap_sagaw(iommu->cap);
1378         if (!test_bit(agaw, &sagaw)) {
1379                 /* hardware doesn't support it, choose a bigger one */
1380                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1381                 agaw = find_next_bit(&sagaw, 5, agaw);
1382                 if (agaw >= 5)
1383                         return -ENODEV;
1384         }
1385         domain->agaw = agaw;
1386         INIT_LIST_HEAD(&domain->devices);
1387
1388         if (ecap_coherent(iommu->ecap))
1389                 domain->iommu_coherency = 1;
1390         else
1391                 domain->iommu_coherency = 0;
1392
1393         if (ecap_sc_support(iommu->ecap))
1394                 domain->iommu_snooping = 1;
1395         else
1396                 domain->iommu_snooping = 0;
1397
1398         domain->iommu_count = 1;
1399         domain->nid = iommu->node;
1400
1401         /* always allocate the top pgd */
1402         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1403         if (!domain->pgd)
1404                 return -ENOMEM;
1405         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1406         return 0;
1407 }
1408
1409 static void domain_exit(struct dmar_domain *domain)
1410 {
1411         struct dmar_drhd_unit *drhd;
1412         struct intel_iommu *iommu;
1413
1414         /* Domain 0 is reserved, so dont process it */
1415         if (!domain)
1416                 return;
1417
1418         domain_remove_dev_info(domain);
1419         /* destroy iovas */
1420         put_iova_domain(&domain->iovad);
1421
1422         /* clear ptes */
1423         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1424
1425         /* free page tables */
1426         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1427
1428         for_each_active_iommu(iommu, drhd)
1429                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1430                         iommu_detach_domain(domain, iommu);
1431
1432         free_domain_mem(domain);
1433 }
1434
1435 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1436                                  u8 bus, u8 devfn, int translation)
1437 {
1438         struct context_entry *context;
1439         unsigned long flags;
1440         struct intel_iommu *iommu;
1441         struct dma_pte *pgd;
1442         unsigned long num;
1443         unsigned long ndomains;
1444         int id;
1445         int agaw;
1446         struct device_domain_info *info = NULL;
1447
1448         pr_debug("Set context mapping for %02x:%02x.%d\n",
1449                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1450
1451         BUG_ON(!domain->pgd);
1452         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1453                translation != CONTEXT_TT_MULTI_LEVEL);
1454
1455         iommu = device_to_iommu(segment, bus, devfn);
1456         if (!iommu)
1457                 return -ENODEV;
1458
1459         context = device_to_context_entry(iommu, bus, devfn);
1460         if (!context)
1461                 return -ENOMEM;
1462         spin_lock_irqsave(&iommu->lock, flags);
1463         if (context_present(context)) {
1464                 spin_unlock_irqrestore(&iommu->lock, flags);
1465                 return 0;
1466         }
1467
1468         id = domain->id;
1469         pgd = domain->pgd;
1470
1471         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1472             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1473                 int found = 0;
1474
1475                 /* find an available domain id for this device in iommu */
1476                 ndomains = cap_ndoms(iommu->cap);
1477                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1478                         if (iommu->domains[num] == domain) {
1479                                 id = num;
1480                                 found = 1;
1481                                 break;
1482                         }
1483                 }
1484
1485                 if (found == 0) {
1486                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1487                         if (num >= ndomains) {
1488                                 spin_unlock_irqrestore(&iommu->lock, flags);
1489                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1490                                 return -EFAULT;
1491                         }
1492
1493                         set_bit(num, iommu->domain_ids);
1494                         iommu->domains[num] = domain;
1495                         id = num;
1496                 }
1497
1498                 /* Skip top levels of page tables for
1499                  * iommu which has less agaw than default.
1500                  * Unnecessary for PT mode.
1501                  */
1502                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1503                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1504                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1505                                 if (!dma_pte_present(pgd)) {
1506                                         spin_unlock_irqrestore(&iommu->lock, flags);
1507                                         return -ENOMEM;
1508                                 }
1509                         }
1510                 }
1511         }
1512
1513         context_set_domain_id(context, id);
1514
1515         if (translation != CONTEXT_TT_PASS_THROUGH) {
1516                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1517                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1518                                      CONTEXT_TT_MULTI_LEVEL;
1519         }
1520         /*
1521          * In pass through mode, AW must be programmed to indicate the largest
1522          * AGAW value supported by hardware. And ASR is ignored by hardware.
1523          */
1524         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1525                 context_set_address_width(context, iommu->msagaw);
1526         else {
1527                 context_set_address_root(context, virt_to_phys(pgd));
1528                 context_set_address_width(context, iommu->agaw);
1529         }
1530
1531         context_set_translation_type(context, translation);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /*
1537          * It's a non-present to present mapping. If hardware doesn't cache
1538          * non-present entry we only need to flush the write-buffer. If the
1539          * _does_ cache non-present entries, then it does so in the special
1540          * domain #0, which we have to flush:
1541          */
1542         if (cap_caching_mode(iommu->cap)) {
1543                 iommu->flush.flush_context(iommu, 0,
1544                                            (((u16)bus) << 8) | devfn,
1545                                            DMA_CCMD_MASK_NOBIT,
1546                                            DMA_CCMD_DEVICE_INVL);
1547                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1548         } else {
1549                 iommu_flush_write_buffer(iommu);
1550         }
1551         iommu_enable_dev_iotlb(info);
1552         spin_unlock_irqrestore(&iommu->lock, flags);
1553
1554         spin_lock_irqsave(&domain->iommu_lock, flags);
1555         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1556                 domain->iommu_count++;
1557                 if (domain->iommu_count == 1)
1558                         domain->nid = iommu->node;
1559                 domain_update_iommu_cap(domain);
1560         }
1561         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1562         return 0;
1563 }
1564
1565 static int
1566 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1567                         int translation)
1568 {
1569         int ret;
1570         struct pci_dev *tmp, *parent;
1571
1572         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1573                                          pdev->bus->number, pdev->devfn,
1574                                          translation);
1575         if (ret)
1576                 return ret;
1577
1578         /* dependent device mapping */
1579         tmp = pci_find_upstream_pcie_bridge(pdev);
1580         if (!tmp)
1581                 return 0;
1582         /* Secondary interface's bus number and devfn 0 */
1583         parent = pdev->bus->self;
1584         while (parent != tmp) {
1585                 ret = domain_context_mapping_one(domain,
1586                                                  pci_domain_nr(parent->bus),
1587                                                  parent->bus->number,
1588                                                  parent->devfn, translation);
1589                 if (ret)
1590                         return ret;
1591                 parent = parent->bus->self;
1592         }
1593         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1594                 return domain_context_mapping_one(domain,
1595                                         pci_domain_nr(tmp->subordinate),
1596                                         tmp->subordinate->number, 0,
1597                                         translation);
1598         else /* this is a legacy PCI bridge */
1599                 return domain_context_mapping_one(domain,
1600                                                   pci_domain_nr(tmp->bus),
1601                                                   tmp->bus->number,
1602                                                   tmp->devfn,
1603                                                   translation);
1604 }
1605
1606 static int domain_context_mapped(struct pci_dev *pdev)
1607 {
1608         int ret;
1609         struct pci_dev *tmp, *parent;
1610         struct intel_iommu *iommu;
1611
1612         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1613                                 pdev->devfn);
1614         if (!iommu)
1615                 return -ENODEV;
1616
1617         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1618         if (!ret)
1619                 return ret;
1620         /* dependent device mapping */
1621         tmp = pci_find_upstream_pcie_bridge(pdev);
1622         if (!tmp)
1623                 return ret;
1624         /* Secondary interface's bus number and devfn 0 */
1625         parent = pdev->bus->self;
1626         while (parent != tmp) {
1627                 ret = device_context_mapped(iommu, parent->bus->number,
1628                                             parent->devfn);
1629                 if (!ret)
1630                         return ret;
1631                 parent = parent->bus->self;
1632         }
1633         if (pci_is_pcie(tmp))
1634                 return device_context_mapped(iommu, tmp->subordinate->number,
1635                                              0);
1636         else
1637                 return device_context_mapped(iommu, tmp->bus->number,
1638                                              tmp->devfn);
1639 }
1640
1641 /* Returns a number of VTD pages, but aligned to MM page size */
1642 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1643                                             size_t size)
1644 {
1645         host_addr &= ~PAGE_MASK;
1646         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1647 }
1648
1649 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1650                             struct scatterlist *sg, unsigned long phys_pfn,
1651                             unsigned long nr_pages, int prot)
1652 {
1653         struct dma_pte *first_pte = NULL, *pte = NULL;
1654         phys_addr_t uninitialized_var(pteval);
1655         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1656         unsigned long sg_res;
1657
1658         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1659
1660         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1661                 return -EINVAL;
1662
1663         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1664
1665         if (sg)
1666                 sg_res = 0;
1667         else {
1668                 sg_res = nr_pages + 1;
1669                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1670         }
1671
1672         while (nr_pages--) {
1673                 uint64_t tmp;
1674
1675                 if (!sg_res) {
1676                         sg_res = aligned_nrpages(sg->offset, sg->length);
1677                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1678                         sg->dma_length = sg->length;
1679                         pteval = page_to_phys(sg_page(sg)) | prot;
1680                 }
1681                 if (!pte) {
1682                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1683                         if (!pte)
1684                                 return -ENOMEM;
1685                 }
1686                 /* We don't need lock here, nobody else
1687                  * touches the iova range
1688                  */
1689                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1690                 if (tmp) {
1691                         static int dumps = 5;
1692                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1693                                iov_pfn, tmp, (unsigned long long)pteval);
1694                         if (dumps) {
1695                                 dumps--;
1696                                 debug_dma_dump_mappings(NULL);
1697                         }
1698                         WARN_ON(1);
1699                 }
1700                 pte++;
1701                 if (!nr_pages || first_pte_in_page(pte)) {
1702                         domain_flush_cache(domain, first_pte,
1703                                            (void *)pte - (void *)first_pte);
1704                         pte = NULL;
1705                 }
1706                 iov_pfn++;
1707                 pteval += VTD_PAGE_SIZE;
1708                 sg_res--;
1709                 if (!sg_res)
1710                         sg = sg_next(sg);
1711         }
1712         return 0;
1713 }
1714
1715 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1716                                     struct scatterlist *sg, unsigned long nr_pages,
1717                                     int prot)
1718 {
1719         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1720 }
1721
1722 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723                                      unsigned long phys_pfn, unsigned long nr_pages,
1724                                      int prot)
1725 {
1726         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1727 }
1728
1729 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1730 {
1731         if (!iommu)
1732                 return;
1733
1734         clear_context_table(iommu, bus, devfn);
1735         iommu->flush.flush_context(iommu, 0, 0, 0,
1736                                            DMA_CCMD_GLOBAL_INVL);
1737         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1738 }
1739
1740 static void domain_remove_dev_info(struct dmar_domain *domain)
1741 {
1742         struct device_domain_info *info;
1743         unsigned long flags;
1744         struct intel_iommu *iommu;
1745
1746         spin_lock_irqsave(&device_domain_lock, flags);
1747         while (!list_empty(&domain->devices)) {
1748                 info = list_entry(domain->devices.next,
1749                         struct device_domain_info, link);
1750                 list_del(&info->link);
1751                 list_del(&info->global);
1752                 if (info->dev)
1753                         info->dev->dev.archdata.iommu = NULL;
1754                 spin_unlock_irqrestore(&device_domain_lock, flags);
1755
1756                 iommu_disable_dev_iotlb(info);
1757                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1758                 iommu_detach_dev(iommu, info->bus, info->devfn);
1759                 free_devinfo_mem(info);
1760
1761                 spin_lock_irqsave(&device_domain_lock, flags);
1762         }
1763         spin_unlock_irqrestore(&device_domain_lock, flags);
1764 }
1765
1766 /*
1767  * find_domain
1768  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1769  */
1770 static struct dmar_domain *
1771 find_domain(struct pci_dev *pdev)
1772 {
1773         struct device_domain_info *info;
1774
1775         /* No lock here, assumes no domain exit in normal case */
1776         info = pdev->dev.archdata.iommu;
1777         if (info)
1778                 return info->domain;
1779         return NULL;
1780 }
1781
1782 /* domain is initialized */
1783 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1784 {
1785         struct dmar_domain *domain, *found = NULL;
1786         struct intel_iommu *iommu;
1787         struct dmar_drhd_unit *drhd;
1788         struct device_domain_info *info, *tmp;
1789         struct pci_dev *dev_tmp;
1790         unsigned long flags;
1791         int bus = 0, devfn = 0;
1792         int segment;
1793         int ret;
1794
1795         domain = find_domain(pdev);
1796         if (domain)
1797                 return domain;
1798
1799         segment = pci_domain_nr(pdev->bus);
1800
1801         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1802         if (dev_tmp) {
1803                 if (pci_is_pcie(dev_tmp)) {
1804                         bus = dev_tmp->subordinate->number;
1805                         devfn = 0;
1806                 } else {
1807                         bus = dev_tmp->bus->number;
1808                         devfn = dev_tmp->devfn;
1809                 }
1810                 spin_lock_irqsave(&device_domain_lock, flags);
1811                 list_for_each_entry(info, &device_domain_list, global) {
1812                         if (info->segment == segment &&
1813                             info->bus == bus && info->devfn == devfn) {
1814                                 found = info->domain;
1815                                 break;
1816                         }
1817                 }
1818                 spin_unlock_irqrestore(&device_domain_lock, flags);
1819                 /* pcie-pci bridge already has a domain, uses it */
1820                 if (found) {
1821                         domain = found;
1822                         goto found_domain;
1823                 }
1824         }
1825
1826         domain = alloc_domain();
1827         if (!domain)
1828                 goto error;
1829
1830         /* Allocate new domain for the device */
1831         drhd = dmar_find_matched_drhd_unit(pdev);
1832         if (!drhd) {
1833                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1834                         pci_name(pdev));
1835                 return NULL;
1836         }
1837         iommu = drhd->iommu;
1838
1839         ret = iommu_attach_domain(domain, iommu);
1840         if (ret) {
1841                 domain_exit(domain);
1842                 goto error;
1843         }
1844
1845         if (domain_init(domain, gaw)) {
1846                 domain_exit(domain);
1847                 goto error;
1848         }
1849
1850         /* register pcie-to-pci device */
1851         if (dev_tmp) {
1852                 info = alloc_devinfo_mem();
1853                 if (!info) {
1854                         domain_exit(domain);
1855                         goto error;
1856                 }
1857                 info->segment = segment;
1858                 info->bus = bus;
1859                 info->devfn = devfn;
1860                 info->dev = NULL;
1861                 info->domain = domain;
1862                 /* This domain is shared by devices under p2p bridge */
1863                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1864
1865                 /* pcie-to-pci bridge already has a domain, uses it */
1866                 found = NULL;
1867                 spin_lock_irqsave(&device_domain_lock, flags);
1868                 list_for_each_entry(tmp, &device_domain_list, global) {
1869                         if (tmp->segment == segment &&
1870                             tmp->bus == bus && tmp->devfn == devfn) {
1871                                 found = tmp->domain;
1872                                 break;
1873                         }
1874                 }
1875                 if (found) {
1876                         free_devinfo_mem(info);
1877                         domain_exit(domain);
1878                         domain = found;
1879                 } else {
1880                         list_add(&info->link, &domain->devices);
1881                         list_add(&info->global, &device_domain_list);
1882                 }
1883                 spin_unlock_irqrestore(&device_domain_lock, flags);
1884         }
1885
1886 found_domain:
1887         info = alloc_devinfo_mem();
1888         if (!info)
1889                 goto error;
1890         info->segment = segment;
1891         info->bus = pdev->bus->number;
1892         info->devfn = pdev->devfn;
1893         info->dev = pdev;
1894         info->domain = domain;
1895         spin_lock_irqsave(&device_domain_lock, flags);
1896         /* somebody is fast */
1897         found = find_domain(pdev);
1898         if (found != NULL) {
1899                 spin_unlock_irqrestore(&device_domain_lock, flags);
1900                 if (found != domain) {
1901                         domain_exit(domain);
1902                         domain = found;
1903                 }
1904                 free_devinfo_mem(info);
1905                 return domain;
1906         }
1907         list_add(&info->link, &domain->devices);
1908         list_add(&info->global, &device_domain_list);
1909         pdev->dev.archdata.iommu = info;
1910         spin_unlock_irqrestore(&device_domain_lock, flags);
1911         return domain;
1912 error:
1913         /* recheck it here, maybe others set it */
1914         return find_domain(pdev);
1915 }
1916
1917 static int iommu_identity_mapping;
1918 #define IDENTMAP_ALL            1
1919 #define IDENTMAP_GFX            2
1920 #define IDENTMAP_AZALIA         4
1921
1922 static int iommu_domain_identity_map(struct dmar_domain *domain,
1923                                      unsigned long long start,
1924                                      unsigned long long end)
1925 {
1926         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1927         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1928
1929         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1930                           dma_to_mm_pfn(last_vpfn))) {
1931                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1932                 return -ENOMEM;
1933         }
1934
1935         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1936                  start, end, domain->id);
1937         /*
1938          * RMRR range might have overlap with physical memory range,
1939          * clear it first
1940          */
1941         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1942
1943         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1944                                   last_vpfn - first_vpfn + 1,
1945                                   DMA_PTE_READ|DMA_PTE_WRITE);
1946 }
1947
1948 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1949                                       unsigned long long start,
1950                                       unsigned long long end)
1951 {
1952         struct dmar_domain *domain;
1953         int ret;
1954
1955         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1956         if (!domain)
1957                 return -ENOMEM;
1958
1959         /* For _hardware_ passthrough, don't bother. But for software
1960            passthrough, we do it anyway -- it may indicate a memory
1961            range which is reserved in E820, so which didn't get set
1962            up to start with in si_domain */
1963         if (domain == si_domain && hw_pass_through) {
1964                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1965                        pci_name(pdev), start, end);
1966                 return 0;
1967         }
1968
1969         printk(KERN_INFO
1970                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1971                pci_name(pdev), start, end);
1972         
1973         if (end < start) {
1974                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1975                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1976                         dmi_get_system_info(DMI_BIOS_VENDOR),
1977                         dmi_get_system_info(DMI_BIOS_VERSION),
1978                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1979                 ret = -EIO;
1980                 goto error;
1981         }
1982
1983         if (end >> agaw_to_width(domain->agaw)) {
1984                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1985                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1986                      agaw_to_width(domain->agaw),
1987                      dmi_get_system_info(DMI_BIOS_VENDOR),
1988                      dmi_get_system_info(DMI_BIOS_VERSION),
1989                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1990                 ret = -EIO;
1991                 goto error;
1992         }
1993
1994         ret = iommu_domain_identity_map(domain, start, end);
1995         if (ret)
1996                 goto error;
1997
1998         /* context entry init */
1999         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2000         if (ret)
2001                 goto error;
2002
2003         return 0;
2004
2005  error:
2006         domain_exit(domain);
2007         return ret;
2008 }
2009
2010 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2011         struct pci_dev *pdev)
2012 {
2013         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2014                 return 0;
2015         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2016                 rmrr->end_address + 1);
2017 }
2018
2019 #ifdef CONFIG_DMAR_FLOPPY_WA
2020 static inline void iommu_prepare_isa(void)
2021 {
2022         struct pci_dev *pdev;
2023         int ret;
2024
2025         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2026         if (!pdev)
2027                 return;
2028
2029         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2030         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2031
2032         if (ret)
2033                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2034                        "floppy might not work\n");
2035
2036 }
2037 #else
2038 static inline void iommu_prepare_isa(void)
2039 {
2040         return;
2041 }
2042 #endif /* !CONFIG_DMAR_FLPY_WA */
2043
2044 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2045
2046 static int __init si_domain_work_fn(unsigned long start_pfn,
2047                                     unsigned long end_pfn, void *datax)
2048 {
2049         int *ret = datax;
2050
2051         *ret = iommu_domain_identity_map(si_domain,
2052                                          (uint64_t)start_pfn << PAGE_SHIFT,
2053                                          (uint64_t)end_pfn << PAGE_SHIFT);
2054         return *ret;
2055
2056 }
2057
2058 static int __init si_domain_init(int hw)
2059 {
2060         struct dmar_drhd_unit *drhd;
2061         struct intel_iommu *iommu;
2062         int nid, ret = 0;
2063
2064         si_domain = alloc_domain();
2065         if (!si_domain)
2066                 return -EFAULT;
2067
2068         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2069
2070         for_each_active_iommu(iommu, drhd) {
2071                 ret = iommu_attach_domain(si_domain, iommu);
2072                 if (ret) {
2073                         domain_exit(si_domain);
2074                         return -EFAULT;
2075                 }
2076         }
2077
2078         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2079                 domain_exit(si_domain);
2080                 return -EFAULT;
2081         }
2082
2083         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2084
2085         if (hw)
2086                 return 0;
2087
2088         for_each_online_node(nid) {
2089                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2090                 if (ret)
2091                         return ret;
2092         }
2093
2094         return 0;
2095 }
2096
2097 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2098                                           struct pci_dev *pdev);
2099 static int identity_mapping(struct pci_dev *pdev)
2100 {
2101         struct device_domain_info *info;
2102
2103         if (likely(!iommu_identity_mapping))
2104                 return 0;
2105
2106
2107         list_for_each_entry(info, &si_domain->devices, link)
2108                 if (info->dev == pdev)
2109                         return 1;
2110         return 0;
2111 }
2112
2113 static int domain_add_dev_info(struct dmar_domain *domain,
2114                                struct pci_dev *pdev,
2115                                int translation)
2116 {
2117         struct device_domain_info *info;
2118         unsigned long flags;
2119         int ret;
2120
2121         info = alloc_devinfo_mem();
2122         if (!info)
2123                 return -ENOMEM;
2124
2125         ret = domain_context_mapping(domain, pdev, translation);
2126         if (ret) {
2127                 free_devinfo_mem(info);
2128                 return ret;
2129         }
2130
2131         info->segment = pci_domain_nr(pdev->bus);
2132         info->bus = pdev->bus->number;
2133         info->devfn = pdev->devfn;
2134         info->dev = pdev;
2135         info->domain = domain;
2136
2137         spin_lock_irqsave(&device_domain_lock, flags);
2138         list_add(&info->link, &domain->devices);
2139         list_add(&info->global, &device_domain_list);
2140         pdev->dev.archdata.iommu = info;
2141         spin_unlock_irqrestore(&device_domain_lock, flags);
2142
2143         return 0;
2144 }
2145
2146 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2147 {
2148         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2149                 return 1;
2150
2151         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2152                 return 1;
2153
2154         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2155                 return 0;
2156
2157         /*
2158          * We want to start off with all devices in the 1:1 domain, and
2159          * take them out later if we find they can't access all of memory.
2160          *
2161          * However, we can't do this for PCI devices behind bridges,
2162          * because all PCI devices behind the same bridge will end up
2163          * with the same source-id on their transactions.
2164          *
2165          * Practically speaking, we can't change things around for these
2166          * devices at run-time, because we can't be sure there'll be no
2167          * DMA transactions in flight for any of their siblings.
2168          * 
2169          * So PCI devices (unless they're on the root bus) as well as
2170          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2171          * the 1:1 domain, just in _case_ one of their siblings turns out
2172          * not to be able to map all of memory.
2173          */
2174         if (!pci_is_pcie(pdev)) {
2175                 if (!pci_is_root_bus(pdev->bus))
2176                         return 0;
2177                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2178                         return 0;
2179         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2180                 return 0;
2181
2182         /* 
2183          * At boot time, we don't yet know if devices will be 64-bit capable.
2184          * Assume that they will -- if they turn out not to be, then we can 
2185          * take them out of the 1:1 domain later.
2186          */
2187         if (!startup)
2188                 return pdev->dma_mask > DMA_BIT_MASK(32);
2189
2190         return 1;
2191 }
2192
2193 static int __init iommu_prepare_static_identity_mapping(int hw)
2194 {
2195         struct pci_dev *pdev = NULL;
2196         int ret;
2197
2198         ret = si_domain_init(hw);
2199         if (ret)
2200                 return -EFAULT;
2201
2202         for_each_pci_dev(pdev) {
2203                 if (iommu_should_identity_map(pdev, 1)) {
2204                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2205                                hw ? "hardware" : "software", pci_name(pdev));
2206
2207                         ret = domain_add_dev_info(si_domain, pdev,
2208                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2209                                                      CONTEXT_TT_MULTI_LEVEL);
2210                         if (ret)
2211                                 return ret;
2212                 }
2213         }
2214
2215         return 0;
2216 }
2217
2218 int __init init_dmars(void)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221         struct dmar_rmrr_unit *rmrr;
2222         struct pci_dev *pdev;
2223         struct intel_iommu *iommu;
2224         int i, ret;
2225
2226         /*
2227          * for each drhd
2228          *    allocate root
2229          *    initialize and program root entry to not present
2230          * endfor
2231          */
2232         for_each_drhd_unit(drhd) {
2233                 g_num_of_iommus++;
2234                 /*
2235                  * lock not needed as this is only incremented in the single
2236                  * threaded kernel __init code path all other access are read
2237                  * only
2238                  */
2239         }
2240
2241         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2242                         GFP_KERNEL);
2243         if (!g_iommus) {
2244                 printk(KERN_ERR "Allocating global iommu array failed\n");
2245                 ret = -ENOMEM;
2246                 goto error;
2247         }
2248
2249         deferred_flush = kzalloc(g_num_of_iommus *
2250                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2251         if (!deferred_flush) {
2252                 ret = -ENOMEM;
2253                 goto error;
2254         }
2255
2256         for_each_drhd_unit(drhd) {
2257                 if (drhd->ignored)
2258                         continue;
2259
2260                 iommu = drhd->iommu;
2261                 g_iommus[iommu->seq_id] = iommu;
2262
2263                 ret = iommu_init_domains(iommu);
2264                 if (ret)
2265                         goto error;
2266
2267                 /*
2268                  * TBD:
2269                  * we could share the same root & context tables
2270                  * amoung all IOMMU's. Need to Split it later.
2271                  */
2272                 ret = iommu_alloc_root_entry(iommu);
2273                 if (ret) {
2274                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2275                         goto error;
2276                 }
2277                 if (!ecap_pass_through(iommu->ecap))
2278                         hw_pass_through = 0;
2279         }
2280
2281         /*
2282          * Start from the sane iommu hardware state.
2283          */
2284         for_each_drhd_unit(drhd) {
2285                 if (drhd->ignored)
2286                         continue;
2287
2288                 iommu = drhd->iommu;
2289
2290                 /*
2291                  * If the queued invalidation is already initialized by us
2292                  * (for example, while enabling interrupt-remapping) then
2293                  * we got the things already rolling from a sane state.
2294                  */
2295                 if (iommu->qi)
2296                         continue;
2297
2298                 /*
2299                  * Clear any previous faults.
2300                  */
2301                 dmar_fault(-1, iommu);
2302                 /*
2303                  * Disable queued invalidation if supported and already enabled
2304                  * before OS handover.
2305                  */
2306                 dmar_disable_qi(iommu);
2307         }
2308
2309         for_each_drhd_unit(drhd) {
2310                 if (drhd->ignored)
2311                         continue;
2312
2313                 iommu = drhd->iommu;
2314
2315                 if (dmar_enable_qi(iommu)) {
2316                         /*
2317                          * Queued Invalidate not enabled, use Register Based
2318                          * Invalidate
2319                          */
2320                         iommu->flush.flush_context = __iommu_flush_context;
2321                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2322                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2323                                "invalidation\n",
2324                                (unsigned long long)drhd->reg_base_addr);
2325                 } else {
2326                         iommu->flush.flush_context = qi_flush_context;
2327                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2328                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2329                                "invalidation\n",
2330                                (unsigned long long)drhd->reg_base_addr);
2331                 }
2332         }
2333
2334         if (iommu_pass_through)
2335                 iommu_identity_mapping |= IDENTMAP_ALL;
2336
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338         iommu_identity_mapping |= IDENTMAP_GFX;
2339 #endif
2340
2341         check_tylersburg_isoch();
2342
2343         /*
2344          * If pass through is not set or not enabled, setup context entries for
2345          * identity mappings for rmrr, gfx, and isa and may fall back to static
2346          * identity mapping if iommu_identity_mapping is set.
2347          */
2348         if (iommu_identity_mapping) {
2349                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2350                 if (ret) {
2351                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2352                         goto error;
2353                 }
2354         }
2355         /*
2356          * For each rmrr
2357          *   for each dev attached to rmrr
2358          *   do
2359          *     locate drhd for dev, alloc domain for dev
2360          *     allocate free domain
2361          *     allocate page table entries for rmrr
2362          *     if context not allocated for bus
2363          *           allocate and init context
2364          *           set present in root table for this bus
2365          *     init context with domain, translation etc
2366          *    endfor
2367          * endfor
2368          */
2369         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370         for_each_rmrr_units(rmrr) {
2371                 for (i = 0; i < rmrr->devices_cnt; i++) {
2372                         pdev = rmrr->devices[i];
2373                         /*
2374                          * some BIOS lists non-exist devices in DMAR
2375                          * table.
2376                          */
2377                         if (!pdev)
2378                                 continue;
2379                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2380                         if (ret)
2381                                 printk(KERN_ERR
2382                                        "IOMMU: mapping reserved region failed\n");
2383                 }
2384         }
2385
2386         iommu_prepare_isa();
2387
2388         /*
2389          * for each drhd
2390          *   enable fault log
2391          *   global invalidate context cache
2392          *   global invalidate iotlb
2393          *   enable translation
2394          */
2395         for_each_drhd_unit(drhd) {
2396                 if (drhd->ignored)
2397                         continue;
2398                 iommu = drhd->iommu;
2399
2400                 iommu_flush_write_buffer(iommu);
2401
2402                 ret = dmar_set_interrupt(iommu);
2403                 if (ret)
2404                         goto error;
2405
2406                 iommu_set_root_entry(iommu);
2407
2408                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2410
2411                 ret = iommu_enable_translation(iommu);
2412                 if (ret)
2413                         goto error;
2414
2415                 iommu_disable_protect_mem_regions(iommu);
2416         }
2417
2418         return 0;
2419 error:
2420         for_each_drhd_unit(drhd) {
2421                 if (drhd->ignored)
2422                         continue;
2423                 iommu = drhd->iommu;
2424                 free_iommu(iommu);
2425         }
2426         kfree(g_iommus);
2427         return ret;
2428 }
2429
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432                                      struct dmar_domain *domain,
2433                                      unsigned long nrpages, uint64_t dma_mask)
2434 {
2435         struct pci_dev *pdev = to_pci_dev(dev);
2436         struct iova *iova = NULL;
2437
2438         /* Restrict dma_mask to the width that the iommu can handle */
2439         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2440
2441         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2442                 /*
2443                  * First try to allocate an io virtual address in
2444                  * DMA_BIT_MASK(32) and if that fails then try allocating
2445                  * from higher range
2446                  */
2447                 iova = alloc_iova(&domain->iovad, nrpages,
2448                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2449                 if (iova)
2450                         return iova;
2451         }
2452         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453         if (unlikely(!iova)) {
2454                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455                        nrpages, pci_name(pdev));
2456                 return NULL;
2457         }
2458
2459         return iova;
2460 }
2461
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2463 {
2464         struct dmar_domain *domain;
2465         int ret;
2466
2467         domain = get_domain_for_dev(pdev,
2468                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2469         if (!domain) {
2470                 printk(KERN_ERR
2471                         "Allocating domain for %s failed", pci_name(pdev));
2472                 return NULL;
2473         }
2474
2475         /* make sure context mapping is ok */
2476         if (unlikely(!domain_context_mapped(pdev))) {
2477                 ret = domain_context_mapping(domain, pdev,
2478                                              CONTEXT_TT_MULTI_LEVEL);
2479                 if (ret) {
2480                         printk(KERN_ERR
2481                                 "Domain context map for %s failed",
2482                                 pci_name(pdev));
2483                         return NULL;
2484                 }
2485         }
2486
2487         return domain;
2488 }
2489
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2491 {
2492         struct device_domain_info *info;
2493
2494         /* No lock here, assumes no domain exit in normal case */
2495         info = dev->dev.archdata.iommu;
2496         if (likely(info))
2497                 return info->domain;
2498
2499         return __get_valid_domain_for_dev(dev);
2500 }
2501
2502 static int iommu_dummy(struct pci_dev *pdev)
2503 {
2504         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2505 }
2506
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2509 {
2510         struct pci_dev *pdev;
2511         int found;
2512
2513         if (unlikely(dev->bus != &pci_bus_type))
2514                 return 1;
2515
2516         pdev = to_pci_dev(dev);
2517         if (iommu_dummy(pdev))
2518                 return 1;
2519
2520         if (!iommu_identity_mapping)
2521                 return 0;
2522
2523         found = identity_mapping(pdev);
2524         if (found) {
2525                 if (iommu_should_identity_map(pdev, 0))
2526                         return 1;
2527                 else {
2528                         /*
2529                          * 32 bit DMA is removed from si_domain and fall back
2530                          * to non-identity mapping.
2531                          */
2532                         domain_remove_one_dev_info(si_domain, pdev);
2533                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2534                                pci_name(pdev));
2535                         return 0;
2536                 }
2537         } else {
2538                 /*
2539                  * In case of a detached 64 bit DMA device from vm, the device
2540                  * is put into si_domain for identity mapping.
2541                  */
2542                 if (iommu_should_identity_map(pdev, 0)) {
2543                         int ret;
2544                         ret = domain_add_dev_info(si_domain, pdev,
2545                                                   hw_pass_through ?
2546                                                   CONTEXT_TT_PASS_THROUGH :
2547                                                   CONTEXT_TT_MULTI_LEVEL);
2548                         if (!ret) {
2549                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2550                                        pci_name(pdev));
2551                                 return 1;
2552                         }
2553                 }
2554         }
2555
2556         return 0;
2557 }
2558
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560                                      size_t size, int dir, u64 dma_mask)
2561 {
2562         struct pci_dev *pdev = to_pci_dev(hwdev);
2563         struct dmar_domain *domain;
2564         phys_addr_t start_paddr;
2565         struct iova *iova;
2566         int prot = 0;
2567         int ret;
2568         struct intel_iommu *iommu;
2569         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2570
2571         BUG_ON(dir == DMA_NONE);
2572
2573         if (iommu_no_mapping(hwdev))
2574                 return paddr;
2575
2576         domain = get_valid_domain_for_dev(pdev);
2577         if (!domain)
2578                 return 0;
2579
2580         iommu = domain_get_iommu(domain);
2581         size = aligned_nrpages(paddr, size);
2582
2583         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2584                                 pdev->dma_mask);
2585         if (!iova)
2586                 goto error;
2587
2588         /*
2589          * Check if DMAR supports zero-length reads on write only
2590          * mappings..
2591          */
2592         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593                         !cap_zlr(iommu->cap))
2594                 prot |= DMA_PTE_READ;
2595         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596                 prot |= DMA_PTE_WRITE;
2597         /*
2598          * paddr - (paddr + size) might be partial page, we should map the whole
2599          * page.  Note: if two part of one page are separately mapped, we
2600          * might have two guest_addr mapping to the same host paddr, but this
2601          * is not a big problem
2602          */
2603         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2605         if (ret)
2606                 goto error;
2607
2608         /* it's a non-present to present mapping. Only flush if caching mode */
2609         if (cap_caching_mode(iommu->cap))
2610                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2611         else
2612                 iommu_flush_write_buffer(iommu);
2613
2614         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615         start_paddr += paddr & ~PAGE_MASK;
2616         return start_paddr;
2617
2618 error:
2619         if (iova)
2620                 __free_iova(&domain->iovad, iova);
2621         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2623         return 0;
2624 }
2625
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627                                  unsigned long offset, size_t size,
2628                                  enum dma_data_direction dir,
2629                                  struct dma_attrs *attrs)
2630 {
2631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632                                   dir, to_pci_dev(dev)->dma_mask);
2633 }
2634
2635 static void flush_unmaps(void)
2636 {
2637         int i, j;
2638
2639         timer_on = 0;
2640
2641         /* just flush them all */
2642         for (i = 0; i < g_num_of_iommus; i++) {
2643                 struct intel_iommu *iommu = g_iommus[i];
2644                 if (!iommu)
2645                         continue;
2646
2647                 if (!deferred_flush[i].next)
2648                         continue;
2649
2650                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2651                                          DMA_TLB_GLOBAL_FLUSH);
2652                 for (j = 0; j < deferred_flush[i].next; j++) {
2653                         unsigned long mask;
2654                         struct iova *iova = deferred_flush[i].iova[j];
2655
2656                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2657                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2658                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2659                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2660                 }
2661                 deferred_flush[i].next = 0;
2662         }
2663
2664         list_size = 0;
2665 }
2666
2667 static void flush_unmaps_timeout(unsigned long data)
2668 {
2669         unsigned long flags;
2670
2671         spin_lock_irqsave(&async_umap_flush_lock, flags);
2672         flush_unmaps();
2673         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2674 }
2675
2676 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2677 {
2678         unsigned long flags;
2679         int next, iommu_id;
2680         struct intel_iommu *iommu;
2681
2682         spin_lock_irqsave(&async_umap_flush_lock, flags);
2683         if (list_size == HIGH_WATER_MARK)
2684                 flush_unmaps();
2685
2686         iommu = domain_get_iommu(dom);
2687         iommu_id = iommu->seq_id;
2688
2689         next = deferred_flush[iommu_id].next;
2690         deferred_flush[iommu_id].domain[next] = dom;
2691         deferred_flush[iommu_id].iova[next] = iova;
2692         deferred_flush[iommu_id].next++;
2693
2694         if (!timer_on) {
2695                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2696                 timer_on = 1;
2697         }
2698         list_size++;
2699         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2700 }
2701
2702 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2703                              size_t size, enum dma_data_direction dir,
2704                              struct dma_attrs *attrs)
2705 {
2706         struct pci_dev *pdev = to_pci_dev(dev);
2707         struct dmar_domain *domain;
2708         unsigned long start_pfn, last_pfn;
2709         struct iova *iova;
2710         struct intel_iommu *iommu;
2711
2712         if (iommu_no_mapping(dev))
2713                 return;
2714
2715         domain = find_domain(pdev);
2716         BUG_ON(!domain);
2717
2718         iommu = domain_get_iommu(domain);
2719
2720         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2721         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2722                       (unsigned long long)dev_addr))
2723                 return;
2724
2725         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2726         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2727
2728         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2729                  pci_name(pdev), start_pfn, last_pfn);
2730
2731         /*  clear the whole page */
2732         dma_pte_clear_range(domain, start_pfn, last_pfn);
2733
2734         /* free page tables */
2735         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2736
2737         if (intel_iommu_strict) {
2738                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2739                                       last_pfn - start_pfn + 1);
2740                 /* free iova */
2741                 __free_iova(&domain->iovad, iova);
2742         } else {
2743                 add_unmap(domain, iova);
2744                 /*
2745                  * queue up the release of the unmap to save the 1/6th of the
2746                  * cpu used up by the iotlb flush operation...
2747                  */
2748         }
2749 }
2750
2751 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2752                                   dma_addr_t *dma_handle, gfp_t flags)
2753 {
2754         void *vaddr;
2755         int order;
2756
2757         size = PAGE_ALIGN(size);
2758         order = get_order(size);
2759
2760         if (!iommu_no_mapping(hwdev))
2761                 flags &= ~(GFP_DMA | GFP_DMA32);
2762         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2763                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2764                         flags |= GFP_DMA;
2765                 else
2766                         flags |= GFP_DMA32;
2767         }
2768
2769         vaddr = (void *)__get_free_pages(flags, order);
2770         if (!vaddr)
2771                 return NULL;
2772         memset(vaddr, 0, size);
2773
2774         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2775                                          DMA_BIDIRECTIONAL,
2776                                          hwdev->coherent_dma_mask);
2777         if (*dma_handle)
2778                 return vaddr;
2779         free_pages((unsigned long)vaddr, order);
2780         return NULL;
2781 }
2782
2783 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2784                                 dma_addr_t dma_handle)
2785 {
2786         int order;
2787
2788         size = PAGE_ALIGN(size);
2789         order = get_order(size);
2790
2791         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2792         free_pages((unsigned long)vaddr, order);
2793 }
2794
2795 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2796                            int nelems, enum dma_data_direction dir,
2797                            struct dma_attrs *attrs)
2798 {
2799         struct pci_dev *pdev = to_pci_dev(hwdev);
2800         struct dmar_domain *domain;
2801         unsigned long start_pfn, last_pfn;
2802         struct iova *iova;
2803         struct intel_iommu *iommu;
2804
2805         if (iommu_no_mapping(hwdev))
2806                 return;
2807
2808         domain = find_domain(pdev);
2809         BUG_ON(!domain);
2810
2811         iommu = domain_get_iommu(domain);
2812
2813         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2814         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2815                       (unsigned long long)sglist[0].dma_address))
2816                 return;
2817
2818         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2819         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2820
2821         /*  clear the whole page */
2822         dma_pte_clear_range(domain, start_pfn, last_pfn);
2823
2824         /* free page tables */
2825         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2826
2827         if (intel_iommu_strict) {
2828                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2829                                       last_pfn - start_pfn + 1);
2830                 /* free iova */
2831                 __free_iova(&domain->iovad, iova);
2832         } else {
2833                 add_unmap(domain, iova);
2834                 /*
2835                  * queue up the release of the unmap to save the 1/6th of the
2836                  * cpu used up by the iotlb flush operation...
2837                  */
2838         }
2839 }
2840
2841 static int intel_nontranslate_map_sg(struct device *hddev,
2842         struct scatterlist *sglist, int nelems, int dir)
2843 {
2844         int i;
2845         struct scatterlist *sg;
2846
2847         for_each_sg(sglist, sg, nelems, i) {
2848                 BUG_ON(!sg_page(sg));
2849                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2850                 sg->dma_length = sg->length;
2851         }
2852         return nelems;
2853 }
2854
2855 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2856                         enum dma_data_direction dir, struct dma_attrs *attrs)
2857 {
2858         int i;
2859         struct pci_dev *pdev = to_pci_dev(hwdev);
2860         struct dmar_domain *domain;
2861         size_t size = 0;
2862         int prot = 0;
2863         size_t offset_pfn = 0;
2864         struct iova *iova = NULL;
2865         int ret;
2866         struct scatterlist *sg;
2867         unsigned long start_vpfn;
2868         struct intel_iommu *iommu;
2869
2870         BUG_ON(dir == DMA_NONE);
2871         if (iommu_no_mapping(hwdev))
2872                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2873
2874         domain = get_valid_domain_for_dev(pdev);
2875         if (!domain)
2876                 return 0;
2877
2878         iommu = domain_get_iommu(domain);
2879
2880         for_each_sg(sglist, sg, nelems, i)
2881                 size += aligned_nrpages(sg->offset, sg->length);
2882
2883         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2884                                 pdev->dma_mask);
2885         if (!iova) {
2886                 sglist->dma_length = 0;
2887                 return 0;
2888         }
2889
2890         /*
2891          * Check if DMAR supports zero-length reads on write only
2892          * mappings..
2893          */
2894         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2895                         !cap_zlr(iommu->cap))
2896                 prot |= DMA_PTE_READ;
2897         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2898                 prot |= DMA_PTE_WRITE;
2899
2900         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2901
2902         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2903         if (unlikely(ret)) {
2904                 /*  clear the page */
2905                 dma_pte_clear_range(domain, start_vpfn,
2906                                     start_vpfn + size - 1);
2907                 /* free page tables */
2908                 dma_pte_free_pagetable(domain, start_vpfn,
2909                                        start_vpfn + size - 1);
2910                 /* free iova */
2911                 __free_iova(&domain->iovad, iova);
2912                 return 0;
2913         }
2914
2915         /* it's a non-present to present mapping. Only flush if caching mode */
2916         if (cap_caching_mode(iommu->cap))
2917                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2918         else
2919                 iommu_flush_write_buffer(iommu);
2920
2921         return nelems;
2922 }
2923
2924 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2925 {
2926         return !dma_addr;
2927 }
2928
2929 struct dma_map_ops intel_dma_ops = {
2930         .alloc_coherent = intel_alloc_coherent,
2931         .free_coherent = intel_free_coherent,
2932         .map_sg = intel_map_sg,
2933         .unmap_sg = intel_unmap_sg,
2934         .map_page = intel_map_page,
2935         .unmap_page = intel_unmap_page,
2936         .mapping_error = intel_mapping_error,
2937 };
2938
2939 static inline int iommu_domain_cache_init(void)
2940 {
2941         int ret = 0;
2942
2943         iommu_domain_cache = kmem_cache_create("iommu_domain",
2944                                          sizeof(struct dmar_domain),
2945                                          0,
2946                                          SLAB_HWCACHE_ALIGN,
2947
2948                                          NULL);
2949         if (!iommu_domain_cache) {
2950                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2951                 ret = -ENOMEM;
2952         }
2953
2954         return ret;
2955 }
2956
2957 static inline int iommu_devinfo_cache_init(void)
2958 {
2959         int ret = 0;
2960
2961         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2962                                          sizeof(struct device_domain_info),
2963                                          0,
2964                                          SLAB_HWCACHE_ALIGN,
2965                                          NULL);
2966         if (!iommu_devinfo_cache) {
2967                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2968                 ret = -ENOMEM;
2969         }
2970
2971         return ret;
2972 }
2973
2974 static inline int iommu_iova_cache_init(void)
2975 {
2976         int ret = 0;
2977
2978         iommu_iova_cache = kmem_cache_create("iommu_iova",
2979                                          sizeof(struct iova),
2980                                          0,
2981                                          SLAB_HWCACHE_ALIGN,
2982                                          NULL);
2983         if (!iommu_iova_cache) {
2984                 printk(KERN_ERR "Couldn't create iova cache\n");
2985                 ret = -ENOMEM;
2986         }
2987
2988         return ret;
2989 }
2990
2991 static int __init iommu_init_mempool(void)
2992 {
2993         int ret;
2994         ret = iommu_iova_cache_init();
2995         if (ret)
2996                 return ret;
2997
2998         ret = iommu_domain_cache_init();
2999         if (ret)
3000                 goto domain_error;
3001
3002         ret = iommu_devinfo_cache_init();
3003         if (!ret)
3004                 return ret;
3005
3006         kmem_cache_destroy(iommu_domain_cache);
3007 domain_error:
3008         kmem_cache_destroy(iommu_iova_cache);
3009
3010         return -ENOMEM;
3011 }
3012
3013 static void __init iommu_exit_mempool(void)
3014 {
3015         kmem_cache_destroy(iommu_devinfo_cache);
3016         kmem_cache_destroy(iommu_domain_cache);
3017         kmem_cache_destroy(iommu_iova_cache);
3018
3019 }
3020
3021 static void __init init_no_remapping_devices(void)
3022 {
3023         struct dmar_drhd_unit *drhd;
3024
3025         for_each_drhd_unit(drhd) {
3026                 if (!drhd->include_all) {
3027                         int i;
3028                         for (i = 0; i < drhd->devices_cnt; i++)
3029                                 if (drhd->devices[i] != NULL)
3030                                         break;
3031                         /* ignore DMAR unit if no pci devices exist */
3032                         if (i == drhd->devices_cnt)
3033                                 drhd->ignored = 1;
3034                 }
3035         }
3036
3037         if (dmar_map_gfx)
3038                 return;
3039
3040         for_each_drhd_unit(drhd) {
3041                 int i;
3042                 if (drhd->ignored || drhd->include_all)
3043                         continue;
3044
3045                 for (i = 0; i < drhd->devices_cnt; i++)
3046                         if (drhd->devices[i] &&
3047                                 !IS_GFX_DEVICE(drhd->devices[i]))
3048                                 break;
3049
3050                 if (i < drhd->devices_cnt)
3051                         continue;
3052
3053                 /* bypass IOMMU if it is just for gfx devices */
3054                 drhd->ignored = 1;
3055                 for (i = 0; i < drhd->devices_cnt; i++) {
3056                         if (!drhd->devices[i])
3057                                 continue;
3058                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3059                 }
3060         }
3061 }
3062
3063 #ifdef CONFIG_SUSPEND
3064 static int init_iommu_hw(void)
3065 {
3066         struct dmar_drhd_unit *drhd;
3067         struct intel_iommu *iommu = NULL;
3068
3069         for_each_active_iommu(iommu, drhd)
3070                 if (iommu->qi)
3071                         dmar_reenable_qi(iommu);
3072
3073         for_each_active_iommu(iommu, drhd) {
3074                 iommu_flush_write_buffer(iommu);
3075
3076                 iommu_set_root_entry(iommu);
3077
3078                 iommu->flush.flush_context(iommu, 0, 0, 0,
3079                                            DMA_CCMD_GLOBAL_INVL);
3080                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3081                                          DMA_TLB_GLOBAL_FLUSH);
3082                 iommu_enable_translation(iommu);
3083                 iommu_disable_protect_mem_regions(iommu);
3084         }
3085
3086         return 0;
3087 }
3088
3089 static void iommu_flush_all(void)
3090 {
3091         struct dmar_drhd_unit *drhd;
3092         struct intel_iommu *iommu;
3093
3094         for_each_active_iommu(iommu, drhd) {
3095                 iommu->flush.flush_context(iommu, 0, 0, 0,
3096                                            DMA_CCMD_GLOBAL_INVL);
3097                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3098                                          DMA_TLB_GLOBAL_FLUSH);
3099         }
3100 }
3101
3102 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3103 {
3104         struct dmar_drhd_unit *drhd;
3105         struct intel_iommu *iommu = NULL;
3106         unsigned long flag;
3107
3108         for_each_active_iommu(iommu, drhd) {
3109                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3110                                                  GFP_ATOMIC);
3111                 if (!iommu->iommu_state)
3112                         goto nomem;
3113         }
3114
3115         iommu_flush_all();
3116
3117         for_each_active_iommu(iommu, drhd) {
3118                 iommu_disable_translation(iommu);
3119
3120                 spin_lock_irqsave(&iommu->register_lock, flag);
3121
3122                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3123                         readl(iommu->reg + DMAR_FECTL_REG);
3124                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3125                         readl(iommu->reg + DMAR_FEDATA_REG);
3126                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3127                         readl(iommu->reg + DMAR_FEADDR_REG);
3128                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3129                         readl(iommu->reg + DMAR_FEUADDR_REG);
3130
3131                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3132         }
3133         return 0;
3134
3135 nomem:
3136         for_each_active_iommu(iommu, drhd)
3137                 kfree(iommu->iommu_state);
3138
3139         return -ENOMEM;
3140 }
3141
3142 static int iommu_resume(struct sys_device *dev)
3143 {
3144         struct dmar_drhd_unit *drhd;
3145         struct intel_iommu *iommu = NULL;
3146         unsigned long flag;
3147
3148         if (init_iommu_hw()) {
3149                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3150                 return -EIO;
3151         }
3152
3153         for_each_active_iommu(iommu, drhd) {
3154
3155                 spin_lock_irqsave(&iommu->register_lock, flag);
3156
3157                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3158                         iommu->reg + DMAR_FECTL_REG);
3159                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3160                         iommu->reg + DMAR_FEDATA_REG);
3161                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3162                         iommu->reg + DMAR_FEADDR_REG);
3163                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3164                         iommu->reg + DMAR_FEUADDR_REG);
3165
3166                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3167         }
3168
3169         for_each_active_iommu(iommu, drhd)
3170                 kfree(iommu->iommu_state);
3171
3172         return 0;
3173 }
3174
3175 static struct sysdev_class iommu_sysclass = {
3176         .name           = "iommu",
3177         .resume         = iommu_resume,
3178         .suspend        = iommu_suspend,
3179 };
3180
3181 static struct sys_device device_iommu = {
3182         .cls    = &iommu_sysclass,
3183 };
3184
3185 static int __init init_iommu_sysfs(void)
3186 {
3187         int error;
3188
3189         error = sysdev_class_register(&iommu_sysclass);
3190         if (error)
3191                 return error;
3192
3193         error = sysdev_register(&device_iommu);
3194         if (error)
3195                 sysdev_class_unregister(&iommu_sysclass);
3196
3197         return error;
3198 }
3199
3200 #else
3201 static int __init init_iommu_sysfs(void)
3202 {
3203         return 0;
3204 }
3205 #endif  /* CONFIG_PM */
3206
3207 /*
3208  * Here we only respond to action of unbound device from driver.
3209  *
3210  * Added device is not attached to its DMAR domain here yet. That will happen
3211  * when mapping the device to iova.
3212  */
3213 static int device_notifier(struct notifier_block *nb,
3214                                   unsigned long action, void *data)
3215 {
3216         struct device *dev = data;
3217         struct pci_dev *pdev = to_pci_dev(dev);
3218         struct dmar_domain *domain;
3219
3220         if (iommu_no_mapping(dev))
3221                 return 0;
3222
3223         domain = find_domain(pdev);
3224         if (!domain)
3225                 return 0;
3226
3227         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3228                 domain_remove_one_dev_info(domain, pdev);
3229
3230         return 0;
3231 }
3232
3233 static struct notifier_block device_nb = {
3234         .notifier_call = device_notifier,
3235 };
3236
3237 int __init intel_iommu_init(void)
3238 {
3239         int ret = 0;
3240         int force_on = 0;
3241
3242         /* VT-d is required for a TXT/tboot launch, so enforce that */
3243         force_on = tboot_force_iommu();
3244
3245         if (dmar_table_init()) {
3246                 if (force_on)
3247                         panic("tboot: Failed to initialize DMAR table\n");
3248                 return  -ENODEV;
3249         }
3250
3251         if (dmar_dev_scope_init()) {
3252                 if (force_on)
3253                         panic("tboot: Failed to initialize DMAR device scope\n");
3254                 return  -ENODEV;
3255         }
3256
3257         /*
3258          * Check the need for DMA-remapping initialization now.
3259          * Above initialization will also be used by Interrupt-remapping.
3260          */
3261         if (no_iommu || dmar_disabled)
3262                 return -ENODEV;
3263
3264         iommu_init_mempool();
3265         dmar_init_reserved_ranges();
3266
3267         init_no_remapping_devices();
3268
3269         ret = init_dmars();
3270         if (ret) {
3271                 if (force_on)
3272                         panic("tboot: Failed to initialize DMARs\n");
3273                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3274                 put_iova_domain(&reserved_iova_list);
3275                 iommu_exit_mempool();
3276                 return ret;
3277         }
3278         printk(KERN_INFO
3279         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3280
3281         init_timer(&unmap_timer);
3282 #ifdef CONFIG_SWIOTLB
3283         swiotlb = 0;
3284 #endif
3285         dma_ops = &intel_dma_ops;
3286
3287         init_iommu_sysfs();
3288
3289         register_iommu(&intel_iommu_ops);
3290
3291         bus_register_notifier(&pci_bus_type, &device_nb);
3292
3293         return 0;
3294 }
3295
3296 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3297                                            struct pci_dev *pdev)
3298 {
3299         struct pci_dev *tmp, *parent;
3300
3301         if (!iommu || !pdev)
3302                 return;
3303
3304         /* dependent device detach */
3305         tmp = pci_find_upstream_pcie_bridge(pdev);
3306         /* Secondary interface's bus number and devfn 0 */
3307         if (tmp) {
3308                 parent = pdev->bus->self;
3309                 while (parent != tmp) {
3310                         iommu_detach_dev(iommu, parent->bus->number,
3311                                          parent->devfn);
3312                         parent = parent->bus->self;
3313                 }
3314                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3315                         iommu_detach_dev(iommu,
3316                                 tmp->subordinate->number, 0);
3317                 else /* this is a legacy PCI bridge */
3318                         iommu_detach_dev(iommu, tmp->bus->number,
3319                                          tmp->devfn);
3320         }
3321 }
3322
3323 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3324                                           struct pci_dev *pdev)
3325 {
3326         struct device_domain_info *info;
3327         struct intel_iommu *iommu;
3328         unsigned long flags;
3329         int found = 0;
3330         struct list_head *entry, *tmp;
3331
3332         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3333                                 pdev->devfn);
3334         if (!iommu)
3335                 return;
3336
3337         spin_lock_irqsave(&device_domain_lock, flags);
3338         list_for_each_safe(entry, tmp, &domain->devices) {
3339                 info = list_entry(entry, struct device_domain_info, link);
3340                 /* No need to compare PCI domain; it has to be the same */
3341                 if (info->bus == pdev->bus->number &&
3342                     info->devfn == pdev->devfn) {
3343                         list_del(&info->link);
3344                         list_del(&info->global);
3345                         if (info->dev)
3346                                 info->dev->dev.archdata.iommu = NULL;
3347                         spin_unlock_irqrestore(&device_domain_lock, flags);
3348
3349                         iommu_disable_dev_iotlb(info);
3350                         iommu_detach_dev(iommu, info->bus, info->devfn);
3351                         iommu_detach_dependent_devices(iommu, pdev);
3352                         free_devinfo_mem(info);
3353
3354                         spin_lock_irqsave(&device_domain_lock, flags);
3355
3356                         if (found)
3357                                 break;
3358                         else
3359                                 continue;
3360                 }
3361
3362                 /* if there is no other devices under the same iommu
3363                  * owned by this domain, clear this iommu in iommu_bmp
3364                  * update iommu count and coherency
3365                  */
3366                 if (iommu == device_to_iommu(info->segment, info->bus,
3367                                             info->devfn))
3368                         found = 1;
3369         }
3370
3371         if (found == 0) {
3372                 unsigned long tmp_flags;
3373                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3374                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3375                 domain->iommu_count--;
3376                 domain_update_iommu_cap(domain);
3377                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3378         }
3379
3380         spin_unlock_irqrestore(&device_domain_lock, flags);
3381 }
3382
3383 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3384 {
3385         struct device_domain_info *info;
3386         struct intel_iommu *iommu;
3387         unsigned long flags1, flags2;
3388
3389         spin_lock_irqsave(&device_domain_lock, flags1);
3390         while (!list_empty(&domain->devices)) {
3391                 info = list_entry(domain->devices.next,
3392                         struct device_domain_info, link);
3393                 list_del(&info->link);
3394                 list_del(&info->global);
3395                 if (info->dev)
3396                         info->dev->dev.archdata.iommu = NULL;
3397
3398                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3399
3400                 iommu_disable_dev_iotlb(info);
3401                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3402                 iommu_detach_dev(iommu, info->bus, info->devfn);
3403                 iommu_detach_dependent_devices(iommu, info->dev);
3404
3405                 /* clear this iommu in iommu_bmp, update iommu count
3406                  * and capabilities
3407                  */
3408                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3409                 if (test_and_clear_bit(iommu->seq_id,
3410                                        &domain->iommu_bmp)) {
3411                         domain->iommu_count--;
3412                         domain_update_iommu_cap(domain);
3413                 }
3414                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3415
3416                 free_devinfo_mem(info);
3417                 spin_lock_irqsave(&device_domain_lock, flags1);
3418         }
3419         spin_unlock_irqrestore(&device_domain_lock, flags1);
3420 }
3421
3422 /* domain id for virtual machine, it won't be set in context */
3423 static unsigned long vm_domid;
3424
3425 static int vm_domain_min_agaw(struct dmar_domain *domain)
3426 {
3427         int i;
3428         int min_agaw = domain->agaw;
3429
3430         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
3431                 if (min_agaw > g_iommus[i]->agaw)
3432                         min_agaw = g_iommus[i]->agaw;
3433         }
3434
3435         return min_agaw;
3436 }
3437
3438 static struct dmar_domain *iommu_alloc_vm_domain(void)
3439 {
3440         struct dmar_domain *domain;
3441
3442         domain = alloc_domain_mem();
3443         if (!domain)
3444                 return NULL;
3445
3446         domain->id = vm_domid++;
3447         domain->nid = -1;
3448         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3449         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3450
3451         return domain;
3452 }
3453
3454 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3455 {
3456         int adjust_width;
3457
3458         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3459         spin_lock_init(&domain->iommu_lock);
3460
3461         domain_reserve_special_ranges(domain);
3462
3463         /* calculate AGAW */
3464         domain->gaw = guest_width;
3465         adjust_width = guestwidth_to_adjustwidth(guest_width);
3466         domain->agaw = width_to_agaw(adjust_width);
3467
3468         INIT_LIST_HEAD(&domain->devices);
3469
3470         domain->iommu_count = 0;
3471         domain->iommu_coherency = 0;
3472         domain->iommu_snooping = 0;
3473         domain->max_addr = 0;
3474         domain->nid = -1;
3475
3476         /* always allocate the top pgd */
3477         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3478         if (!domain->pgd)
3479                 return -ENOMEM;
3480         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3481         return 0;
3482 }
3483
3484 static void iommu_free_vm_domain(struct dmar_domain *domain)
3485 {
3486         unsigned long flags;
3487         struct dmar_drhd_unit *drhd;
3488         struct intel_iommu *iommu;
3489         unsigned long i;
3490         unsigned long ndomains;
3491
3492         for_each_drhd_unit(drhd) {
3493                 if (drhd->ignored)
3494                         continue;
3495                 iommu = drhd->iommu;
3496
3497                 ndomains = cap_ndoms(iommu->cap);
3498                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3499                         if (iommu->domains[i] == domain) {
3500                                 spin_lock_irqsave(&iommu->lock, flags);
3501                                 clear_bit(i, iommu->domain_ids);
3502                                 iommu->domains[i] = NULL;
3503                                 spin_unlock_irqrestore(&iommu->lock, flags);
3504                                 break;
3505                         }
3506                 }
3507         }
3508 }
3509
3510 static void vm_domain_exit(struct dmar_domain *domain)
3511 {
3512         /* Domain 0 is reserved, so dont process it */
3513         if (!domain)
3514                 return;
3515
3516         vm_domain_remove_all_dev_info(domain);
3517         /* destroy iovas */
3518         put_iova_domain(&domain->iovad);
3519
3520         /* clear ptes */
3521         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3522
3523         /* free page tables */
3524         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3525
3526         iommu_free_vm_domain(domain);
3527         free_domain_mem(domain);
3528 }
3529
3530 static int intel_iommu_domain_init(struct iommu_domain *domain)
3531 {
3532         struct dmar_domain *dmar_domain;
3533
3534         dmar_domain = iommu_alloc_vm_domain();
3535         if (!dmar_domain) {
3536                 printk(KERN_ERR
3537                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3538                 return -ENOMEM;
3539         }
3540         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3541                 printk(KERN_ERR
3542                         "intel_iommu_domain_init() failed\n");
3543                 vm_domain_exit(dmar_domain);
3544                 return -ENOMEM;
3545         }
3546         domain->priv = dmar_domain;
3547
3548         return 0;
3549 }
3550
3551 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3552 {
3553         struct dmar_domain *dmar_domain = domain->priv;
3554
3555         domain->priv = NULL;
3556         vm_domain_exit(dmar_domain);
3557 }
3558
3559 static int intel_iommu_attach_device(struct iommu_domain *domain,
3560                                      struct device *dev)
3561 {
3562         struct dmar_domain *dmar_domain = domain->priv;
3563         struct pci_dev *pdev = to_pci_dev(dev);
3564         struct intel_iommu *iommu;
3565         int addr_width;
3566         u64 end;
3567
3568         /* normally pdev is not mapped */
3569         if (unlikely(domain_context_mapped(pdev))) {
3570                 struct dmar_domain *old_domain;
3571
3572                 old_domain = find_domain(pdev);
3573                 if (old_domain) {
3574                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3575                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3576                                 domain_remove_one_dev_info(old_domain, pdev);
3577                         else
3578                                 domain_remove_dev_info(old_domain);
3579                 }
3580         }
3581
3582         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3583                                 pdev->devfn);
3584         if (!iommu)
3585                 return -ENODEV;
3586
3587         /* check if this iommu agaw is sufficient for max mapped address */
3588         addr_width = agaw_to_width(iommu->agaw);
3589         end = DOMAIN_MAX_ADDR(addr_width);
3590         end = end & VTD_PAGE_MASK;
3591         if (end < dmar_domain->max_addr) {
3592                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3593                        "sufficient for the mapped address (%llx)\n",
3594                        __func__, iommu->agaw, dmar_domain->max_addr);
3595                 return -EFAULT;
3596         }
3597
3598         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3599 }
3600
3601 static void intel_iommu_detach_device(struct iommu_domain *domain,
3602                                       struct device *dev)
3603 {
3604         struct dmar_domain *dmar_domain = domain->priv;
3605         struct pci_dev *pdev = to_pci_dev(dev);
3606
3607         domain_remove_one_dev_info(dmar_domain, pdev);
3608 }
3609
3610 static int intel_iommu_map_range(struct iommu_domain *domain,
3611                                  unsigned long iova, phys_addr_t hpa,
3612                                  size_t size, int iommu_prot)
3613 {
3614         struct dmar_domain *dmar_domain = domain->priv;
3615         u64 max_addr;
3616         int addr_width;
3617         int prot = 0;
3618         int ret;
3619
3620         if (iommu_prot & IOMMU_READ)
3621                 prot |= DMA_PTE_READ;
3622         if (iommu_prot & IOMMU_WRITE)
3623                 prot |= DMA_PTE_WRITE;
3624         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3625                 prot |= DMA_PTE_SNP;
3626
3627         max_addr = iova + size;
3628         if (dmar_domain->max_addr < max_addr) {
3629                 int min_agaw;
3630                 u64 end;
3631
3632                 /* check if minimum agaw is sufficient for mapped address */
3633                 min_agaw = vm_domain_min_agaw(dmar_domain);
3634                 addr_width = agaw_to_width(min_agaw);
3635                 end = DOMAIN_MAX_ADDR(addr_width);
3636                 end = end & VTD_PAGE_MASK;
3637                 if (end < max_addr) {
3638                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3639                                "sufficient for the mapped address (%llx)\n",
3640                                __func__, min_agaw, max_addr);
3641                         return -EFAULT;
3642                 }
3643                 dmar_domain->max_addr = max_addr;
3644         }
3645         /* Round up size to next multiple of PAGE_SIZE, if it and
3646            the low bits of hpa would take us onto the next page */
3647         size = aligned_nrpages(hpa, size);
3648         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3649                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3650         return ret;
3651 }
3652
3653 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3654                                     unsigned long iova, size_t size)
3655 {
3656         struct dmar_domain *dmar_domain = domain->priv;
3657
3658         if (!size)
3659                 return;
3660
3661         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3662                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3663
3664         if (dmar_domain->max_addr == iova + size)
3665                 dmar_domain->max_addr = iova;
3666 }
3667
3668 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3669                                             unsigned long iova)
3670 {
3671         struct dmar_domain *dmar_domain = domain->priv;
3672         struct dma_pte *pte;
3673         u64 phys = 0;
3674
3675         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3676         if (pte)
3677                 phys = dma_pte_addr(pte);
3678
3679         return phys;
3680 }
3681
3682 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3683                                       unsigned long cap)
3684 {
3685         struct dmar_domain *dmar_domain = domain->priv;
3686
3687         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3688                 return dmar_domain->iommu_snooping;
3689
3690         return 0;
3691 }
3692
3693 static struct iommu_ops intel_iommu_ops = {
3694         .domain_init    = intel_iommu_domain_init,
3695         .domain_destroy = intel_iommu_domain_destroy,
3696         .attach_dev     = intel_iommu_attach_device,
3697         .detach_dev     = intel_iommu_detach_device,
3698         .map            = intel_iommu_map_range,
3699         .unmap          = intel_iommu_unmap_range,
3700         .iova_to_phys   = intel_iommu_iova_to_phys,
3701         .domain_has_cap = intel_iommu_domain_has_cap,
3702 };
3703
3704 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3705 {
3706         /*
3707          * Mobile 4 Series Chipset neglects to set RWBF capability,
3708          * but needs it:
3709          */
3710         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3711         rwbf_quirk = 1;
3712 }
3713
3714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3715
3716 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3717    ISOCH DMAR unit for the Azalia sound device, but not give it any
3718    TLB entries, which causes it to deadlock. Check for that.  We do
3719    this in a function called from init_dmars(), instead of in a PCI
3720    quirk, because we don't want to print the obnoxious "BIOS broken"
3721    message if VT-d is actually disabled.
3722 */
3723 static void __init check_tylersburg_isoch(void)
3724 {
3725         struct pci_dev *pdev;
3726         uint32_t vtisochctrl;
3727
3728         /* If there's no Azalia in the system anyway, forget it. */
3729         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3730         if (!pdev)
3731                 return;
3732         pci_dev_put(pdev);
3733
3734         /* System Management Registers. Might be hidden, in which case
3735            we can't do the sanity check. But that's OK, because the
3736            known-broken BIOSes _don't_ actually hide it, so far. */
3737         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3738         if (!pdev)
3739                 return;
3740
3741         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3742                 pci_dev_put(pdev);
3743                 return;
3744         }
3745
3746         pci_dev_put(pdev);
3747
3748         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3749         if (vtisochctrl & 1)
3750                 return;
3751
3752         /* Drop all bits other than the number of TLB entries */
3753         vtisochctrl &= 0x1c;
3754
3755         /* If we have the recommended number of TLB entries (16), fine. */
3756         if (vtisochctrl == 0x10)
3757                 return;
3758
3759         /* Zero TLB entries? You get to ride the short bus to school. */
3760         if (!vtisochctrl) {
3761                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3762                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3763                      dmi_get_system_info(DMI_BIOS_VENDOR),
3764                      dmi_get_system_info(DMI_BIOS_VERSION),
3765                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3766                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3767                 return;
3768         }
3769         
3770         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3771                vtisochctrl);
3772 }