Merge branches 'upstream/xenfs' and 'upstream/core' of git://git.kernel.org/pub/scm...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)
diff --cc arch/x86/include/asm/xen/page.h
Simple merge
diff --cc arch/x86/xen/enlighten.c
Simple merge
diff --cc arch/x86/xen/mmu.c

index f72d18c,f08ea04,e41683c..9631c90
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
--- 3/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@@ -56,7 -55,6 -55,7 +56,8 @@@@
    #include <asm/e820.h>
    #include <asm/linkage.h>
    #include <asm/page.h>
+ ++#include <asm/init.h>
++ #include <asm/pat.h>
    
    #include <asm/xen/hypercall.h>
    #include <asm/xen/hypervisor.h>
@@@@ -171,23 -169,23 -171,52 +173,52 @@@@ DEFINE_PER_CPU(unsigned long, xen_curre
     */
    #define USER_LIMIT   ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
    
- - #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
- - #define TOP_ENTRIES          (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
++ /*
++  * Xen leaves the responsibility for maintaining p2m mappings to the
++  * guests themselves, but it must also access and update the p2m array
++  * during suspend/resume when all the pages are reallocated.
++  *
++  * The p2m table is logically a flat array, but we implement it as a
++  * three-level tree to allow the address space to be sparse.
++  *
++  *                               Xen
++  *                                |
++  *     p2m_top              p2m_top_mfn
++  *       /  \                   /   \
++  * p2m_mid p2m_mid   p2m_mid_mfn p2m_mid_mfn
++  *    / \      / \         /           /
++  *  p2m p2m p2m p2m p2m p2m p2m ...
++  *
++  * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
++  *
++  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
++  * maximum representable pseudo-physical address space is:
++  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
++  *
++  * P2M_PER_PAGE depends on the architecture, as a mfn is always
++  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
++  * 512 and 1024 entries respectively. 
++  */
+  
++ unsigned long xen_max_p2m_pfn __read_mostly;
    
-  #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-  #define TOP_ENTRIES          (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
- - /* Placeholder for holes in the address space */
- - static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- -              { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
++ #define P2M_PER_PAGE         (PAGE_SIZE / sizeof(unsigned long))
++ #define P2M_MID_PER_PAGE     (PAGE_SIZE / sizeof(unsigned long *))
++ #define P2M_TOP_PER_PAGE     (PAGE_SIZE / sizeof(unsigned long **))
    
-  /* Placeholder for holes in the address space */
-  static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
-               { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
- -  /* Array of pointers to pages containing p2m entries */
- - static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- -              { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
++ #define MAX_P2M_PFN          (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
    
-   /* Array of pointers to pages containing p2m entries */
-  static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
-               { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
- - /* Arrays of p2m arrays expressed in mfns used for save/restore */
- - static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
++ /* Placeholders for holes in the address space */
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
    
-  /* Arrays of p2m arrays expressed in mfns used for save/restore */
-  static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
- - static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- -      __page_aligned_bss;
++ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+ + 
-  static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
-       __page_aligned_bss;
++ RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
++ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
    
    static inline unsigned p2m_top_index(unsigned long pfn)
    {
@@@@ -197,23 -195,23 -231,122 +233,122 @@@@ static inline unsigned p2m_mid_index(un
    
    static inline unsigned p2m_index(unsigned long pfn)
    {
--      return pfn % P2M_ENTRIES_PER_PAGE;
++      return pfn % P2M_PER_PAGE;
+  }
+  
- - /* Build the parallel p2m_top_mfn structures */
++ static void p2m_top_init(unsigned long ***top)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++              top[i] = p2m_mid_missing;
++ }
++ 
++ static void p2m_top_mfn_init(unsigned long *top)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++              top[i] = virt_to_mfn(p2m_mid_missing_mfn);
++ }
++ 
++ static void p2m_top_mfn_p_init(unsigned long **top)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++              top[i] = p2m_mid_missing_mfn;
++ }
++ 
++ static void p2m_mid_init(unsigned long **mid)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_MID_PER_PAGE; i++)
++              mid[i] = p2m_missing;
++ }
++ 
++ static void p2m_mid_mfn_init(unsigned long *mid)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_MID_PER_PAGE; i++)
++              mid[i] = virt_to_mfn(p2m_missing);
+ + }
+ + 
-  /* Build the parallel p2m_top_mfn structures */
++ static void p2m_init(unsigned long *p2m)
++ {
++      unsigned i;
++ 
++      for (i = 0; i < P2M_MID_PER_PAGE; i++)
++              p2m[i] = INVALID_P2M_ENTRY;
++ }
++ 
++ /*
++  * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
++  *
++  * This is called both at boot time, and after resuming from suspend:
++  * - At boot time we're called very early, and must use extend_brk()
++  *   to allocate memory.
++  *
++  * - After resume we're called from within stop_machine, but the mfn
++  *   tree should alreay be completely allocated.
++  */
    void xen_build_mfn_list_list(void)
    {
--      unsigned pfn, idx;
++      unsigned long pfn;
    
--      for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
--              unsigned topidx = p2m_top_index(pfn);
++      /* Pre-initialize p2m_top_mfn to be completely missing */
++      if (p2m_top_mfn == NULL) {
++              p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++              p2m_mid_mfn_init(p2m_mid_missing_mfn);
++ 
++              p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++              p2m_top_mfn_p_init(p2m_top_mfn_p);
    
--              p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
++              p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++              p2m_top_mfn_init(p2m_top_mfn);
++      } else {
++              /* Reinitialise, mfn's all change after migration */
++              p2m_mid_mfn_init(p2m_mid_missing_mfn);
         }
    
--      for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
--              unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
--              p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
++      for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
++              unsigned topidx = p2m_top_index(pfn);
++              unsigned mididx = p2m_mid_index(pfn);
++              unsigned long **mid;
++              unsigned long *mid_mfn_p;
++ 
++              mid = p2m_top[topidx];
++              mid_mfn_p = p2m_top_mfn_p[topidx];
++ 
++              /* Don't bother allocating any mfn mid levels if
++               * they're just missing, just update the stored mfn,
++               * since all could have changed over a migrate.
++               */
++              if (mid == p2m_mid_missing) {
++                      BUG_ON(mididx);
++                      BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
++                      p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
++                      pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
++                      continue;
++              }
++ 
++              if (mid_mfn_p == p2m_mid_missing_mfn) {
++                      /*
++                       * XXX boot-time only!  We should never find
++                       * missing parts of the mfn tree after
++                       * runtime.  extend_brk() will BUG if we call
++                       * it too late.
++                       */
++                      mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++                      p2m_mid_mfn_init(mid_mfn_p);
++ 
++                      p2m_top_mfn_p[topidx] = mid_mfn_p;
++              }
++ 
++              p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
++              mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
         }
    }
    
@@@@ -231,15 -229,15 -364,37 +366,37 @@@@ void __init xen_build_dynamic_phys_to_m
    {
         unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
--      unsigned pfn;
++      unsigned long pfn;
++ 
++      xen_max_p2m_pfn = max_pfn;
+ + 
-       for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++      p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++      p2m_init(p2m_missing);
+  
- -      for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++      p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++      p2m_mid_init(p2m_mid_missing);
++ 
++      p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
++      p2m_top_init(p2m_top);
++ 
++      /*
++       * The domain builder gives us a pre-constructed p2m array in
++       * mfn_list for all the pages initially given to us, so we just
++       * need to graft that into our tree structure.
++       */
++      for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
                 unsigned topidx = p2m_top_index(pfn);
++              unsigned mididx = p2m_mid_index(pfn);
    
--              p2m_top[topidx] = &mfn_list[pfn];
--      }
++              if (p2m_top[topidx] == p2m_mid_missing) {
++                      unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
++                      p2m_mid_init(mid);
++ 
++                      p2m_top[topidx] = mid;
++              }
    
--      xen_build_mfn_list_list();
++              p2m_top[topidx][mididx] = &mfn_list[pfn];
++      }
    }
    
    unsigned long get_phys_to_machine(unsigned long pfn)
@@@@ -255,36 -253,36 -412,88 +414,88 @@@@
    }
    EXPORT_SYMBOL_GPL(get_phys_to_machine);
    
-- /* install a  new p2m_top page */
-- bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
++ static void *alloc_p2m_page(void)
    {
--      unsigned topidx = p2m_top_index(pfn);
--      unsigned long **pfnp, *mfnp;
--      unsigned i;
++      return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
++ }
    
--      pfnp = &p2m_top[topidx];
--      mfnp = &p2m_top_mfn[topidx];
++ static void free_p2m_page(void *p)
++ {
++      free_page((unsigned long)p);
++ }
    
--      for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
--              p[i] = INVALID_P2M_ENTRY;
++ /* 
++  * Fully allocate the p2m structure for a given pfn.  We need to check
++  * that both the top and mid levels are allocated, and make sure the
++  * parallel mfn tree is kept in sync.  We may race with other cpus, so
++  * the new pages are installed with cmpxchg; if we lose the race then
++  * simply free the page we allocated and use the one that's there.
++  */
++ static bool alloc_p2m(unsigned long pfn)
++ {
++      unsigned topidx, mididx;
++      unsigned long ***top_p, **mid;
++      unsigned long *top_mfn_p, *mid_mfn;
    
--      if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
--              *mfnp = virt_to_mfn(p);
--              return true;
++      topidx = p2m_top_index(pfn);
++      mididx = p2m_mid_index(pfn);
++ 
++      top_p = &p2m_top[topidx];
++      mid = *top_p;
++ 
++      if (mid == p2m_mid_missing) {
++              /* Mid level is missing, allocate a new one */
++              mid = alloc_p2m_page();
++              if (!mid)
++                      return false;
++ 
++              p2m_mid_init(mid);
++ 
++              if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
++                      free_p2m_page(mid);
         }
    
--      return false;
-- }
++      top_mfn_p = &p2m_top_mfn[topidx];
++      mid_mfn = p2m_top_mfn_p[topidx];
    
-- static void alloc_p2m(unsigned long pfn)
-- {
--      unsigned long *p;
++      BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
++ 
++      if (mid_mfn == p2m_mid_missing_mfn) {
++              /* Separately check the mid mfn level */
++              unsigned long missing_mfn;
++              unsigned long mid_mfn_mfn;
+  
- -      p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- -      BUG_ON(p == NULL);
++              mid_mfn = alloc_p2m_page();
++              if (!mid_mfn)
++                      return false;
+  
- -      if (!install_p2mtop_page(pfn, p))
- -              free_page((unsigned long)p);
++              p2m_mid_mfn_init(mid_mfn);
++ 
++              missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
++              mid_mfn_mfn = virt_to_mfn(mid_mfn);
++              if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
++                      free_p2m_page(mid_mfn);
++              else
++                      p2m_top_mfn_p[topidx] = mid_mfn;
++      }
++ 
++      if (p2m_top[topidx][mididx] == p2m_missing) {
++              /* p2m leaf page is missing */
++              unsigned long *p2m;
++ 
++              p2m = alloc_p2m_page();
++              if (!p2m)
++                      return false;
+ + 
-       p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-       BUG_ON(p == NULL);
++              p2m_init(p2m);
++ 
++              if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
++                      free_p2m_page(p2m);
++              else
++                      mid_mfn[mididx] = virt_to_mfn(p2m);
++      }
+ + 
-       if (!install_p2mtop_page(pfn, p))
-               free_page((unsigned long)p);
++      return true;
    }
    
    /* Try to install p2m mapping; fail if intermediate bits missing */
diff --cc arch/x86/xen/setup.c

index 9729c90,328b003,8e2c9f2..105db25
--- 1/arch/x86/xen/setup.c
--- 2/arch/x86/xen/setup.c
--- 3/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@@@ -34,6 -33,6 -35,39 +36,39 @@@@ extern void xen_sysenter_target(void)
    extern void xen_syscall_target(void);
    extern void xen_syscall32_target(void);
    
-  -     reserve_early(extra_start, extra_start + size, "XEN EXTRA");
++ /* Amount of extra memory space we add to the e820 ranges */
++ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
++ 
++ /* 
++  * The maximum amount of extra memory compared to the base size.  The
++  * main scaling factor is the size of struct page.  At extreme ratios
++  * of base:extra, all the base memory can be filled with page
++  * structures for the extra memory, leaving no space for anything
++  * else.
++  * 
++  * 10x seems like a reasonable balance between scaling flexibility and
++  * leaving a practically usable system.
++  */
++ #define EXTRA_MEM_RATIO              (10)
++ 
++ static __init void xen_add_extra_mem(unsigned long pages)
++ {
++      u64 size = (u64)pages * PAGE_SIZE;
++      u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
++ 
++      if (!pages)
++              return;
++ 
++      e820_add_region(extra_start, size, E820_RAM);
++      sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++ 
+++     memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
++ 
++      xen_extra_mem_size += size;
++ 
++      xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
++ }
++ 
    static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
                                               phys_addr_t end_addr)
    {
diff --cc drivers/xen/events.c
Simple merge
diff --cc drivers/xen/xenfs/super.c
Simple merge
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Oct 2010 01:20:19 +0000 (18:20 -0700)
		1	2	3
arch/x86/include/asm/xen/page.h	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
arch/x86/xen/enlighten.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
arch/x86/xen/setup.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
drivers/xen/events.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
drivers/xen/xenfs/super.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history