Merge branch 'akpm' (fixes from Andrew Morton)

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)
diff --git a/MAINTAINERS b/MAINTAINERS

index 72b1e5c..a7c34ef 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3624,6 +3624,12 @@ L:       linux-scsi@vger.kernel.org
  S:     Odd Fixes (e.g., new signatures)
  F:     drivers/scsi/fdomain.*
  
+GCOV BASED KERNEL PROFILING
+M:     Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+S:     Maintained
+F:     kernel/gcov/
+F:     Documentation/gcov.txt
+
  GDT SCSI DISK ARRAY CONTROLLER DRIVER
  M:     Achim Leubner <achim_leubner@adaptec.com>
  L:     linux-scsi@vger.kernel.org
diff --git a/block/partitions/efi.c b/block/partitions/efi.c

index 1eb09ee..a8287b4 100644 (file)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -222,11 +222,16 @@ check_hybrid:
          * the disk size.
          *
          * Hybrid MBRs do not necessarily comply with this.
+        *
+        * Consider a bad value here to be a warning to support dd'ing
+        * an image from a smaller disk to a larger disk.
          */
         if (ret == GPT_MBR_PROTECTIVE) {
                 sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);
                 if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF)
-                       ret = 0;
+                       pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n",
+                                sz, min_t(uint32_t,
+                                          total_sectors - 1, 0xFFFFFFFF));
         }
  done:
         return ret;
diff --git a/fs/buffer.c b/fs/buffer.c

index 4d74335..6024877 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
         struct buffer_head *bh;
         sector_t end_block;
         int ret = 0;            /* Will call free_more_memory() */
+       gfp_t gfp_mask;
  
-       page = find_or_create_page(inode->i_mapping, index,
-               (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+       gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+       gfp_mask |= __GFP_MOVABLE;
+       /*
+        * XXX: __getblk_slow() can not really deal with failure and
+        * will endlessly loop on improvised global reclaim.  Prefer
+        * looping in the allocator rather than here, at least that
+        * code knows what it's doing.
+        */
+       gfp_mask |= __GFP_NOFAIL;
+
+       page = find_or_create_page(inode->i_mapping, index, gfp_mask);
         if (!page)
                 return ret;
  
diff --git a/fs/proc/inode.c b/fs/proc/inode.c

index 9f8ef9b..8eaa1ba 100644 (file)
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -288,10 +288,14 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
  static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
  {
         struct proc_dir_entry *pde = PDE(file_inode(file));
-       int rv = -EIO;
-       unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+       unsigned long rv = -EIO;
+       unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL;
         if (use_pde(pde)) {
-               get_unmapped_area = pde->proc_fops->get_unmapped_area;
+#ifdef CONFIG_MMU
+               get_unmapped_area = current->mm->get_unmapped_area;
+#endif
+               if (pde->proc_fops->get_unmapped_area)
+                       get_unmapped_area = pde->proc_fops->get_unmapped_area;
                 if (get_unmapped_area)
                         rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
                 unuse_pde(pde);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index 7366e9d..390bdab 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
                 frame = pte_pfn(pte);
                 flags = PM_PRESENT;
                 page = vm_normal_page(vma, addr, pte);
+               if (pte_soft_dirty(pte))
+                       flags2 |= __PM_SOFT_DIRTY;
         } else if (is_swap_pte(pte)) {
                 swp_entry_t entry;
                 if (pte_swp_soft_dirty(pte))
@@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
  
         if (page && !PageAnon(page))
                 flags |= PM_FILE;
-       if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
+       if ((vma->vm_flags & VM_SOFTDIRTY))
                 flags2 |= __PM_SOFT_DIRTY;
  
         *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index ecc82b3..b3e7a66 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
  extern void mem_cgroup_replace_page_cache(struct page *oldpage,
                                         struct page *newpage);
  
-/**
- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
- * @new: true to enable, false to disable
- *
- * Toggle whether a failed memcg charge should invoke the OOM killer
- * or just return -ENOMEM.  Returns the previous toggle state.
- *
- * NOTE: Any path that enables the OOM killer before charging must
- *       call mem_cgroup_oom_synchronize() afterward to finalize the
- *       OOM handling and clean up.
- */
-static inline bool mem_cgroup_toggle_oom(bool new)
+static inline void mem_cgroup_oom_enable(void)
  {
-       bool old;
-
-       old = current->memcg_oom.may_oom;
-       current->memcg_oom.may_oom = new;
-
-       return old;
+       WARN_ON(current->memcg_oom.may_oom);
+       current->memcg_oom.may_oom = 1;
  }
  
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
  {
-       bool old = mem_cgroup_toggle_oom(true);
-
-       WARN_ON(old == true);
-}
-
-static inline void mem_cgroup_disable_oom(void)
-{
-       bool old = mem_cgroup_toggle_oom(false);
-
-       WARN_ON(old == false);
+       WARN_ON(!current->memcg_oom.may_oom);
+       current->memcg_oom.may_oom = 0;
  }
  
  static inline bool task_in_memcg_oom(struct task_struct *p)
  {
-       return p->memcg_oom.in_memcg_oom;
+       return p->memcg_oom.memcg;
  }
  
-bool mem_cgroup_oom_synchronize(void);
+bool mem_cgroup_oom_synchronize(bool wait);
  
  #ifdef CONFIG_MEMCG_SWAP
  extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
  {
  }
  
-static inline bool mem_cgroup_toggle_oom(bool new)
-{
-       return false;
-}
-
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_enable(void)
  {
  }
  
-static inline void mem_cgroup_disable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
  {
  }
  
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
         return false;
  }
  
-static inline bool mem_cgroup_oom_synchronize(void)
+static inline bool mem_cgroup_oom_synchronize(bool wait)
  {
         return false;
  }
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 6682da3..e27baee 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
         } memcg_batch;
         unsigned int memcg_kmem_skip_account;
         struct memcg_oom_info {
+               struct mem_cgroup *memcg;
+               gfp_t gfp_mask;
+               int order;
                 unsigned int may_oom:1;
-               unsigned int in_memcg_oom:1;
-               unsigned int oom_locked:1;
-               int wakeups;
-               struct mem_cgroup *wait_on_memcg;
         } memcg_oom;
  #endif
  #ifdef CONFIG_UPROBES
diff --git a/ipc/sem.c b/ipc/sem.c

index 8c4f59b..db9d241 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
  
         sem_lock(sma, NULL, -1);
  
+       if (sma->sem_perm.deleted) {
+               sem_unlock(sma, -1);
+               rcu_read_unlock();
+               return -EIDRM;
+       }
+
         curr = &sma->sem_base[semnum];
  
         ipc_assert_locked_object(&sma->sem_perm);
@@ -1336,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 int i;
  
                 sem_lock(sma, NULL, -1);
+               if (sma->sem_perm.deleted) {
+                       err = -EIDRM;
+                       goto out_unlock;
+               }
                 if(nsems > SEMMSL_FAST) {
                         if (!ipc_rcu_getref(sma)) {
-                               sem_unlock(sma, -1);
-                               rcu_read_unlock();
                                 err = -EIDRM;
-                               goto out_free;
+                               goto out_unlock;
                         }
                         sem_unlock(sma, -1);
                         rcu_read_unlock();
@@ -1354,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                         rcu_read_lock();
                         sem_lock_and_putref(sma);
                         if (sma->sem_perm.deleted) {
-                               sem_unlock(sma, -1);
-                               rcu_read_unlock();
                                 err = -EIDRM;
-                               goto out_free;
+                               goto out_unlock;
                         }
                 }
                 for (i = 0; i < sma->sem_nsems; i++)
@@ -1375,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 struct sem_undo *un;
  
                 if (!ipc_rcu_getref(sma)) {
-                       rcu_read_unlock();
-                       return -EIDRM;
+                       err = -EIDRM;
+                       goto out_rcu_wakeup;
                 }
                 rcu_read_unlock();
  
@@ -1404,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 rcu_read_lock();
                 sem_lock_and_putref(sma);
                 if (sma->sem_perm.deleted) {
-                       sem_unlock(sma, -1);
-                       rcu_read_unlock();
                         err = -EIDRM;
-                       goto out_free;
+                       goto out_unlock;
                 }
  
                 for (i = 0; i < nsems; i++)
@@ -1431,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 goto out_rcu_wakeup;
  
         sem_lock(sma, NULL, -1);
+       if (sma->sem_perm.deleted) {
+               err = -EIDRM;
+               goto out_unlock;
+       }
         curr = &sma->sem_base[semnum];
  
         switch (cmd) {
@@ -1836,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
         if (error)
                 goto out_rcu_wakeup;
  
+       error = -EIDRM;
+       locknum = sem_lock(sma, sops, nsops);
+       if (sma->sem_perm.deleted)
+               goto out_unlock_free;
         /*
          * semid identifiers are not unique - find_alloc_undo may have
          * allocated an undo structure, it was invalidated by an RMID
@@ -1843,8 +1855,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
          * This case can be detected checking un->semid. The existence of
          * "un" itself is guaranteed by rcu.
          */
-       error = -EIDRM;
-       locknum = sem_lock(sma, sops, nsops);
         if (un && un->semid == -1)
                 goto out_unlock_free;
  
@@ -2057,6 +2067,12 @@ void exit_sem(struct task_struct *tsk)
                 }
  
                 sem_lock(sma, NULL, -1);
+               /* exit_sem raced with IPC_RMID, nothing to do */
+               if (sma->sem_perm.deleted) {
+                       sem_unlock(sma, -1);
+                       rcu_read_unlock();
+                       continue;
+               }
                 un = __lookup_undo(ulp, semid);
                 if (un == NULL) {
                         /* exit_sem raced with IPC_RMID+semget() that created
diff --git a/ipc/util.c b/ipc/util.c

index fdb8ae7..7684f41 100644 (file)
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -17,12 +17,27 @@
   *            Pavel Emelianov <xemul@openvz.org>
   *
   * General sysv ipc locking scheme:
- *  when doing ipc id lookups, take the ids->rwsem
- *      rcu_read_lock()
- *          obtain the ipc object (kern_ipc_perm)
- *          perform security, capabilities, auditing and permission checks, etc.
- *          acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object()
- *             perform data updates (ie: SET, RMID, LOCK/UNLOCK commands)
+ *     rcu_read_lock()
+ *          obtain the ipc object (kern_ipc_perm) by looking up the id in an idr
+ *         tree.
+ *         - perform initial checks (capabilities, auditing and permission,
+ *           etc).
+ *         - perform read-only operations, such as STAT, INFO commands.
+ *           acquire the ipc lock (kern_ipc_perm.lock) through
+ *           ipc_lock_object()
+ *             - perform data updates, such as SET, RMID commands and
+ *               mechanism-specific operations (semop/semtimedop,
+ *               msgsnd/msgrcv, shmat/shmdt).
+ *         drop the ipc lock, through ipc_unlock_object().
+ *     rcu_read_unlock()
+ *
+ *  The ids->rwsem must be taken when:
+ *     - creating, removing and iterating the existing entries in ipc
+ *       identifier sets.
+ *     - iterating through files under /proc/sysvipc/
+ *
+ *  Note that sems have a special fast path that avoids kern_ipc_perm.lock -
+ *  see sem_lock().
   */
  
  #include <linux/mm.h>
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c

index 7deeb62..1a53d49 100644 (file)
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
         ref->release = release;
         return 0;
  }
+EXPORT_SYMBOL_GPL(percpu_ref_init);
  
  /**
   * percpu_ref_cancel_init - cancel percpu_ref_init()
@@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref)
                 free_percpu(ref->pcpu_count);
         }
  }
+EXPORT_SYMBOL_GPL(percpu_ref_cancel_init);
  
  static void percpu_ref_kill_rcu(struct rcu_head *rcu)
  {
@@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
  
         call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu);
  }
+EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
diff --git a/mm/filemap.c b/mm/filemap.c

index 1e6aec4..ae4846f 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         struct inode *inode = mapping->host;
         pgoff_t offset = vmf->pgoff;
         struct page *page;
-       bool memcg_oom;
         pgoff_t size;
         int ret = 0;
  
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 return VM_FAULT_SIGBUS;
  
         /*
-        * Do we have something in the page cache already?  Either
-        * way, try readahead, but disable the memcg OOM killer for it
-        * as readahead is optional and no errors are propagated up
-        * the fault stack.  The OOM killer is enabled while trying to
-        * instantiate the faulting page individually below.
+        * Do we have something in the page cache already?
          */
         page = find_get_page(mapping, offset);
         if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                  * We found the page, so try async readahead before
                  * waiting for the lock.
                  */
-               memcg_oom = mem_cgroup_toggle_oom(false);
                 do_async_mmap_readahead(vma, ra, file, page, offset);
-               mem_cgroup_toggle_oom(memcg_oom);
         } else if (!page) {
                 /* No page in the page cache at all */
-               memcg_oom = mem_cgroup_toggle_oom(false);
                 do_sync_mmap_readahead(vma, ra, file, offset);
-               mem_cgroup_toggle_oom(memcg_oom);
                 count_vm_event(PGMAJFAULT);
                 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 7489884..610e3df 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
  
         mmun_start = haddr;
         mmun_end   = haddr + HPAGE_PMD_SIZE;
+again:
         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
         spin_lock(&mm->page_table_lock);
         if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
         split_huge_page(page);
  
         put_page(page);
-       BUG_ON(pmd_trans_huge(*pmd));
+
+       /*
+        * We don't always have down_write of mmap_sem here: a racing
+        * do_huge_pmd_wp_page() might have copied-on-write to another
+        * huge page before our split_huge_page() got the anon_vma lock.
+        */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               goto again;
  }
  
  void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index b49579c..0b7656e 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)
         BUG_ON(page_count(page));
         BUG_ON(page_mapcount(page));
         restore_reserve = PagePrivate(page);
+       ClearPagePrivate(page);
  
         spin_lock(&hugetlb_lock);
         hugetlb_cgroup_uncharge_page(hstate_index(h),
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
         /* we rely on prep_new_huge_page to set the destructor */
         set_compound_order(page, order);
         __SetPageHead(page);
+       __ClearPageReserved(page);
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                 __SetPageTail(p);
+               /*
+                * For gigantic hugepages allocated through bootmem at
+                * boot, it's safer to be consistent with the not-gigantic
+                * hugepages and clear the PG_reserved bit from all tail pages
+                * too.  Otherwse drivers using get_user_pages() to access tail
+                * pages may get the reference counting wrong if they see
+                * PG_reserved set on a tail page (despite the head page not
+                * having PG_reserved set).  Enforcing this consistency between
+                * head and tail pages allows drivers to optimize away a check
+                * on the head page when they need know if put_page() is needed
+                * after get_user_pages().
+                */
+               __ClearPageReserved(p);
                 set_page_count(p, 0);
                 p->first_page = page;
         }
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)
  #else
                 page = virt_to_page(m);
  #endif
-               __ClearPageReserved(page);
                 WARN_ON(page_count(page) != 1);
                 prep_compound_huge_page(page, h->order);
+               WARN_ON(PageReserved(page));
                 prep_new_huge_page(h, page, page_to_nid(page));
                 /*
                  * If we had gigantic hugepages allocated at boot time, we need
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 1c52ddb..34d3ca9 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         unsigned long val = 0;
         int cpu;
  
+       get_online_cpus();
         for_each_online_cpu(cpu)
                 val += per_cpu(memcg->stat->events[idx], cpu);
  #ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         val += memcg->nocpu_base.events[idx];
         spin_unlock(&memcg->pcp_counter_lock);
  #endif
+       put_online_cpus();
         return val;
  }
  
@@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                 memcg_wakeup_oom(memcg);
  }
  
-/*
- * try to call OOM killer
- */
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       bool locked;
-       int wakeups;
-
         if (!current->memcg_oom.may_oom)
                 return;
-
-       current->memcg_oom.in_memcg_oom = 1;
-
         /*
-        * As with any blocking lock, a contender needs to start
-        * listening for wakeups before attempting the trylock,
-        * otherwise it can miss the wakeup from the unlock and sleep
-        * indefinitely.  This is just open-coded because our locking
-        * is so particular to memcg hierarchies.
+        * We are in the middle of the charge context here, so we
+        * don't want to block when potentially sitting on a callstack
+        * that holds all kinds of filesystem and mm locks.
+        *
+        * Also, the caller may handle a failed allocation gracefully
+        * (like optional page cache readahead) and so an OOM killer
+        * invocation might not even be necessary.
+        *
+        * That's why we don't do anything here except remember the
+        * OOM context and then deal with it at the end of the page
+        * fault when the stack is unwound, the locks are released,
+        * and when we know whether the fault was overall successful.
          */
-       wakeups = atomic_read(&memcg->oom_wakeups);
-       mem_cgroup_mark_under_oom(memcg);
-
-       locked = mem_cgroup_oom_trylock(memcg);
-
-       if (locked)
-               mem_cgroup_oom_notify(memcg);
-
-       if (locked && !memcg->oom_kill_disable) {
-               mem_cgroup_unmark_under_oom(memcg);
-               mem_cgroup_out_of_memory(memcg, mask, order);
-               mem_cgroup_oom_unlock(memcg);
-               /*
-                * There is no guarantee that an OOM-lock contender
-                * sees the wakeups triggered by the OOM kill
-                * uncharges.  Wake any sleepers explicitely.
-                */
-               memcg_oom_recover(memcg);
-       } else {
-               /*
-                * A system call can just return -ENOMEM, but if this
-                * is a page fault and somebody else is handling the
-                * OOM already, we need to sleep on the OOM waitqueue
-                * for this memcg until the situation is resolved.
-                * Which can take some time because it might be
-                * handled by a userspace task.
-                *
-                * However, this is the charge context, which means
-                * that we may sit on a large call stack and hold
-                * various filesystem locks, the mmap_sem etc. and we
-                * don't want the OOM handler to deadlock on them
-                * while we sit here and wait.  Store the current OOM
-                * context in the task_struct, then return -ENOMEM.
-                * At the end of the page fault handler, with the
-                * stack unwound, pagefault_out_of_memory() will check
-                * back with us by calling
-                * mem_cgroup_oom_synchronize(), possibly putting the
-                * task to sleep.
-                */
-               current->memcg_oom.oom_locked = locked;
-               current->memcg_oom.wakeups = wakeups;
-               css_get(&memcg->css);
-               current->memcg_oom.wait_on_memcg = memcg;
-       }
+       css_get(&memcg->css);
+       current->memcg_oom.memcg = memcg;
+       current->memcg_oom.gfp_mask = mask;
+       current->memcg_oom.order = order;
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
   *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
   *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
   */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
  {
+       struct mem_cgroup *memcg = current->memcg_oom.memcg;
         struct oom_wait_info owait;
-       struct mem_cgroup *memcg;
+       bool locked;
  
         /* OOM is global, do not handle */
-       if (!current->memcg_oom.in_memcg_oom)
-               return false;
-
-       /*
-        * We invoked the OOM killer but there is a chance that a kill
-        * did not free up any charges.  Everybody else might already
-        * be sleeping, so restart the fault and keep the rampage
-        * going until some charges are released.
-        */
-       memcg = current->memcg_oom.wait_on_memcg;
         if (!memcg)
-               goto out;
+               return false;
  
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               goto out_memcg;
+       if (!handle)
+               goto cleanup;
  
         owait.memcg = memcg;
         owait.wait.flags = 0;
@@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
         INIT_LIST_HEAD(&owait.wait.task_list);
  
         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       /* Only sleep if we didn't miss any wakeups since OOM */
-       if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
+       if (locked)
+               mem_cgroup_oom_notify(memcg);
+
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                        current->memcg_oom.order);
+       } else {
                 schedule();
-       finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-       mem_cgroup_unmark_under_oom(memcg);
-       if (current->memcg_oom.oom_locked) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+       }
+
+       if (locked) {
                 mem_cgroup_oom_unlock(memcg);
                 /*
                  * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2249,9 @@ out_memcg:
                  */
                 memcg_oom_recover(memcg);
         }
+cleanup:
+       current->memcg_oom.memcg = NULL;
         css_put(&memcg->css);
-       current->memcg_oom.wait_on_memcg = NULL;
-out:
-       current->memcg_oom.in_memcg_oom = 0;
         return true;
  }
  
@@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                      || fatal_signal_pending(current)))
                 goto bypass;
  
+       if (unlikely(task_in_memcg_oom(current)))
+               goto bypass;
+
         /*
          * We always charge the cgroup the mm_struct belongs to.
          * The mm_struct's mem_cgroup changes on task migration if the
@@ -2801,6 +2766,8 @@ done:
         return 0;
  nomem:
         *ptr = NULL;
+       if (gfp_mask & __GFP_NOFAIL)
+               return 0;
         return -ENOMEM;
  bypass:
         *ptr = root_mem_cgroup;
diff --git a/mm/memory.c b/mm/memory.c

index ca00039..1311f26 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                          */
                                         make_migration_entry_read(&entry);
                                         pte = swp_entry_to_pte(entry);
+                                       if (pte_swp_soft_dirty(*src_pte))
+                                               pte = pte_swp_mksoft_dirty(pte);
                                         set_pte_at(src_mm, addr, src_pte, pte);
                                 }
                         }
@@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * space.  Kernel faults are handled more gracefully.
          */
         if (flags & FAULT_FLAG_USER)
-               mem_cgroup_enable_oom();
+               mem_cgroup_oom_enable();
  
         ret = __handle_mm_fault(mm, vma, address, flags);
  
-       if (flags & FAULT_FLAG_USER)
-               mem_cgroup_disable_oom();
-
-       if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
-               mem_cgroup_oom_synchronize();
+       if (flags & FAULT_FLAG_USER) {
+               mem_cgroup_oom_disable();
+                /*
+                 * The task may have entered a memcg OOM situation but
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+       }
  
         return ret;
  }
diff --git a/mm/migrate.c b/mm/migrate.c

index a26bccd..7a7325e 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  
         get_page(new);
         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+       if (pte_swp_soft_dirty(*ptep))
+               pte = pte_mksoft_dirty(pte);
         if (is_write_migration_entry(entry))
                 pte = pte_mkwrite(pte);
  #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 94722a4..a3af058 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                         swp_entry_t entry = pte_to_swp_entry(oldpte);
  
                         if (is_write_migration_entry(entry)) {
+                               pte_t newpte;
                                 /*
                                  * A protection check is difficult so
                                  * just be safe and disable write
                                  */
                                 make_migration_entry_read(&entry);
-                               set_pte_at(mm, addr, pte,
-                                       swp_entry_to_pte(entry));
+                               newpte = swp_entry_to_pte(entry);
+                               if (pte_swp_soft_dirty(oldpte))
+                                       newpte = pte_swp_mksoft_dirty(newpte);
+                               set_pte_at(mm, addr, pte, newpte);
                         }
                         pages++;
                 }
diff --git a/mm/mremap.c b/mm/mremap.c

index 91b13d6..0843feb 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,6 @@
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
  
  #include "internal.h"
  
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                 return NULL;
  
         pmd = pmd_alloc(mm, pud, addr);
-       if (!pmd) {
-               pud_free(mm, pud);
+       if (!pmd)
                 return NULL;
-       }
  
         VM_BUG_ON(pmd_trans_huge(*pmd));
  
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 314e9d2..6738c47 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
  {
         struct zonelist *zonelist;
  
-       if (mem_cgroup_oom_synchronize())
+       if (mem_cgroup_oom_synchronize(true))
                 return;
  
         zonelist = node_zonelist(first_online_node, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index f5236f8..6380758 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
         return 1;
  }
  
-static long bdi_max_pause(struct backing_dev_info *bdi,
-                         unsigned long bdi_dirty)
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+                                  unsigned long bdi_dirty)
  {
-       long bw = bdi->avg_write_bandwidth;
-       long t;
+       unsigned long bw = bdi->avg_write_bandwidth;
+       unsigned long t;
  
         /*
          * Limit pause time for small memory systems. If sleeping for too long
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
         t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
         t++;
  
-       return min_t(long, t, MAX_PAUSE);
+       return min_t(unsigned long, t, MAX_PAUSE);
  }
  
  static long bdi_min_pause(struct backing_dev_info *bdi,
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 3963fc2..de7c904 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         struct filename *pathname;
         int i, type, prev;
         int err;
+       unsigned int old_block_size;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         }
  
         swap_file = p->swap_file;
+       old_block_size = p->old_block_size;
         p->swap_file = NULL;
         p->max = 0;
         swap_map = p->swap_map;
@@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         inode = mapping->host;
         if (S_ISBLK(inode->i_mode)) {
                 struct block_device *bdev = I_BDEV(inode);
-               set_blocksize(bdev, p->old_block_size);
+               set_blocksize(bdev, old_block_size);
                 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
         } else {
                 mutex_lock(&inode->i_mutex);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 53f2f82..eea668d 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
         down_write(&shrinker_rwsem);
         list_del(&shrinker->list);
         up_write(&shrinker_rwsem);
+       kfree(shrinker->nr_deferred);
  }
  EXPORT_SYMBOL(unregister_shrinker);
  
diff --git a/mm/zswap.c b/mm/zswap.c

index 841e35f..d93510c 100644 (file)
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type)
         }
         tree->rbroot = RB_ROOT;
         spin_unlock(&tree->lock);
+
+       zbud_destroy_pool(tree->pool);
+       kfree(tree);
+       zswap_trees[type] = NULL;
  }
  
  static struct zbud_ops zswap_zbud_ops = {
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c

index 4fa655d..41bd855 100644 (file)
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -151,7 +151,7 @@ static int check_timer_create(int which)
         fflush(stdout);
  
         done = 0;
-       timer_create(which, NULL, &id);
+       err = timer_create(which, NULL, &id);
         if (err < 0) {
                 perror("Can't create timer\n");
                 return -1;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 17 Oct 2013 04:36:03 +0000 (21:36 -0700)
MAINTAINERS		patch \| blob \| history
block/partitions/efi.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/proc/inode.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
ipc/sem.c		patch \| blob \| history
ipc/util.c		patch \| blob \| history
lib/percpu-refcount.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/zswap.c		patch \| blob \| history
tools/testing/selftests/timers/posix_timers.c		patch \| blob \| history