Merge branch 'common/mmcif' into rmobile-latest
authorPaul Mundt <lethal@linux-sh.org>
Fri, 14 Jan 2011 07:06:31 +0000 (16:06 +0900)
committerPaul Mundt <lethal@linux-sh.org>
Fri, 14 Jan 2011 07:06:31 +0000 (16:06 +0900)
181 files changed:
Documentation/cgroups/memory.txt
Documentation/device-mapper/dm-crypt.txt
Documentation/device-mapper/dm-raid.txt [new file with mode: 0644]
Documentation/filesystems/proc.txt
Documentation/gpio.txt
Documentation/vm/transhuge.txt [new file with mode: 0644]
MAINTAINERS
arch/alpha/include/asm/mman.h
arch/arm/kernel/module.c
arch/arm/mm/pgd.c
arch/avr32/boards/atngw100/setup.c
arch/avr32/boards/atstk1000/atstk1002.c
arch/avr32/boards/favr-32/setup.c
arch/avr32/boards/hammerhead/setup.c
arch/avr32/boards/merisc/setup.c
arch/avr32/boards/mimc200/setup.c
arch/avr32/configs/atngw100_defconfig
arch/avr32/configs/atngw100_evklcd100_defconfig
arch/avr32/configs/atngw100_evklcd101_defconfig
arch/avr32/configs/atngw100mkii_defconfig
arch/avr32/configs/atngw100mkii_evklcd100_defconfig
arch/avr32/configs/atngw100mkii_evklcd101_defconfig
arch/avr32/configs/atstk1002_defconfig
arch/avr32/configs/atstk1003_defconfig
arch/avr32/configs/atstk1004_defconfig
arch/avr32/configs/atstk1006_defconfig
arch/avr32/configs/favr-32_defconfig
arch/avr32/configs/hammerhead_defconfig
arch/avr32/include/asm/syscalls.h
arch/avr32/kernel/process.c
arch/avr32/kernel/time.c
arch/ia64/kernel/perfmon.c
arch/ia64/mm/hugetlbpage.c
arch/mips/include/asm/mman.h
arch/mips/kernel/module.c
arch/parisc/include/asm/mman.h
arch/powerpc/mm/gup.c
arch/sh/mm/hugetlbpage.c
arch/sparc/kernel/module.c
arch/sparc/mm/generic_32.c
arch/sparc/mm/generic_64.c
arch/sparc/mm/hugetlbpage.c
arch/um/kernel/skas/mmu.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/pgtable-2level.h
arch/x86/include/asm/pgtable-3level.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/xen/page.h
arch/x86/kernel/module.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/tboot.c
arch/x86/kernel/vm86_32.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/mm/gup.c
arch/x86/mm/pgtable.c
arch/x86/xen/Makefile
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c [new file with mode: 0644]
arch/xtensa/include/asm/mman.h
drivers/base/node.c
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/bitmap.c
drivers/md/dm-crypt.c
drivers/md/dm-delay.c
drivers/md/dm-ioctl.c
drivers/md/dm-kcopyd.c
drivers/md/dm-log-userspace-base.c
drivers/md/dm-log-userspace-transfer.c
drivers/md/dm-log.c
drivers/md/dm-mpath.c
drivers/md/dm-raid.c [new file with mode: 0644]
drivers/md/dm-raid1.c
drivers/md/dm-snap-persistent.c
drivers/md/dm-snap.c
drivers/md/dm-stripe.c
drivers/md/dm-table.c
drivers/md/dm.c
drivers/md/md.c
drivers/md/md.h
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/serial/atmel_serial.c
drivers/xen/Kconfig
drivers/xen/Makefile
drivers/xen/gntdev.c [new file with mode: 0644]
drivers/xen/grant-table.c
drivers/xen/platform-pci.c
fs/ecryptfs/main.c
fs/fs-writeback.c
fs/mpage.c
fs/nfs/dir.c
fs/proc/base.c
fs/proc/meminfo.c
fs/proc/page.c
fs/proc/task_mmu.c
include/asm-generic/gpio.h
include/asm-generic/mman-common.h
include/asm-generic/pgtable.h
include/linux/compaction.h
include/linux/device-mapper.h
include/linux/dm-ioctl.h
include/linux/dm-log-userspace.h
include/linux/gfp.h
include/linux/gpio.h
include/linux/huge_mm.h [new file with mode: 0644]
include/linux/irqdesc.h
include/linux/kernel.h
include/linux/kernel_stat.h
include/linux/khugepaged.h [new file with mode: 0644]
include/linux/memcontrol.h
include/linux/memory_hotplug.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmc/sh_mmcif.h
include/linux/mmu_notifier.h
include/linux/mmzone.h
include/linux/page-flags.h
include/linux/page_cgroup.h
include/linux/pagemap.h
include/linux/radix-tree.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swap.h
include/linux/vmalloc.h
include/linux/vmstat.h
include/trace/events/compaction.h [new file with mode: 0644]
include/trace/events/vmscan.h
include/trace/events/writeback.h
include/xen/gntdev.h [new file with mode: 0644]
include/xen/grant_table.h
kernel/fork.c
kernel/futex.c
kernel/irq/irqdesc.c
mm/Kconfig
mm/Makefile
mm/compaction.c
mm/dmapool.c
mm/filemap.c
mm/huge_memory.c [new file with mode: 0644]
mm/hugetlb.c
mm/internal.h
mm/ksm.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mincore.c
mm/mlock.c
mm/mmap.c
mm/mmu_notifier.c
mm/mmzone.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/page-writeback.c
mm/page_alloc.c
mm/pagewalk.c
mm/percpu-vm.c
mm/pgtable-generic.c [new file with mode: 0644]
mm/rmap.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
virt/kvm/kvm_main.c

index 7781857..bac328c 100644 (file)
@@ -385,6 +385,10 @@ mapped_file        - # of bytes of mapped file (includes tmpfs/shmem)
 pgpgin         - # of pages paged in (equivalent to # of charging events).
 pgpgout                - # of pages paged out (equivalent to # of uncharging events).
 swap           - # of bytes of swap usage
+dirty          - # of bytes that are waiting to get written back to the disk.
+writeback      - # of bytes that are actively being written back to the disk.
+nfs_unstable   - # of bytes sent to the NFS server, but not yet committed to
+               the actual storage.
 inactive_anon  - # of bytes of anonymous memory and swap cache memory on
                LRU list.
 active_anon    - # of bytes of anonymous and swap cache memory on active
@@ -406,6 +410,9 @@ total_mapped_file   - sum of all children's "cache"
 total_pgpgin           - sum of all children's "pgpgin"
 total_pgpgout          - sum of all children's "pgpgout"
 total_swap             - sum of all children's "swap"
+total_dirty            - sum of all children's "dirty"
+total_writeback                - sum of all children's "writeback"
+total_nfs_unstable     - sum of all children's "nfs_unstable"
 total_inactive_anon    - sum of all children's "inactive_anon"
 total_active_anon      - sum of all children's "active_anon"
 total_inactive_file    - sum of all children's "inactive_file"
@@ -453,6 +460,73 @@ memory under it will be reclaimed.
 You can reset failcnt by writing 0 to failcnt file.
 # echo 0 > .../memory.failcnt
 
+5.5 dirty memory
+
+Control the maximum amount of dirty pages a cgroup can have at any given time.
+
+Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
+page cache used by a cgroup.  So, in case of multiple cgroup writers, they will
+not be able to consume more than their designated share of dirty pages and will
+be forced to perform write-out if they cross that limit.
+
+The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.  It
+is possible to configure a limit to trigger both a direct writeback or a
+background writeback performed by per-bdi flusher threads.  The root cgroup
+memory.dirty_* control files are read-only and match the contents of
+the /proc/sys/vm/dirty_* files.
+
+Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+
+- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of
+  cgroup memory) at which a process generating dirty pages will itself start
+  writing out dirty data.
+
+- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes)
+  in the cgroup at which a process generating dirty pages will start itself
+  writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to indicate
+  that value is kilo, mega or gigabytes.
+
+  Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio.
+  Only one of them may be specified at a time.  When one is written it is
+  immediately taken into account to evaluate the dirty memory limits and the
+  other appears as 0 when read.
+
+- memory.dirty_background_ratio: the amount of dirty memory of the cgroup
+  (expressed as a percentage of cgroup memory) at which background writeback
+  kernel threads will start writing out dirty data.
+
+- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed
+  in bytes) in the cgroup at which background writeback kernel threads will
+  start writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to
+  indicate that value is kilo, mega or gigabytes.
+
+  Note: memory.dirty_background_limit_in_bytes is the counterpart of
+  memory.dirty_background_ratio.  Only one of them may be specified at a time.
+  When one is written it is immediately taken into account to evaluate the dirty
+  memory limits and the other appears as 0 when read.
+
+A cgroup may contain more dirty memory than its dirty limit.  This is possible
+because of the principle that the first cgroup to touch a page is charged for
+it.  Subsequent page counting events (dirty, writeback, nfs_unstable) are also
+counted to the originally charged cgroup.
+
+Example: If page is allocated by a cgroup A task, then the page is charged to
+cgroup A.  If the page is later dirtied by a task in cgroup B, then the cgroup A
+dirty count will be incremented.  If cgroup A is over its dirty limit but cgroup
+B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A
+over its dirty limit without throttling the dirtying cgroup B task.
+
+When use_hierarchy=0, each cgroup has dirty memory usage and limits.
+System-wide dirty limits are also consulted.  Dirty memory consumption is
+checked against both system-wide and per-cgroup dirty limits.
+
+The current implementation does not enforce per-cgroup dirty limits when
+use_hierarchy=1.  System-wide dirty limits are used for processes in such
+cgroups.  Attempts to read memory.dirty_* files return the system-wide
+values.  Writes to the memory.dirty_* files return error.  An enhanced
+implementation is needed to check the chain of parents to ensure that no
+dirty limit is exceeded.
+
 6. Hierarchy support
 
 The memory controller supports a deep hierarchy and hierarchical accounting.
index 524de92..59293ac 100644 (file)
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
-    (In format cipher-chainmode-ivopts:ivmode).
+    (In format cipher[:keycount]-chainmode-ivopts:ivmode).
     Examples:
        des
        aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
     Key used for encryption. It is encoded as a hexadecimal number.
     You can only use key sizes that are valid for the selected cipher.
 
+<keycount>
+    Multi-key compatibility mode. You can define <keycount> keys and
+    then sectors are encrypted according to their offsets (sector 0 uses key0;
+    sector 1 uses key1 etc.).  <keycount> must be a power of two.
+
 <iv_offset>
     The IV offset is a sector count that is added to the sector number
     before creating the IV.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644 (file)
index 0000000..33b6b70
--- /dev/null
@@ -0,0 +1,70 @@
+Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
+provides a way to use device-mapper interfaces to access the MD RAID
+drivers.
+
+As with all device-mapper targets, the nominal public interfaces are the
+constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
+and STATUSTYPE_TABLE).  The CTR table looks like the following:
+
+1: <s> <l> raid \
+2:      <raid_type> <#raid_params> <raid_params> \
+3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
+
+Line 1 contains the standard first three arguments to any device-mapper
+target - the start, length, and target type fields.  The target type in
+this case is "raid".
+
+Line 2 contains the arguments that define the particular raid
+type/personality/level, the required arguments for that raid type, and
+any optional arguments.  Possible raid types include: raid4, raid5_la,
+raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
+planned for the future.)  The list of required and optional parameters
+is the same for all the current raid types.  The required parameters are
+positional, while the optional parameters are given as key/value pairs.
+The possible parameters are as follows:
+ <chunk_size>           Chunk size in sectors.
+ [[no]sync]             Force/Prevent RAID initialization
+ [rebuild <idx>]        Rebuild the drive indicated by the index
+ [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
+ [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
+ [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
+
+Line 3 contains the list of devices that compose the array in
+metadata/data device pairs.  If the metadata is stored separately, a '-'
+is given for the metadata device position.  If a drive has failed or is
+missing at creation time, a '-' can be given for both the metadata and
+data drives for a given position.
+
+NB. Currently all metadata devices must be specified as '-'.
+
+Examples:
+# RAID4 - 4 data drives, 1 parity
+# No metadata devices specified to hold superblock/bitmap info
+# Chunk size of 1MiB
+# (Lines separated for easy reading)
+0 1960893648 raid \
+        raid4 1 2048 \
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# Chunk size of 1MiB, force RAID initialization,
+#       min recovery rate at 20 kiB/sec/disk
+0 1960893648 raid \
+        raid4 4 2048 min_recovery_rate 20 sync\
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+Performing a 'dmsetup table' should display the CTR table used to
+construct the mapping (with possible reordering of optional
+parameters).
+
+Performing a 'dmsetup status' will yield information on the state and
+health of the array.  The output is as follows:
+1: <s> <l> raid \
+2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
+
+Line 1 is standard DM output.  Line 2 is best shown by example:
+        0 1960893648 raid raid4 5 AAAAA 2/490221568
+Here we can see the RAID type is raid4, there are 5 devices - all of
+which are 'A'live, and the array is 2/490221568 complete with recovery.
index 9471225..23cae65 100644 (file)
@@ -375,6 +375,7 @@ Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
+Locked:              374 kB
 
 The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options.  The following is from a
 
 > cat /proc/meminfo
 
+The "Locked" indicates whether the mapping is locked in memory or not.
+
 
 MemTotal:     16344972 kB
 MemFree:      13634064 kB
@@ -1320,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
 
index a492d92..792faa3 100644 (file)
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
        int gpio_direction_input(unsigned gpio);
        int gpio_direction_output(unsigned gpio, int value);
 
-The return value is zero for success, else a negative errno.  It must
+The return value is zero for success, else a negative errno.  It should
 be checked, since the get/set calls don't have error returns and since
 misconfiguration is possible.  You should normally issue these calls from
 a task context.  However, for spinlock-safe GPIOs it's OK to use them
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644 (file)
index 0000000..0924aac
--- /dev/null
@@ -0,0 +1,298 @@
+= Transparent Hugepage Support =
+
+== Objective ==
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently it only works for anonymous memory mappings but in the
+future it can expand over the pagecache layer starting with tmpfs.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+
+== Design ==
+
+- "graceful fallback": mm components which don't have transparent
+  hugepage knowledge fall back to breaking a transparent hugepage and
+  working on the regular pages and their respective regular pmd/pte
+  mappings
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+- this initial support only offers the feature in the anonymous memory
+  regions but it'd be ideal to move it to tmpfs and the pagecache
+  later
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+== sysfs ==
+
+Transparent Hugepage Support can be entirely disabled (mostly for
+debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
+avoid the risk of consuming more memory resources) or enabled system
+wide. This can be achieved with one of:
+
+echo always >/sys/kernel/mm/transparent_hugepage/enabled
+echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+hugepages in case they're not immediately free to madvise regions or
+to never try to defrag memory and simply fallback to regular pages
+unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact
+we use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged:
+
+echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core):
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+== Boot parameter ==
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter "transparent_hugepage=always" or
+"transparent_hugepage=madvise" or "transparent_hugepage=never"
+(without "") to the kernel command line.
+
+== Need of application restart ==
+
+The transparent_hugepage/enabled values only affect future
+behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to
+the regions registered in khugepaged.
+
+== get_user_pages and follow_page ==
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead (while serializing properly against
+split_huge_page() to avoid the head and tail pages to disappear from
+under it, see the futex code to see an example of that, hugetlbfs also
+needed special handling in futex code for similar reasons).
+
+NOTE: these aren't new constraints to the GUP API, and they match the
+same constrains that applies to hugetlbfs too, so any driver capable
+of handling GUP on hugetlbfs will also work fine on transparent
+hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+== Optimizing the applications ==
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+== Hugetlbfs ==
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+
+== Graceful fallback ==
+
+Code walking pagetables but unware about huge pmds can simply call
+split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change:
+
+diff --git a/mm/mremap.c b/mm/mremap.c
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+               return NULL;
+
+       pmd = pmd_offset(pud, addr);
++      split_huge_page_pmd(mm, pmd);
+       if (pmd_none_or_clear_bad(pmd))
+               return NULL;
+
+== Locking in hugepage aware code ==
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_page_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page_table_lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_page can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page_table_lock and fallback to the old code as
+before. Otherwise you should run pmd_trans_splitting on the pmd. In
+case pmd_trans_splitting returns true, it means split_huge_page is
+already in the middle of splitting the page. So if pmd_trans_splitting
+returns true it's enough to drop the page_table_lock and call
+wait_split_huge_page and then fallback the old code paths. You are
+guaranteed by the time wait_split_huge_page returns, the pmd isn't
+huge anymore. If pmd_trans_splitting returns false, you can proceed to
+process the huge pmd and the hugepage natively. Once finished you can
+drop the page_table_lock.
+
+== compound_lock, get_user_pages and put_page ==
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the
+page structures. It can do that easily for refcounts taken by huge pmd
+mappings. But the GUI API as created by hugetlbfs (that returns head
+and tail pages if running get_user_pages on an address backed by any
+hugepage), requires the refcount to be accounted on the tail pages and
+not only in the head pages, if we want to be able to run
+split_huge_page while there are gup pins established on any tail
+page. Failure to be able to run split_huge_page if there's any gup pin
+on any tail page, would mean having to split all hugepages upfront in
+get_user_pages which is unacceptable as too many gup users are
+performance critical and they must work natively on hugepages like
+they work natively on hugetlbfs already (hugetlbfs is simpler because
+hugetlbfs pages cannot be splitted so there wouldn't be requirement of
+accounting the pins on the tail pages for hugetlbfs). If we wouldn't
+account the gup refcounts on the tail pages during gup, we won't know
+anymore which tail page is pinned by gup and which is not while we run
+split_huge_page. But we still have to add the gup pin to the head page
+too, to know when we can free the compound page in case it's never
+splitted during its lifetime. That requires changing not just
+get_page, but put_page as well so that when put_page runs on a tail
+page (and only on a tail page) it will find its respective head page,
+and then it will decrease the head page refcount in addition to the
+tail page refcount. To obtain a head page reliably and to decrease its
+refcount without race conditions, put_page has to serialize against
+__split_huge_page_refcount using a special per-page lock called
+compound_lock.
index 3dd5c6f..af656de 100644 (file)
@@ -6592,13 +6592,12 @@ F:      Documentation/i2c/busses/i2c-viapro
 F:     drivers/i2c/busses/i2c-viapro.c
 
 VIA SD/MMC CARD CONTROLLER DRIVER
-M:     Joseph Chan <JosephChan@via.com.tw>
+M:     Bruce Chang <brucechang@via.com.tw>
 M:     Harald Welte <HaraldWelte@viatech.com>
 S:     Maintained
 F:     drivers/mmc/host/via-sdmmc.c
 
 VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER
-M:     Joseph Chan <JosephChan@via.com.tw>
 M:     Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
 L:     linux-fbdev@vger.kernel.org
 S:     Maintained
index 99c56d4..72db984 100644 (file)
@@ -53,6 +53,9 @@
 #define MADV_MERGEABLE   12            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13            /* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE  14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE        15              /* Not worth backing with hugepages */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 0c1bb68..2cfe816 100644 (file)
 #ifdef CONFIG_MMU
 void *module_alloc(unsigned long size)
 {
-       struct vm_struct *area;
-
-       size = PAGE_ALIGN(size);
-       if (!size)
-               return NULL;
-
-       area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-       if (!area)
-               return NULL;
-
-       return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+                               GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
+                               __builtin_return_address(0));
 }
 #else /* CONFIG_MMU */
 void *module_alloc(unsigned long size)
index 93292a1..709244c 100644 (file)
@@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
                if (!new_pmd)
                        goto no_pmd;
 
-               new_pte = pte_alloc_map(mm, new_pmd, 0);
+               new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
                if (!new_pte)
                        goto no_pte;
 
index 8c6a244..659d119 100644 (file)
@@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index 2adc261..6ce30fb 100644 (file)
@@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index 75f19f4..86fab77 100644 (file)
@@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index dd00987..da14fbd 100644 (file)
@@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
 
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index 623b077..e61bc94 100644 (file)
@@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
 
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index 523d8e1..c4da5cb 100644 (file)
@@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-       if (!pclk)
+       if (IS_ERR(pclk))
                return;
 
        clk_enable(pclk);
index 9854013..6f9ca56 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
-CONFIG_EEPROM_AT24=m
 CONFIG_NETDEVICES=y
 CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
@@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_GPIO=y
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
index 7ceda35..7eece0a 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -155,7 +160,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
index 7bc5b2c..387eb9d 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -154,7 +159,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
index 4bd3682..f0fe237 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
 CONFIG_TUN=m
@@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_GPIO=y
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
index f8437ef..e4a7c1d 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -158,7 +163,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
index 7f58f99..6f37f70 100644 (file)
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -157,7 +162,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
index aec4c43..4fb01f5 100644 (file)
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -35,6 +35,7 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
 CONFIG_NET_IPGRE=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
@@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_M25P80=m
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=m
 CONFIG_BLK_DEV_SR=m
@@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m
 CONFIG_SND_PCM_OSS=m
 # CONFIG_SND_SUPPORT_OLD_API is not set
 # CONFIG_SND_VERBOSE_PROCFS is not set
-# CONFIG_SND_DRIVERS is not set
 CONFIG_SND_AT73C213=m
 # CONFIG_HID_SUPPORT is not set
 CONFIG_USB_GADGET=y
@@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
 CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=m
 CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
 CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
@@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
index 50ba3db..9faaf9b 100644 (file)
@@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
-# CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_M25P80=m
+CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=m
 CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
 CONFIG_ATA=m
 # CONFIG_SATA_PMP is not set
 CONFIG_PATA_AT32=m
@@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m
 CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
 CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GPIO=m
 # CONFIG_MOUSE_PS2 is not set
@@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m
 CONFIG_SND_AT73C213=m
 # CONFIG_HID_SUPPORT is not set
 CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_DEBUG_FS=y
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
-CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
-CONFIG_DW_DMAC=y
-CONFIG_EXT2_FS=m
-CONFIG_EXT3_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
+CONFIG_UBIFS_FS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-CONFIG_CRC_T10DIF=m
index 329e10b..3d2a5d8 100644 (file)
@@ -1,19 +1,32 @@
 CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
-# CONFIG_FUTEX is not set
-# CONFIG_EPOLL is not set
-# CONFIG_SIGNALFD is not set
-# CONFIG_TIMERFD is not set
-# CONFIG_EVENTFD is not set
 # CONFIG_COMPAT_BRK is not set
-CONFIG_SLOB=y
-# CONFIG_BLOCK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+# CONFIG_KPROBES is not set
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BOARD_ATSTK1004=y
 # CONFIG_OWNERSHIP_TRACE is not set
+CONFIG_NMI_DEBUGGING=y
+CONFIG_PM=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -31,40 +44,104 @@ CONFIG_MTD=y
 CONFIG_MTD_PARTITIONS=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
-# CONFIG_INPUT is not set
+CONFIG_MTD_UBI=y
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
+CONFIG_ATMEL_PWM=m
+CONFIG_ATMEL_TCLIB=y
+CONFIG_ATMEL_SSC=m
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=m
+CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=m
+# CONFIG_SATA_PMP is not set
+CONFIG_PATA_AT32=m
+CONFIG_NETDEVICES=y
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+CONFIG_PPP=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GPIO=m
+# CONFIG_MOUSE_PS2 is not set
+CONFIG_MOUSE_GPIO=m
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_ATMEL=y
 CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_SERIAL_ATMEL_PDC is not set
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
+CONFIG_I2C=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
+CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_FB=y
 CONFIG_FB_ATMEL=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_LTV350QV=y
 # CONFIG_BACKLIGHT_CLASS_DEVICE is not set
 CONFIG_USB_GADGET=y
-CONFIG_USB_ETH=y
-# CONFIG_USB_ETH_RNDIS is not set
+CONFIG_USB_ZERO=m
+CONFIG_USB_ETH=m
+CONFIG_USB_GADGETFS=m
+CONFIG_USB_FILE_STORAGE=m
+CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
+CONFIG_MMC=y
+CONFIG_MMC_TEST=m
+CONFIG_MMC_ATMELMCI=y
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_ATMEL_PWM=m
+CONFIG_LEDS_GPIO=m
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
 CONFIG_RTC_CLASS=y
-# CONFIG_RTC_INTF_PROC is not set
 CONFIG_RTC_DRV_AT32AP700X=y
+CONFIG_DMADEVICES=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
+CONFIG_FUSE_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
-# CONFIG_PROC_PAGE_MONITOR is not set
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
+CONFIG_UBIFS_FS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_FRAME_POINTER=y
index dbcc1b5..1ed8f22 100644 (file)
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -37,6 +36,7 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
 CONFIG_NET_IPGRE=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
@@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_DATAFLASH_OTP=y
-CONFIG_MTD_M25P80=m
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
@@ -132,17 +130,17 @@ CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
 CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
 CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=m
 CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
 CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
@@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_FIPS=y
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
index 0c813b6..aeadc95 100644 (file)
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
index dcc01f0..1692bee 100644 (file)
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
index ab608b7..244f2ac 100644 (file)
 #include <linux/types.h>
 #include <linux/signal.h>
 
-/* kernel/process.c */
-asmlinkage int sys_fork(struct pt_regs *);
-asmlinkage int sys_clone(unsigned long, unsigned long,
-                        unsigned long, unsigned long,
-                        struct pt_regs *);
-asmlinkage int sys_vfork(struct pt_regs *);
-asmlinkage int sys_execve(const char __user *, char __user *__user *,
-                         char __user *__user *, struct pt_regs *);
-
-/* kernel/signal.c */
-asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *,
-                              struct pt_regs *);
-asmlinkage int sys_rt_sigreturn(struct pt_regs *);
-
 /* mm/cache.c */
 asmlinkage int sys_cacheflush(int, void __user *, size_t);
 
index 9c46aaa..ef5a2a0 100644 (file)
@@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs)
 }
 
 asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
-                        unsigned long parent_tidptr,
-                        unsigned long child_tidptr, struct pt_regs *regs)
+               void __user *parent_tidptr, void __user *child_tidptr,
+               struct pt_regs *regs)
 {
        if (!newsp)
                newsp = regs->sp;
-       return do_fork(clone_flags, newsp, regs, 0,
-                      (int __user *)parent_tidptr,
-                      (int __user *)child_tidptr);
+       return do_fork(clone_flags, newsp, regs, 0, parent_tidptr,
+                       child_tidptr);
 }
 
 asmlinkage int sys_vfork(struct pt_regs *regs)
index 668ed28..05ad291 100644 (file)
@@ -35,7 +35,6 @@ static struct clocksource counter = {
        .rating         = 50,
        .read           = read_cycle_count,
        .mask           = CLOCKSOURCE_MASK(32),
-       .shift          = 16,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -123,9 +122,7 @@ void __init time_init(void)
 
        /* figure rate for counter */
        counter_hz = clk_get_rate(boot_cpu_data.clk);
-       counter.mult = clocksource_hz2mult(counter_hz, counter.shift);
-
-       ret = clocksource_register(&counter);
+       ret = clocksource_register_hz(&counter, counter_hz);
        if (ret)
                pr_debug("timer: could not register clocksource: %d\n", ret);
 
index ac76da0..89accc6 100644 (file)
@@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 /* forward declaration */
-static static const struct dentry_operations pfmfs_dentry_operations;
+static const struct dentry_operations pfmfs_dentry_operations;
 
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
index 1841ee7..5ca674b 100644 (file)
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
        if (pud) {
                pmd = pmd_alloc(mm, pud, taddr);
                if (pmd)
-                       pte = pte_alloc_map(mm, pmd, taddr);
+                       pte = pte_alloc_map(mm, NULL, pmd, taddr);
        }
        return pte;
 }
index c892bfb..785b4ea 100644 (file)
@@ -77,6 +77,9 @@
 #define MADV_UNMERGEABLE 13            /* KSM may not merge identical pages */
 #define MADV_HWPOISON    100           /* poison a page for testing */
 
+#define MADV_HUGEPAGE  14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE        15              /* Not worth backing with hugepages */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 6f51dda..d87a72e 100644 (file)
@@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 #ifdef MODULE_START
-       struct vm_struct *area;
-
-       size = PAGE_ALIGN(size);
-       if (!size)
-               return NULL;
-
-       area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END);
-       if (!area)
-               return NULL;
-
-       return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+       return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+                               GFP_KERNEL, PAGE_KERNEL, -1,
+                               __builtin_return_address(0));
 #else
        if (size == 0)
                return NULL;
index 9749c8a..f5b7bf5 100644 (file)
@@ -59,6 +59,9 @@
 #define MADV_MERGEABLE   65            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66            /* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE  67              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE        68              /* Not worth backing with hugepages */
+
 /* compatibility flags */
 #define MAP_FILE       0
 #define MAP_VARIABLE   0
index d7efdbf..fec1320 100644 (file)
 
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 
+static inline void get_huge_page_tail(struct page *page)
+{
+       /*
+        * __split_huge_page_refcount() cannot run
+        * from under us.
+        */
+       VM_BUG_ON(atomic_read(&page->_count) < 0);
+       atomic_inc(&page->_count);
+}
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        put_page(page);
                        return 0;
                }
+               if (PageTail(page))
+                       get_huge_page_tail(page);
                pages[*nr] = page;
                (*nr)++;
 
index 9163db3..d776234 100644 (file)
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                if (pud) {
                        pmd = pmd_alloc(mm, pud, addr);
                        if (pmd)
-                               pte = pte_alloc_map(mm, pmd, addr);
+                               pte = pte_alloc_map(mm, NULL, pmd, addr);
                }
        }
 
index ee3c7dd..8d348c4 100644 (file)
 
 static void *module_map(unsigned long size)
 {
-       struct vm_struct *area;
-
-       size = PAGE_ALIGN(size);
-       if (!size || size > MODULES_LEN)
-               return NULL;
-
-       area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-       if (!area)
+       if (PAGE_ALIGN(size) > MODULES_LEN)
                return NULL;
-
-       return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+                               GFP_KERNEL, PAGE_KERNEL, -1,
+                               __builtin_return_address(0));
 }
 
 static char *dot2underscore(char *name)
index 5edcac1..e6067b7 100644 (file)
@@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
                end = PGDIR_SIZE;
        offset -= address;
        do {
-               pte_t * pte = pte_alloc_map(mm, pmd, address);
+               pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
                if (!pte)
                        return -ENOMEM;
                io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
index 04f2bf4..3cb00df 100644 (file)
@@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
                end = PGDIR_SIZE;
        offset -= address;
        do {
-               pte_t * pte = pte_alloc_map(mm, pmd, address);
+               pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
                if (!pte)
                        return -ENOMEM;
                io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
index 5fdddf1..f4e9764 100644 (file)
@@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
        if (pud) {
                pmd = pmd_alloc(mm, pud, addr);
                if (pmd)
-                       pte = pte_alloc_map(mm, pmd, addr);
+                       pte = pte_alloc_map(mm, NULL, pmd, addr);
        }
        return pte;
 }
index 3d099f9..1aee587 100644 (file)
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
        if (!pmd)
                goto out_pmd;
 
-       pte = pte_alloc_map(mm, pmd, proc);
+       pte = pte_alloc_map(mm, NULL, pmd, proc);
        if (!pte)
                goto out_pte;
 
index aa75f21..ffd7f8d 100644 (file)
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
index 7709c12..2071a8b 100644 (file)
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
 {
        PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
 }
+static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
+                             pmd_t *pmdp)
+{
+       PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
+}
 
 static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
        PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
 }
 
+static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
+                                   pmd_t *pmdp)
+{
+       PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
+}
+
 static inline pte_t __pte(pteval_t val)
 {
        pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                             pmd_t *pmdp, pmd_t pmd)
+{
+#if PAGETABLE_LEVELS >= 3
+       if (sizeof(pmdval_t) > sizeof(long))
+               /* 5 arg words */
+               pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
+       else
+               PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
+#endif
+}
+#endif
+
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
        pmdval_t val = native_pmd_val(pmd);
index b82bac9..8288509 100644 (file)
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
        void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
                           pte_t *ptep, pte_t pteval);
        void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+       void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
+                          pmd_t *pmdp, pmd_t pmdval);
        void (*pte_update)(struct mm_struct *mm, unsigned long addr,
                           pte_t *ptep);
        void (*pte_update_defer)(struct mm_struct *mm,
                                 unsigned long addr, pte_t *ptep);
+       void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
+                          pmd_t *pmdp);
+       void (*pmd_update_defer)(struct mm_struct *mm,
+                                unsigned long addr, pmd_t *pmdp);
 
        pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep);
index 2334982..98391db 100644 (file)
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+       return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
  * split up the 29 bits of offset into this range:
index 177b016..94b979d 100644 (file)
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pmd {
+       struct {
+               u32 pmd_low;
+               u32 pmd_high;
+       };
+       pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+       union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+       /* xchg acts as a barrier before setting of the high bits */
+       res.pmd_low = xchg(&orig->pmd_low, 0);
+       res.pmd_high = orig->pmd_high;
+       orig->pmd_high = 0;
+
+       return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits 0, 6 and 7 are taken in the low part of the pte,
  * put the 32 bits of offset into the high part.
index ada823a..18601c8 100644 (file)
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #else  /* !CONFIG_PARAVIRT */
 #define set_pte(ptep, pte)             native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)        native_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd)        native_set_pmd_at(mm, addr, pmdp, pmd)
 
 #define set_pte_atomic(ptep, pte)                                      \
        native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 
 #define pte_update(mm, addr, ptep)              do { } while (0)
 #define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#define pmd_update(mm, addr, ptep)              do { } while (0)
+#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
 
 #define pgd_val(x)     native_pgd_val(x)
 #define __pgd(x)       native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
        return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
+static inline int pmd_young(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
        return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
                (_PAGE_PSE | _PAGE_PRESENT);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+       return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+       return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
 {
        pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
        return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+       pmdval_t v = native_pmd_val(pmd);
+
+       return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+       pmdval_t v = native_pmd_val(pmd);
+
+       return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
        return __pte(val);
 }
 
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       pmdval_t val = pmd_val(pmd);
+
+       val &= _HPAGE_CHG_MASK;
+       val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+       return __pmd(val);
+}
+
 /* mprotect needs to preserve PAT bits when updating vm_page_prot */
 #define pgprot_modify pgprot_modify
 static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pmd_page(pmd)  pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd)  pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
 
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
        return res;
 }
 
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+       pmd_t res = *pmdp;
+
+       native_pmd_clear(pmdp);
+       return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
                                     pte_t *ptep , pte_t pte)
 {
        native_set_pte(ptep, pte);
 }
 
+static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                                    pmd_t *pmdp , pmd_t pmd)
+{
+       native_set_pmd(pmdp, pmd);
+}
+
 #ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 
 #define flush_tlb_fix_spurious_fault(vma, address)
 
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp,
+                                pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                    unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                                      pmd_t *pmdp)
+{
+       pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+       pmd_update(mm, addr, pmdp);
+       return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                     unsigned long addr, pmd_t *pmdp)
+{
+       clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+       pmd_update(mm, addr, pmdp);
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
index f86da20..975f709 100644 (file)
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
        native_set_pte(ptep, pte);
 }
 
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+       *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+       native_set_pmd(pmd, native_make_pmd(0));
+}
+
 static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 {
 #ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #endif
 }
 
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 {
-       *pmdp = pmd;
-}
-
-static inline void native_pmd_clear(pmd_t *pmd)
-{
-       native_set_pmd(pmd, native_make_pmd(0));
+#ifdef CONFIG_SMP
+       return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+       /* native_local_pmdp_get_and_clear,
+          but duplicated because of cyclic dependency */
+       pmd_t ret = *xp;
+       native_pmd_clear(xp);
+       return ret;
+#endif
 }
 
 static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
 #define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
 
 #define __HAVE_ARCH_PTE_SAME
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_64_H */
index d1f4a76..7db7723 100644 (file)
@@ -22,6 +22,7 @@
 #define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL      _PAGE_BIT_UNUSED1
 #define _PAGE_BIT_CPA_TEST     _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING    _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL  (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING        (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |         \
                         _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 #define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB         (0)
index 8760cc6..f25bdf2 100644 (file)
@@ -42,6 +42,11 @@ extern unsigned int   machine_to_phys_order;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
+extern int m2p_add_override(unsigned long mfn, struct page *page);
+extern int m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
        unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return mfn;
 
-       if (unlikely((mfn >> machine_to_phys_order) != 0))
-               return ~0;
-
        pfn = 0;
        /*
         * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
         */
        __get_user(pfn, &machine_to_phys_mapping[mfn]);
 
+       /*
+        * If this appears to be a foreign mfn (because the pfn
+        * doesn't map back to the mfn), then check the local override
+        * table to see if there's a better pfn to use.
+        */
+       if (get_phys_to_machine(pfn) != mfn)
+               pfn = m2p_find_override_pfn(mfn, pfn);
+
        return pfn;
 }
 
index 8f29560..ab23f1a 100644 (file)
 
 void *module_alloc(unsigned long size)
 {
-       struct vm_struct *area;
-
-       if (!size)
-               return NULL;
-       size = PAGE_ALIGN(size);
-       if (size > MODULES_LEN)
+       if (PAGE_ALIGN(size) > MODULES_LEN)
                return NULL;
-
-       area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-       if (!area)
-               return NULL;
-
-       return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
-                                       PAGE_KERNEL_EXEC);
+       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+                               GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+                               -1, __builtin_return_address(0));
 }
 
 /* Free memory returned from module_alloc */
index c5b2500..869e1ae 100644 (file)
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
        .set_pte = native_set_pte,
        .set_pte_at = native_set_pte_at,
        .set_pmd = native_set_pmd,
+       .set_pmd_at = native_set_pmd_at,
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
+       .pmd_update = paravirt_nop,
+       .pmd_update_defer = paravirt_nop,
 
        .ptep_modify_prot_start = __ptep_modify_prot_start,
        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
index c2f1b26..998e972 100644 (file)
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
        pmd = pmd_alloc(&tboot_mm, pud, vaddr);
        if (!pmd)
                return -1;
-       pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+       pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
        if (!pte)
                return -1;
        set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
index 61fb985..863f875 100644 (file)
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
+       split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
index 9cafbb4..f02b8ed 100644 (file)
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
        return ret;
 }
 
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
        struct kvm_memory_slot *slot;
-       int host_level, level, max_level;
-
        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
        if (slot && slot->dirty_bitmap)
-               return PT_PAGE_TABLE_LEVEL;
+               return true;
+       return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+       int host_level, level, max_level;
 
        host_level = host_mapping_level(vcpu->kvm, large_gfn);
 
@@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return young;
 }
 
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                             unsigned long data)
+{
+       u64 *spte;
+       int young = 0;
+
+       /*
+        * If there's no access bit in the secondary pte set by the
+        * hardware it's up to gup-fast/gup to set the access bit in
+        * the primary pte or in the page structure.
+        */
+       if (!shadow_accessed_mask)
+               goto out;
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               u64 _spte = *spte;
+               BUG_ON(!(_spte & PT_PRESENT_MASK));
+               young = _spte & PT_ACCESSED_MASK;
+               if (young) {
+                       young = 1;
+                       break;
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+out:
+       return young;
+}
+
 #define RMAP_RECYCLE_THRESHOLD 1000
 
 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
        return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
 
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
        return 1;
 }
 
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+                                       gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+       pfn_t pfn = *pfnp;
+       gfn_t gfn = *gfnp;
+       int level = *levelp;
+
+       /*
+        * Check if it's a transparent hugepage. If this would be an
+        * hugetlbfs page, level wouldn't be set to
+        * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+        * here.
+        */
+       if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+           level == PT_PAGE_TABLE_LEVEL &&
+           PageTransCompound(pfn_to_page(pfn)) &&
+           !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+               unsigned long mask;
+               /*
+                * mmu_notifier_retry was successful and we hold the
+                * mmu_lock here, so the pmd can't become splitting
+                * from under us, and in turn
+                * __split_huge_page_refcount() can't run from under
+                * us and we can safely transfer the refcount from
+                * PG_tail to PG_head as we switch the pfn to tail to
+                * head.
+                */
+               *levelp = level = PT_DIRECTORY_LEVEL;
+               mask = KVM_PAGES_PER_HPAGE(level) - 1;
+               VM_BUG_ON((gfn & mask) != (pfn & mask));
+               if (pfn & mask) {
+                       gfn &= ~mask;
+                       *gfnp = gfn;
+                       kvm_release_pfn_clean(pfn);
+                       pfn &= ~mask;
+                       if (!get_page_unless_zero(pfn_to_page(pfn)))
+                               BUG();
+                       *pfnp = pfn;
+               }
+       }
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 {
        int r;
        int level;
+       int force_pt_level;
        pfn_t pfn;
        unsigned long mmu_seq;
        bool map_writable;
 
-       level = mapping_level(vcpu, gfn);
-
-       /*
-        * This path builds a PAE pagetable - so we can map 2mb pages at
-        * maximum. Therefore check if the level is larger than that.
-        */
-       if (level > PT_DIRECTORY_LEVEL)
-               level = PT_DIRECTORY_LEVEL;
+       force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+       if (likely(!force_pt_level)) {
+               level = mapping_level(vcpu, gfn);
+               /*
+                * This path builds a PAE pagetable - so we can map
+                * 2mb pages at maximum. Therefore check if the level
+                * is larger than that.
+                */
+               if (level > PT_DIRECTORY_LEVEL)
+                       level = PT_DIRECTORY_LEVEL;
 
-       gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+       } else
+               level = PT_PAGE_TABLE_LEVEL;
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
@@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
+       if (likely(!force_pt_level))
+               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
                         prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        pfn_t pfn;
        int r;
        int level;
+       int force_pt_level;
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;
        int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (r)
                return r;
 
-       level = mapping_level(vcpu, gfn);
-
-       gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+       force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+       if (likely(!force_pt_level)) {
+               level = mapping_level(vcpu, gfn);
+               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+       } else
+               level = PT_PAGE_TABLE_LEVEL;
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
@@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
+       if (likely(!force_pt_level))
+               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, gpa, write, map_writable,
                         level, gfn, pfn, prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
index 53210f1..6bccc24 100644 (file)
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int r;
        pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
+       int force_pt_level;
        unsigned long mmu_seq;
        bool map_writable;
 
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                return 0;
        }
 
-       if (walker.level >= PT_DIRECTORY_LEVEL) {
+       if (walker.level >= PT_DIRECTORY_LEVEL)
+               force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+       else
+               force_pt_level = 1;
+       if (!force_pt_level) {
                level = min(walker.level, mapping_level(vcpu, walker.gfn));
                walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
        }
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
        trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
        kvm_mmu_free_some_pages(vcpu);
+       if (!force_pt_level)
+               transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
                             level, &write_pt, pfn, map_writable, prefault);
        (void)sptep;
index 738e659..dbe34b9 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #include <asm/pgtable.h>
 
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
                get_page(page);
+               SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
 
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        VM_BUG_ON(page != compound_head(page));
        VM_BUG_ON(page_count(page) == 0);
        atomic_add(nr, &page->_count);
+       SetPageReferenced(page);
+}
+
+static inline void get_huge_page_tail(struct page *page)
+{
+       /*
+        * __split_huge_page_refcount() cannot run
+        * from under us.
+        */
+       VM_BUG_ON(atomic_read(&page->_count) < 0);
+       atomic_inc(&page->_count);
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
+               if (PageTail(page))
+                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd))
+               /*
+                * The pmd_trans_splitting() check below explains why
+                * pmdp_splitting_flush has to flush the tlb, to stop
+                * this gup-fast code from running while we set the
+                * splitting bit in the pmd. Returning zero will take
+                * the slow path that will call wait_split_huge_page()
+                * if the pmd is still in splitting state. gup-fast
+                * can't because it has irq disabled and
+                * wait_split_huge_page() would never return as the
+                * tlb flush IPI wouldn't run.
+                */
+               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
index 8be8c7d..500242d 100644 (file)
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        return changed;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp,
+                         pmd_t entry, int dirty)
+{
+       int changed = !pmd_same(*pmdp, entry);
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       if (changed && dirty) {
+               *pmdp = entry;
+               pmd_update_defer(vma->vm_mm, address, pmdp);
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       }
+
+       return changed;
+}
+#endif
+
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
 {
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
        return ret;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long addr, pmd_t *pmdp)
+{
+       int ret = 0;
+
+       if (pmd_young(*pmdp))
+               ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                        (unsigned long *)pmdp);
+
+       if (ret)
+               pmd_update(vma->vm_mm, addr, pmdp);
+
+       return ret;
+}
+#endif
+
 int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
 {
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+       int young;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       young = pmdp_test_and_clear_young(vma, address, pmdp);
+       if (young)
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+       return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp)
+{
+       int set;
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+                               (unsigned long *)pmdp);
+       if (set) {
+               pmd_update(vma->vm_mm, address, pmdp);
+               /* need tlb flush only to serialize against gup-fast */
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       }
+}
+#endif
+
 /**
  * reserve_top_address - reserves a hole in the top of kernel address space
  * @reserve - size of hole to reserve
index 7793851..17c565d 100644 (file)
@@ -12,7 +12,8 @@ CFLAGS_mmu.o                  := $(nostackp)
 
 obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm.o xen-asm_$(BITS).o \
-                       grant-table.o suspend.o platform-pci-unplug.o
+                       grant-table.o suspend.o platform-pci-unplug.o \
+                       p2m.o
 
 obj-$(CONFIG_SMP)              += smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
index 44924e5..7575e55 100644 (file)
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);    /* actual vcpu cr3 */
  */
 #define USER_LIMIT     ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- *                               Xen
- *                                |
- *     p2m_top              p2m_top_mfn
- *       /  \                   /   \
- * p2m_mid p2m_mid     p2m_mid_mfn p2m_mid_mfn
- *    / \      / \         /           /
- *  p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively. 
- */
-
-unsigned long xen_max_p2m_pfn __read_mostly;
-
-#define P2M_PER_PAGE           (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN            (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
-       BUG_ON(pfn >= MAX_P2M_PFN);
-       return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
-       return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
-       return pfn % P2M_PER_PAGE;
-}
-
-static void p2m_top_init(unsigned long ***top)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-               top[i] = p2m_mid_missing;
-}
-
-static void p2m_top_mfn_init(unsigned long *top)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-               top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-               top[i] = p2m_mid_missing_mfn;
-}
-
-static void p2m_mid_init(unsigned long **mid)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_MID_PER_PAGE; i++)
-               mid[i] = p2m_missing;
-}
-
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_MID_PER_PAGE; i++)
-               mid[i] = virt_to_mfn(p2m_missing);
-}
-
-static void p2m_init(unsigned long *p2m)
-{
-       unsigned i;
-
-       for (i = 0; i < P2M_MID_PER_PAGE; i++)
-               p2m[i] = INVALID_P2M_ENTRY;
-}
-
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- *   to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- *   tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
-       unsigned long pfn;
-
-       /* Pre-initialize p2m_top_mfn to be completely missing */
-       if (p2m_top_mfn == NULL) {
-               p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
-               p2m_mid_mfn_init(p2m_mid_missing_mfn);
-
-               p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
-               p2m_top_mfn_p_init(p2m_top_mfn_p);
-
-               p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
-               p2m_top_mfn_init(p2m_top_mfn);
-       } else {
-               /* Reinitialise, mfn's all change after migration */
-               p2m_mid_mfn_init(p2m_mid_missing_mfn);
-       }
-
-       for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
-               unsigned topidx = p2m_top_index(pfn);
-               unsigned mididx = p2m_mid_index(pfn);
-               unsigned long **mid;
-               unsigned long *mid_mfn_p;
-
-               mid = p2m_top[topidx];
-               mid_mfn_p = p2m_top_mfn_p[topidx];
-
-               /* Don't bother allocating any mfn mid levels if
-                * they're just missing, just update the stored mfn,
-                * since all could have changed over a migrate.
-                */
-               if (mid == p2m_mid_missing) {
-                       BUG_ON(mididx);
-                       BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
-                       p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
-                       pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
-                       continue;
-               }
-
-               if (mid_mfn_p == p2m_mid_missing_mfn) {
-                       /*
-                        * XXX boot-time only!  We should never find
-                        * missing parts of the mfn tree after
-                        * runtime.  extend_brk() will BUG if we call
-                        * it too late.
-                        */
-                       mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                       p2m_mid_mfn_init(mid_mfn_p);
-
-                       p2m_top_mfn_p[topidx] = mid_mfn_p;
-               }
-
-               p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
-               mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
-       }
-}
-
-void xen_setup_mfn_list_list(void)
-{
-       BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
-       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-               virt_to_mfn(p2m_top_mfn);
-       HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
-       unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
-       unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-       unsigned long pfn;
-
-       xen_max_p2m_pfn = max_pfn;
-
-       p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
-       p2m_init(p2m_missing);
-
-       p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
-       p2m_mid_init(p2m_mid_missing);
-
-       p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
-       p2m_top_init(p2m_top);
-
-       /*
-        * The domain builder gives us a pre-constructed p2m array in
-        * mfn_list for all the pages initially given to us, so we just
-        * need to graft that into our tree structure.
-        */
-       for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
-               unsigned topidx = p2m_top_index(pfn);
-               unsigned mididx = p2m_mid_index(pfn);
-
-               if (p2m_top[topidx] == p2m_mid_missing) {
-                       unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                       p2m_mid_init(mid);
-
-                       p2m_top[topidx] = mid;
-               }
-
-               p2m_top[topidx][mididx] = &mfn_list[pfn];
-       }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
-       unsigned topidx, mididx, idx;
-
-       if (unlikely(pfn >= MAX_P2M_PFN))
-               return INVALID_P2M_ENTRY;
-
-       topidx = p2m_top_index(pfn);
-       mididx = p2m_mid_index(pfn);
-       idx = p2m_index(pfn);
-
-       return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void *alloc_p2m_page(void)
-{
-       return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-
-static void free_p2m_page(void *p)
-{
-       free_page((unsigned long)p);
-}
-
-/* 
- * Fully allocate the p2m structure for a given pfn.  We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync.  We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
-       unsigned topidx, mididx;
-       unsigned long ***top_p, **mid;
-       unsigned long *top_mfn_p, *mid_mfn;
-
-       topidx = p2m_top_index(pfn);
-       mididx = p2m_mid_index(pfn);
-
-       top_p = &p2m_top[topidx];
-       mid = *top_p;
-
-       if (mid == p2m_mid_missing) {
-               /* Mid level is missing, allocate a new one */
-               mid = alloc_p2m_page();
-               if (!mid)
-                       return false;
-
-               p2m_mid_init(mid);
-
-               if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
-                       free_p2m_page(mid);
-       }
-
-       top_mfn_p = &p2m_top_mfn[topidx];
-       mid_mfn = p2m_top_mfn_p[topidx];
-
-       BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-
-       if (mid_mfn == p2m_mid_missing_mfn) {
-               /* Separately check the mid mfn level */
-               unsigned long missing_mfn;
-               unsigned long mid_mfn_mfn;
-
-               mid_mfn = alloc_p2m_page();
-               if (!mid_mfn)
-                       return false;
-
-               p2m_mid_mfn_init(mid_mfn);
-
-               missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
-               mid_mfn_mfn = virt_to_mfn(mid_mfn);
-               if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
-                       free_p2m_page(mid_mfn);
-               else
-                       p2m_top_mfn_p[topidx] = mid_mfn;
-       }
-
-       if (p2m_top[topidx][mididx] == p2m_missing) {
-               /* p2m leaf page is missing */
-               unsigned long *p2m;
-
-               p2m = alloc_p2m_page();
-               if (!p2m)
-                       return false;
-
-               p2m_init(p2m);
-
-               if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
-                       free_p2m_page(p2m);
-               else
-                       mid_mfn[mididx] = virt_to_mfn(p2m);
-       }
-
-       return true;
-}
-
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-       unsigned topidx, mididx, idx;
-
-       if (unlikely(pfn >= MAX_P2M_PFN)) {
-               BUG_ON(mfn != INVALID_P2M_ENTRY);
-               return true;
-       }
-
-       topidx = p2m_top_index(pfn);
-       mididx = p2m_mid_index(pfn);
-       idx = p2m_index(pfn);
-
-       if (p2m_top[topidx][mididx] == p2m_missing)
-               return mfn == INVALID_P2M_ENTRY;
-
-       p2m_top[topidx][mididx][idx] = mfn;
-
-       return true;
-}
-
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-               return true;
-       }
-
-       if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-               if (!alloc_p2m(pfn))
-                       return false;
-
-               if (!__set_phys_to_machine(pfn, mfn))
-                       return false;
-       }
-
-       return true;
-}
-
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 {
        xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644 (file)
index 0000000..8f2251d
--- /dev/null
@@ -0,0 +1,510 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ *                               Xen
+ *                                |
+ *     p2m_top              p2m_top_mfn
+ *       /  \                   /   \
+ * p2m_mid p2m_mid     p2m_mid_mfn p2m_mid_mfn
+ *    / \      / \         /           /
+ *  p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively. 
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE           (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN            (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+       BUG_ON(pfn >= MAX_P2M_PFN);
+       return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+       return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+       return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+               top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+               top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+               top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_MID_PER_PAGE; i++)
+               mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_MID_PER_PAGE; i++)
+               mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+       unsigned i;
+
+       for (i = 0; i < P2M_MID_PER_PAGE; i++)
+               p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ *   to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ *   tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+       unsigned long pfn;
+
+       /* Pre-initialize p2m_top_mfn to be completely missing */
+       if (p2m_top_mfn == NULL) {
+               p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+               p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+               p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+               p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+               p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+               p2m_top_mfn_init(p2m_top_mfn);
+       } else {
+               /* Reinitialise, mfn's all change after migration */
+               p2m_mid_mfn_init(p2m_mid_missing_mfn);
+       }
+
+       for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+               unsigned topidx = p2m_top_index(pfn);
+               unsigned mididx = p2m_mid_index(pfn);
+               unsigned long **mid;
+               unsigned long *mid_mfn_p;
+
+               mid = p2m_top[topidx];
+               mid_mfn_p = p2m_top_mfn_p[topidx];
+
+               /* Don't bother allocating any mfn mid levels if
+                * they're just missing, just update the stored mfn,
+                * since all could have changed over a migrate.
+                */
+               if (mid == p2m_mid_missing) {
+                       BUG_ON(mididx);
+                       BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+                       p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+                       pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+                       continue;
+               }
+
+               if (mid_mfn_p == p2m_mid_missing_mfn) {
+                       /*
+                        * XXX boot-time only!  We should never find
+                        * missing parts of the mfn tree after
+                        * runtime.  extend_brk() will BUG if we call
+                        * it too late.
+                        */
+                       mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                       p2m_mid_mfn_init(mid_mfn_p);
+
+                       p2m_top_mfn_p[topidx] = mid_mfn_p;
+               }
+
+               p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+               mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+       }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+       BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+               virt_to_mfn(p2m_top_mfn);
+       HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+       unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+       unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+       unsigned long pfn;
+
+       xen_max_p2m_pfn = max_pfn;
+
+       p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+       p2m_init(p2m_missing);
+
+       p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+       p2m_mid_init(p2m_mid_missing);
+
+       p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+       p2m_top_init(p2m_top);
+
+       /*
+        * The domain builder gives us a pre-constructed p2m array in
+        * mfn_list for all the pages initially given to us, so we just
+        * need to graft that into our tree structure.
+        */
+       for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+               unsigned topidx = p2m_top_index(pfn);
+               unsigned mididx = p2m_mid_index(pfn);
+
+               if (p2m_top[topidx] == p2m_mid_missing) {
+                       unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                       p2m_mid_init(mid);
+
+                       p2m_top[topidx] = mid;
+               }
+
+               p2m_top[topidx][mididx] = &mfn_list[pfn];
+       }
+
+       m2p_override_init();
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+       unsigned topidx, mididx, idx;
+
+       if (unlikely(pfn >= MAX_P2M_PFN))
+               return INVALID_P2M_ENTRY;
+
+       topidx = p2m_top_index(pfn);
+       mididx = p2m_mid_index(pfn);
+       idx = p2m_index(pfn);
+
+       return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+       return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+       free_page((unsigned long)p);
+}
+
+/* 
+ * Fully allocate the p2m structure for a given pfn.  We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync.  We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+       unsigned topidx, mididx;
+       unsigned long ***top_p, **mid;
+       unsigned long *top_mfn_p, *mid_mfn;
+
+       topidx = p2m_top_index(pfn);
+       mididx = p2m_mid_index(pfn);
+
+       top_p = &p2m_top[topidx];
+       mid = *top_p;
+
+       if (mid == p2m_mid_missing) {
+               /* Mid level is missing, allocate a new one */
+               mid = alloc_p2m_page();
+               if (!mid)
+                       return false;
+
+               p2m_mid_init(mid);
+
+               if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+                       free_p2m_page(mid);
+       }
+
+       top_mfn_p = &p2m_top_mfn[topidx];
+       mid_mfn = p2m_top_mfn_p[topidx];
+
+       BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+       if (mid_mfn == p2m_mid_missing_mfn) {
+               /* Separately check the mid mfn level */
+               unsigned long missing_mfn;
+               unsigned long mid_mfn_mfn;
+
+               mid_mfn = alloc_p2m_page();
+               if (!mid_mfn)
+                       return false;
+
+               p2m_mid_mfn_init(mid_mfn);
+
+               missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+               mid_mfn_mfn = virt_to_mfn(mid_mfn);
+               if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+                       free_p2m_page(mid_mfn);
+               else
+                       p2m_top_mfn_p[topidx] = mid_mfn;
+       }
+
+       if (p2m_top[topidx][mididx] == p2m_missing) {
+               /* p2m leaf page is missing */
+               unsigned long *p2m;
+
+               p2m = alloc_p2m_page();
+               if (!p2m)
+                       return false;
+
+               p2m_init(p2m);
+
+               if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+                       free_p2m_page(p2m);
+               else
+                       mid_mfn[mididx] = virt_to_mfn(p2m);
+       }
+
+       return true;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+       unsigned topidx, mididx, idx;
+
+       if (unlikely(pfn >= MAX_P2M_PFN)) {
+               BUG_ON(mfn != INVALID_P2M_ENTRY);
+               return true;
+       }
+
+       topidx = p2m_top_index(pfn);
+       mididx = p2m_mid_index(pfn);
+       idx = p2m_index(pfn);
+
+       if (p2m_top[topidx][mididx] == p2m_missing)
+               return mfn == INVALID_P2M_ENTRY;
+
+       p2m_top[topidx][mididx][idx] = mfn;
+
+       return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+               return true;
+       }
+
+       if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+               if (!alloc_p2m(pfn))
+                       return false;
+
+               if (!__set_phys_to_machine(pfn, mfn))
+                       return false;
+       }
+
+       return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT        10
+#define M2P_OVERRIDE_HASH      (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+       unsigned i;
+
+       m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+                                  sizeof(unsigned long));
+
+       for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+               INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+       return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page)
+{
+       unsigned long flags;
+       unsigned long pfn;
+       unsigned long address;
+       unsigned level;
+       pte_t *ptep = NULL;
+
+       pfn = page_to_pfn(page);
+       if (!PageHighMem(page)) {
+               address = (unsigned long)__va(pfn << PAGE_SHIFT);
+               ptep = lookup_address(address, &level);
+
+               if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+                                       "m2p_add_override: pfn %lx not mapped", pfn))
+                       return -EINVAL;
+       }
+
+       page->private = mfn;
+       page->index = pfn_to_mfn(pfn);
+
+       __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+       if (!PageHighMem(page))
+               /* Just zap old mapping for now */
+               pte_clear(&init_mm, address, ptep);
+
+       spin_lock_irqsave(&m2p_override_lock, flags);
+       list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
+       spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+       return 0;
+}
+
+int m2p_remove_override(struct page *page)
+{
+       unsigned long flags;
+       unsigned long mfn;
+       unsigned long pfn;
+       unsigned long address;
+       unsigned level;
+       pte_t *ptep = NULL;
+
+       pfn = page_to_pfn(page);
+       mfn = get_phys_to_machine(pfn);
+       if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
+               return -EINVAL;
+
+       if (!PageHighMem(page)) {
+               address = (unsigned long)__va(pfn << PAGE_SHIFT);
+               ptep = lookup_address(address, &level);
+
+               if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+                                       "m2p_remove_override: pfn %lx not mapped", pfn))
+                       return -EINVAL;
+       }
+
+       spin_lock_irqsave(&m2p_override_lock, flags);
+       list_del(&page->lru);
+       spin_unlock_irqrestore(&m2p_override_lock, flags);
+       __set_phys_to_machine(pfn, page->index);
+
+       if (!PageHighMem(page))
+               set_pte_at(&init_mm, address, ptep,
+                               pfn_pte(pfn, PAGE_KERNEL));
+               /* No tlb flush necessary because the caller already
+                * left the pte unmapped. */
+
+       return 0;
+}
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+       unsigned long flags;
+       struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+       struct page *p, *ret;
+
+       ret = NULL;
+
+       spin_lock_irqsave(&m2p_override_lock, flags);
+
+       list_for_each_entry(p, bucket, lru) {
+               if (p->private == mfn) {
+                       ret = p;
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+       return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+       struct page *p = m2p_find_override(mfn);
+       unsigned long ret = pfn;
+
+       if (p)
+               ret = page_to_pfn(p);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
index fca4db4..3078901 100644 (file)
@@ -83,6 +83,9 @@
 #define MADV_MERGEABLE   12            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13            /* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE  14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE        15              /* Not worth backing with hugepages */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index ce012a9..36b4305 100644 (file)
@@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
                       "Node %d WritebackTmp:   %8lu kB\n"
                       "Node %d Slab:           %8lu kB\n"
                       "Node %d SReclaimable:   %8lu kB\n"
-                      "Node %d SUnreclaim:     %8lu kB\n",
+                      "Node %d SUnreclaim:     %8lu kB\n"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                      "Node %d AnonHugePages:  %8lu kB\n"
+#endif
+                       ,
                       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
                       nid, K(node_page_state(nid, NR_WRITEBACK)),
                       nid, K(node_page_state(nid, NR_FILE_PAGES)),
                       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
-                      nid, K(node_page_state(nid, NR_ANON_PAGES)),
+                      nid, K(node_page_state(nid, NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+                       HPAGE_PMD_NR
+#endif
+                      ),
                       nid, K(node_page_state(nid, NR_SHMEM)),
                       nid, node_page_state(nid, NR_KERNEL_STACK) *
                                THREAD_SIZE / 1024,
@@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
                       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
                                node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
                       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
-                      nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+                      nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       , nid,
+                       K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+                       HPAGE_PMD_NR)
+#endif
+                      );
        n += hugetlb_report_node_meminfo(nid, buf + n);
        return n;
 }
index bf1a95e..98d9ec8 100644 (file)
@@ -240,6 +240,30 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       select BLK_DEV_MD
+       ---help---
+        A dm target that supports RAID4, RAID5 and RAID6 mappings
+
+        A RAID-5 set of N drives with a capacity of C MB per drive provides
+        the capacity of C * (N - 1) MB, and protects against a failure
+        of a single drive. For a given sector (row) number, (N - 1) drives
+        contain data sectors, and one drive contains the parity protection.
+        For a RAID-4 set, the parity blocks are present on a single drive,
+        while a RAID-5 set distributes the parity across the drives in one
+        of the available parity distribution methods.
+
+        A RAID-6 set of N drives with a capacity of C MB per drive
+        provides the capacity of C * (N - 2) MB, and protects
+        against a failure of any two drives. For a given sector
+        (row) number, (N - 2) drives contain data sectors, and two
+        drives contains two independent redundancy syndromes.  Like
+        RAID-5, RAID-6 distributes the syndromes across the drives
+        in one of the available parity distribution methods.
+
 config DM_LOG_USERSPACE
        tristate "Mirror userspace logging (EXPERIMENTAL)"
        depends on DM_MIRROR && EXPERIMENTAL && NET
index 5e3aac4..d013860 100644 (file)
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT)     += dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)          += dm-zero.o
+obj-$(CONFIG_DM_RAID)  += dm-raid.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                    += dm-uevent.o
index 5a1ffe3..9a35320 100644 (file)
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
                    || test_bit(Faulty, &rdev->flags))
                        continue;
 
-               target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+               target = offset + index * (PAGE_SIZE/512);
 
                if (sync_page_io(rdev, target,
                                 roundup(size, bdev_logical_block_size(rdev->bdev)),
-                                page, READ)) {
+                                page, READ, true)) {
                        page->index = index;
                        attach_page_buffers(page, NULL); /* so that free_buffer will
                                                          * quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        mdk_rdev_t *rdev = NULL;
+       struct block_device *bdev;
        mddev_t *mddev = bitmap->mddev;
 
        while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
                int size = PAGE_SIZE;
                loff_t offset = mddev->bitmap_info.offset;
+
+               bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
                if (page->index == bitmap->file_pages-1)
                        size = roundup(bitmap->last_page_size,
-                                      bdev_logical_block_size(rdev->bdev));
+                                      bdev_logical_block_size(bdev));
                /* Just make sure we aren't corrupting data or
                 * metadata
                 */
@@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        wait_event(bitmap->mddev->recovery_wait,
                   atomic_read(&bitmap->mddev->recovery_active) == 0);
 
-       bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+       bitmap->mddev->curr_resync_completed = sector;
        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
index d5b0e4c..4e054bd 100644 (file)
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
 
 #include <linux/device-mapper.h>
 
@@ -63,6 +67,7 @@ struct dm_crypt_request {
        struct convert_context *ctx;
        struct scatterlist sg_in;
        struct scatterlist sg_out;
+       sector_t iv_sector;
 };
 
 struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
        void (*dtr)(struct crypt_config *cc);
        int (*init)(struct crypt_config *cc);
        int (*wipe)(struct crypt_config *cc);
-       int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+       int (*generator)(struct crypt_config *cc, u8 *iv,
+                        struct dm_crypt_request *dmreq);
+       int (*post)(struct crypt_config *cc, u8 *iv,
+                   struct dm_crypt_request *dmreq);
 };
 
 struct iv_essiv_private {
-       struct crypto_cipher *tfm;
        struct crypto_hash *hash_tfm;
        u8 *salt;
 };
@@ -86,11 +93,32 @@ struct iv_benbi_private {
        int shift;
 };
 
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+       struct crypto_shash *hash_tfm;
+       u8 *seed;
+};
+
 /*
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+       struct ablkcipher_request *req;
+       /* ESSIV: struct crypto_cipher *essiv_tfm */
+       void *iv_private;
+       struct crypto_ablkcipher *tfms[0];
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
        struct dm_dev *dev;
        sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
        struct workqueue_struct *crypt_queue;
 
        char *cipher;
-       char *cipher_mode;
+       char *cipher_string;
 
        struct crypt_iv_operations *iv_gen_ops;
        union {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
+               struct iv_lmk_private lmk;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
 
        /*
+        * Duplicated per cpu state. Access through
+        * per_cpu_ptr() only.
+        */
+       struct crypt_cpu __percpu *cpu;
+       unsigned tfms_count;
+
+       /*
         * Layout of each crypto request:
         *
         *   struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
         * correctly aligned.
         */
        unsigned int dmreq_start;
-       struct ablkcipher_request *req;
 
-       struct crypto_ablkcipher *tfm;
        unsigned long flags;
        unsigned int key_size;
+       unsigned int key_parts;
        u8 key[0];
 };
 
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+       return this_cpu_ptr(cc->cpu);
+}
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+       return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
 
 /*
  * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
  * null: the initial vector is always zero.  Provides compatibility with
  *       obsolete loop_fish2 devices.  Do not use for new devices.
  *
+ * lmk:  Compatible implementation of the block chaining mode used
+ *       by the Loop-AES block device encryption system
+ *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from the sector number, the data and
+ *       optionally extra IV seed.
+ *       This means that after decryption the first block
+ *       of sector must be tweaked according to decrypted data.
+ *       Loop-AES can use three encryption schemes:
+ *         version 1: is plain aes-cbc mode
+ *         version 2: uses 64 multikey scheme with lmk IV generator
+ *         version 3: the same as version 2 with additional IV seed
+ *                   (it uses 65 keys, last key is used as IV seed)
+ *
  * plumb: unimplemented, see:
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
 
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-       *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+       *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
        return 0;
 }
 
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-                               sector_t sector)
+                               struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-       *(u64 *)iv = cpu_to_le64(sector);
+       *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
        return 0;
 }
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        struct hash_desc desc;
        struct scatterlist sg;
-       int err;
+       struct crypto_cipher *essiv_tfm;
+       int err, cpu;
 
        sg_init_one(&sg, cc->key, cc->key_size);
        desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        if (err)
                return err;
 
-       return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+       for_each_possible_cpu(cpu) {
+               essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+
+               err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
                                    crypto_hash_digestsize(essiv->hash_tfm));
+               if (err)
+                       return err;
+       }
+
+       return 0;
 }
 
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+       struct crypto_cipher *essiv_tfm;
+       int cpu, r, err = 0;
 
        memset(essiv->salt, 0, salt_size);
 
-       return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+       for_each_possible_cpu(cpu) {
+               essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+               r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+               if (r)
+                       err = r;
+       }
+
+       return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+                                            struct dm_target *ti,
+                                            u8 *salt, unsigned saltsize)
+{
+       struct crypto_cipher *essiv_tfm;
+       int err;
+
+       /* Setup the essiv_tfm with the given salt */
+       essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+       if (IS_ERR(essiv_tfm)) {
+               ti->error = "Error allocating crypto tfm for ESSIV";
+               return essiv_tfm;
+       }
+
+       if (crypto_cipher_blocksize(essiv_tfm) !=
+           crypto_ablkcipher_ivsize(any_tfm(cc))) {
+               ti->error = "Block size of ESSIV cipher does "
+                           "not match IV size of block cipher";
+               crypto_free_cipher(essiv_tfm);
+               return ERR_PTR(-EINVAL);
+       }
+
+       err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+       if (err) {
+               ti->error = "Failed to set key for ESSIV cipher";
+               crypto_free_cipher(essiv_tfm);
+               return ERR_PTR(err);
+       }
+
+       return essiv_tfm;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+       int cpu;
+       struct crypt_cpu *cpu_cc;
+       struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-       crypto_free_cipher(essiv->tfm);
-       essiv->tfm = NULL;
-
        crypto_free_hash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
 
        kzfree(essiv->salt);
        essiv->salt = NULL;
+
+       for_each_possible_cpu(cpu) {
+               cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+               essiv_tfm = cpu_cc->iv_private;
+
+               if (essiv_tfm)
+                       crypto_free_cipher(essiv_tfm);
+
+               cpu_cc->iv_private = NULL;
+       }
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        struct crypto_cipher *essiv_tfm = NULL;
        struct crypto_hash *hash_tfm = NULL;
        u8 *salt = NULL;
-       int err;
+       int err, cpu;
 
        if (!opts) {
                ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                goto bad;
        }
 
-       /* Allocate essiv_tfm */
-       essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-       if (IS_ERR(essiv_tfm)) {
-               ti->error = "Error allocating crypto tfm for ESSIV";
-               err = PTR_ERR(essiv_tfm);
-               goto bad;
-       }
-       if (crypto_cipher_blocksize(essiv_tfm) !=
-           crypto_ablkcipher_ivsize(cc->tfm)) {
-               ti->error = "Block size of ESSIV cipher does "
-                           "not match IV size of block cipher";
-               err = -EINVAL;
-               goto bad;
-       }
-
        cc->iv_gen_private.essiv.salt = salt;
-       cc->iv_gen_private.essiv.tfm = essiv_tfm;
        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
+       for_each_possible_cpu(cpu) {
+               essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+                                       crypto_hash_digestsize(hash_tfm));
+               if (IS_ERR(essiv_tfm)) {
+                       crypt_iv_essiv_dtr(cc);
+                       return PTR_ERR(essiv_tfm);
+               }
+               per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+       }
+
        return 0;
 
 bad:
-       if (essiv_tfm && !IS_ERR(essiv_tfm))
-               crypto_free_cipher(essiv_tfm);
        if (hash_tfm && !IS_ERR(hash_tfm))
                crypto_free_hash(hash_tfm);
        kfree(salt);
        return err;
 }
 
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
+       struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+
        memset(iv, 0, cc->iv_size);
-       *(u64 *)iv = cpu_to_le64(sector);
-       crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+       *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+       crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
        return 0;
 }
 
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-       unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+       unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
        int log = ilog2(bs);
 
        /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
 
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
        __be64 val;
 
        memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
 
-       val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+       val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
        put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
        return 0;
 }
 
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
 
        return 0;
 }
 
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+       struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+       if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+               crypto_free_shash(lmk->hash_tfm);
+       lmk->hash_tfm = NULL;
+
+       kzfree(lmk->seed);
+       lmk->seed = NULL;
+}
+
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+                           const char *opts)
+{
+       struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+       lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+       if (IS_ERR(lmk->hash_tfm)) {
+               ti->error = "Error initializing LMK hash";
+               return PTR_ERR(lmk->hash_tfm);
+       }
+
+       /* No seed in LMK version 2 */
+       if (cc->key_parts == cc->tfms_count) {
+               lmk->seed = NULL;
+               return 0;
+       }
+
+       lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+       if (!lmk->seed) {
+               crypt_iv_lmk_dtr(cc);
+               ti->error = "Error kmallocing seed storage in LMK";
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+       struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+       int subkey_size = cc->key_size / cc->key_parts;
+
+       /* LMK seed is on the position of LMK_KEYS + 1 key */
+       if (lmk->seed)
+               memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+                      crypto_shash_digestsize(lmk->hash_tfm));
+
+       return 0;
+}
+
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+       struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+       if (lmk->seed)
+               memset(lmk->seed, 0, LMK_SEED_SIZE);
+
+       return 0;
+}
+
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+                           struct dm_crypt_request *dmreq,
+                           u8 *data)
+{
+       struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+       struct {
+               struct shash_desc desc;
+               char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+       } sdesc;
+       struct md5_state md5state;
+       u32 buf[4];
+       int i, r;
+
+       sdesc.desc.tfm = lmk->hash_tfm;
+       sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+       r = crypto_shash_init(&sdesc.desc);
+       if (r)
+               return r;
+
+       if (lmk->seed) {
+               r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+               if (r)
+                       return r;
+       }
+
+       /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+       r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+       if (r)
+               return r;
+
+       /* Sector is cropped to 56 bits here */
+       buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+       buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+       buf[2] = cpu_to_le32(4024);
+       buf[3] = 0;
+       r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+       if (r)
+               return r;
+
+       /* No MD5 padding here */
+       r = crypto_shash_export(&sdesc.desc, &md5state);
+       if (r)
+               return r;
+
+       for (i = 0; i < MD5_HASH_WORDS; i++)
+               __cpu_to_le32s(&md5state.hash[i]);
+       memcpy(iv, &md5state.hash, cc->iv_size);
+
+       return 0;
+}
+
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+                           struct dm_crypt_request *dmreq)
+{
+       u8 *src;
+       int r = 0;
+
+       if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+               src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+               r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+               kunmap_atomic(src, KM_USER0);
+       } else
+               memset(iv, 0, cc->iv_size);
+
+       return r;
+}
+
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
+{
+       u8 *dst;
+       int r;
+
+       if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+               return 0;
+
+       dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+       r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+
+       /* Tweak the first block of plaintext sector */
+       if (!r)
+               crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+
+       kunmap_atomic(dst, KM_USER0);
+       return r;
+}
+
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
        .generator = crypt_iv_null_gen
 };
 
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+       .ctr       = crypt_iv_lmk_ctr,
+       .dtr       = crypt_iv_lmk_dtr,
+       .init      = crypt_iv_lmk_init,
+       .wipe      = crypt_iv_lmk_wipe,
+       .generator = crypt_iv_lmk_gen,
+       .post      = crypt_iv_lmk_post
+};
+
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
        return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
 
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+                      struct dm_crypt_request *dmreq)
+{
+       return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+               crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
 static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
        int r = 0;
 
        dmreq = dmreq_of_req(cc, req);
-       iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-                        crypto_ablkcipher_alignmask(cc->tfm) + 1);
+       iv = iv_of_dmreq(cc, dmreq);
 
+       dmreq->iv_sector = ctx->sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
        }
 
        if (cc->iv_gen_ops) {
-               r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+               r = cc->iv_gen_ops->generator(cc, iv, dmreq);
                if (r < 0)
                        return r;
        }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
        else
                r = crypto_ablkcipher_decrypt(req);
 
+       if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+               r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
        return r;
 }
 
 static void kcryptd_async_done(struct crypto_async_request *async_req,
                               int error);
+
 static void crypt_alloc_req(struct crypt_config *cc,
                            struct convert_context *ctx)
 {
-       if (!cc->req)
-               cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-       ablkcipher_request_set_tfm(cc->req, cc->tfm);
-       ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
-                                       CRYPTO_TFM_REQ_MAY_SLEEP,
-                                       kcryptd_async_done,
-                                       dmreq_of_req(cc, cc->req));
+       struct crypt_cpu *this_cc = this_crypt_config(cc);
+       unsigned key_index = ctx->sector & (cc->tfms_count - 1);
+
+       if (!this_cc->req)
+               this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+       ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+       ablkcipher_request_set_callback(this_cc->req,
+           CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+           kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 
 /*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
                         struct convert_context *ctx)
 {
+       struct crypt_cpu *this_cc = this_crypt_config(cc);
        int r;
 
        atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
 
                atomic_inc(&ctx->pending);
 
-               r = crypt_convert_block(cc, ctx, cc->req);
+               r = crypt_convert_block(cc, ctx, this_cc->req);
 
                switch (r) {
                /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
                        INIT_COMPLETION(ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
-                       cc->req = NULL;
+                       this_cc->req = NULL;
                        ctx->sector++;
                        continue;
 
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * They must be separated as otherwise the final stages could be
  * starved by new requests which can block in the first stages due
  * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
  */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
        clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+       blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct bio *clone;
 
-       crypt_inc_pending(io);
-
        /*
         * The block layer might modify the bvec array, so always
         * copy the required bvecs because we need the original
         * one in order to decrypt the whole bio data *afterwards*.
         */
-       clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
-       if (unlikely(!clone)) {
-               io->error = -ENOMEM;
-               crypt_dec_pending(io);
-               return;
+       clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+       if (!clone) {
+               kcryptd_unplug(cc);
+               return 1;
        }
 
+       crypt_inc_pending(io);
+
        clone_init(io, clone);
        clone->bi_idx = 0;
        clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
               sizeof(struct bio_vec) * clone->bi_vcnt);
 
        generic_make_request(clone);
+       return 0;
 }
 
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
 {
        struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
-       if (bio_data_dir(io->base_bio) == READ)
-               kcryptd_io_read(io);
-       else
+       if (bio_data_dir(io->base_bio) == READ) {
+               crypt_inc_pending(io);
+               if (kcryptd_io_read(io, GFP_NOIO))
+                       io->error = -ENOMEM;
+               crypt_dec_pending(io);
+       } else
                kcryptd_io_write(io);
 }
 
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
                return;
        }
 
+       if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+               error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 
        if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
        }
 }
 
-static int crypt_set_key(struct crypt_config *cc, char *key)
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
 {
-       unsigned key_size = strlen(key) >> 1;
+       struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+       unsigned i;
 
-       if (cc->key_size && cc->key_size != key_size)
+       for (i = 0; i < cc->tfms_count; i++)
+               if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+                       crypto_free_ablkcipher(cpu_cc->tfms[i]);
+                       cpu_cc->tfms[i] = NULL;
+               }
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+       struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+       unsigned i;
+       int err;
+
+       for (i = 0; i < cc->tfms_count; i++) {
+               cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+               if (IS_ERR(cpu_cc->tfms[i])) {
+                       err = PTR_ERR(cpu_cc->tfms[i]);
+                       crypt_free_tfms(cc, cpu);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+       unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+       int cpu, err = 0, i, r;
+
+       for_each_possible_cpu(cpu) {
+               for (i = 0; i < cc->tfms_count; i++) {
+                       r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+                                                    cc->key + (i * subkey_size), subkey_size);
+                       if (r)
+                               err = r;
+               }
+       }
+
+       return err;
+}
+
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+       /* The key size may not be changed. */
+       if (cc->key_size != (strlen(key) >> 1))
                return -EINVAL;
 
-       cc->key_size = key_size; /* initial settings */
+       /* Hyphen (which gives a key_size of zero) means there is no key. */
+       if (!cc->key_size && strcmp(key, "-"))
+               return -EINVAL;
 
-       if ((!key_size && strcmp(key, "-")) ||
-          (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+       if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
                return -EINVAL;
 
        set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-       return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+       return crypt_setkey_allcpus(cc);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
-       return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+
+       return crypt_setkey_allcpus(cc);
 }
 
 static void crypt_dtr(struct dm_target *ti)
 {
        struct crypt_config *cc = ti->private;
+       struct crypt_cpu *cpu_cc;
+       int cpu;
 
        ti->private = NULL;
 
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->crypt_queue)
                destroy_workqueue(cc->crypt_queue);
 
+       if (cc->cpu)
+               for_each_possible_cpu(cpu) {
+                       cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                       if (cpu_cc->req)
+                               mempool_free(cpu_cc->req, cc->req_pool);
+                       crypt_free_tfms(cc, cpu);
+               }
+
        if (cc->bs)
                bioset_free(cc->bs);
 
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
 
-       if (cc->tfm && !IS_ERR(cc->tfm))
-               crypto_free_ablkcipher(cc->tfm);
-
        if (cc->dev)
                dm_put_device(ti, cc->dev);
 
+       if (cc->cpu)
+               free_percpu(cc->cpu);
+
        kzfree(cc->cipher);
-       kzfree(cc->cipher_mode);
+       kzfree(cc->cipher_string);
 
        /* Must zero key material before freeing */
        kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                            char *cipher_in, char *key)
 {
        struct crypt_config *cc = ti->private;
-       char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+       char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
        char *cipher_api = NULL;
-       int ret = -EINVAL;
+       int cpu, ret = -EINVAL;
 
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                return -EINVAL;
        }
 
+       cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+       if (!cc->cipher_string)
+               goto bad_mem;
+
        /*
         * Legacy dm-crypt cipher specification
-        * cipher-mode-iv:ivopts
+        * cipher[:keycount]-mode-iv:ivopts
         */
        tmp = cipher_in;
-       cipher = strsep(&tmp, "-");
+       keycount = strsep(&tmp, "-");
+       cipher = strsep(&keycount, ":");
+
+       if (!keycount)
+               cc->tfms_count = 1;
+       else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+                !is_power_of_2(cc->tfms_count)) {
+               ti->error = "Bad cipher key count specification";
+               return -EINVAL;
+       }
+       cc->key_parts = cc->tfms_count;
 
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
                goto bad_mem;
 
-       if (tmp) {
-               cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-               if (!cc->cipher_mode)
-                       goto bad_mem;
-       }
-
        chainmode = strsep(&tmp, "-");
        ivopts = strsep(&tmp, "-");
        ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (tmp)
                DMWARN("Ignoring unexpected additional cipher options");
 
-       /* Compatibility mode for old dm-crypt mappings */
+       cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+                                cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+                                __alignof__(struct crypt_cpu));
+       if (!cc->cpu) {
+               ti->error = "Cannot allocate per cpu state";
+               goto bad_mem;
+       }
+
+       /*
+        * For compatibility with the original dm-crypt mapping format, if
+        * only the cipher name is supplied, use cbc-plain.
+        */
        if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-               kfree(cc->cipher_mode);
-               cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
                chainmode = "cbc";
                ivmode = "plain";
        }
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
 
        /* Allocate cipher */
-       cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
-       if (IS_ERR(cc->tfm)) {
-               ret = PTR_ERR(cc->tfm);
-               ti->error = "Error allocating crypto tfm";
-               goto bad;
+       for_each_possible_cpu(cpu) {
+               ret = crypt_alloc_tfms(cc, cpu, cipher_api);
+               if (ret < 0) {
+                       ti->error = "Error allocating crypto tfm";
+                       goto bad;
+               }
        }
 
        /* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
 
        /* Initialize IV */
-       cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+       cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
                cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_benbi_ops;
        else if (strcmp(ivmode, "null") == 0)
                cc->iv_gen_ops = &crypt_iv_null_ops;
-       else {
+       else if (strcmp(ivmode, "lmk") == 0) {
+               cc->iv_gen_ops = &crypt_iv_lmk_ops;
+               /* Version 2 and 3 is recognised according
+                * to length of provided multi-key string.
+                * If present (version 3), last key is used as IV seed.
+                */
+               if (cc->key_size % cc->key_parts)
+                       cc->key_parts++;
+       } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate encryption context";
                return -ENOMEM;
        }
+       cc->key_size = key_size;
 
        ti->private = cc;
        ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
 
        cc->dmreq_start = sizeof(struct ablkcipher_request);
-       cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+       cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
        cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-       cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+       cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
                           ~(crypto_tfm_ctx_alignment() - 1);
 
        cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate crypt request mempool";
                goto bad;
        }
-       cc->req = NULL;
 
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        cc->start = tmpll;
 
        ret = -ENOMEM;
-       cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+       cc->io_queue = alloc_workqueue("kcryptd_io",
+                                      WQ_NON_REENTRANT|
+                                      WQ_MEM_RECLAIM,
+                                      1);
        if (!cc->io_queue) {
                ti->error = "Couldn't create kcryptd io queue";
                goto bad;
        }
 
-       cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+       cc->crypt_queue = alloc_workqueue("kcryptd",
+                                         WQ_NON_REENTRANT|
+                                         WQ_CPU_INTENSIVE|
+                                         WQ_MEM_RECLAIM,
+                                         1);
        if (!cc->crypt_queue) {
                ti->error = "Couldn't create kcryptd queue";
                goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
        io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
 
-       if (bio_data_dir(io->base_bio) == READ)
-               kcryptd_queue_io(io);
-       else
+       if (bio_data_dir(io->base_bio) == READ) {
+               if (kcryptd_io_read(io, GFP_NOWAIT))
+                       kcryptd_queue_io(io);
+       } else
                kcryptd_queue_crypt(io);
 
        return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
                break;
 
        case STATUSTYPE_TABLE:
-               if (cc->cipher_mode)
-                       DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-               else
-                       DMEMIT("%s ", cc->cipher);
+               DMEMIT("%s ", cc->cipher_string);
 
                if (cc->key_size > 0) {
                        if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
        .name   = "crypt",
-       .version = {1, 7, 0},
+       .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
index baa1191..f18375d 100644 (file)
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
 {
        int r = -ENOMEM;
 
-       kdelayd_wq = create_workqueue("kdelayd");
+       kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
                goto bad_queue;
index 4b54618..6d12775 100644 (file)
@@ -295,19 +295,55 @@ retry:
                DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
 
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+       mutex_lock(&dm_hash_cells_mutex);
+       hc->uuid = new_uuid;
+       mutex_unlock(&dm_hash_cells_mutex);
+
+       list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+       char *old_name;
+
+       /*
+        * Rename and move the name cell.
+        */
+       list_del(&hc->name_list);
+       old_name = hc->name;
+
+       mutex_lock(&dm_hash_cells_mutex);
+       hc->name = new_name;
+       mutex_unlock(&dm_hash_cells_mutex);
+
+       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+       return old_name;
+}
+
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
                                            const char *new)
 {
-       char *new_name, *old_name;
+       char *new_data, *old_name = NULL;
        struct hash_cell *hc;
        struct dm_table *table;
        struct mapped_device *md;
+       unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
        /*
         * duplicate new.
         */
-       new_name = kstrdup(new, GFP_KERNEL);
-       if (!new_name)
+       new_data = kstrdup(new, GFP_KERNEL);
+       if (!new_data)
                return ERR_PTR(-ENOMEM);
 
        down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
        /*
         * Is new free ?
         */
-       hc = __get_name_cell(new);
+       if (change_uuid)
+               hc = __get_uuid_cell(new);
+       else
+               hc = __get_name_cell(new);
+
        if (hc) {
-               DMWARN("asked to rename to an already-existing name %s -> %s",
+               DMWARN("Unable to change %s on mapped device %s to one that "
+                      "already exists: %s",
+                      change_uuid ? "uuid" : "name",
                       param->name, new);
                dm_put(hc->md);
                up_write(&_hash_lock);
-               kfree(new_name);
+               kfree(new_data);
                return ERR_PTR(-EBUSY);
        }
 
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
         */
        hc = __get_name_cell(param->name);
        if (!hc) {
-               DMWARN("asked to rename a non-existent device %s -> %s",
-                      param->name, new);
+               DMWARN("Unable to rename non-existent device, %s to %s%s",
+                      param->name, change_uuid ? "uuid " : "", new);
                up_write(&_hash_lock);
-               kfree(new_name);
+               kfree(new_data);
                return ERR_PTR(-ENXIO);
        }
 
        /*
-        * rename and move the name cell.
+        * Does this device already have a uuid?
         */
-       list_del(&hc->name_list);
-       old_name = hc->name;
-       mutex_lock(&dm_hash_cells_mutex);
-       hc->name = new_name;
-       mutex_unlock(&dm_hash_cells_mutex);
-       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+       if (change_uuid && hc->uuid) {
+               DMWARN("Unable to change uuid of mapped device %s to %s "
+                      "because uuid is already set to %s",
+                      param->name, new, hc->uuid);
+               dm_put(hc->md);
+               up_write(&_hash_lock);
+               kfree(new_data);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (change_uuid)
+               __set_cell_uuid(hc, new_data);
+       else
+               old_name = __change_cell_name(hc, new_data);
 
        /*
         * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
 
        if (!hc) {
-               DMWARN("device doesn't appear to be in the dev hash table.");
+               DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
         */
        r = dm_lock_for_deletion(md);
        if (r) {
-               DMWARN("unable to remove open device %s", hc->name);
+               DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
                up_write(&_hash_lock);
                dm_put(md);
                return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
        int r;
-       char *new_name = (char *) param + param->data_start;
+       char *new_data = (char *) param + param->data_start;
        struct mapped_device *md;
+       unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
-       if (new_name < param->data ||
-           invalid_str(new_name, (void *) param + param_size) ||
-           strlen(new_name) > DM_NAME_LEN - 1) {
-               DMWARN("Invalid new logical volume name supplied.");
+       if (new_data < param->data ||
+           invalid_str(new_data, (void *) param + param_size) ||
+           strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+               DMWARN("Invalid new mapped device name or uuid string supplied.");
                return -EINVAL;
        }
 
-       r = check_name(new_name);
-       if (r)
-               return r;
+       if (!change_uuid) {
+               r = check_name(new_data);
+               if (r)
+                       return r;
+       }
 
-       md = dm_hash_rename(param, new_name);
+       md = dm_hash_rename(param, new_data);
        if (IS_ERR(md))
                return PTR_ERR(md);
 
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
 
        hc = __find_device_hash_cell(param);
        if (!hc) {
-               DMWARN("device doesn't appear to be in the dev hash table.");
+               DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 
        hc = __find_device_hash_cell(param);
        if (!hc) {
-               DMWARN("device doesn't appear to be in the dev hash table.");
+               DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
index d8587ba..924f5f0 100644 (file)
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
        unsigned int nr_pages;
        unsigned int nr_free_pages;
 
+       /*
+        * Block devices to unplug.
+        * Non-NULL pointer means that a block device has some pending requests
+        * and needs to be unplugged.
+        */
+       struct block_device *unplug[2];
+
        struct dm_io_client *io_client;
 
        wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
        return 0;
 }
 
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+       if (kc->unplug[rw] != NULL) {
+               blk_unplug(bdev_get_queue(kc->unplug[rw]));
+               kc->unplug[rw] = NULL;
+       }
+}
+
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+                          struct block_device *bdev)
+{
+       if (likely(kc->unplug[rw] == bdev))
+               return;
+       unplug(kc, rw);
+       kc->unplug[rw] = bdev;
+}
+
 static void complete_io(unsigned long error, void *context)
 {
        struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
        int r;
        struct dm_io_request io_req = {
-               .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+               .bi_rw = job->rw,
                .mem.type = DM_IO_PAGE_LIST,
                .mem.ptr.pl = job->pages,
                .mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
                .client = job->kc->io_client,
        };
 
-       if (job->rw == READ)
+       if (job->rw == READ) {
                r = dm_io(&io_req, 1, &job->source, NULL);
-       else
+               prepare_unplug(job->kc, READ, job->source.bdev);
+       } else {
+               if (job->num_dests > 1)
+                       io_req.bi_rw |= REQ_UNPLUG;
                r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+               if (!(io_req.bi_rw & REQ_UNPLUG))
+                       prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+       }
 
        return r;
 }
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
         * Pages jobs when successful will jump onto the io jobs
         * list.  io jobs call wake when they complete and it all
         * starts again.
+        *
+        * Note that io_jobs add block devices to the unplug array,
+        * this array is cleared with "unplug" calls. It is thus
+        * forbidden to run complete_jobs after io_jobs and before
+        * unplug because the block device could be destroyed in
+        * job completion callback.
         */
        process_jobs(&kc->complete_jobs, kc, run_complete_job);
        process_jobs(&kc->pages_jobs, kc, run_pages_job);
        process_jobs(&kc->io_jobs, kc, run_io_job);
+       unplug(kc, READ);
+       unplug(kc, WRITE);
 }
 
 /*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
        INIT_LIST_HEAD(&kc->io_jobs);
        INIT_LIST_HEAD(&kc->pages_jobs);
 
+       memset(kc->unplug, 0, sizeof(kc->unplug));
+
        kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
        if (!kc->job_pool)
                goto bad_slab;
 
        INIT_WORK(&kc->kcopyd_work, do_work);
-       kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+       kc->kcopyd_wq = alloc_workqueue("kcopyd",
+                                       WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!kc->kcopyd_wq)
                goto bad_workqueue;
 
index 1ed0094..aa2e0c3 100644 (file)
 
 #include "dm-log-userspace-transfer.h"
 
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
 struct flush_entry {
        int type;
        region_t region;
        struct list_head list;
 };
 
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
 struct log_c {
        struct dm_target *ti;
        uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
         */
        uint64_t in_sync_hint;
 
+       /*
+        * Mark and clear requests are held until a flush is issued
+        * so that we can group, and thereby limit, the amount of
+        * network traffic between kernel and userspace.  The 'flush_lock'
+        * is used to protect these lists.
+        */
        spinlock_t flush_lock;
-       struct list_head flush_list;  /* only for clear and mark requests */
+       struct list_head mark_list;
+       struct list_head clear_list;
 };
 
 static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
        spin_lock_init(&lc->flush_lock);
-       INIT_LIST_HEAD(&lc->flush_list);
+       INIT_LIST_HEAD(&lc->mark_list);
+       INIT_LIST_HEAD(&lc->clear_list);
 
        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
        if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
                                 ctr_str, str_size, NULL, NULL);
 
-       if (r == -ESRCH) {
-               DMERR("Userspace log server not found");
+       if (r < 0) {
+               if (r == -ESRCH)
+                       DMERR("Userspace log server not found");
+               else
+                       DMERR("Userspace log server failed to create log");
                goto out;
        }
 
@@ -214,10 +235,9 @@ out:
 
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-       int r;
        struct log_c *lc = log->context;
 
-       r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+       (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
                                 NULL, 0,
                                 NULL, NULL);
 
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
        return (r) ? 0 : (int)in_sync;
 }
 
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+       int r = 0;
+       struct flush_entry *fe;
+
+       list_for_each_entry(fe, flush_list, list) {
+               r = userspace_do_request(lc, lc->uuid, fe->type,
+                                        (char *)&fe->region,
+                                        sizeof(fe->region),
+                                        NULL, NULL);
+               if (r)
+                       break;
+       }
+
+       return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+       int r = 0;
+       int count;
+       uint32_t type = 0;
+       struct flush_entry *fe, *tmp_fe;
+       LIST_HEAD(tmp_list);
+       uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+       /*
+        * Group process the requests
+        */
+       while (!list_empty(flush_list)) {
+               count = 0;
+
+               list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+                       group[count] = fe->region;
+                       count++;
+
+                       list_del(&fe->list);
+                       list_add(&fe->list, &tmp_list);
+
+                       type = fe->type;
+                       if (count >= MAX_FLUSH_GROUP_COUNT)
+                               break;
+               }
+
+               r = userspace_do_request(lc, lc->uuid, type,
+                                        (char *)(group),
+                                        count * sizeof(uint64_t),
+                                        NULL, NULL);
+               if (r) {
+                       /* Group send failed.  Attempt one-by-one. */
+                       list_splice_init(&tmp_list, flush_list);
+                       r = flush_one_by_one(lc, flush_list);
+                       break;
+               }
+       }
+
+       /*
+        * Must collect flush_entrys that were successfully processed
+        * as a group so that they will be free'd by the caller.
+        */
+       list_splice_init(&tmp_list, flush_list);
+
+       return r;
+}
+
 /*
  * userspace_flush
  *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
        int r = 0;
        unsigned long flags;
        struct log_c *lc = log->context;
-       LIST_HEAD(flush_list);
+       LIST_HEAD(mark_list);
+       LIST_HEAD(clear_list);
        struct flush_entry *fe, *tmp_fe;
 
        spin_lock_irqsave(&lc->flush_lock, flags);
-       list_splice_init(&lc->flush_list, &flush_list);
+       list_splice_init(&lc->mark_list, &mark_list);
+       list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-       if (list_empty(&flush_list))
+       if (list_empty(&mark_list) && list_empty(&clear_list))
                return 0;
 
-       /*
-        * FIXME: Count up requests, group request types,
-        * allocate memory to stick all requests in and
-        * send to server in one go.  Failing the allocation,
-        * do it one by one.
-        */
+       r = flush_by_group(lc, &mark_list);
+       if (r)
+               goto fail;
 
-       list_for_each_entry(fe, &flush_list, list) {
-               r = userspace_do_request(lc, lc->uuid, fe->type,
-                                        (char *)&fe->region,
-                                        sizeof(fe->region),
-                                        NULL, NULL);
-               if (r)
-                       goto fail;
-       }
+       r = flush_by_group(lc, &clear_list);
+       if (r)
+               goto fail;
 
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
                                 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
-       list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+       list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+               list_del(&fe->list);
+               mempool_free(fe, flush_entry_pool);
+       }
+       list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
                list_del(&fe->list);
                mempool_free(fe, flush_entry_pool);
        }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_MARK_REGION;
        fe->region = region;
-       list_add(&fe->list, &lc->flush_list);
+       list_add(&fe->list, &lc->mark_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
 
        return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_CLEAR_REGION;
        fe->region = region;
-       list_add(&fe->list, &lc->flush_list);
+       list_add(&fe->list, &lc->clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
 
        return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
                return r;
        }
 
-       DMINFO("version 1.0.0 loaded");
+       DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
        return 0;
 }
 
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
        dm_ulog_tfr_exit();
        mempool_destroy(flush_entry_pool);
 
-       DMINFO("version 1.0.0 unloaded");
+       DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
 }
 
index 075cbcf..049eaf1 100644 (file)
@@ -198,6 +198,7 @@ resend:
 
        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+       tfr->version = DM_ULOG_REQUEST_VERSION;
        tfr->luid = luid;
        tfr->seq = dm_ulog_seq++;
 
index 33420e6..6951536 100644 (file)
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                        r = PTR_ERR(lc->io_req.client);
                        DMWARN("couldn't allocate disk io client");
                        kfree(lc);
-                       return -ENOMEM;
+                       return r;
                }
 
                lc->disk_header = vmalloc(buf_size);
index 487ecda..b82d288 100644 (file)
@@ -23,6 +23,8 @@
 
 #define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 
 /* Path properties */
 struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
        unsigned fail_count;            /* Cumulative failure count */
 
        struct dm_path path;
-       struct work_struct deactivate_path;
-       struct work_struct activate_path;
+       struct delayed_work activate_path;
 };
 
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
 
        const char *hw_handler_name;
        char *hw_handler_params;
+
        unsigned nr_priority_groups;
        struct list_head priority_groups;
+
+       wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+
        unsigned pg_init_required;      /* pg_init needs calling? */
        unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
-       wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+       unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
 
        unsigned nr_valid_paths;        /* Total number of usable paths */
        struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
        unsigned saved_queue_if_no_path;/* Saved state during suspension */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
+       unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
 
        struct work_struct process_queued_ios;
        struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 
 
 /*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
 
        if (pgpath) {
                pgpath->is_active = 1;
-               INIT_WORK(&pgpath->deactivate_path, deactivate_path);
-               INIT_WORK(&pgpath->activate_path, activate_path);
+               INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
        }
 
        return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
        kfree(pgpath);
 }
 
-static void deactivate_path(struct work_struct *work)
-{
-       struct pgpath *pgpath =
-               container_of(work, struct pgpath, deactivate_path);
-
-       blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
-
 static struct priority_group *alloc_priority_group(void)
 {
        struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
+               m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
                init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
 static void __pg_init_all_paths(struct multipath *m)
 {
        struct pgpath *pgpath;
+       unsigned long pg_init_delay = 0;
 
        m->pg_init_count++;
        m->pg_init_required = 0;
+       if (m->pg_init_delay_retry)
+               pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+                                                m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
        list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
                /* Skip failed paths */
                if (!pgpath->is_active)
                        continue;
-               if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+               if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+                                      pg_init_delay))
                        m->pg_init_in_progress++;
        }
 }
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
        const char *param_name;
 
        static struct param _params[] = {
-               {0, 3, "invalid number of feature args"},
+               {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
+               {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
 
        r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                        continue;
                }
 
+               if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+                   (argc >= 1)) {
+                       r = read_param(_params + 2, shift(as),
+                                      &m->pg_init_delay_msecs, &ti->error);
+                       argc--;
+                       continue;
+               }
+
                ti->error = "Unrecognised multipath feature request";
                r = -EINVAL;
        } while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
-       flush_scheduled_work();
+       flush_work_sync(&m->trigger_event);
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
                      pgpath->path.dev->name, m->nr_valid_paths);
 
        schedule_work(&m->trigger_event);
-       queue_work(kmultipathd, &pgpath->deactivate_path);
 
 out:
        spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
-               if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+               if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
                        m->pg_init_in_progress++;
        }
 
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
        struct priority_group *pg = pgpath->pg;
        struct multipath *m = pg->m;
        unsigned long flags;
+       unsigned delay_retry = 0;
 
        /* device or driver problems */
        switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
                 */
                bypass_pg(m, pg, 1);
                break;
-       /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
        case SCSI_DH_RETRY:
+               /* Wait before retrying. */
+               delay_retry = 1;
        case SCSI_DH_IMM_RETRY:
        case SCSI_DH_RES_TEMP_UNAVAIL:
                if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
        if (!m->pg_init_required)
                m->queue_io = 0;
 
+       m->pg_init_delay_retry = delay_retry;
        queue_work(kmultipathd, &m->process_queued_ios);
 
        /*
@@ -1241,7 +1254,7 @@ out:
 static void activate_path(struct work_struct *work)
 {
        struct pgpath *pgpath =
-               container_of(work, struct pgpath, activate_path);
+               container_of(work, struct pgpath, activate_path.work);
 
        scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
                                pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
        else {
                DMEMIT("%u ", m->queue_if_no_path +
-                             (m->pg_init_retries > 0) * 2);
+                             (m->pg_init_retries > 0) * 2 +
+                             (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+               if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+                       DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
        }
 
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-       .version = {1, 1, 1},
+       .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
                return -EINVAL;
        }
 
-       kmultipathd = create_workqueue("kmpathd");
+       kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
        if (!kmultipathd) {
                DMERR("failed to create workqueue kmpathd");
                dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
         * old workqueue would also create a bottleneck in the
         * path of the storage hardware device activation.
         */
-       kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+       kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+                                                 WQ_MEM_RECLAIM);
        if (!kmpath_handlerd) {
                DMERR("failed to create workqueue kmpath_handlerd");
                destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644 (file)
index 0000000..b9e1e15
--- /dev/null
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+
+#define DM_MSG_PREFIX "raid"
+
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+
+struct raid_dev {
+       /*
+        * Two DM devices, one to hold metadata and one to hold the
+        * actual data/parity.  The reason for this is to not confuse
+        * ti->len and give more flexibility in altering size and
+        * characteristics.
+        *
+        * While it is possible for this device to be associated
+        * with a different physical device than the data_dev, it
+        * is intended for it to be the same.
+        *    |--------- Physical Device ---------|
+        *    |- meta_dev -|------ data_dev ------|
+        */
+       struct dm_dev *meta_dev;
+       struct dm_dev *data_dev;
+       struct mdk_rdev_s rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+
+struct raid_set {
+       struct dm_target *ti;
+
+       uint64_t print_flags;
+
+       struct mddev_s md;
+       struct raid_type *raid_type;
+       struct dm_target_callbacks callbacks;
+
+       struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+       const char *name;               /* RAID algorithm. */
+       const char *descr;              /* Descriptor text for logging. */
+       const unsigned parity_devs;     /* # of parity devices. */
+       const unsigned minimal_devs;    /* minimal # of devices in set. */
+       const unsigned level;           /* RAID level. */
+       const unsigned algorithm;       /* RAID algorithm. */
+} raid_types[] = {
+       {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
+       {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+       {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+       {"raid5_ls", "RAID5 (left symmetric)",          1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+       {"raid5_rs", "RAID5 (right symmetric)",         1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+       {"raid6_zr", "RAID6 (zero restart)",            2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+       {"raid6_nr", "RAID6 (N restart)",               2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+       {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+               if (!strcmp(raid_types[i].name, name))
+                       return &raid_types[i];
+
+       return NULL;
+}
+
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+       unsigned i;
+       struct raid_set *rs;
+       sector_t sectors_per_dev;
+
+       if (raid_devs <= raid_type->parity_devs) {
+               ti->error = "Insufficient number of devices";
+               return ERR_PTR(-EINVAL);
+       }
+
+       sectors_per_dev = ti->len;
+       if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+               ti->error = "Target length not divisible by number of data devices";
+               return ERR_PTR(-EINVAL);
+       }
+
+       rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+       if (!rs) {
+               ti->error = "Cannot allocate raid context";
+               return ERR_PTR(-ENOMEM);
+       }
+
+       mddev_init(&rs->md);
+
+       rs->ti = ti;
+       rs->raid_type = raid_type;
+       rs->md.raid_disks = raid_devs;
+       rs->md.level = raid_type->level;
+       rs->md.new_level = rs->md.level;
+       rs->md.dev_sectors = sectors_per_dev;
+       rs->md.layout = raid_type->algorithm;
+       rs->md.new_layout = rs->md.layout;
+       rs->md.delta_disks = 0;
+       rs->md.recovery_cp = 0;
+
+       for (i = 0; i < raid_devs; i++)
+               md_rdev_init(&rs->dev[i].rdev);
+
+       /*
+        * Remaining items to be initialized by further RAID params:
+        *  rs->md.persistent
+        *  rs->md.external
+        *  rs->md.chunk_sectors
+        *  rs->md.new_chunk_sectors
+        */
+
+       return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+       int i;
+
+       for (i = 0; i < rs->md.raid_disks; i++)
+               if (rs->dev[i].data_dev)
+                       dm_put_device(rs->ti, rs->dev[i].data_dev);
+
+       kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+       int i;
+       int rebuild = 0;
+       int metadata_available = 0;
+       int ret = 0;
+
+       for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+               rs->dev[i].rdev.raid_disk = i;
+
+               rs->dev[i].meta_dev = NULL;
+               rs->dev[i].data_dev = NULL;
+
+               /*
+                * There are no offsets, since there is a separate device
+                * for data and metadata.
+                */
+               rs->dev[i].rdev.data_offset = 0;
+               rs->dev[i].rdev.mddev = &rs->md;
+
+               if (strcmp(argv[0], "-")) {
+                       rs->ti->error = "Metadata devices not supported";
+                       return -EINVAL;
+               }
+
+               if (!strcmp(argv[1], "-")) {
+                       if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+                           (!rs->dev[i].rdev.recovery_offset)) {
+                               rs->ti->error = "Drive designated for rebuild not specified";
+                               return -EINVAL;
+                       }
+
+                       continue;
+               }
+
+               ret = dm_get_device(rs->ti, argv[1],
+                                   dm_table_get_mode(rs->ti->table),
+                                   &rs->dev[i].data_dev);
+               if (ret) {
+                       rs->ti->error = "RAID device lookup failure";
+                       return ret;
+               }
+
+               rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+               list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+               if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                       rebuild++;
+       }
+
+       if (metadata_available) {
+               rs->md.external = 0;
+               rs->md.persistent = 1;
+               rs->md.major_version = 2;
+       } else if (rebuild && !rs->md.recovery_cp) {
+               /*
+                * Without metadata, we will not be able to tell if the array
+                * is in-sync or not - we must assume it is not.  Therefore,
+                * it is impossible to rebuild a drive.
+                *
+                * Even if there is metadata, the on-disk information may
+                * indicate that the array is not in-sync and it will then
+                * fail at that time.
+                *
+                * User could specify 'nosync' option if desperate.
+                */
+               DMERR("Unable to rebuild drive while array is not in-sync");
+               rs->ti->error = "RAID device lookup failure";
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Possible arguments are...
+ * RAID456:
+ *     <chunk_size> [optional_args]
+ *
+ * Optional args:
+ *    [[no]sync]                       Force or prevent recovery of the entire array
+ *    [rebuild <idx>]                  Rebuild the drive indicated by the index
+ *    [daemon_sleep <ms>]              Time between bitmap daemon work to clear bits
+ *    [min_recovery_rate <kB/sec/disk>]        Throttle RAID initialization
+ *    [max_recovery_rate <kB/sec/disk>]        Throttle RAID initialization
+ *    [max_write_behind <sectors>]     See '-write-behind=' (man mdadm)
+ *    [stripe_cache <sectors>]         Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+                            unsigned num_raid_params)
+{
+       unsigned i, rebuild_cnt = 0;
+       unsigned long value;
+       char *key;
+
+       /*
+        * First, parse the in-order required arguments
+        */
+       if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+           !is_power_of_2(value) || (value < 8)) {
+               rs->ti->error = "Bad chunk size";
+               return -EINVAL;
+       }
+
+       rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+       argv++;
+       num_raid_params--;
+
+       /*
+        * Second, parse the unordered optional arguments
+        */
+       for (i = 0; i < rs->md.raid_disks; i++)
+               set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+       for (i = 0; i < num_raid_params; i++) {
+               if (!strcmp(argv[i], "nosync")) {
+                       rs->md.recovery_cp = MaxSector;
+                       rs->print_flags |= DMPF_NOSYNC;
+                       rs->md.flags |= MD_SYNC_STATE_FORCED;
+                       continue;
+               }
+               if (!strcmp(argv[i], "sync")) {
+                       rs->md.recovery_cp = 0;
+                       rs->print_flags |= DMPF_SYNC;
+                       rs->md.flags |= MD_SYNC_STATE_FORCED;
+                       continue;
+               }
+
+               /* The rest of the optional arguments come in key/value pairs */
+               if ((i + 1) >= num_raid_params) {
+                       rs->ti->error = "Wrong number of raid parameters given";
+                       return -EINVAL;
+               }
+
+               key = argv[i++];
+               if (strict_strtoul(argv[i], 10, &value) < 0) {
+                       rs->ti->error = "Bad numerical argument given in raid params";
+                       return -EINVAL;
+               }
+
+               if (!strcmp(key, "rebuild")) {
+                       if (++rebuild_cnt > rs->raid_type->parity_devs) {
+                               rs->ti->error = "Too many rebuild drives given";
+                               return -EINVAL;
+                       }
+                       if (value > rs->md.raid_disks) {
+                               rs->ti->error = "Invalid rebuild index given";
+                               return -EINVAL;
+                       }
+                       clear_bit(In_sync, &rs->dev[value].rdev.flags);
+                       rs->dev[value].rdev.recovery_offset = 0;
+               } else if (!strcmp(key, "max_write_behind")) {
+                       rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+                       /*
+                        * In device-mapper, we specify things in sectors, but
+                        * MD records this value in kB
+                        */
+                       value /= 2;
+                       if (value > COUNTER_MAX) {
+                               rs->ti->error = "Max write-behind limit out of range";
+                               return -EINVAL;
+                       }
+                       rs->md.bitmap_info.max_write_behind = value;
+               } else if (!strcmp(key, "daemon_sleep")) {
+                       rs->print_flags |= DMPF_DAEMON_SLEEP;
+                       if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+                               rs->ti->error = "daemon sleep period out of range";
+                               return -EINVAL;
+                       }
+                       rs->md.bitmap_info.daemon_sleep = value;
+               } else if (!strcmp(key, "stripe_cache")) {
+                       rs->print_flags |= DMPF_STRIPE_CACHE;
+
+                       /*
+                        * In device-mapper, we specify things in sectors, but
+                        * MD records this value in kB
+                        */
+                       value /= 2;
+
+                       if (rs->raid_type->level < 5) {
+                               rs->ti->error = "Inappropriate argument: stripe_cache";
+                               return -EINVAL;
+                       }
+                       if (raid5_set_cache_size(&rs->md, (int)value)) {
+                               rs->ti->error = "Bad stripe_cache size";
+                               return -EINVAL;
+                       }
+               } else if (!strcmp(key, "min_recovery_rate")) {
+                       rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+                       if (value > INT_MAX) {
+                               rs->ti->error = "min_recovery_rate out of range";
+                               return -EINVAL;
+                       }
+                       rs->md.sync_speed_min = (int)value;
+               } else if (!strcmp(key, "max_recovery_rate")) {
+                       rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+                       if (value > INT_MAX) {
+                               rs->ti->error = "max_recovery_rate out of range";
+                               return -EINVAL;
+                       }
+                       rs->md.sync_speed_max = (int)value;
+               } else {
+                       DMERR("Unable to parse RAID parameter: %s", key);
+                       rs->ti->error = "Unable to parse RAID parameters";
+                       return -EINVAL;
+               }
+       }
+
+       /* Assume there are no metadata devices until the drives are parsed */
+       rs->md.persistent = 0;
+       rs->md.external = 1;
+
+       return 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+       struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+
+       dm_table_event(rs->ti->table);
+}
+
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+       struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+       return md_raid5_congested(&rs->md, bits);
+}
+
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+       struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+       md_raid5_unplug_device(rs->md.private);
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *     <raid_type> <#raid_params> <raid_params>                \
+ *     <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+       int ret;
+       struct raid_type *rt;
+       unsigned long num_raid_params, num_raid_devs;
+       struct raid_set *rs = NULL;
+
+       /* Must have at least <raid_type> <#raid_params> */
+       if (argc < 2) {
+               ti->error = "Too few arguments";
+               return -EINVAL;
+       }
+
+       /* raid type */
+       rt = get_raid_type(argv[0]);
+       if (!rt) {
+               ti->error = "Unrecognised raid_type";
+               return -EINVAL;
+       }
+       argc--;
+       argv++;
+
+       /* number of RAID parameters */
+       if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+               ti->error = "Cannot understand number of RAID parameters";
+               return -EINVAL;
+       }
+       argc--;
+       argv++;
+
+       /* Skip over RAID params for now and find out # of devices */
+       if (num_raid_params + 1 > argc) {
+               ti->error = "Arguments do not agree with counts given";
+               return -EINVAL;
+       }
+
+       if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+           (num_raid_devs >= INT_MAX)) {
+               ti->error = "Cannot understand number of raid devices";
+               return -EINVAL;
+       }
+
+       rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+       if (IS_ERR(rs))
+               return PTR_ERR(rs);
+
+       ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+       if (ret)
+               goto bad;
+
+       ret = -EINVAL;
+
+       argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+       argv += num_raid_params + 1;
+
+       if (argc != (num_raid_devs * 2)) {
+               ti->error = "Supplied RAID devices does not match the count given";
+               goto bad;
+       }
+
+       ret = dev_parms(rs, argv);
+       if (ret)
+               goto bad;
+
+       INIT_WORK(&rs->md.event_work, do_table_event);
+       ti->split_io = rs->md.chunk_sectors;
+       ti->private = rs;
+
+       mutex_lock(&rs->md.reconfig_mutex);
+       ret = md_run(&rs->md);
+       rs->md.in_sync = 0; /* Assume already marked dirty */
+       mutex_unlock(&rs->md.reconfig_mutex);
+
+       if (ret) {
+               ti->error = "Fail to run raid array";
+               goto bad;
+       }
+
+       rs->callbacks.congested_fn = raid_is_congested;
+       rs->callbacks.unplug_fn = raid_unplug;
+       dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+
+       return 0;
+
+bad:
+       context_free(rs);
+
+       return ret;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+
+       list_del_init(&rs->callbacks.list);
+       md_stop(&rs->md);
+       context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+       struct raid_set *rs = ti->private;
+       mddev_t *mddev = &rs->md;
+
+       mddev->pers->make_request(mddev, bio);
+
+       return DM_MAPIO_SUBMITTED;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+                      char *result, unsigned maxlen)
+{
+       struct raid_set *rs = ti->private;
+       unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+       unsigned sz = 0;
+       int i;
+       sector_t sync;
+
+       switch (type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+
+               for (i = 0; i < rs->md.raid_disks; i++) {
+                       if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+                               DMEMIT("D");
+                       else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+                               DMEMIT("A");
+                       else
+                               DMEMIT("a");
+               }
+
+               if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                       sync = rs->md.curr_resync_completed;
+               else
+                       sync = rs->md.recovery_cp;
+
+               if (sync > rs->md.resync_max_sectors)
+                       sync = rs->md.resync_max_sectors;
+
+               DMEMIT(" %llu/%llu",
+                      (unsigned long long) sync,
+                      (unsigned long long) rs->md.resync_max_sectors);
+
+               break;
+       case STATUSTYPE_TABLE:
+               /* The string you would use to construct this array */
+               for (i = 0; i < rs->md.raid_disks; i++)
+                       if (rs->dev[i].data_dev &&
+                           !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                               raid_param_cnt++; /* for rebuilds */
+
+               raid_param_cnt += (hweight64(rs->print_flags) * 2);
+               if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+                       raid_param_cnt--;
+
+               DMEMIT("%s %u %u", rs->raid_type->name,
+                      raid_param_cnt, rs->md.chunk_sectors);
+
+               if ((rs->print_flags & DMPF_SYNC) &&
+                   (rs->md.recovery_cp == MaxSector))
+                       DMEMIT(" sync");
+               if (rs->print_flags & DMPF_NOSYNC)
+                       DMEMIT(" nosync");
+
+               for (i = 0; i < rs->md.raid_disks; i++)
+                       if (rs->dev[i].data_dev &&
+                           !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                               DMEMIT(" rebuild %u", i);
+
+               if (rs->print_flags & DMPF_DAEMON_SLEEP)
+                       DMEMIT(" daemon_sleep %lu",
+                              rs->md.bitmap_info.daemon_sleep);
+
+               if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+                       DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+
+               if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+                       DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+
+               if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+                       DMEMIT(" max_write_behind %lu",
+                              rs->md.bitmap_info.max_write_behind);
+
+               if (rs->print_flags & DMPF_STRIPE_CACHE) {
+                       raid5_conf_t *conf = rs->md.private;
+
+                       /* convert from kiB to sectors */
+                       DMEMIT(" stripe_cache %d",
+                              conf ? conf->max_nr_stripes * 2 : 0);
+               }
+
+               DMEMIT(" %d", rs->md.raid_disks);
+               for (i = 0; i < rs->md.raid_disks; i++) {
+                       DMEMIT(" -"); /* metadata device */
+
+                       if (rs->dev[i].data_dev)
+                               DMEMIT(" %s", rs->dev[i].data_dev->name);
+                       else
+                               DMEMIT(" -");
+               }
+       }
+
+       return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+       struct raid_set *rs = ti->private;
+       unsigned i;
+       int ret = 0;
+
+       for (i = 0; !ret && i < rs->md.raid_disks; i++)
+               if (rs->dev[i].data_dev)
+                       ret = fn(ti,
+                                rs->dev[i].data_dev,
+                                0, /* No offset on data devs */
+                                rs->md.dev_sectors,
+                                data);
+
+       return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+       struct raid_set *rs = ti->private;
+       unsigned chunk_size = rs->md.chunk_sectors << 9;
+       raid5_conf_t *conf = rs->md.private;
+
+       blk_limits_io_min(limits, chunk_size);
+       blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+
+static void raid_presuspend(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+
+       md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+
+       mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+
+       mddev_resume(&rs->md);
+}
+
+static struct target_type raid_target = {
+       .name = "raid",
+       .version = {1, 0, 0},
+       .module = THIS_MODULE,
+       .ctr = raid_ctr,
+       .dtr = raid_dtr,
+       .map = raid_map,
+       .status = raid_status,
+       .iterate_devices = raid_iterate_devices,
+       .io_hints = raid_io_hints,
+       .presuspend = raid_presuspend,
+       .postsuspend = raid_postsuspend,
+       .resume = raid_resume,
+};
+
+static int __init dm_raid_init(void)
+{
+       return dm_register_target(&raid_target);
+}
+
+static void __exit dm_raid_exit(void)
+{
+       dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
index 19a59b0..dee3267 100644 (file)
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
        struct dm_io_request io_req = {
                .bi_rw = WRITE_FLUSH,
                .mem.type = DM_IO_KMEM,
-               .mem.ptr.bvec = NULL,
+               .mem.ptr.addr = NULL,
                .client = ms->io_client,
        };
 
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
                .client = ms->io_client,
        };
 
+       if (bio->bi_rw & REQ_DISCARD) {
+               io_req.bi_rw |= REQ_DISCARD;
+               io_req.mem.type = DM_IO_KMEM;
+               io_req.mem.ptr.addr = NULL;
+       }
+
        for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
                map_region(dest++, m, bio);
 
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
 
        while ((bio = bio_list_pop(writes))) {
-               if (bio->bi_rw & REQ_FLUSH) {
+               if ((bio->bi_rw & REQ_FLUSH) ||
+                   (bio->bi_rw & REQ_DISCARD)) {
                        bio_list_add(&sync, bio);
                        continue;
                }
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = ms;
        ti->split_io = dm_rh_get_region_size(ms->rh);
        ti->num_flush_requests = 1;
+       ti->num_discard_requests = 1;
 
-       ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+       ms->kmirrord_wq = alloc_workqueue("kmirrord",
+                                         WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!ms->kmirrord_wq) {
                DMERR("couldn't start kmirrord");
                r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
 
        del_timer_sync(&ms->timer);
        flush_workqueue(ms->kmirrord_wq);
-       flush_scheduled_work();
+       flush_work_sync(&ms->trigger_event);
        dm_kcopyd_client_destroy(ms->kcopyd_client);
        destroy_workqueue(ms->kmirrord_wq);
        free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
        .name    = "mirror",
-       .version = {1, 12, 0},
+       .version = {1, 12, 1},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
index 2129cdb..95891df 100644 (file)
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
         */
        INIT_WORK_ONSTACK(&req.work, do_metadata);
        queue_work(ps->metadata_wq, &req.work);
-       flush_workqueue(ps->metadata_wq);
+       flush_work(&req.work);
 
        return req.result;
 }
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
        atomic_set(&ps->pending_count, 0);
        ps->callbacks = NULL;
 
-       ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+       ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
        if (!ps->metadata_wq) {
                kfree(ps);
                DMERR("couldn't start header metadata update thread");
index 53cf79d..fdde53c 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
 
 #include "dm-exception-store.h"
 
@@ -80,9 +79,6 @@ struct dm_snapshot {
        /* Origin writes don't trigger exceptions until this is set */
        int active;
 
-       /* Whether or not owning mapped_device is suspended */
-       int suspended;
-
        atomic_t pending_exceptions_count;
 
        mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
 
        struct dm_kcopyd_client *kcopyd_client;
 
-       /* Queue of snapshot writes for ksnapd to flush */
-       struct bio_list queued_bios;
-       struct work_struct queued_bios_work;
-
        /* Wait for events based on state_bits */
        unsigned long state_bits;
 
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 }
 EXPORT_SYMBOL(dm_snap_cow);
 
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
-
 static sector_t chunk_to_sector(struct dm_exception_store *store,
                                chunk_t chunk)
 {
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->ti = ti;
        s->valid = 1;
        s->active = 0;
-       s->suspended = 0;
        atomic_set(&s->pending_exceptions_count, 0);
        init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
        spin_lock_init(&s->tracked_chunk_lock);
 
-       bio_list_init(&s->queued_bios);
-       INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
        ti->private = s;
        ti->num_flush_requests = num_flush_requests;
 
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
        struct dm_snapshot *s = ti->private;
        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
-       flush_workqueue(ksnapd);
-
        down_read(&_origins_lock);
        /* Check whether exception handover must be cancelled */
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
        }
 }
 
-static void flush_queued_bios(struct work_struct *work)
-{
-       struct dm_snapshot *s =
-               container_of(work, struct dm_snapshot, queued_bios_work);
-       struct bio *queued_bios;
-       unsigned long flags;
-
-       spin_lock_irqsave(&s->pe_lock, flags);
-       queued_bios = bio_list_get(&s->queued_bios);
-       spin_unlock_irqrestore(&s->pe_lock, flags);
-
-       flush_bios(queued_bios);
-}
-
 static int do_origin(struct dm_dev *origin, struct bio *bio);
 
 /*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
        stop_merge(s);
 }
 
-static void snapshot_postsuspend(struct dm_target *ti)
-{
-       struct dm_snapshot *s = ti->private;
-
-       down_write(&s->lock);
-       s->suspended = 1;
-       up_write(&s->lock);
-}
-
 static int snapshot_preresume(struct dm_target *ti)
 {
        int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
                        DMERR("Unable to resume snapshot source until "
                              "handover completes.");
                        r = -EINVAL;
-               } else if (!snap_src->suspended) {
+               } else if (!dm_suspended(snap_src->ti)) {
                        DMERR("Unable to perform snapshot handover until "
                              "source is suspended.");
                        r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
 
        down_write(&s->lock);
        s->active = 1;
-       s->suspended = 0;
        up_write(&s->lock);
 }
 
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-       .version = {1, 7, 0},
+       .version = {1, 7, 1},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-       .version = {1, 9, 0},
+       .version = {1, 10, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_map,
        .end_io  = snapshot_end_io,
-       .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_resume,
        .status  = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
 
 static struct target_type merge_target = {
        .name    = dm_snapshot_merge_target_name,
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_merge_map,
        .end_io  = snapshot_end_io,
        .presuspend = snapshot_merge_presuspend,
-       .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_merge_resume,
        .status  = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
                goto bad_tracked_chunk_cache;
        }
 
-       ksnapd = create_singlethread_workqueue("ksnapd");
-       if (!ksnapd) {
-               DMERR("Failed to create ksnapd workqueue.");
-               r = -ENOMEM;
-               goto bad_pending_pool;
-       }
-
        return 0;
 
-bad_pending_pool:
-       kmem_cache_destroy(tracked_chunk_cache);
 bad_tracked_chunk_cache:
        kmem_cache_destroy(pending_cache);
 bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
 
 static void __exit dm_snapshot_exit(void)
 {
-       destroy_workqueue(ksnapd);
-
        dm_unregister_target(&snapshot_target);
        dm_unregister_target(&origin_target);
        dm_unregister_target(&merge_target);
index f0371b4..dddfa14 100644 (file)
@@ -39,23 +39,20 @@ struct stripe_c {
        struct dm_target *ti;
 
        /* Work struct used for triggering events*/
-       struct work_struct kstriped_ws;
+       struct work_struct trigger_event;
 
        struct stripe stripe[0];
 };
 
-static struct workqueue_struct *kstriped;
-
 /*
  * An event is triggered whenever a drive
  * drops out of a stripe volume.
  */
 static void trigger_event(struct work_struct *work)
 {
-       struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
-
+       struct stripe_c *sc = container_of(work, struct stripe_c,
+                                          trigger_event);
        dm_table_event(sc->ti->table);
-
 }
 
 static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
 
-       INIT_WORK(&sc->kstriped_ws, trigger_event);
+       INIT_WORK(&sc->trigger_event, trigger_event);
 
        /* Set pointer to dm target; used in trigger_event */
        sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
        for (i = 0; i < sc->stripes; i++)
                dm_put_device(ti, sc->stripe[i].dev);
 
-       flush_workqueue(kstriped);
+       flush_work_sync(&sc->trigger_event);
        kfree(sc);
 }
 
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
                        atomic_inc(&(sc->stripe[i].error_count));
                        if (atomic_read(&(sc->stripe[i].error_count)) <
                            DM_IO_ERROR_THRESHOLD)
-                               queue_work(kstriped, &sc->kstriped_ws);
+                               schedule_work(&sc->trigger_event);
                }
 
        return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
 static struct target_type stripe_target = {
        .name   = "striped",
-       .version = {1, 3, 0},
+       .version = {1, 3, 1},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
                return r;
        }
 
-       kstriped = create_singlethread_workqueue("kstriped");
-       if (!kstriped) {
-               DMERR("failed to create workqueue kstriped");
-               dm_unregister_target(&stripe_target);
-               return -ENOMEM;
-       }
-
        return r;
 }
 
 void dm_stripe_exit(void)
 {
        dm_unregister_target(&stripe_target);
-       destroy_workqueue(kstriped);
-
-       return;
 }
index 985c20a..dffa0ac 100644 (file)
@@ -71,6 +71,8 @@ struct dm_table {
        void *event_context;
 
        struct dm_md_mempools *mempools;
+
+       struct list_head target_callbacks;
 };
 
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
                return -ENOMEM;
 
        INIT_LIST_HEAD(&t->devices);
+       INIT_LIST_HEAD(&t->target_callbacks);
        atomic_set(&t->holders, 0);
        t->discards_supported = 1;
 
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
        return 0;
 }
 
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+       list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+       struct dm_target_callbacks *cb;
        int r = 0;
 
        list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
                                     bdevname(dd->dm_dev.bdev, b));
        }
 
+       list_for_each_entry(cb, &t->target_callbacks, list)
+               if (cb->congested_fn)
+                       r |= cb->congested_fn(cb, bdi_bits);
+
        return r;
 }
 
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+       struct dm_target_callbacks *cb;
 
        list_for_each_entry(dd, devices, list) {
                struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
                                     dm_device_name(t->md),
                                     bdevname(dd->dm_dev.bdev, b));
        }
+
+       list_for_each_entry(cb, &t->target_callbacks, list)
+               if (cb->unplug_fn)
+                       cb->unplug_fn(cb);
 }
 
 struct mapped_device *dm_table_get_md(struct dm_table *t)
index f48a2f3..eaa3af0 100644 (file)
@@ -32,7 +32,6 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 
-static DEFINE_MUTEX(dm_mutex);
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
        struct mapped_device *md;
 
-       mutex_lock(&dm_mutex);
        spin_lock(&_minor_lock);
 
        md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 
 out:
        spin_unlock(&_minor_lock);
-       mutex_unlock(&dm_mutex);
 
        return md ? 0 : -ENXIO;
 }
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
        struct mapped_device *md = disk->private_data;
 
-       mutex_lock(&dm_mutex);
+       spin_lock(&_minor_lock);
+
        atomic_dec(&md->open_count);
        dm_put(md);
-       mutex_unlock(&dm_mutex);
+
+       spin_unlock(&_minor_lock);
 
        return 0;
 }
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
                if (map_request(ti, clone, md))
                        goto requeued;
 
-               spin_lock_irq(q->queue_lock);
+               BUG_ON(!irqs_disabled());
+               spin_lock(q->queue_lock);
        }
 
        goto out;
 
 requeued:
-       spin_lock_irq(q->queue_lock);
+       BUG_ON(!irqs_disabled());
+       spin_lock(q->queue_lock);
 
 plug_and_out:
        if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
        add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
 
-       md->wq = create_singlethread_workqueue("kdmflush");
+       md->wq = alloc_workqueue("kdmflush",
+                                WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!md->wq)
                goto bad_thread;
 
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
        wake_up(&md->eventq);
 }
 
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
 
-       mutex_lock(&md->bdev->bd_inode->i_mutex);
        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-       mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 /*
index 7fc090a..cf8594c 100644 (file)
@@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
        int rv;
        int cpu;
 
-       if (mddev == NULL || mddev->pers == NULL) {
+       if (mddev == NULL || mddev->pers == NULL
+           || !mddev->ready) {
                bio_io_error(bio);
                return 0;
        }
+       smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
        if (mddev->suspended) {
                DEFINE_WAIT(__wait);
@@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
 }
 
 /* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
 {
-       sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+       sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
        return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
@@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
         */
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
-       bio->bi_bdev = rdev->bdev;
+       bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
@@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error)
 }
 
 int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
-                struct page *page, int rw)
+                struct page *page, int rw, bool metadata_op)
 {
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        struct completion event;
@@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
 
        rw |= REQ_SYNC | REQ_UNPLUG;
 
-       bio->bi_bdev = rdev->bdev;
-       bio->bi_sector = sector;
+       bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+               rdev->meta_bdev : rdev->bdev;
+       if (metadata_op)
+               bio->bi_sector = sector + rdev->sb_start;
+       else
+               bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        init_completion(&event);
        bio->bi_private = &event;
@@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
                return 0;
 
 
-       if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+       if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
                goto fail;
        rdev->sb_loaded = 1;
        return 0;
@@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
         *
         * It also happens to be a multiple of 4Kb.
         */
-       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+       rdev->sb_start = calc_dev_sboffset(rdev);
 
        ret = read_disk_sb(rdev, MD_SB_BYTES);
        if (ret) return ret;
@@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_info.offset)
                return 0; /* can't move bitmap */
-       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+       rdev->sb_start = calc_dev_sboffset(rdev);
        if (!num_sectors || num_sectors > rdev->sb_start)
                num_sectors = rdev->sb_start;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
 
+               if (slot >= rdev->mddev->raid_disks &&
+                   slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+                       return -ENOSPC;
+
                rdev->raid_disk = slot;
                if (test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = slot;
@@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        /* failure here is OK */;
                /* don't wakeup anyone, leave that to userspace. */
        } else {
-               if (slot >= rdev->mddev->raid_disks)
+               if (slot >= rdev->mddev->raid_disks &&
+                   slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
                rdev->raid_disk = slot;
                /* assume it is working */
@@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                char nm[20];
                if (rdev->raid_disk < 0)
                        continue;
-               if (rdev->new_raid_disk > mddev->raid_disks)
+               if (rdev->new_raid_disk >= mddev->raid_disks)
                        rdev->new_raid_disk = -1;
                if (rdev->new_raid_disk == rdev->raid_disk)
                        continue;
@@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page)
        return sprintf(page, "%s\n", type);
 }
 
+static void reap_sync_thread(mddev_t *mddev);
+
 static ssize_t
 action_store(mddev_t *mddev, const char *page, size_t len)
 {
@@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                if (mddev->sync_thread) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                       md_unregister_thread(mddev->sync_thread);
-                       mddev->sync_thread = NULL;
-                       mddev->recovery = 0;
+                       reap_sync_thread(mddev);
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-       unsigned long max_sectors, resync;
+       unsigned long long max_sectors, resync;
 
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return sprintf(page, "none\n");
@@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page)
                max_sectors = mddev->dev_sectors;
 
        resync = mddev->curr_resync_completed;
-       return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+       return sprintf(page, "%llu / %llu\n", resync, max_sectors);
 }
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+       unsigned long long old = mddev->suspend_lo;
 
        if (mddev->pers == NULL || 
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-       if (new >= mddev->suspend_hi ||
-           (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
-               mddev->suspend_lo = new;
+
+       mddev->suspend_lo = new;
+       if (new >= old)
+               /* Shrinking suspended region */
                mddev->pers->quiesce(mddev, 2);
-               return len;
-       } else
-               return -EINVAL;
+       else {
+               /* Expanding suspended region - need to wait */
+               mddev->pers->quiesce(mddev, 1);
+               mddev->pers->quiesce(mddev, 0);
+       }
+       return len;
 }
 static struct md_sysfs_entry md_suspend_lo =
 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+       unsigned long long old = mddev->suspend_hi;
 
        if (mddev->pers == NULL ||
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-       if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
-           (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
-               mddev->suspend_hi = new;
+
+       mddev->suspend_hi = new;
+       if (new <= old)
+               /* Shrinking suspended region */
+               mddev->pers->quiesce(mddev, 2);
+       else {
+               /* Expanding suspended region - need to wait */
                mddev->pers->quiesce(mddev, 1);
                mddev->pers->quiesce(mddev, 0);
-               return len;
-       } else
-               return -EINVAL;
+       }
+       return len;
 }
 static struct md_sysfs_entry md_suspend_hi =
 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev)
                 * We don't want the data to overlap the metadata,
                 * Internal Bitmap issues have been handled elsewhere.
                 */
-               if (rdev->data_offset < rdev->sb_start) {
+               if (rdev->meta_bdev) {
+                       /* Nothing to check */;
+               } else if (rdev->data_offset < rdev->sb_start) {
                        if (mddev->dev_sectors &&
                            rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
@@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev)
        mddev->safemode_timer.data = (unsigned long) mddev;
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
-
+       smp_wmb();
+       mddev->ready = 1;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
@@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev)
        mddev->plug = NULL;
 }
 
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
 {
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-               md_unregister_thread(mddev->sync_thread);
-               mddev->sync_thread = NULL;
+               reap_sync_thread(mddev);
        }
 
        del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev)
                md_update_sb(mddev, 1);
        }
 }
+
+void md_stop_writes(mddev_t *mddev)
+{
+       mddev_lock(mddev);
+       __md_stop_writes(mddev);
+       mddev_unlock(mddev);
+}
 EXPORT_SYMBOL_GPL(md_stop_writes);
 
 void md_stop(mddev_t *mddev)
 {
+       mddev->ready = 0;
        mddev->pers->stop(mddev);
        if (mddev->pers->sync_request && mddev->to_remove == NULL)
                mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
                goto out;
        }
        if (mddev->pers) {
-               md_stop_writes(mddev);
+               __md_stop_writes(mddev);
 
                err  = -ENXIO;
                if (mddev->ro==1)
@@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                if (mddev->ro)
                        set_disk_ro(disk, 0);
 
-               md_stop_writes(mddev);
+               __md_stop_writes(mddev);
                md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->unplug_fn = NULL;
@@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                /* set saved_raid_disk if appropriate */
                if (!mddev->persistent) {
                        if (info->state & (1<<MD_DISK_SYNC)  &&
-                           info->raid_disk < mddev->raid_disks)
+                           info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
-                       else
+                               set_bit(In_sync, &rdev->flags);
+                       } else
                                rdev->raid_disk = -1;
                } else
                        super_types[mddev->major_version].
@@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
                } else
-                       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                       rdev->sb_start = calc_dev_sboffset(rdev);
                rdev->sectors = rdev->sb_start;
 
                err = bind_rdev_to_array(rdev, mddev);
@@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        }
 
        if (mddev->persistent)
-               rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+               rdev->sb_start = calc_dev_sboffset(rdev);
        else
                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
 
@@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
         * sb_start or, if that is <data_offset, it must fit before the size
         * of each device.  If num_sectors is zero, we find the largest size
         * that fits.
-
         */
        if (mddev->sync_thread)
                return -EBUSY;
@@ -6033,7 +6063,8 @@ static int md_thread(void * arg)
                         || kthread_should_stop(),
                         thread->timeout);
 
-               if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+               clear_bit(THREAD_WAKEUP, &thread->flags);
+               if (!kthread_should_stop())
                        thread->run(thread->mddev);
        }
 
@@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev)
                       desc, mdname(mddev));
                mddev->curr_resync = j;
        }
-       mddev->curr_resync_completed = mddev->curr_resync;
+       mddev->curr_resync_completed = j;
 
        while (j < max_sectors) {
                sector_t sectors;
@@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev)
                        md_unplug(mddev);
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
-                       mddev->curr_resync_completed =
-                               mddev->curr_resync;
+                       mddev->curr_resync_completed = j;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
@@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev)
        }
        return spares;
 }
+
+static void reap_sync_thread(mddev_t *mddev)
+{
+       mdk_rdev_t *rdev;
+
+       /* resync has finished, collect result */
+       md_unregister_thread(mddev->sync_thread);
+       mddev->sync_thread = NULL;
+       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+           !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+               /* success...*/
+               /* activate any spares */
+               if (mddev->pers->spare_active(mddev))
+                       sysfs_notify(&mddev->kobj, NULL,
+                                    "degraded");
+       }
+       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+           mddev->pers->finish_reshape)
+               mddev->pers->finish_reshape(mddev);
+       md_update_sb(mddev, 1);
+
+       /* if array is no-longer degraded, then any saved_raid_disk
+        * information must be scrapped
+        */
+       if (!mddev->degraded)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       rdev->saved_raid_disk = -1;
+
+       clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+       clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+       clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+       clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+       clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+       /* flag recovery needed just to double check */
+       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       sysfs_notify_dirent_safe(mddev->sysfs_action);
+       md_new_event(mddev);
+}
+
 /*
  * This routine is regularly called by all per-raid-array threads to
  * deal with generic issues like resync and super-block update.
@@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev)
  */
 void md_check_recovery(mddev_t *mddev)
 {
-       mdk_rdev_t *rdev;
-
-
        if (mddev->bitmap)
                bitmap_daemon_work(mddev);
 
@@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev)
                        goto unlock;
                }
                if (mddev->sync_thread) {
-                       /* resync has finished, collect result */
-                       md_unregister_thread(mddev->sync_thread);
-                       mddev->sync_thread = NULL;
-                       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-                           !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-                               /* success...*/
-                               /* activate any spares */
-                               if (mddev->pers->spare_active(mddev))
-                                       sysfs_notify(&mddev->kobj, NULL,
-                                                    "degraded");
-                       }
-                       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-                           mddev->pers->finish_reshape)
-                               mddev->pers->finish_reshape(mddev);
-                       md_update_sb(mddev, 1);
-
-                       /* if array is no-longer degraded, then any saved_raid_disk
-                        * information must be scrapped
-                        */
-                       if (!mddev->degraded)
-                               list_for_each_entry(rdev, &mddev->disks, same_set)
-                                       rdev->saved_raid_disk = -1;
-
-                       mddev->recovery = 0;
-                       /* flag recovery needed just to double check */
-                       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                       sysfs_notify_dirent_safe(mddev->sysfs_action);
-                       md_new_event(mddev);
+                       reap_sync_thread(mddev);
                        goto unlock;
                }
                /* Set RUNNING before clearing NEEDED to avoid
@@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev)
                                        " thread...\n", 
                                        mdname(mddev));
                                /* leave the spares where they are, it shouldn't hurt */
-                               mddev->recovery = 0;
+                               clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                        } else
                                md_wakeup_thread(mddev->sync_thread);
                        sysfs_notify_dirent_safe(mddev->sysfs_action);
index d05bab5..eec517c 100644 (file)
@@ -60,6 +60,12 @@ struct mdk_rdev_s
        mddev_t *mddev;                 /* RAID array if running */
        int last_events;                /* IO event timestamp */
 
+       /*
+        * If meta_bdev is non-NULL, it means that a separate device is
+        * being used to store the metadata (superblock/bitmap) which
+        * would otherwise be contained on the same device as the data (bdev).
+        */
+       struct block_device *meta_bdev;
        struct block_device *bdev;      /* block device handle */
 
        struct page     *sb_page;
@@ -148,7 +154,8 @@ struct mddev_s
                                                       * are happening, so run/
                                                       * takeover/stop are not safe
                                                       */
-
+       int                             ready; /* See when safe to pass 
+                                               * IO requests down */
        struct gendisk                  *gendisk;
 
        struct kobject                  kobj;
@@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                           sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
-                       struct page *page, int rw);
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
+                       struct page *page, int rw, bool metadata_op);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
index 845cf95..a23ffa3 100644 (file)
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        } else
                set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-       printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
-              KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+       printk(KERN_ALERT
+              "md/raid1:%s: Disk failure on %s, disabling device.\n"
+              "md/raid1:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                         */
                                        rdev = conf->mirrors[d].rdev;
                                        if (sync_page_io(rdev,
-                                                        sect + rdev->data_offset,
+                                                        sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                        READ)) {
+                                                        READ, false)) {
                                                success = 1;
                                                break;
                                        }
@@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                        rdev = conf->mirrors[d].rdev;
                                        atomic_add(s, &rdev->corrected_errors);
                                        if (sync_page_io(rdev,
-                                                        sect + rdev->data_offset,
+                                                        sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                        WRITE) == 0)
+                                                        WRITE, false) == 0)
                                                md_error(mddev, rdev);
                                }
                                d = start;
@@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                                continue;
                                        rdev = conf->mirrors[d].rdev;
                                        if (sync_page_io(rdev,
-                                                        sect + rdev->data_offset,
+                                                        sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                        READ) == 0)
+                                                        READ, false) == 0)
                                                md_error(mddev, rdev);
                                }
                        } else {
@@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags) &&
-                           sync_page_io(rdev,
-                                        sect + rdev->data_offset,
-                                        s<<9,
-                                        conf->tmppage, READ))
+                           sync_page_io(rdev, sect, s<<9,
+                                        conf->tmppage, READ, false))
                                success = 1;
                        else {
                                d++;
@@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                               if (sync_page_io(rdev,
-                                                sect + rdev->data_offset,
-                                                s<<9, conf->tmppage, WRITE)
+                               if (sync_page_io(rdev, sect, s<<9,
+                                                conf->tmppage, WRITE, false)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
@@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                               if (sync_page_io(rdev,
-                                                sect + rdev->data_offset,
-                                                s<<9, conf->tmppage, READ)
+                               if (sync_page_io(rdev, sect, s<<9,
+                                                conf->tmppage, READ, false)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
index 0641674..69b6595 100644 (file)
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-       printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
-              KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+       printk(KERN_ALERT
+              "md/raid10:%s: Disk failure on %s, disabling device.\n"
+              "md/raid10:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                rcu_read_unlock();
                                success = sync_page_io(rdev,
                                                       r10_bio->devs[sl].addr +
-                                                      sect + rdev->data_offset,
+                                                      sect,
                                                       s<<9,
-                                                      conf->tmppage, READ);
+                                                      conf->tmppage, READ, false);
                                rdev_dec_pending(rdev, mddev);
                                rcu_read_lock();
                                if (success)
@@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                atomic_add(s, &rdev->corrected_errors);
                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                sect + rdev->data_offset,
-                                                s<<9, conf->tmppage, WRITE)
+                                                sect,
+                                                s<<9, conf->tmppage, WRITE, false)
                                    == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
@@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                rcu_read_unlock();
                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                sect + rdev->data_offset,
+                                                sect,
                                                 s<<9, conf->tmppage,
-                                                READ) == 0) {
+                                                READ, false) == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
                                               "md/raid10:%s: unable to read back "
index dc574f3..5044bab 100644 (file)
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                set_bit(Faulty, &rdev->flags);
                printk(KERN_ALERT
                       "md/raid:%s: Disk failure on %s, disabling device.\n"
-                      KERN_ALERT
                       "md/raid:%s: Operation continuing on %d devices.\n",
                       mdname(mddev),
                       bdevname(rdev->bdev, b),
@@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
                mddev->reshape_position = conf->reshape_progress;
-               mddev->curr_resync_completed = mddev->curr_resync;
+               mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
-               mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+               mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        count++;
-                       sysfs_notify_dirent(tmp->rdev->sysfs_state);
+                       sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
@@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev)
                return -ENOSPC;
 
        list_for_each_entry(rdev, &mddev->disks, same_set)
-               if (rdev->raid_disk < 0 &&
-                   !test_bit(Faulty, &rdev->flags))
+               if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks)
+                    && !test_bit(Faulty, &rdev->flags))
                        spares++;
 
        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev)
                                        /* Failure here is OK */;
                        } else
                                break;
+               } else if (rdev->raid_disk >= conf->previous_raid_disks
+                          && !test_bit(Faulty, &rdev->flags)) {
+                       /* This is a spare that was manually added */
+                       set_bit(In_sync, &rdev->flags);
+                       added_devices++;
                }
 
        /* When a reshape changes the number of devices, ->degraded
index 3892666..2a1d52f 100644 (file)
@@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev)
        device_init_wakeup(&pdev->dev, 1);
        platform_set_drvdata(pdev, port);
 
+       if (port->rs485.flags & SER_RS485_ENABLED) {
+               UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL);
+               UART_PUT_CR(&port->uart, ATMEL_US_RTSEN);
+       }
+
        return 0;
 
 err_add_port:
index 5a48ce9..07bec09 100644 (file)
@@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR
         but will have no xen contents.
 
 config XEN_XENBUS_FRONTEND
-       tristate
+       tristate
+
+config XEN_GNTDEV
+       tristate "userspace grant access device driver"
+       depends on XEN
+       select MMU_NOTIFIER
+       help
+         Allows userspace processes to use grants.
 
 config XEN_PLATFORM_PCI
        tristate "xen platform pci device driver"
-       depends on XEN_PVHVM
+       depends on XEN_PVHVM && PCI
        default m
        help
          Driver for the Xen PCI Platform device: it is responsible for
index 533a199..5088cc2 100644 (file)
@@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU)     += cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)      += xencomm.o
 obj-$(CONFIG_XEN_BALLOON)      += balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)   += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV)       += xen-gntdev.o
 obj-$(CONFIG_XENFS)            += xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
-obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
+obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
 obj-$(CONFIG_SWIOTLB_XEN)      += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)         += pci.o
 
 xen-evtchn-y                   := evtchn.o
+xen-gntdev-y                           := gntdev.o
 
+xen-platform-pci-y             := platform-pci.o
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644 (file)
index 0000000..1e31cdc
--- /dev/null
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+             "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
+               "once by a gntdev instance");
+
+struct gntdev_priv {
+       struct list_head maps;
+       uint32_t used;
+       uint32_t limit;
+       /* lock protects maps from concurrent changes */
+       spinlock_t lock;
+       struct mm_struct *mm;
+       struct mmu_notifier mn;
+};
+
+struct grant_map {
+       struct list_head next;
+       struct gntdev_priv *priv;
+       struct vm_area_struct *vma;
+       int index;
+       int count;
+       int flags;
+       int is_mapped;
+       struct ioctl_gntdev_grant_ref *grants;
+       struct gnttab_map_grant_ref   *map_ops;
+       struct gnttab_unmap_grant_ref *unmap_ops;
+       struct page **pages;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+                             char *text, int text_index)
+{
+#ifdef DEBUG
+       struct grant_map *map;
+
+       pr_debug("maps list (priv %p, usage %d/%d)\n",
+              priv, priv->used, priv->limit);
+
+       list_for_each_entry(map, &priv->maps, next)
+               pr_debug("  index %2d, count %2d %s\n",
+                      map->index, map->count,
+                      map->index == text_index && text ? text : "");
+#endif
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+       struct grant_map *add;
+       int i;
+
+       add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+       if (NULL == add)
+               return NULL;
+
+       add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
+       add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
+       add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+       add->pages     = kzalloc(sizeof(add->pages[0])     * count, GFP_KERNEL);
+       if (NULL == add->grants    ||
+           NULL == add->map_ops   ||
+           NULL == add->unmap_ops ||
+           NULL == add->pages)
+               goto err;
+
+       for (i = 0; i < count; i++) {
+               add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+               if (add->pages[i] == NULL)
+                       goto err;
+       }
+
+       add->index = 0;
+       add->count = count;
+       add->priv  = priv;
+
+       if (add->count + priv->used > priv->limit)
+               goto err;
+
+       return add;
+
+err:
+       if (add->pages)
+               for (i = 0; i < count; i++) {
+                       if (add->pages[i])
+                               __free_page(add->pages[i]);
+               }
+       kfree(add->pages);
+       kfree(add->grants);
+       kfree(add->map_ops);
+       kfree(add->unmap_ops);
+       kfree(add);
+       return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+       struct grant_map *map;
+
+       list_for_each_entry(map, &priv->maps, next) {
+               if (add->index + add->count < map->index) {
+                       list_add_tail(&add->next, &map->next);
+                       goto done;
+               }
+               add->index = map->index + map->count;
+       }
+       list_add_tail(&add->next, &priv->maps);
+
+done:
+       priv->used += add->count;
+       gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+               int index, int count)
+{
+       struct grant_map *map;
+
+       list_for_each_entry(map, &priv->maps, next) {
+               if (map->index != index)
+                       continue;
+               if (map->count != count)
+                       continue;
+               return map;
+       }
+       return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+                                              unsigned long vaddr)
+{
+       struct grant_map *map;
+
+       list_for_each_entry(map, &priv->maps, next) {
+               if (!map->vma)
+                       continue;
+               if (vaddr < map->vma->vm_start)
+                       continue;
+               if (vaddr >= map->vma->vm_end)
+                       continue;
+               return map;
+       }
+       return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+       int i;
+
+       if (map->vma)
+               return -EBUSY;
+       for (i = 0; i < map->count; i++)
+               if (map->unmap_ops[i].handle)
+                       return -EBUSY;
+
+       map->priv->used -= map->count;
+       list_del(&map->next);
+       return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+       int i;
+
+       if (!map)
+               return;
+
+       if (map->pages)
+               for (i = 0; i < map->count; i++) {
+                       if (map->pages[i])
+                               __free_page(map->pages[i]);
+               }
+       kfree(map->pages);
+       kfree(map->grants);
+       kfree(map->map_ops);
+       kfree(map->unmap_ops);
+       kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+               unsigned long addr, void *data)
+{
+       struct grant_map *map = data;
+       unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+       u64 pte_maddr;
+
+       BUG_ON(pgnr >= map->count);
+       pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+       gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+                         GNTMAP_contains_pte | map->flags,
+                         map->grants[pgnr].ref,
+                         map->grants[pgnr].domid);
+       gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+                           GNTMAP_contains_pte | map->flags,
+                           0 /* handle */);
+       return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+       int i, err = 0;
+
+       pr_debug("map %d+%d\n", map->index, map->count);
+       err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+       if (err)
+               return err;
+
+       for (i = 0; i < map->count; i++) {
+               if (map->map_ops[i].status)
+                       err = -EINVAL;
+               map->unmap_ops[i].handle = map->map_ops[i].handle;
+       }
+       return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+       int i, err = 0;
+
+       pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+       err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+       if (err)
+               return err;
+
+       for (i = 0; i < pages; i++) {
+               if (map->unmap_ops[offset+i].status)
+                       err = -EINVAL;
+               map->unmap_ops[offset+i].handle = 0;
+       }
+       return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+       struct grant_map *map = vma->vm_private_data;
+
+       pr_debug("close %p\n", vma);
+       map->is_mapped = 0;
+       map->vma = NULL;
+       vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
+                       vmf->virtual_address, vmf->pgoff);
+       vmf->flags = VM_FAULT_ERROR;
+       return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+       .close = gntdev_vma_close,
+       .fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+                               struct mm_struct *mm,
+                               unsigned long start, unsigned long end)
+{
+       struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+       struct grant_map *map;
+       unsigned long mstart, mend;
+       int err;
+
+       spin_lock(&priv->lock);
+       list_for_each_entry(map, &priv->maps, next) {
+               if (!map->vma)
+                       continue;
+               if (!map->is_mapped)
+                       continue;
+               if (map->vma->vm_start >= end)
+                       continue;
+               if (map->vma->vm_end <= start)
+                       continue;
+               mstart = max(start, map->vma->vm_start);
+               mend   = min(end,   map->vma->vm_end);
+               pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+                               map->index, map->count,
+                               map->vma->vm_start, map->vma->vm_end,
+                               start, end, mstart, mend);
+               err = unmap_grant_pages(map,
+                                       (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+                                       (mend - mstart) >> PAGE_SHIFT);
+               WARN_ON(err);
+       }
+       spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+                        struct mm_struct *mm,
+                        unsigned long address)
+{
+       mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+                      struct mm_struct *mm)
+{
+       struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+       struct grant_map *map;
+       int err;
+
+       spin_lock(&priv->lock);
+       list_for_each_entry(map, &priv->maps, next) {
+               if (!map->vma)
+                       continue;
+               pr_debug("map %d+%d (%lx %lx)\n",
+                               map->index, map->count,
+                               map->vma->vm_start, map->vma->vm_end);
+               err = unmap_grant_pages(map, /* offset */ 0, map->count);
+               WARN_ON(err);
+       }
+       spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+       .release                = mn_release,
+       .invalidate_page        = mn_invl_page,
+       .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+       struct gntdev_priv *priv;
+       int ret = 0;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&priv->maps);
+       spin_lock_init(&priv->lock);
+       priv->limit = limit;
+
+       priv->mm = get_task_mm(current);
+       if (!priv->mm) {
+               kfree(priv);
+               return -ENOMEM;
+       }
+       priv->mn.ops = &gntdev_mmu_ops;
+       ret = mmu_notifier_register(&priv->mn, priv->mm);
+       mmput(priv->mm);
+
+       if (ret) {
+               kfree(priv);
+               return ret;
+       }
+
+       flip->private_data = priv;
+       pr_debug("priv %p\n", priv);
+
+       return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+       struct gntdev_priv *priv = flip->private_data;
+       struct grant_map *map;
+       int err;
+
+       pr_debug("priv %p\n", priv);
+
+       spin_lock(&priv->lock);
+       while (!list_empty(&priv->maps)) {
+               map = list_entry(priv->maps.next, struct grant_map, next);
+               err = gntdev_del_map(map);
+               if (WARN_ON(err))
+                       gntdev_free_map(map);
+
+       }
+       spin_unlock(&priv->lock);
+
+       mmu_notifier_unregister(&priv->mn, priv->mm);
+       kfree(priv);
+       return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+                                      struct ioctl_gntdev_map_grant_ref __user *u)
+{
+       struct ioctl_gntdev_map_grant_ref op;
+       struct grant_map *map;
+       int err;
+
+       if (copy_from_user(&op, u, sizeof(op)) != 0)
+               return -EFAULT;
+       pr_debug("priv %p, add %d\n", priv, op.count);
+       if (unlikely(op.count <= 0))
+               return -EINVAL;
+       if (unlikely(op.count > priv->limit))
+               return -EINVAL;
+
+       err = -ENOMEM;
+       map = gntdev_alloc_map(priv, op.count);
+       if (!map)
+               return err;
+       if (copy_from_user(map->grants, &u->refs,
+                          sizeof(map->grants[0]) * op.count) != 0) {
+               gntdev_free_map(map);
+               return err;
+       }
+
+       spin_lock(&priv->lock);
+       gntdev_add_map(priv, map);
+       op.index = map->index << PAGE_SHIFT;
+       spin_unlock(&priv->lock);
+
+       if (copy_to_user(u, &op, sizeof(op)) != 0) {
+               spin_lock(&priv->lock);
+               gntdev_del_map(map);
+               spin_unlock(&priv->lock);
+               gntdev_free_map(map);
+               return err;
+       }
+       return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+                                        struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+       struct ioctl_gntdev_unmap_grant_ref op;
+       struct grant_map *map;
+       int err = -ENOENT;
+
+       if (copy_from_user(&op, u, sizeof(op)) != 0)
+               return -EFAULT;
+       pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+       spin_lock(&priv->lock);
+       map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+       if (map)
+               err = gntdev_del_map(map);
+       spin_unlock(&priv->lock);
+       if (!err)
+               gntdev_free_map(map);
+       return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+                                             struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+       struct ioctl_gntdev_get_offset_for_vaddr op;
+       struct grant_map *map;
+
+       if (copy_from_user(&op, u, sizeof(op)) != 0)
+               return -EFAULT;
+       pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+       spin_lock(&priv->lock);
+       map = gntdev_find_map_vaddr(priv, op.vaddr);
+       if (map == NULL ||
+           map->vma->vm_start != op.vaddr) {
+               spin_unlock(&priv->lock);
+               return -EINVAL;
+       }
+       op.offset = map->index << PAGE_SHIFT;
+       op.count = map->count;
+       spin_unlock(&priv->lock);
+
+       if (copy_to_user(u, &op, sizeof(op)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+                                       struct ioctl_gntdev_set_max_grants __user *u)
+{
+       struct ioctl_gntdev_set_max_grants op;
+
+       if (copy_from_user(&op, u, sizeof(op)) != 0)
+               return -EFAULT;
+       pr_debug("priv %p, limit %d\n", priv, op.count);
+       if (op.count > limit)
+               return -E2BIG;
+
+       spin_lock(&priv->lock);
+       priv->limit = op.count;
+       spin_unlock(&priv->lock);
+       return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+                        unsigned int cmd, unsigned long arg)
+{
+       struct gntdev_priv *priv = flip->private_data;
+       void __user *ptr = (void __user *)arg;
+
+       switch (cmd) {
+       case IOCTL_GNTDEV_MAP_GRANT_REF:
+               return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+       case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+               return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+       case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+               return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+       case IOCTL_GNTDEV_SET_MAX_GRANTS:
+               return gntdev_ioctl_set_max_grants(priv, ptr);
+
+       default:
+               pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+               return -ENOIOCTLCMD;
+       }
+
+       return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+       struct gntdev_priv *priv = flip->private_data;
+       int index = vma->vm_pgoff;
+       int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       struct grant_map *map;
+       int err = -EINVAL;
+
+       if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+               return -EINVAL;
+
+       pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+                       index, count, vma->vm_start, vma->vm_pgoff);
+
+       spin_lock(&priv->lock);
+       map = gntdev_find_map_index(priv, index, count);
+       if (!map)
+               goto unlock_out;
+       if (map->vma)
+               goto unlock_out;
+       if (priv->mm != vma->vm_mm) {
+               printk(KERN_WARNING "Huh? Other mm?\n");
+               goto unlock_out;
+       }
+
+       vma->vm_ops = &gntdev_vmops;
+
+       vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
+
+       vma->vm_private_data = map;
+       map->vma = vma;
+
+       map->flags = GNTMAP_host_map | GNTMAP_application_map;
+       if (!(vma->vm_flags & VM_WRITE))
+               map->flags |= GNTMAP_readonly;
+
+       spin_unlock(&priv->lock);
+
+       err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+                                 vma->vm_end - vma->vm_start,
+                                 find_grant_ptes, map);
+       if (err) {
+               printk(KERN_WARNING "find_grant_ptes() failure.\n");
+               return err;
+       }
+
+       err = map_grant_pages(map);
+       if (err) {
+               printk(KERN_WARNING "map_grant_pages() failure.\n");
+               return err;
+       }
+
+       map->is_mapped = 1;
+
+       return 0;
+
+unlock_out:
+       spin_unlock(&priv->lock);
+       return err;
+}
+
+static const struct file_operations gntdev_fops = {
+       .owner = THIS_MODULE,
+       .open = gntdev_open,
+       .release = gntdev_release,
+       .mmap = gntdev_mmap,
+       .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+       .minor        = MISC_DYNAMIC_MINOR,
+       .name         = "xen/gntdev",
+       .fops         = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+       int err;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       err = misc_register(&gntdev_miscdev);
+       if (err != 0) {
+               printk(KERN_ERR "Could not register gntdev device\n");
+               return err;
+       }
+       return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+       misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
index 6c45318..9ef54eb 100644 (file)
@@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+                   struct page **pages, unsigned int count)
+{
+       int i, ret;
+       pte_t *pte;
+       unsigned long mfn;
+
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < count; i++) {
+               /* m2p override only supported for GNTMAP_contains_pte mappings */
+               if (!(map_ops[i].flags & GNTMAP_contains_pte))
+                       continue;
+               pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+                               (map_ops[i].host_addr & ~PAGE_MASK));
+               mfn = pte_mfn(*pte);
+               ret = m2p_add_override(mfn, pages[i]);
+               if (ret)
+                       return ret;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+               struct page **pages, unsigned int count)
+{
+       int i, ret;
+
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < count; i++) {
+               ret = m2p_remove_override(pages[i]);
+               if (ret)
+                       return ret;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
        struct gnttab_setup_table setup;
index c01b5dd..afbe041 100644 (file)
@@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                                       const struct pci_device_id *ent)
 {
        int i, ret;
-       long ioaddr, iolen;
+       long ioaddr;
        long mmio_addr, mmio_len;
        unsigned int max_nr_gframes;
 
@@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                return i;
 
        ioaddr = pci_resource_start(pdev, 0);
-       iolen = pci_resource_len(pdev, 0);
 
        mmio_addr = pci_resource_start(pdev, 1);
        mmio_len = pci_resource_len(pdev, 1);
@@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                goto pci_out;
        }
 
-       if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
-               dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
-                      mmio_addr, mmio_len);
-               ret = -EBUSY;
+       ret = pci_request_region(pdev, 1, DRV_NAME);
+       if (ret < 0)
                goto pci_out;
-       }
 
-       if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
-               dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
-                      iolen, ioaddr);
-               ret = -EBUSY;
+       ret = pci_request_region(pdev, 0, DRV_NAME);
+       if (ret < 0)
                goto mem_out;
-       }
 
        platform_mmio = mmio_addr;
        platform_mmiolen = mmio_len;
@@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
        return 0;
 
 out:
-       release_region(ioaddr, iolen);
+       pci_release_region(pdev, 0);
 mem_out:
-       release_mem_region(mmio_addr, mmio_len);
+       pci_release_region(pdev, 1);
 pci_out:
        pci_disable_device(pdev);
        return ret;
index 9ed4769..d3b28ab 100644 (file)
@@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        return rc;
 }
 
-static inode *ecryptfs_get_inode(struct inode *lower_inode,
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
                       struct super_block *sb)
 {
        struct inode *inode;
        int rc = 0;
 
-       lower_inode = lower_dentry->d_inode;
        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
                rc = -EXDEV;
                goto out;
@@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 {
        struct inode *lower_inode = lower_dentry->d_inode;
        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
-       if (IS_ERR(inode)
+       if (IS_ERR(inode))
                return PTR_ERR(inode);
        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
index 3d06ccc..59c6e49 100644 (file)
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
        return list_entry(head, struct inode, i_wb_list);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-               struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-       trace_writeback_queue(bdi, work);
-
-       spin_lock_bh(&bdi->wb_lock);
-       list_add_tail(&work->list, &bdi->work_list);
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-               trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                          struct wb_writeback_work *work)
+{
+       trace_writeback_queue(bdi, work);
+
+       spin_lock_bh(&bdi->wb_lock);
+       list_add_tail(&work->list, &bdi->work_list);
+       if (!bdi->wb.task)
+               trace_writeback_nothread(bdi, work);
+       bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-               bool range_cyclic, bool for_background)
+                     bool range_cyclic)
 {
        struct wb_writeback_work *work;
 
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-       work->for_background = for_background;
 
        bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-       __bdi_start_writeback(bdi, nr_pages, true, false);
+       __bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-       __bdi_start_writeback(bdi, LONG_MAX, true, true);
+       /*
+        * We just wake up the flusher thread. It will perform background
+        * writeback as soon as there is no other work to do.
+        */
+       trace_writeback_wake_background(bdi);
+       spin_lock_bh(&bdi->wb_lock);
+       bdi_wakeup_flusher(bdi);
+       spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+       long write_chunk;
        struct inode *inode;
 
        if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
 
+       /*
+        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+        * here avoids calling into writeback_inodes_wb() more than once.
+        *
+        * The intended call sequence for WB_SYNC_ALL writeback is:
+        *
+        *      wb_writeback()
+        *          __writeback_inodes_sb()     <== called only once
+        *              write_cache_pages()     <== called once for each inode
+        *                   (quickly) tag currently dirty pages
+        *                   (maybe slowly) sync all tagged pages
+        */
+       if (wbc.sync_mode == WB_SYNC_NONE)
+               write_chunk = MAX_WRITEBACK_PAGES;
+       else
+               write_chunk = LONG_MAX;
+
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
 
                /*
+                * Background writeout and kupdate-style writeback may
+                * run forever. Stop them if there is other work to do
+                * so that e.g. sync can proceed. They'll be restarted
+                * after the other works are all done.
+                */
+               if ((work->for_background || work->for_kupdate) &&
+                   !list_empty(&wb->bdi->work_list))
+                       break;
+
+               /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
 
                wbc.more_io = 0;
-               wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+               wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
 
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
 
-               work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-               wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+               work->nr_pages -= write_chunk - wbc.nr_to_write;
+               wrote += write_chunk - wbc.nr_to_write;
 
                /*
                 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-               if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+               if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
                get_nr_dirty_inodes();
 }
 
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+       if (over_bground_thresh()) {
+
+               struct wb_writeback_work work = {
+                       .nr_pages       = LONG_MAX,
+                       .sync_mode      = WB_SYNC_NONE,
+                       .for_background = 1,
+                       .range_cyclic   = 1,
+               };
+
+               return wb_writeback(wb, &work);
+       }
+
+       return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+       wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
 
        return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-               __bdi_start_writeback(bdi, nr_pages, false, false);
+               __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
  * @sb: the superblock
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
  */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
  * @inode: the inode to sync
  * @wait: wait for I/O to complete.
  *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
  *
  * Note: only writes the actual inode, no associated data or other metadata.
  */
index fd56ca2..d78455a 100644 (file)
@@ -40,7 +40,7 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
 
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
-
-               if (uptodate) {
-                       SetPageUptodate(page);
-               } else {
-                       ClearPageUptodate(page);
-                       SetPageError(page);
-               }
-               unlock_page(page);
-       } while (bvec >= bio->bi_io_vec);
-       bio_put(bio);
-}
-
-static void mpage_end_io_write(struct bio *bio, int err)
-{
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-       do {
-               struct page *page = bvec->bv_page;
-
-               if (--bvec >= bio->bi_io_vec)
-                       prefetchw(&bvec->bv_page->flags);
-
-               if (!uptodate){
-                       SetPageError(page);
-                       if (page->mapping)
-                               set_bit(AS_EIO, &page->mapping->flags);
+               if (bio_data_dir(bio) == READ) {
+                       if (uptodate) {
+                               SetPageUptodate(page);
+                       } else {
+                               ClearPageUptodate(page);
+                               SetPageError(page);
+                       }
+                       unlock_page(page);
+               } else { /* bio_data_dir(bio) == WRITE */
+                       if (!uptodate) {
+                               SetPageError(page);
+                               if (page->mapping)
+                                       set_bit(AS_EIO, &page->mapping->flags);
+                       }
+                       end_page_writeback(page);
                }
-               end_page_writeback(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
 }
 
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-       bio->bi_end_io = mpage_end_io_read;
-       if (rw == WRITE)
-               bio->bi_end_io = mpage_end_io_write;
+       bio->bi_end_io = mpage_end_io;
        submit_bio(rw, bio);
        return NULL;
 }
index 95b081b..64ee240 100644 (file)
@@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
+       int open_flags = 0;
 
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
 
-       error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+       if ((nd->flags & LOOKUP_CREATE) != 0)
+               open_flags = nd->intent.open.flags;
+
+       error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
        if (error != 0)
                goto out_err;
        return 0;
index 93f1cdd..9d096e8 100644 (file)
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
 
-       if (oom_score_adj < task->signal->oom_score_adj &&
+       if (oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                        atomic_dec(&task->mm->oom_disable_count);
        }
        task->signal->oom_score_adj = oom_score_adj;
+       if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+               task->signal->oom_score_adj_min = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
index a65239c..ed257d1 100644 (file)
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                "HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               "AnonHugePages:  %8lu kB\n"
+#endif
                ,
                K(i.totalram),
                K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-               K(global_page_state(NR_ANON_PAGES)),
+               K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                 HPAGE_PMD_NR
+#endif
+                 ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                  HPAGE_PMD_NR)
+#endif
                );
 
        hugetlb_report_meminfo(m);
index b06c674..6d8e6a9 100644 (file)
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
 
-       u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
-
        /*
-        * Caveats on high order pages:
-        * PG_buddy will only be set on the head page; SLUB/SLQB do the same
-        * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+        * Caveats on high order pages: page->_count will only be set
+        * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+        * SLOB won't set PG_slab at all on compound pages.
         */
+       if (PageBuddy(page))
+               u |= 1 << KPF_BUDDY;
+
+       u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
+
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
-       u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
 
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
index c3755bd..60b9148 100644 (file)
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
-                  "MMUPageSize:    %8lu kB\n",
+                  "MMUPageSize:    %8lu kB\n"
+                  "Locked:         %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
-                  vma_mmu_pagesize(vma) >> 10);
+                  vma_mmu_pagesize(vma) >> 10,
+                  (vma->vm_flags & VM_LOCKED) ?
+                       (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
index 6098cae..ff5c660 100644 (file)
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
 /* Always use the library code for GPIO management calls,
  * or when sleeping may be involved.
  */
-extern int __must_check gpio_request(unsigned gpio, const char *label);
+extern int gpio_request(unsigned gpio, const char *label);
 extern void gpio_free(unsigned gpio);
 
-extern int __must_check gpio_direction_input(unsigned gpio);
-extern int __must_check gpio_direction_output(unsigned gpio, int value);
+extern int gpio_direction_input(unsigned gpio);
+extern int gpio_direction_output(unsigned gpio, int value);
 
 extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
 
@@ -192,8 +192,8 @@ struct gpio {
        const char      *label;
 };
 
-extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int __must_check gpio_request_array(struct gpio *array, size_t num);
+extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+extern int gpio_request_array(struct gpio *array, size_t num);
 extern void gpio_free_array(struct gpio *array, size_t num);
 
 #ifdef CONFIG_GPIO_SYSFS
index 3da9e27..787abbb 100644 (file)
@@ -45,6 +45,9 @@
 #define MADV_MERGEABLE   12            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13            /* KSM may not merge identical pages */
 
+#define MADV_HUGEPAGE  14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE        15              /* Not worth backing with hugepages */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 6f3c6ae..f1eddf7 100644 (file)
 #ifdef CONFIG_MMU
 
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-/*
- * Largely same as above, but only sets the access flags (dirty,
- * accessed, and writable). Furthermore, we know it always gets set
- * to a "more permissive" setting, which allows most architectures
- * to optimize this. We return whether the PTE actually changed, which
- * in turn instructs the caller to do things like update__mmu_cache.
- * This used to be done in the caller, but sparc needs minor faults to
- * force that call on sun4c so we changed this macro slightly
- */
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-({                                                                       \
-       int __changed = !pte_same(*(__ptep), __entry);                    \
-       if (__changed) {                                                  \
-               set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \
-               flush_tlb_page(__vma, __address);                         \
-       }                                                                 \
-       __changed;                                                        \
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pte_t *ptep,
+                                pte_t entry, int dirty);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp,
+                                pmd_t entry, int dirty);
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __address, __ptep)            \
-({                                                                     \
-       pte_t __pte = *(__ptep);                                        \
-       int r = 1;                                                      \
-       if (!pte_young(__pte))                                          \
-               r = 0;                                                  \
-       else                                                            \
-               set_pte_at((__vma)->vm_mm, (__address),                 \
-                          (__ptep), pte_mkold(__pte));                 \
-       r;                                                              \
-})
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address,
+                                           pte_t *ptep)
+{
+       pte_t pte = *ptep;
+       int r = 1;
+       if (!pte_young(pte))
+               r = 0;
+       else
+               set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
+       return r;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address,
+                                           pmd_t *pmdp)
+{
+       pmd_t pmd = *pmdp;
+       int r = 1;
+       if (!pmd_young(pmd))
+               r = 0;
+       else
+               set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
+       return r;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address,
+                                           pmd_t *pmdp)
+{
+       BUG();
+       return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep)               \
-({                                                                     \
-       int __young;                                                    \
-       __young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-       if (__young)                                                    \
-               flush_tlb_page(__vma, __address);                       \
-       __young;                                                        \
-})
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp);
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define ptep_get_and_clear(__mm, __address, __ptep)                    \
-({                                                                     \
-       pte_t __pte = *(__ptep);                                        \
-       pte_clear((__mm), (__address), (__ptep));                       \
-       __pte;                                                          \
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+                                      unsigned long address,
+                                      pte_t *ptep)
+{
+       pte_t pte = *ptep;
+       pte_clear(mm, address, ptep);
+       return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                      unsigned long address,
+                                      pmd_t *pmdp)
+{
+       pmd_t pmd = *pmdp;
+       pmd_clear(mm, address, pmdp);
+       return pmd;
 })
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                      unsigned long address,
+                                      pmd_t *pmdp)
+{
+       BUG();
+       return __pmd(0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define ptep_get_and_clear_full(__mm, __address, __ptep, __full)       \
-({                                                                     \
-       pte_t __pte;                                                    \
-       __pte = ptep_get_and_clear((__mm), (__address), (__ptep));      \
-       __pte;                                                          \
-})
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+                                           unsigned long address, pte_t *ptep,
+                                           int full)
+{
+       pte_t pte;
+       pte = ptep_get_and_clear(mm, address, ptep);
+       return pte;
+}
 #endif
 
 /*
  * not present, or in the process of an address space destruction.
  */
 #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
-#define pte_clear_not_present_full(__mm, __address, __ptep, __full)    \
-do {                                                                   \
-       pte_clear((__mm), (__address), (__ptep));                       \
-} while (0)
+static inline void pte_clear_not_present_full(struct mm_struct *mm,
+                                             unsigned long address,
+                                             pte_t *ptep,
+                                             int full)
+{
+       pte_clear(mm, address, ptep);
+}
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define ptep_clear_flush(__vma, __address, __ptep)                     \
-({                                                                     \
-       pte_t __pte;                                                    \
-       __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);  \
-       flush_tlb_page(__vma, __address);                               \
-       __pte;                                                          \
-})
+extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
+                             unsigned long address,
+                             pte_t *ptep);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+                             unsigned long address,
+                             pmd_t *pmdp);
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                     unsigned long address, pmd_t *pmdp)
+{
+       pmd_t old_pmd = *pmdp;
+       set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                     unsigned long address, pmd_t *pmdp)
+{
+       BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+                             unsigned long address,
+                             pmd_t *pmdp);
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B)  (pte_val(A) == pte_val(B))
+static inline int pte_same(pte_t pte_a, pte_t pte_b)
+{
+       return pte_val(pte_a) == pte_val(pte_b);
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMD_SAME
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+       return pmd_val(pmd_a) == pmd_val(pmd_b);
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+       BUG();
+       return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
 #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY
@@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
                                unsigned long size);
 #endif
 
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       return 0;
+}
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+       return 0;
+}
+#ifndef __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+       BUG();
+       return 0;
+}
+#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_GENERIC_PGTABLE_H */
index 5ac5155..dfa2ed4 100644 (file)
@@ -11,6 +11,9 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE       3
 
+#define COMPACT_MODE_DIRECT_RECLAIM    0
+#define COMPACT_MODE_KSWAPD            1
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                       int order, gfp_t gfp_mask, nodemask_t *mask);
+                       int order, gfp_t gfp_mask, nodemask_t *mask,
+                       bool sync);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                       gfp_t gfp_mask, bool sync,
+                                       int compact_mode);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone)
 
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                       int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                       int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                       bool sync)
+{
+       return COMPACT_CONTINUE;
+}
+
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       return COMPACT_SKIPPED;
+}
+
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                              gfp_t gfp_mask, bool sync,
+                                              int compact_mode)
 {
        return COMPACT_CONTINUE;
 }
index 2970022..272496d 100644 (file)
@@ -193,6 +193,13 @@ struct dm_target {
        char *error;
 };
 
+/* Each target can link one of these into the table */
+struct dm_target_callbacks {
+       struct list_head list;
+       int (*congested_fn) (struct dm_target_callbacks *, int);
+       void (*unplug_fn)(struct dm_target_callbacks *);
+};
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
@@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type,
                        sector_t start, sector_t len, char *params);
 
 /*
+ * Target_ctr should call this if it needs to add any callbacks.
+ */
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
+
+/*
  * Finally call this to make the table ready for use.
  */
 int dm_table_complete(struct dm_table *t);
index 49eab36..78bbf47 100644 (file)
@@ -44,7 +44,7 @@
  * Remove a device, destroy any tables.
  *
  * DM_DEV_RENAME:
- * Rename a device.
+ * Rename a device or set its uuid if none was previously supplied.
  *
  * DM_SUSPEND:
  * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY    _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR       4
-#define DM_VERSION_MINOR       18
-#define DM_VERSION_PATCHLEVEL  0
-#define DM_VERSION_EXTRA       "-ioctl (2010-06-29)"
+#define DM_VERSION_MINOR       19
+#define DM_VERSION_PATCHLEVEL  1
+#define DM_VERSION_EXTRA       "-ioctl (2011-01-07)"
 
 /* Status bits */
 #define DM_READONLY_FLAG       (1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
  */
 #define DM_UEVENT_GENERATED_FLAG       (1 << 13) /* Out */
 
+/*
+ * If set, rename changes the uuid not the name.  Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG                   (1 << 14) /* In */
+
 #endif                         /* _LINUX_DM_IOCTL_H */
index 0c3c3a2..eeace7d 100644 (file)
 #define DM_ULOG_REQUEST_TYPE(request_type) \
        (DM_ULOG_REQUEST_MASK & (request_type))
 
+/*
+ * DM_ULOG_REQUEST_VERSION is incremented when there is a
+ * change to the way information is passed between kernel
+ * and userspace.  This could be a structure change of
+ * dm_ulog_request or a change in the way requests are
+ * issued/handled.  Changes are outlined here:
+ *     version 1:  Initial implementation
+ */
+#define DM_ULOG_REQUEST_VERSION 1
+
 struct dm_ulog_request {
        /*
         * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
         */
        uint64_t luid;
        char uuid[DM_UUID_LEN];
-       char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+       char padding[3];        /* Padding because DM_UUID_LEN = 129 */
 
+       uint32_t version;       /* See DM_ULOG_REQUEST_VERSION */
        int32_t error;          /* Used to report back processing errors */
 
        uint32_t seq;           /* Sequence number for request */
index f54adfc..a3b148a 100644 (file)
@@ -34,6 +34,7 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK         0
 #endif
+#define ___GFP_NO_KSWAPD       0x400000u
 
 /*
  * GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+#define __GFP_NO_KSWAPD        ((__force gfp_t)___GFP_NO_KSWAPD)
+
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
  * allocations that simply cannot be supported (e.g. page tables).
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 22    /* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23    /* Room for 23 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -106,6 +109,9 @@ struct vm_area_struct;
                                 __GFP_HARDWALL | __GFP_HIGHMEM | \
                                 __GFP_MOVABLE)
 #define GFP_IOFS       (__GFP_IO | __GFP_FS)
+#define GFP_TRANSHUGE  (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+                        __GFP_NO_KSWAPD)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE   (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
        return alloc_pages_current(gfp_mask, order);
 }
-extern struct page *alloc_page_vma(gfp_t gfp_mask,
+extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr);
 #else
 #define alloc_pages(gfp_mask, order) \
                alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_pages_vma(gfp_mask, order, vma, addr)    \
+       alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr)    \
+       alloc_pages_vma(gfp_mask, 0, vma, addr)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
index f79d67f..4b47ed9 100644 (file)
@@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number)
        return 0;
 }
 
-static inline int __must_check gpio_request(unsigned gpio, const char *label)
+static inline int gpio_request(unsigned gpio, const char *label)
 {
        return -ENOSYS;
 }
@@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num)
        WARN_ON(1);
 }
 
-static inline int __must_check gpio_direction_input(unsigned gpio)
+static inline int gpio_direction_input(unsigned gpio)
 {
        return -ENOSYS;
 }
 
-static inline int __must_check gpio_direction_output(unsigned gpio, int value)
+static inline int gpio_direction_output(unsigned gpio, int value)
 {
        return -ENOSYS;
 }
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
new file mode 100644 (file)
index 0000000..8e6c8c4
--- /dev/null
@@ -0,0 +1,179 @@
+#ifndef _LINUX_HUGE_MM_H
+#define _LINUX_HUGE_MM_H
+
+extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                     struct vm_area_struct *vma,
+                                     unsigned long address, pmd_t *pmd,
+                                     unsigned int flags);
+extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                        pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                        struct vm_area_struct *vma);
+extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                              unsigned long address, pmd_t *pmd,
+                              pmd_t orig_pmd);
+extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
+extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                         unsigned long addr,
+                                         pmd_t *pmd,
+                                         unsigned int flags);
+extern int zap_huge_pmd(struct mmu_gather *tlb,
+                       struct vm_area_struct *vma,
+                       pmd_t *pmd);
+extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned char *vec);
+extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                       unsigned long addr, pgprot_t newprot);
+
+enum transparent_hugepage_flag {
+       TRANSPARENT_HUGEPAGE_FLAG,
+       TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+       TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+       TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+       TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+#ifdef CONFIG_DEBUG_VM
+       TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
+#endif
+};
+
+enum page_check_address_pmd_flag {
+       PAGE_CHECK_ADDRESS_PMD_FLAG,
+       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
+       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
+};
+extern pmd_t *page_check_address_pmd(struct page *page,
+                                    struct mm_struct *mm,
+                                    unsigned long address,
+                                    enum page_check_address_pmd_flag flag);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define HPAGE_PMD_SHIFT HPAGE_SHIFT
+#define HPAGE_PMD_MASK HPAGE_MASK
+#define HPAGE_PMD_SIZE HPAGE_SIZE
+
+#define transparent_hugepage_enabled(__vma)                            \
+       ((transparent_hugepage_flags &                                  \
+         (1<<TRANSPARENT_HUGEPAGE_FLAG) ||                             \
+         (transparent_hugepage_flags &                                 \
+          (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&                   \
+          ((__vma)->vm_flags & VM_HUGEPAGE))) &&                       \
+        !((__vma)->vm_flags & VM_NOHUGEPAGE))
+#define transparent_hugepage_defrag(__vma)                             \
+       ((transparent_hugepage_flags &                                  \
+         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||                     \
+        (transparent_hugepage_flags &                                  \
+         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) &&             \
+         (__vma)->vm_flags & VM_HUGEPAGE))
+#ifdef CONFIG_DEBUG_VM
+#define transparent_hugepage_debug_cow()                               \
+       (transparent_hugepage_flags &                                   \
+        (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
+#else /* CONFIG_DEBUG_VM */
+#define transparent_hugepage_debug_cow() 0
+#endif /* CONFIG_DEBUG_VM */
+
+extern unsigned long transparent_hugepage_flags;
+extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                         pmd_t *dst_pmd, pmd_t *src_pmd,
+                         struct vm_area_struct *vma,
+                         unsigned long addr, unsigned long end);
+extern int handle_pte_fault(struct mm_struct *mm,
+                           struct vm_area_struct *vma, unsigned long address,
+                           pte_t *pte, pmd_t *pmd, unsigned int flags);
+extern int split_huge_page(struct page *page);
+extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+#define split_huge_page_pmd(__mm, __pmd)                               \
+       do {                                                            \
+               pmd_t *____pmd = (__pmd);                               \
+               if (unlikely(pmd_trans_huge(*____pmd)))                 \
+                       __split_huge_page_pmd(__mm, ____pmd);           \
+       }  while (0)
+#define wait_split_huge_page(__anon_vma, __pmd)                                \
+       do {                                                            \
+               pmd_t *____pmd = (__pmd);                               \
+               spin_unlock_wait(&(__anon_vma)->root->lock);            \
+               /*                                                      \
+                * spin_unlock_wait() is just a loop in C and so the    \
+                * CPU can reorder anything around it.                  \
+                */                                                     \
+               smp_mb();                                               \
+               BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
+                      pmd_trans_huge(*____pmd));                       \
+       } while (0)
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+#if HPAGE_PMD_ORDER > MAX_ORDER
+#error "hugepages can't be allocated by the buddy allocator"
+#endif
+extern int hugepage_madvise(struct vm_area_struct *vma,
+                           unsigned long *vm_flags, int advice);
+extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                   unsigned long start,
+                                   unsigned long end,
+                                   long adjust_next);
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                        unsigned long start,
+                                        unsigned long end,
+                                        long adjust_next)
+{
+       if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+               return;
+       __vma_adjust_trans_huge(vma, start, end, adjust_next);
+}
+static inline int hpage_nr_pages(struct page *page)
+{
+       if (unlikely(PageTransHuge(page)))
+               return HPAGE_PMD_NR;
+       return 1;
+}
+static inline struct page *compound_trans_head(struct page *page)
+{
+       if (PageTail(page)) {
+               struct page *head;
+               head = page->first_page;
+               smp_rmb();
+               /*
+                * head may be a dangling pointer.
+                * __split_huge_page_refcount clears PageTail before
+                * overwriting first_page, so if PageTail is still
+                * there it means the head pointer isn't dangling.
+                */
+               if (PageTail(page))
+                       return head;
+       }
+       return page;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+
+#define hpage_nr_pages(x) 1
+
+#define transparent_hugepage_enabled(__vma) 0
+
+#define transparent_hugepage_flags 0UL
+static inline int split_huge_page(struct page *page)
+{
+       return 0;
+}
+#define split_huge_page_pmd(__mm, __pmd)       \
+       do { } while (0)
+#define wait_split_huge_page(__anon_vma, __pmd)        \
+       do { } while (0)
+#define compound_trans_head(page) compound_head(page)
+static inline int hugepage_madvise(struct vm_area_struct *vma,
+                                  unsigned long *vm_flags, int advice)
+{
+       BUG();
+       return 0;
+}
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                        unsigned long start,
+                                        unsigned long end,
+                                        long adjust_next)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_HUGE_MM_H */
index 979c68c..6a64c6f 100644 (file)
@@ -57,7 +57,7 @@ struct irq_desc {
 #endif
 
        struct timer_rand_state *timer_rand_state;
-       unsigned int            *kstat_irqs;
+       unsigned int __percpu   *kstat_irqs;
        irq_flow_handler_t      handle_irq;
        struct irqaction        *action;        /* IRQ action list */
        unsigned int            status;         /* IRQ status */
index 57dac70..5a9d905 100644 (file)
@@ -600,6 +600,13 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
 
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
+
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
index 44e83ba..0cce2db 100644 (file)
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 extern unsigned long long nr_context_switches(void);
 
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define kstat_irqs_this_cpu(irq) \
-       (this_cpu_read(kstat.irqs[irq])
 
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
                                            struct irq_desc *desc)
 {
-       kstat_this_cpu.irqs[irq]++;
-       kstat_this_cpu.irqs_sum++;
+       __this_cpu_inc(kstat.irqs[irq]);
+       __this_cpu_inc(kstat.irqs_sum);
 }
 
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 #else
 #include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#define kstat_irqs_this_cpu(DESC) \
-       ((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
-       ((DESC)->kstat_irqs[smp_processor_id()]++);\
-       kstat_this_cpu.irqs_sum++; } while (0)
+
+#define kstat_incr_irqs_this_cpu(irqno, DESC)          \
+do {                                                   \
+       __this_cpu_inc(*(DESC)->kstat_irqs);            \
+       __this_cpu_inc(kstat.irqs_sum);                 \
+} while (0)
 
 #endif
 
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
 {
-       kstat_this_cpu.softirqs[irq]++;
+       __this_cpu_inc(kstat.softirqs[irq]);
 }
 
 static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644 (file)
index 0000000..6b394f0
--- /dev/null
@@ -0,0 +1,67 @@
+#ifndef _LINUX_KHUGEPAGED_H
+#define _LINUX_KHUGEPAGED_H
+
+#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __khugepaged_enter(struct mm_struct *mm);
+extern void __khugepaged_exit(struct mm_struct *mm);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+
+#define khugepaged_enabled()                                          \
+       (transparent_hugepage_flags &                                  \
+        ((1<<TRANSPARENT_HUGEPAGE_FLAG) |                     \
+         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define khugepaged_always()                            \
+       (transparent_hugepage_flags &                   \
+        (1<<TRANSPARENT_HUGEPAGE_FLAG))
+#define khugepaged_req_madv()                                  \
+       (transparent_hugepage_flags &                           \
+        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+#define khugepaged_defrag()                                    \
+       (transparent_hugepage_flags &                           \
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+       if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+               return __khugepaged_enter(mm);
+       return 0;
+}
+
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+       if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+               __khugepaged_exit(mm);
+}
+
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+       if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+               if ((khugepaged_always() ||
+                    (khugepaged_req_madv() &&
+                     vma->vm_flags & VM_HUGEPAGE)) &&
+                   !(vma->vm_flags & VM_NOHUGEPAGE))
+                       if (__khugepaged_enter(vma->vm_mm))
+                               return -ENOMEM;
+       return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+       return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+       return 0;
+}
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+       return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_KHUGEPAGED_H */
index 159a076..6a576f9 100644 (file)
@@ -25,6 +25,11 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/* Stats that can be updated by kernel. */
+enum mem_cgroup_page_stat_item {
+       MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+};
+
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
@@ -93,7 +98,7 @@ extern int
 mem_cgroup_prepare_migration(struct page *page,
        struct page *newpage, struct mem_cgroup **ptr);
 extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
-       struct page *oldpage, struct page *newpage);
+       struct page *oldpage, struct page *newpage, bool migration_ok);
 
 /*
  * For memory reclaim.
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
        return false;
 }
 
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_page_stat(struct page *page,
+                                enum mem_cgroup_page_stat_item idx,
+                                int val);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+                                           enum mem_cgroup_page_stat_item idx)
+{
+       mem_cgroup_update_page_stat(page, idx, 1);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+                                           enum mem_cgroup_page_stat_item idx)
+{
+       mem_cgroup_update_page_stat(page, idx, -1);
+}
+
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 }
 
 static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
-                                       struct page *oldpage,
-                                       struct page *newpage)
+               struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
 
@@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_file_mapped(struct page *page,
-                                                       int val)
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+                                           enum mem_cgroup_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+                                           enum mem_cgroup_page_stat_item idx)
 {
 }
 
index 31c237a..24376fe 100644 (file)
@@ -13,12 +13,16 @@ struct mem_section;
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 /*
- * Types for free bootmem.
- * The normal smallest mapcount is -1. Here is smaller value than it.
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
  */
-#define SECTION_INFO           (-1 - 1)
-#define MIX_SECTION_INFO       (-1 - 2)
-#define NODE_INFO              (-1 - 3)
+enum {
+       MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+       SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+       MIX_SECTION_INFO,
+       NODE_INFO,
+       MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
 
 /*
  * pgdat resizing functions
index 085527f..e39aeec 100644 (file)
@@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
                        struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-                       unsigned long private, int offlining);
+                       unsigned long private, bool offlining,
+                       bool sync);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
-                       unsigned long private, int offlining);
+                       unsigned long private, bool offlining,
+                       bool sync);
 
 extern int fail_migrate_page(struct address_space *,
                        struct page *, struct page *);
@@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-               unsigned long private, int offlining) { return -ENOSYS; }
+               unsigned long private, bool offlining,
+               bool sync) { return -ENOSYS; }
 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
-               unsigned long private, int offlining) { return -ENOSYS; }
+               unsigned long private, bool offlining,
+               bool sync) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
index 721f451..956a355 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/mm_types.h>
 #include <linux/range.h>
 #include <linux/pfn.h>
+#include <linux/bit_spinlock.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSUP     0x00000200
 #else
 #define VM_GROWSUP     0x00000000
+#define VM_NOHUGEPAGE  0x00000200      /* MADV_NOHUGEPAGE marked this vma */
 #endif
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
@@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
+#else
+#define VM_HUGEPAGE    0x01000000      /* MADV_HUGEPAGE marked this vma */
+#endif
 #define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP  0x04000000      /* Always include in core dumps */
 
@@ -242,6 +248,7 @@ struct inode;
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/huge_mm.h>
 
 /*
  * Methods to modify the page usage count.
@@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+static inline void compound_lock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       bit_spin_lock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline void compound_unlock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       bit_spin_unlock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline unsigned long compound_lock_irqsave(struct page *page)
+{
+       unsigned long uninitialized_var(flags);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       local_irq_save(flags);
+       compound_lock(page);
+#endif
+       return flags;
+}
+
+static inline void compound_unlock_irqrestore(struct page *page,
+                                             unsigned long flags)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       compound_unlock(page);
+       local_irq_restore(flags);
+#endif
+}
+
 static inline struct page *compound_head(struct page *page)
 {
        if (unlikely(PageTail(page)))
@@ -319,9 +359,29 @@ static inline int page_count(struct page *page)
 
 static inline void get_page(struct page *page)
 {
-       page = compound_head(page);
-       VM_BUG_ON(atomic_read(&page->_count) == 0);
+       /*
+        * Getting a normal page or the head of a compound page
+        * requires to already have an elevated page->_count. Only if
+        * we're getting a tail page, the elevated page->_count is
+        * required only in the head page, so for tail pages the
+        * bugcheck only verifies that the page->_count isn't
+        * negative.
+        */
+       VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
        atomic_inc(&page->_count);
+       /*
+        * Getting a tail page will elevate both the head and tail
+        * page->_count(s).
+        */
+       if (unlikely(PageTail(page))) {
+               /*
+                * This is safe only because
+                * __split_huge_page_refcount can't run under
+                * get_page().
+                */
+               VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+               atomic_inc(&page->first_page->_count);
+       }
 }
 
 static inline struct page *virt_to_head_page(const void *x)
@@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page)
        atomic_set(&page->_count, 1);
 }
 
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ */
+static inline int PageBuddy(struct page *page)
+{
+       return atomic_read(&page->_mapcount) == -2;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+       VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+       atomic_set(&page->_mapcount, -2);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+       VM_BUG_ON(!PageBuddy(page));
+       atomic_set(&page->_mapcount, -1);
+}
+
 void put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
@@ -370,12 +451,39 @@ static inline int compound_order(struct page *page)
        return (unsigned long)page[1].lru.prev;
 }
 
+static inline int compound_trans_order(struct page *page)
+{
+       int order;
+       unsigned long flags;
+
+       if (!PageHead(page))
+               return 0;
+
+       flags = compound_lock_irqsave(page);
+       order = compound_order(page);
+       compound_unlock_irqrestore(page, flags);
+       return order;
+}
+
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
        page[1].lru.prev = (void *)order;
 }
 
 /*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
+ * servicing faults for write access.  In the normal case, do always want
+ * pte_mkwrite.  But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+       if (likely(vma->vm_flags & VM_WRITE))
+               pte = pte_mkwrite(pte);
+       return pte;
+}
+
+/*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
  * zeroes, and text pages of executables and shared libraries have
@@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page)
        VM_BUG_ON(PageSlab(page));
        if (unlikely(PageSwapCache(page)))
                mapping = &swapper_space;
-       else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
+       else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                mapping = NULL;
        return mapping;
 }
@@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 #endif
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
 /*
@@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page)
        pte_unmap(pte);                                 \
 } while (0)
 
-#define pte_alloc_map(mm, pmd, address)                        \
-       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
-               NULL: pte_offset_map(pmd, address))
+#define pte_alloc_map(mm, vma, pmd, address)                           \
+       ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma,    \
+                                                       pmd, address))? \
+        NULL: pte_offset_map(pmd, address))
 
 #define pte_alloc_map_lock(mm, pmd, address, ptlp)     \
-       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+       ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL,   \
+                                                       pmd, address))? \
                NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
 
 #define pte_alloc_kernel(pmd, address)                 \
-       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+       ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
                NULL: pte_offset_kernel(pmd, address))
 
 extern void free_area_init(unsigned long * zones_size);
@@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_GET       0x04    /* do get_page on page */
 #define FOLL_DUMP      0x08    /* give error on hole if it would be zero */
 #define FOLL_FORCE     0x10    /* get_user_pages read/write w/o permission */
+#define FOLL_MLOCK     0x40    /* mark page as mlocked */
+#define FOLL_SPLIT     0x80    /* don't return transhuge pages, split them */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
@@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr)
 
 extern void dump_page(struct page *page);
 
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_huge_page(struct page *page,
+                           unsigned long addr,
+                           unsigned int pages_per_huge_page);
+extern void copy_user_huge_page(struct page *dst, struct page *src,
+                               unsigned long addr, struct vm_area_struct *vma,
+                               unsigned int pages_per_huge_page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
index 8835b87..8f7d247 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef LINUX_MM_INLINE_H
 #define LINUX_MM_INLINE_H
 
+#include <linux/huge_mm.h>
+
 /**
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
@@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page)
 }
 
 static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
+                      struct list_head *head)
 {
-       list_add(&page->lru, &zone->lru[l].list);
-       __inc_zone_state(zone, NR_LRU_BASE + l);
+       list_add(&page->lru, head);
+       __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
        mem_cgroup_add_lru_list(page, l);
 }
 
 static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+       __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+}
+
+static inline void
 del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
        list_del(&page->lru);
-       __dec_zone_state(zone, NR_LRU_BASE + l);
+       __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
        mem_cgroup_del_lru_list(page, l);
 }
 
@@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
                        l += LRU_ACTIVE;
                }
        }
-       __dec_zone_state(zone, NR_LRU_BASE + l);
+       __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
        mem_cgroup_del_lru_list(page, l);
 }
 
index bb7288a..26bc4e2 100644 (file)
@@ -310,6 +310,9 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
        struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+#endif
        /* How many tasks sharing this mm are OOM_DISABLE */
        atomic_t oom_disable_count;
 };
index bf17350..38d3930 100644 (file)
@@ -94,12 +94,12 @@ struct sh_mmcif_plat_data {
 
 static inline u32 sh_mmcif_readl(void __iomem *addr, int reg)
 {
-       return readl(addr + reg);
+       return __raw_readl(addr + reg);
 }
 
 static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val)
 {
-       writel(val, addr + reg);
+       __raw_writel(val, addr + reg);
 }
 
 #define SH_MMCIF_BBS 512 /* boot block size */
index 43dcfbd..cc2e7df 100644 (file)
@@ -62,6 +62,16 @@ struct mmu_notifier_ops {
                                 unsigned long address);
 
        /*
+        * test_young is called to check the young/accessed bitflag in
+        * the secondary pte. This is used to know if the page is
+        * frequently used without actually clearing the flag or tearing
+        * down the secondary mapping on the page.
+        */
+       int (*test_young)(struct mmu_notifier *mn,
+                         struct mm_struct *mm,
+                         unsigned long address);
+
+       /*
         * change_pte is called in cases that pte mapping to page is changed:
         * for example, when ksm remaps pte to point to a new shared page.
         */
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long address);
+extern int __mmu_notifier_test_young(struct mm_struct *mm,
+                                    unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
                                      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
 
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_test_young(mm, address);
+       return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
@@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __pte;                                                          \
 })
 
+#define pmdp_clear_flush_notify(__vma, __address, __pmdp)              \
+({                                                                     \
+       pmd_t __pmd;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       VM_BUG_ON(__address & ~HPAGE_PMD_MASK);                         \
+       mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,  \
+                                           (__address)+HPAGE_PMD_SIZE);\
+       __pmd = pmdp_clear_flush(___vma, ___address, __pmdp);           \
+       mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,    \
+                                         (__address)+HPAGE_PMD_SIZE);  \
+       __pmd;                                                          \
+})
+
+#define pmdp_splitting_flush_notify(__vma, __address, __pmdp)          \
+({                                                                     \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       VM_BUG_ON(__address & ~HPAGE_PMD_MASK);                         \
+       mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,  \
+                                           (__address)+HPAGE_PMD_SIZE);\
+       pmdp_splitting_flush(___vma, ___address, __pmdp);               \
+       mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,    \
+                                         (__address)+HPAGE_PMD_SIZE);  \
+})
+
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
 ({                                                                     \
        int __young;                                                    \
@@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
 
+#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);   \
+       __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
+                                                 ___address);          \
+       __young;                                                        \
+})
+
 #define set_pte_at_notify(__mm, __address, __ptep, __pte)              \
 ({                                                                     \
        struct mm_struct *___mm = __mm;                                 \
@@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
 
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
@@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 }
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
+#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_splitting_flush_notify pmdp_splitting_flush
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
index 39c24eb..02ecb01 100644 (file)
@@ -114,6 +114,7 @@ enum zone_stat_item {
        NUMA_LOCAL,             /* allocation from local node */
        NUMA_OTHER,             /* allocation from other node */
 #endif
+       NR_ANON_TRANSPARENT_HUGEPAGES,
        NR_VM_ZONE_STAT_ITEMS };
 
 /*
@@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
        return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
-
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -645,6 +640,7 @@ typedef struct pglist_data {
        wait_queue_head_t kswapd_wait;
        struct task_struct *kswapd;
        int kswapd_max_order;
+       enum zone_type classzone_idx;
 } pg_data_t;
 
 #define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
@@ -660,8 +656,10 @@ typedef struct pglist_data {
 
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
-void wakeup_kswapd(struct zone *zone, int order);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+               int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
                int classzone_idx, int alloc_flags);
 enum memmap_context {
        MEMMAP_EARLY,
index 5f38c46..0db8037 100644 (file)
@@ -48,9 +48,6 @@
  * struct page (these bits with information) are always mapped into kernel
  * address space...
  *
- * PG_buddy is set to indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
  * PG_hwpoison indicates that a page got corrupted in hardware and contains
  * data with incorrect ECC bits that triggered a machine check. Accessing is
  * not safe since it may cause another machine check. Don't touch!
@@ -96,7 +93,6 @@ enum pageflags {
        PG_swapcache,           /* Swap page: swp_entry_t in private */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,             /* To be reclaimed asap */
-       PG_buddy,               /* Page is free, on buddy lists */
        PG_swapbacked,          /* Page is backed by RAM/swap */
        PG_unevictable,         /* Page is "unevictable"  */
 #ifdef CONFIG_MMU
@@ -108,6 +104,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       PG_compound_lock,
+#endif
        __NR_PAGEFLAGS,
 
        /* Filesystems */
@@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 struct page;   /* forward declaration */
 
 TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
-PAGEFLAG(Error, error)
+PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
@@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
  * risky: they bypass page accounting.
  */
 TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-__PAGEFLAG(Buddy, buddy)
 PAGEFLAG(MappedToDisk, mappedtodisk)
 
 /* PG_readahead is only used for file reads; PG_reclaim is only for writes */
@@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page)
  * tests can be used in performance sensitive paths. PageCompound is
  * generally not used in hot code paths.
  */
-__PAGEFLAG(Head, head)
+__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
 __PAGEFLAG(Tail, tail)
 
 static inline int PageCompound(struct page *page)
@@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page)
        return page->flags & ((1L << PG_head) | (1L << PG_tail));
 
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+       BUG_ON(!PageHead(page));
+       ClearPageHead(page);
+}
+#endif
 #else
 /*
  * Reduce page flag use as much as possible by overlapping
@@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page)
        page->flags &= ~PG_head_tail_mask;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+       BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
+       clear_bit(PG_compound, &page->flags);
+}
+#endif
+
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for
+ * normal or transparent huge pages.
+ *
+ * PageTransHuge() returns true for both transparent huge and
+ * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
+ * called only in the core VM paths where hugetlbfs pages can't exist.
+ */
+static inline int PageTransHuge(struct page *page)
+{
+       VM_BUG_ON(PageTail(page));
+       return PageHead(page);
+}
+
+static inline int PageTransCompound(struct page *page)
+{
+       return PageCompound(page);
+}
+
+#else
+
+static inline int PageTransHuge(struct page *page)
+{
+       return 0;
+}
+
+static inline int PageTransCompound(struct page *page)
+{
+       return 0;
+}
+#endif
+
 #ifdef CONFIG_MMU
 #define __PG_MLOCKED           (1 << PG_mlocked)
 #else
 #define __PG_MLOCKED           0
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __PG_COMPOUND_LOCK             (1 << PG_compound_lock)
+#else
+#define __PG_COMPOUND_LOCK             0
+#endif
+
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
  * these flags set.  It they are, there is a problem.
@@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_FREE \
        (1 << PG_lru     | 1 << PG_locked    | \
         1 << PG_private | 1 << PG_private_2 | \
-        1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
+        1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
+        1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+        __PG_COMPOUND_LOCK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
index b02195d..5b0c971 100644 (file)
@@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
 
 enum {
        /* flags for mem_cgroup */
-       PCG_LOCK,  /* page cgroup is locked */
+       PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
        PCG_CACHE, /* charged as cache */
        PCG_USED, /* this object is in use. */
-       PCG_ACCT_LRU, /* page has been accounted for */
-       PCG_FILE_MAPPED, /* page is accounted as "mapped" */
        PCG_MIGRATION, /* under page migration */
+       /* flags for mem_cgroup and file and I/O status */
+       PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
+       PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+       PCG_FILE_DIRTY, /* page is dirty */
+       PCG_FILE_WRITEBACK, /* page is under writeback */
+       PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
+       /* No lock in page_cgroup */
+       PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
 };
 
 #define TESTPCGFLAG(uname, lname)                      \
@@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)    \
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)   \
        { return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTSETPCGFLAG(uname, lname)                   \
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc)     \
+       { return test_and_set_bit(PCG_##lname, &pc->flags);  }
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
 CLEARPCGFLAG(Cache, CACHE)
@@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
 TESTPCGFLAG(FileMapped, FILE_MAPPED)
 
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
+
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
@@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+       /*
+        * Don't take this lock in IRQ context.
+        * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+        */
        bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
@@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc)
        return bit_spin_is_locked(PCG_LOCK, &pc->flags);
 }
 
+static inline void move_lock_page_cgroup(struct page_cgroup *pc,
+       unsigned long *flags)
+{
+       /*
+        * We know updates to pc->flags of page cache's stats are from both of
+        * usual context or IRQ context. Disable IRQ to avoid deadlock.
+        */
+       local_irq_save(*flags);
+       bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
+}
+
+static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
+       unsigned long *flags)
+{
+       bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
+       local_irq_restore(*flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
index 2d1ffe3..9c66e99 100644 (file)
@@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
 
 static inline int mapping_unevictable(struct address_space *mapping)
 {
-       if (likely(mapping))
+       if (mapping)
                return test_bit(AS_UNEVICTABLE, &mapping->flags);
        return !!mapping;
 }
index ab2baa5..23241c2 100644 (file)
@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot)
 }
 
 /**
+ * radix_tree_deref_slot_protected     - dereference a slot without RCU lock but with tree lock held
+ * @pslot:     pointer to slot, returned by radix_tree_lookup_slot
+ * Returns:    item that was stored in that slot with any direct pointer flag
+ *             removed.
+ *
+ * Similar to radix_tree_deref_slot but only used during migration when a pages
+ * mapping is being moved. The caller does not hold the RCU read lock but it
+ * must hold the tree lock to prevent parallel updates.
+ */
+static inline void *radix_tree_deref_slot_protected(void **pslot,
+                                                       spinlock_t *treelock)
+{
+       return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+}
+
+/**
  * radix_tree_deref_retry      - check radix_tree_deref_slot
  * @arg:       pointer returned by radix_tree_deref_slot
  * Returns:    0 if retry is not required, otherwise retry is required
index bb83c0d..e9fd04c 100644 (file)
@@ -198,6 +198,8 @@ enum ttu_flags {
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
+bool is_vma_temporary_stack(struct vm_area_struct *vma);
+
 int try_to_unmap(struct page *, enum ttu_flags flags);
 int try_to_unmap_one(struct page *, struct vm_area_struct *,
                        unsigned long address, enum ttu_flags flags);
index 96e2321..d747f94 100644 (file)
@@ -21,7 +21,8 @@
 #define CLONE_DETACHED         0x00400000      /* Unused, ignored */
 #define CLONE_UNTRACED         0x00800000      /* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID     0x01000000      /* set the TID in the child */
-#define CLONE_STOPPED          0x02000000      /* Start in stopped state */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+   and is now available for re-use. */
 #define CLONE_NEWUTS           0x04000000      /* New utsname group? */
 #define CLONE_NEWIPC           0x08000000      /* New ipcs */
 #define CLONE_NEWUSER          0x10000000      /* New user namespace */
@@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
 #endif
                                        /* leave room for more dump flags */
 #define MMF_VM_MERGEABLE       16      /* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE                17      /* set when VM_HUGEPAGE is set on vma */
 
 #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
@@ -633,6 +635,8 @@ struct signal_struct {
 
        int oom_adj;            /* OOM kill score adjustment (bit shift) */
        int oom_score_adj;      /* OOM kill score adjustment */
+       int oom_score_adj_min;  /* OOM kill score adjustment minimum value.
+                                * Only settable by CAP_SYS_RESOURCE. */
 
        struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                         * credential calculations
index eba53e7..4d55932 100644 (file)
@@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_add_page_tail(struct zone* zone,
+                             struct page *page, struct page *page_tail);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
index 44b54f6..4ed6fcd 100644 (file)
@@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
-                               pgprot_t prot);
+extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
+                       unsigned long start, unsigned long end, gfp_t gfp_mask,
+                       pgprot_t prot, int node, void *caller);
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
                                        unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        void *caller);
-extern struct vm_struct *get_vm_area_node(unsigned long size,
-                                         unsigned long flags, int node,
-                                         gfp_t gfp_mask);
 extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
@@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 #ifdef CONFIG_SMP
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                    size_t align, gfp_t gfp_mask);
+                                    size_t align);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 #endif
index eaaea37..833e676 100644 (file)
@@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
 void refresh_cpu_vm_stats(int);
+
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                               int (*calculate_pressure)(struct zone *));
 #else /* CONFIG_SMP */
 
 /*
@@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
+
 static inline void refresh_cpu_vm_stats(int cpu) { }
 #endif
 
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
new file mode 100644 (file)
index 0000000..388bcdd
--- /dev/null
@@ -0,0 +1,74 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM compaction
+
+#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_COMPACTION_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include "gfpflags.h"
+
+DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
+
+       TP_PROTO(unsigned long nr_scanned,
+               unsigned long nr_taken),
+
+       TP_ARGS(nr_scanned, nr_taken),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, nr_scanned)
+               __field(unsigned long, nr_taken)
+       ),
+
+       TP_fast_assign(
+               __entry->nr_scanned = nr_scanned;
+               __entry->nr_taken = nr_taken;
+       ),
+
+       TP_printk("nr_scanned=%lu nr_taken=%lu",
+               __entry->nr_scanned,
+               __entry->nr_taken)
+);
+
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
+
+       TP_PROTO(unsigned long nr_scanned,
+               unsigned long nr_taken),
+
+       TP_ARGS(nr_scanned, nr_taken)
+);
+
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
+       TP_PROTO(unsigned long nr_scanned,
+               unsigned long nr_taken),
+
+       TP_ARGS(nr_scanned, nr_taken)
+);
+
+TRACE_EVENT(mm_compaction_migratepages,
+
+       TP_PROTO(unsigned long nr_migrated,
+               unsigned long nr_failed),
+
+       TP_ARGS(nr_migrated, nr_failed),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, nr_migrated)
+               __field(unsigned long, nr_failed)
+       ),
+
+       TP_fast_assign(
+               __entry->nr_migrated = nr_migrated;
+               __entry->nr_failed = nr_failed;
+       ),
+
+       TP_printk("nr_migrated=%lu nr_failed=%lu",
+               __entry->nr_migrated,
+               __entry->nr_failed)
+);
+
+
+#endif /* _TRACE_COMPACTION_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index c255fcc..ea422aa 100644 (file)
 
 #define trace_reclaim_flags(page, sync) ( \
        (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-       (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+       (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
        )
 
 #define trace_shrink_flags(file, sync) ( \
-       (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \
+       (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
                        (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-       (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+       (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
        )
 
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
index 89a2b2d..4e249b9 100644 (file)
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
        TP_ARGS(bdi))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644 (file)
index 0000000..eb23f41
--- /dev/null
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * gntdev.h
+ * 
+ * Interface to /dev/xen/gntdev.
+ * 
+ * Copyright (c) 2007, D G Murray
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+       /* The domain ID of the grant to be mapped. */
+       uint32_t domid;
+       /* The grant reference of the grant to be mapped. */
+       uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+       /* IN parameters */
+       /* The number of grants to be mapped. */
+       uint32_t count;
+       uint32_t pad;
+       /* OUT parameters */
+       /* The offset to be used on a subsequent call to mmap(). */
+       uint64_t index;
+       /* Variable IN parameter. */
+       /* Array of grant references, of size @count. */
+       struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+       /* IN parameters */
+       /* The offset was returned by the corresponding map operation. */
+       uint64_t index;
+       /* The number of pages to be unmapped. */
+       uint32_t count;
+       uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ *      supplied @vaddr must correspond to the start of the range; otherwise
+ *      an error will result. It is only possible to munmap() the entire
+ *      contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+       /* IN parameters */
+       /* The virtual address of the first mapped page in a range. */
+       uint64_t vaddr;
+       /* OUT parameters */
+       /* The offset that was used in the initial mmap() operation. */
+       uint64_t offset;
+       /* The number of pages mapped in the VM area that begins at @vaddr. */
+       uint32_t count;
+       uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+       /* IN parameter */
+       /* The maximum number of grants that may be mapped at once. */
+       uint32_t count;
+};
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
index 9a73170..b1fab6b 100644 (file)
 #ifndef __ASM_GNTTAB_H__
 #define __ASM_GNTTAB_H__
 
-#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+
+#include <xen/interface/xen.h>
 #include <xen/interface/grant_table.h>
+
+#include <asm/xen/hypervisor.h>
 #include <asm/xen/grant_table.h>
 
+#include <xen/features.h>
+
 /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
 #define NR_GRANT_FRAMES 4
 
@@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
                                       unsigned long pfn);
 
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
+                 uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+       if (flags & GNTMAP_contains_pte)
+               map->host_addr = addr;
+       else if (xen_feature(XENFEAT_auto_translated_physmap))
+               map->host_addr = __pa(addr);
+       else
+               map->host_addr = addr;
+
+       map->flags = flags;
+       map->ref = ref;
+       map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
+                   uint32_t flags, grant_handle_t handle)
+{
+       if (flags & GNTMAP_contains_pte)
+               unmap->host_addr = addr;
+       else if (xen_feature(XENFEAT_auto_translated_physmap))
+               unmap->host_addr = __pa(addr);
+       else
+               unmap->host_addr = addr;
+
+       unmap->handle = handle;
+       unmap->dev_bus_addr = 0;
+}
+
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
                           unsigned long max_nr_gframes,
                           struct grant_entry **__shared);
@@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void);
 
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+                   struct page **pages, unsigned int count);
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+                     struct page **pages, unsigned int count);
+
 #endif /* __ASM_GNTTAB_H__ */
index d9b44f2..25e4291 100644 (file)
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+       retval = khugepaged_fork(mm, oldmm);
+       if (retval)
+               goto out;
 
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+               khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       mm->pmd_huge_pte = NULL;
+#endif
+
        if (!mm_init(mm, tsk))
                goto fail_nomem;
 
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+       sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
        mutex_init(&sig->cred_guard_mutex);
 
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
        }
 
        /*
-        * We hope to recycle these flags after 2.6.26
-        */
-       if (unlikely(clone_flags & CLONE_STOPPED)) {
-               static int __read_mostly count = 100;
-
-               if (count > 0 && printk_ratelimit()) {
-                       char comm[TASK_COMM_LEN];
-
-                       count--;
-                       printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                       "clone flags 0x%lx\n",
-                               get_task_comm(comm, current),
-                               clone_flags & CLONE_STOPPED);
-               }
-       }
-
-       /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
 
-               if (unlikely(clone_flags & CLONE_STOPPED)) {
-                       /*
-                        * We'll start up with an immediate SIGSTOP.
-                        */
-                       sigaddset(&p->pending.signal, SIGSTOP);
-                       set_tsk_thread_flag(p, TIF_SIGPENDING);
-                       __set_task_state(p, TASK_STOPPED);
-               } else {
-                       wake_up_new_task(p, clone_flags);
-               }
+               wake_up_new_task(p, clone_flags);
 
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
index 3019b92..5207563 100644 (file)
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct page *page;
+       struct page *page, *page_head;
        int err;
 
        /*
@@ -265,11 +265,46 @@ again:
        if (err < 0)
                return err;
 
-       page = compound_head(page);
-       lock_page(page);
-       if (!page->mapping) {
-               unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       page_head = page;
+       if (unlikely(PageTail(page))) {
                put_page(page);
+               /* serialize against __split_huge_page_splitting() */
+               local_irq_disable();
+               if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                       page_head = compound_head(page);
+                       /*
+                        * page_head is valid pointer but we must pin
+                        * it before taking the PG_lock and/or
+                        * PG_compound_lock. The moment we re-enable
+                        * irqs __split_huge_page_splitting() can
+                        * return and the head page can be freed from
+                        * under us. We can't take the PG_lock and/or
+                        * PG_compound_lock on a page that could be
+                        * freed from under us.
+                        */
+                       if (page != page_head) {
+                               get_page(page_head);
+                               put_page(page);
+                       }
+                       local_irq_enable();
+               } else {
+                       local_irq_enable();
+                       goto again;
+               }
+       }
+#else
+       page_head = compound_head(page);
+       if (page != page_head) {
+               get_page(page_head);
+               put_page(page);
+       }
+#endif
+
+       lock_page(page_head);
+       if (!page_head->mapping) {
+               unlock_page(page_head);
+               put_page(page_head);
                goto again;
        }
 
@@ -280,20 +315,20 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-       if (PageAnon(page)) {
+       if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = page->mapping->host;
-               key->shared.pgoff = page->index;
+               key->shared.inode = page_head->mapping->host;
+               key->shared.pgoff = page_head->index;
        }
 
        get_futex_key_refs(key);
 
-       unlock_page(page);
-       put_page(page);
+       unlock_page(page_head);
+       put_page(page_head);
        return 0;
 }
 
index 9988d03..282f202 100644 (file)
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+       int cpu;
+
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
-       memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+       for_each_possible_cpu(cpu)
+               *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
 }
 
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        if (!desc)
                return NULL;
        /* allocate based on nr_cpu_ids */
-       desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
-                                        gfp, node);
+       desc->kstat_irqs = alloc_percpu(unsigned int);
        if (!desc->kstat_irqs)
                goto err_desc;
 
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        return desc;
 
 err_kstat:
-       kfree(desc->kstat_irqs);
+       free_percpu(desc->kstat_irqs);
 err_desc:
        kfree(desc);
        return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
        mutex_unlock(&sparse_irq_lock);
 
        free_masks(desc);
-       kfree(desc->kstat_irqs);
+       free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
 
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
 
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
        int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-               desc[i].kstat_irqs = kstat_irqs_all[i];
+               /* TODO : do this allocation on-demand ... */
+               desc[i].kstat_irqs = alloc_percpu(unsigned int);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+       struct irq_desc *desc;
+       unsigned int i;
+
+       for (i = 0; i < cnt; i++) {
+               desc = irq_to_desc(start + i);
+               if (desc && !desc->kstat_irqs) {
+                       unsigned int __percpu *stats = alloc_percpu(unsigned int);
+
+                       if (!stats)
+                               return -1;
+                       if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+                               free_percpu(stats);
+               }
+       }
+#endif
        return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-       return desc ? desc->kstat_irqs[cpu] : 0;
+
+       return desc && desc->kstat_irqs ?
+                       *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
        int cpu;
        int sum = 0;
 
-       if (!desc)
+       if (!desc || !desc->kstat_irqs)
                return 0;
        for_each_possible_cpu(cpu)
-               sum += desc->kstat_irqs[cpu];
+               sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
index c2c8a4a..3ad483b 100644 (file)
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
          See Documentation/nommu-mmap.txt for more information.
 
+config TRANSPARENT_HUGEPAGE
+       bool "Transparent Hugepage Support"
+       depends on X86 && MMU
+       select COMPACTION
+       help
+         Transparent Hugepages allows the kernel to use huge pages and
+         huge tlb transparently to the applications whenever possible.
+         This feature can improve computing performance to certain
+         applications by speeding up page faults during memory
+         allocation, by reducing the number of tlb misses and by speeding
+         up the pagetable walking.
+
+         If memory constrained on embedded, you may want to say N.
+
+choice
+       prompt "Transparent Hugepage Support sysfs defaults"
+       depends on TRANSPARENT_HUGEPAGE
+       default TRANSPARENT_HUGEPAGE_ALWAYS
+       help
+         Selects the sysfs defaults for Transparent Hugepage Support.
+
+       config TRANSPARENT_HUGEPAGE_ALWAYS
+               bool "always"
+       help
+         Enabling Transparent Hugepage always, can increase the
+         memory footprint of applications without a guaranteed
+         benefit but it will work automatically for all applications.
+
+       config TRANSPARENT_HUGEPAGE_MADVISE
+               bool "madvise"
+       help
+         Enabling Transparent Hugepage madvise, will only provide a
+         performance improvement benefit to the applications using
+         madvise(MADV_HUGEPAGE) but it won't risk to increase the
+         memory footprint of applications without a guaranteed
+         benefit.
+endchoice
+
 #
 # UP and nommu archs use km based percpu allocator
 #
index f73f75a..2b1b575 100644 (file)
@@ -5,7 +5,7 @@
 mmu-y                  := nommu.o
 mmu-$(CONFIG_MMU)      := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                          vmalloc.o pagewalk.o
+                          vmalloc.o pagewalk.o pgtable-generic.o
 
 obj-y                  := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
index 1a8894e..6d592a0 100644 (file)
@@ -16,6 +16,9 @@
 #include <linux/sysfs.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
 /*
  * compact_control is used to track pages being migrated and the free pages
  * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+       bool sync;                      /* Synchronous migration */
 
        /* Account for isolated anon and file pages */
        unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+
+       int compact_mode;
 };
 
 static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                                struct list_head *freelist)
 {
        unsigned long zone_end_pfn, end_pfn;
-       int total_isolated = 0;
+       int nr_scanned = 0, total_isolated = 0;
        struct page *cursor;
 
        /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
 
                if (!pfn_valid_within(blockpfn))
                        continue;
+               nr_scanned++;
 
                if (!PageBuddy(page))
                        continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                }
        }
 
+       trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        return total_isolated;
 }
 
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+       unsigned long last_pageblock_nr = 0, pageblock_nr;
+       unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
 
        /* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
                struct page *page;
                if (!pfn_valid_within(low_pfn))
                        continue;
+               nr_scanned++;
 
                /* Get the page and skip if free */
                page = pfn_to_page(low_pfn);
                if (PageBuddy(page))
                        continue;
 
+               /*
+                * For async migration, also only scan in MOVABLE blocks. Async
+                * migration is optimistic to see if the minimum amount of work
+                * satisfies the allocation
+                */
+               pageblock_nr = low_pfn >> pageblock_order;
+               if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                               get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                       low_pfn += pageblock_nr_pages;
+                       low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                       last_pageblock_nr = pageblock_nr;
+                       continue;
+               }
+
+               if (!PageLRU(page))
+                       continue;
+
+               /*
+                * PageLRU is set, and lru_lock excludes isolation,
+                * splitting and collapsing (collapsing has already
+                * happened if PageLRU is set).
+                */
+               if (PageTransHuge(page)) {
+                       low_pfn += (1 << compound_order(page)) - 1;
+                       continue;
+               }
+
                /* Try isolate the page */
                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
                        continue;
 
+               VM_BUG_ON(PageTransCompound(page));
+
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
+               nr_isolated++;
 
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
        cc->migrate_pfn = low_pfn;
 
+       trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
        return cc->nr_migratepages;
 }
 
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 
 static int compact_finished(struct zone *zone,
-                                               struct compact_control *cc)
+                           struct compact_control *cc)
 {
        unsigned int order;
-       unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+       unsigned long watermark;
 
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
                return COMPACT_COMPLETE;
 
        /* Compaction run is not finished if the watermark is not met */
+       if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+               watermark = low_wmark_pages(zone);
+       else
+               watermark = high_wmark_pages(zone);
+       watermark += (1 << cc->order);
+
        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
 
        if (cc->order == -1)
                return COMPACT_CONTINUE;
 
+       /*
+        * Generating only one page of the right order is not enough
+        * for kswapd, we must continue until we're above the high
+        * watermark as a pool for high order GFP_ATOMIC allocations
+        * too.
+        */
+       if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+               return COMPACT_CONTINUE;
+
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                /* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
 
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       int fragindex;
+       unsigned long watermark;
+
+       /*
+        * Watermarks for order-0 must be met for compaction. Note the 2UL.
+        * This is because during migration, copies of pages need to be
+        * allocated and for a short time, the footprint is higher
+        */
+       watermark = low_wmark_pages(zone) + (2UL << order);
+       if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               return COMPACT_SKIPPED;
+
+       /*
+        * fragmentation index determines if allocation failures are due to
+        * low memory or external fragmentation
+        *
+        * index of -1 implies allocations might succeed dependingon watermarks
+        * index towards 0 implies failure is due to lack of memory
+        * index towards 1000 implies failure is due to fragmentation
+        *
+        * Only compact if a failure would be due to fragmentation.
+        */
+       fragindex = fragmentation_index(zone, order);
+       if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+               return COMPACT_SKIPPED;
+
+       if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+               return COMPACT_PARTIAL;
+
+       return COMPACT_CONTINUE;
+}
+
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
 
+       ret = compaction_suitable(zone, cc->order);
+       switch (ret) {
+       case COMPACT_PARTIAL:
+       case COMPACT_SKIPPED:
+               /* Compaction is likely to fail */
+               return ret;
+       case COMPACT_CONTINUE:
+               /* Fall through to compaction */
+               ;
+       }
+
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
                nr_migrate = cc->nr_migratepages;
                migrate_pages(&cc->migratepages, compaction_alloc,
-                                               (unsigned long)cc, 0);
+                               (unsigned long)cc, false,
+                               cc->sync);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
 
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
                if (nr_remaining)
                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+               trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                                               nr_remaining);
 
                /* Release LRU pages not migrated */
                if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-                                               int order, gfp_t gfp_mask)
+unsigned long compact_zone_order(struct zone *zone,
+                                int order, gfp_t gfp_mask,
+                                bool sync,
+                                int compact_mode)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
+               .sync = sync,
+               .compact_mode = compact_mode,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
  *
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                       int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                       int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                       bool sync)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-       unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-       if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+       if (!order || !may_enter_fs || !may_perform_io)
                return rc;
 
        count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-               int fragindex;
                int status;
 
-               /*
-                * Watermarks for order-0 must be met for compaction. Note
-                * the 2UL. This is because during migration, copies of
-                * pages need to be allocated and for a short time, the
-                * footprint is higher
-                */
-               watermark = low_wmark_pages(zone) + (2UL << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                       continue;
-
-               /*
-                * fragmentation index determines if allocation failures are
-                * due to low memory or external fragmentation
-                *
-                * index of -1 implies allocations might succeed depending
-                *      on watermarks
-                * index towards 0 implies failure is due to lack of memory
-                * index towards 1000 implies failure is due to fragmentation
-                *
-                * Only compact if a failure would be due to fragmentation.
-                */
-               fragindex = fragmentation_index(zone, order);
-               if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                       continue;
-
-               if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                       rc = COMPACT_PARTIAL;
-                       break;
-               }
-
-               status = compact_zone_order(zone, order, gfp_mask);
+               status = compact_zone_order(zone, order, gfp_mask, sync,
+                                           COMPACT_MODE_DIRECT_RECLAIM);
                rc = max(status, rc);
 
-               if (zone_watermark_ok(zone, order, watermark, 0, 0))
+               /* If a normal allocation would succeed, stop compacting */
+               if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
 
@@ -531,6 +620,7 @@ static int compact_node(int nid)
                        .nr_freepages = 0,
                        .nr_migratepages = 0,
                        .order = -1,
+                       .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
                };
 
                zone = &pgdat->node_zones[zoneid];
index 4df2de7..03bf3bb 100644 (file)
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                if (mem_flags & __GFP_WAIT) {
                        DECLARE_WAITQUEUE(wait, current);
 
-                       __set_current_state(TASK_INTERRUPTIBLE);
+                       __set_current_state(TASK_UNINTERRUPTIBLE);
                        __add_wait_queue(&pool->waitq, &wait);
                        spin_unlock_irqrestore(&pool->lock, flags);
 
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
 
 static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
 {
-       unsigned long flags;
        struct dma_page *page;
 
-       spin_lock_irqsave(&pool->lock, flags);
        list_for_each_entry(page, &pool->page_list, page_list) {
                if (dma < page->dma)
                        continue;
                if (dma < (page->dma + pool->allocation))
-                       goto done;
+                       return page;
        }
-       page = NULL;
- done:
-       spin_unlock_irqrestore(&pool->lock, flags);
-       return page;
+       return NULL;
 }
 
 /**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        unsigned long flags;
        unsigned int offset;
 
+       spin_lock_irqsave(&pool->lock, flags);
        page = pool_find_page(pool, dma);
        if (!page) {
+               spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        offset = vaddr - page->vaddr;
 #ifdef DMAPOOL_DEBUG
        if ((dma - page->dma) != offset) {
+               spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
                                chain = *(int *)(page->vaddr + chain);
                                continue;
                        }
+                       spin_unlock_irqrestore(&pool->lock, flags);
                        if (pool->dev)
                                dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
                                        "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
 
-       spin_lock_irqsave(&pool->lock, flags);
        page->in_use--;
        *(int *)vaddr = page->offset;
        page->offset = offset;
index ca38939..83a45d3 100644 (file)
@@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                                continue;
 
                        wait_on_page_writeback(page);
-                       if (PageError(page))
+                       if (TestClearPageError(page))
                                ret = -EIO;
                }
                pagevec_release(&pvec);
@@ -837,9 +837,6 @@ repeat:
                if (radix_tree_deref_retry(page))
                        goto restart;
 
-               if (page->mapping == NULL || page->index != index)
-                       break;
-
                if (!page_cache_get_speculative(page))
                        goto repeat;
 
@@ -849,6 +846,16 @@ repeat:
                        goto repeat;
                }
 
+               /*
+                * must check mapping and index after taking the ref.
+                * otherwise we can get both false positives and false
+                * negatives, which is just confusing to the caller.
+                */
+               if (page->mapping == NULL || page->index != index) {
+                       page_cache_release(page);
+                       break;
+               }
+
                pages[ret] = page;
                ret++;
                index++;
@@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
-       if (likely(page))
+       if (page)
                return page;
 
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644 (file)
index 0000000..004c9c2
--- /dev/null
@@ -0,0 +1,2346 @@
+/*
+ *  Copyright (C) 2009  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+       (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+       (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+       (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+       (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+       struct hlist_node hash;
+       struct list_head mm_node;
+       struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+       struct list_head mm_head;
+       struct mm_slot *mm_slot;
+       unsigned long address;
+} khugepaged_scan = {
+       .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+       struct zone *zone;
+       int nr_zones = 0;
+       unsigned long recommended_min;
+       extern int min_free_kbytes;
+
+       if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) &&
+           !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags))
+               return 0;
+
+       for_each_populated_zone(zone)
+               nr_zones++;
+
+       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+       /*
+        * Make sure that on average at least two pageblocks are almost free
+        * of another type, one for a migratetype to fall back to and a
+        * second to avoid subsequent fallbacks of other types There are 3
+        * MIGRATE_TYPES we care about.
+        */
+       recommended_min += pageblock_nr_pages * nr_zones *
+                          MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+       /* don't ever allow to reserve more than 5% of the lowmem */
+       recommended_min = min(recommended_min,
+                             (unsigned long) nr_free_buffer_pages() / 20);
+       recommended_min <<= (PAGE_SHIFT-10);
+
+       if (recommended_min > min_free_kbytes)
+               min_free_kbytes = recommended_min;
+       setup_per_zone_wmarks();
+       return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+       int err = 0;
+       if (khugepaged_enabled()) {
+               int wakeup;
+               if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               mutex_lock(&khugepaged_mutex);
+               if (!khugepaged_thread)
+                       khugepaged_thread = kthread_run(khugepaged, NULL,
+                                                       "khugepaged");
+               if (unlikely(IS_ERR(khugepaged_thread))) {
+                       printk(KERN_ERR
+                              "khugepaged: kthread_run(khugepaged) failed\n");
+                       err = PTR_ERR(khugepaged_thread);
+                       khugepaged_thread = NULL;
+               }
+               wakeup = !list_empty(&khugepaged_scan.mm_head);
+               mutex_unlock(&khugepaged_mutex);
+               if (wakeup)
+                       wake_up_interruptible(&khugepaged_wait);
+
+               set_recommended_min_free_kbytes();
+       } else
+               /* wakeup to exit */
+               wake_up_interruptible(&khugepaged_wait);
+out:
+       return err;
+}
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf,
+                               enum transparent_hugepage_flag enabled,
+                               enum transparent_hugepage_flag req_madv)
+{
+       if (test_bit(enabled, &transparent_hugepage_flags)) {
+               VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+               return sprintf(buf, "[always] madvise never\n");
+       } else if (test_bit(req_madv, &transparent_hugepage_flags))
+               return sprintf(buf, "always [madvise] never\n");
+       else
+               return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+                                struct kobj_attribute *attr,
+                                const char *buf, size_t count,
+                                enum transparent_hugepage_flag enabled,
+                                enum transparent_hugepage_flag req_madv)
+{
+       if (!memcmp("always", buf,
+                   min(sizeof("always")-1, count))) {
+               set_bit(enabled, &transparent_hugepage_flags);
+               clear_bit(req_madv, &transparent_hugepage_flags);
+       } else if (!memcmp("madvise", buf,
+                          min(sizeof("madvise")-1, count))) {
+               clear_bit(enabled, &transparent_hugepage_flags);
+               set_bit(req_madv, &transparent_hugepage_flags);
+       } else if (!memcmp("never", buf,
+                          min(sizeof("never")-1, count))) {
+               clear_bit(enabled, &transparent_hugepage_flags);
+               clear_bit(req_madv, &transparent_hugepage_flags);
+       } else
+               return -EINVAL;
+
+       return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+                           struct kobj_attribute *attr, char *buf)
+{
+       return double_flag_show(kobj, attr, buf,
+                               TRANSPARENT_HUGEPAGE_FLAG,
+                               TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+                            struct kobj_attribute *attr,
+                            const char *buf, size_t count)
+{
+       ssize_t ret;
+
+       ret = double_flag_store(kobj, attr, buf, count,
+                               TRANSPARENT_HUGEPAGE_FLAG,
+                               TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+       if (ret > 0) {
+               int err = start_khugepaged();
+               if (err)
+                       ret = err;
+       }
+
+       if (ret > 0 &&
+           (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) ||
+            test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags)))
+               set_recommended_min_free_kbytes();
+
+       return ret;
+}
+static struct kobj_attribute enabled_attr =
+       __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf,
+                               enum transparent_hugepage_flag flag)
+{
+       if (test_bit(flag, &transparent_hugepage_flags))
+               return sprintf(buf, "[yes] no\n");
+       else
+               return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+                                struct kobj_attribute *attr,
+                                const char *buf, size_t count,
+                                enum transparent_hugepage_flag flag)
+{
+       if (!memcmp("yes", buf,
+                   min(sizeof("yes")-1, count))) {
+               set_bit(flag, &transparent_hugepage_flags);
+       } else if (!memcmp("no", buf,
+                          min(sizeof("no")-1, count))) {
+               clear_bit(flag, &transparent_hugepage_flags);
+       } else
+               return -EINVAL;
+
+       return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+                          struct kobj_attribute *attr, char *buf)
+{
+       return double_flag_show(kobj, attr, buf,
+                               TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                               TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+                           struct kobj_attribute *attr,
+                           const char *buf, size_t count)
+{
+       return double_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+       __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+       return single_flag_show(kobj, attr, buf,
+                               TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+                              struct kobj_attribute *attr,
+                              const char *buf, size_t count)
+{
+       return single_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+       __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+       &enabled_attr.attr,
+       &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+       &debug_cow_attr.attr,
+#endif
+       NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+       .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         const char *buf, size_t count)
+{
+       unsigned long msecs;
+       int err;
+
+       err = strict_strtoul(buf, 10, &msecs);
+       if (err || msecs > UINT_MAX)
+               return -EINVAL;
+
+       khugepaged_scan_sleep_millisecs = msecs;
+       wake_up_interruptible(&khugepaged_wait);
+
+       return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+       __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+              scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+       unsigned long msecs;
+       int err;
+
+       err = strict_strtoul(buf, 10, &msecs);
+       if (err || msecs > UINT_MAX)
+               return -EINVAL;
+
+       khugepaged_alloc_sleep_millisecs = msecs;
+       wake_up_interruptible(&khugepaged_wait);
+
+       return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+       __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+              alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       int err;
+       unsigned long pages;
+
+       err = strict_strtoul(buf, 10, &pages);
+       if (err || !pages || pages > UINT_MAX)
+               return -EINVAL;
+
+       khugepaged_pages_to_scan = pages;
+
+       return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+       __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+              pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+       __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+                              struct kobj_attribute *attr,
+                              char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+       __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+                                     struct kobj_attribute *attr, char *buf)
+{
+       return single_flag_show(kobj, attr, buf,
+                               TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+                                      struct kobj_attribute *attr,
+                                      const char *buf, size_t count)
+{
+       return single_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+       __ATTR(defrag, 0644, khugepaged_defrag_show,
+              khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+{
+       int err;
+       unsigned long max_ptes_none;
+
+       err = strict_strtoul(buf, 10, &max_ptes_none);
+       if (err || max_ptes_none > HPAGE_PMD_NR-1)
+               return -EINVAL;
+
+       khugepaged_max_ptes_none = max_ptes_none;
+
+       return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+       __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+              khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+       &khugepaged_defrag_attr.attr,
+       &khugepaged_max_ptes_none_attr.attr,
+       &pages_to_scan_attr.attr,
+       &pages_collapsed_attr.attr,
+       &full_scans_attr.attr,
+       &scan_sleep_millisecs_attr.attr,
+       &alloc_sleep_millisecs_attr.attr,
+       NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+       .attrs = khugepaged_attr,
+       .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+       int err;
+#ifdef CONFIG_SYSFS
+       static struct kobject *hugepage_kobj;
+#endif
+
+       err = -EINVAL;
+       if (!has_transparent_hugepage()) {
+               transparent_hugepage_flags = 0;
+               goto out;
+       }
+
+#ifdef CONFIG_SYSFS
+       err = -ENOMEM;
+       hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+       if (unlikely(!hugepage_kobj)) {
+               printk(KERN_ERR "hugepage: failed kobject create\n");
+               goto out;
+       }
+
+       err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+       if (err) {
+               printk(KERN_ERR "hugepage: failed register hugeage group\n");
+               goto out;
+       }
+
+       err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+       if (err) {
+               printk(KERN_ERR "hugepage: failed register hugeage group\n");
+               goto out;
+       }
+#endif
+
+       err = khugepaged_slab_init();
+       if (err)
+               goto out;
+
+       err = mm_slots_hash_init();
+       if (err) {
+               khugepaged_slab_free();
+               goto out;
+       }
+
+       /*
+        * By default disable transparent hugepages on smaller systems,
+        * where the extra memory used could hurt more than TLB overhead
+        * is likely to save.  The admin can still enable it through /sys.
+        */
+       if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+               transparent_hugepage_flags = 0;
+
+       start_khugepaged();
+
+       set_recommended_min_free_kbytes();
+
+out:
+       return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+       if (!strcmp(str, "always")) {
+               set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                       &transparent_hugepage_flags);
+               clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                         &transparent_hugepage_flags);
+               ret = 1;
+       } else if (!strcmp(str, "madvise")) {
+               clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                         &transparent_hugepage_flags);
+               set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                       &transparent_hugepage_flags);
+               ret = 1;
+       } else if (!strcmp(str, "never")) {
+               clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                         &transparent_hugepage_flags);
+               clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                         &transparent_hugepage_flags);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               printk(KERN_WARNING
+                      "transparent_hugepage= cannot parse, ignored\n");
+       return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+                                struct mm_struct *mm)
+{
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       if (!mm->pmd_huge_pte)
+               INIT_LIST_HEAD(&pgtable->lru);
+       else
+               list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+       mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+       if (likely(vma->vm_flags & VM_WRITE))
+               pmd = pmd_mkwrite(pmd);
+       return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long haddr, pmd_t *pmd,
+                                       struct page *page)
+{
+       int ret = 0;
+       pgtable_t pgtable;
+
+       VM_BUG_ON(!PageCompound(page));
+       pgtable = pte_alloc_one(mm, haddr);
+       if (unlikely(!pgtable)) {
+               mem_cgroup_uncharge_page(page);
+               put_page(page);
+               return VM_FAULT_OOM;
+       }
+
+       clear_huge_page(page, haddr, HPAGE_PMD_NR);
+       __SetPageUptodate(page);
+
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_none(*pmd))) {
+               spin_unlock(&mm->page_table_lock);
+               mem_cgroup_uncharge_page(page);
+               put_page(page);
+               pte_free(mm, pgtable);
+       } else {
+               pmd_t entry;
+               entry = mk_pmd(page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+               entry = pmd_mkhuge(entry);
+               /*
+                * The spinlocking to take the lru_lock inside
+                * page_add_new_anon_rmap() acts as a full memory
+                * barrier to be sure clear_huge_page writes become
+                * visible after the set_pmd_at() write.
+                */
+               page_add_new_anon_rmap(page, vma, haddr);
+               set_pmd_at(mm, haddr, pmd, entry);
+               prepare_pmd_huge_pte(pgtable, mm);
+               add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               spin_unlock(&mm->page_table_lock);
+       }
+
+       return ret;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+       return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                             struct vm_area_struct *vma,
+                                             unsigned long haddr)
+{
+       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+                              HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+       return alloc_pages(alloc_hugepage_gfpmask(defrag),
+                          HPAGE_PMD_ORDER);
+}
+#endif
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                              unsigned long address, pmd_t *pmd,
+                              unsigned int flags)
+{
+       struct page *page;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       pte_t *pte;
+
+       if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+               if (unlikely(anon_vma_prepare(vma)))
+                       return VM_FAULT_OOM;
+               if (unlikely(khugepaged_enter(vma)))
+                       return VM_FAULT_OOM;
+               page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                         vma, haddr);
+               if (unlikely(!page))
+                       goto out;
+               if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+                       put_page(page);
+                       goto out;
+               }
+
+               return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+       }
+out:
+       /*
+        * Use __pte_alloc instead of pte_alloc_map, because we can't
+        * run pte_offset_map on the pmd, if an huge pmd could
+        * materialize from under us from a different thread.
+        */
+       if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+               return VM_FAULT_OOM;
+       /* if an huge pmd materialized from under us just retry later */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               return 0;
+       /*
+        * A regular pmd is established and it can't morph into a huge pmd
+        * from under us anymore at this point because we hold the mmap_sem
+        * read mode and khugepaged takes it in write mode. So now it's
+        * safe to run pte_offset_map().
+        */
+       pte = pte_offset_map(pmd, address);
+       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                 struct vm_area_struct *vma)
+{
+       struct page *src_page;
+       pmd_t pmd;
+       pgtable_t pgtable;
+       int ret;
+
+       ret = -ENOMEM;
+       pgtable = pte_alloc_one(dst_mm, addr);
+       if (unlikely(!pgtable))
+               goto out;
+
+       spin_lock(&dst_mm->page_table_lock);
+       spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+       ret = -EAGAIN;
+       pmd = *src_pmd;
+       if (unlikely(!pmd_trans_huge(pmd))) {
+               pte_free(dst_mm, pgtable);
+               goto out_unlock;
+       }
+       if (unlikely(pmd_trans_splitting(pmd))) {
+               /* split huge page running from under us */
+               spin_unlock(&src_mm->page_table_lock);
+               spin_unlock(&dst_mm->page_table_lock);
+               pte_free(dst_mm, pgtable);
+
+               wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+               goto out;
+       }
+       src_page = pmd_page(pmd);
+       VM_BUG_ON(!PageHead(src_page));
+       get_page(src_page);
+       page_dup_rmap(src_page);
+       add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+       pmdp_set_wrprotect(src_mm, addr, src_pmd);
+       pmd = pmd_mkold(pmd_wrprotect(pmd));
+       set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+       prepare_pmd_huge_pte(pgtable, dst_mm);
+
+       ret = 0;
+out_unlock:
+       spin_unlock(&src_mm->page_table_lock);
+       spin_unlock(&dst_mm->page_table_lock);
+out:
+       return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+       pgtable_t pgtable;
+
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       pgtable = mm->pmd_huge_pte;
+       if (list_empty(&pgtable->lru))
+               mm->pmd_huge_pte = NULL;
+       else {
+               mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                             struct page, lru);
+               list_del(&pgtable->lru);
+       }
+       return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pmd_t *pmd, pmd_t orig_pmd,
+                                       struct page *page,
+                                       unsigned long haddr)
+{
+       pgtable_t pgtable;
+       pmd_t _pmd;
+       int ret = 0, i;
+       struct page **pages;
+
+       pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+                       GFP_KERNEL);
+       if (unlikely(!pages)) {
+               ret |= VM_FAULT_OOM;
+               goto out;
+       }
+
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                                         vma, address);
+               if (unlikely(!pages[i] ||
+                            mem_cgroup_newpage_charge(pages[i], mm,
+                                                      GFP_KERNEL))) {
+                       if (pages[i])
+                               put_page(pages[i]);
+                       mem_cgroup_uncharge_start();
+                       while (--i >= 0) {
+                               mem_cgroup_uncharge_page(pages[i]);
+                               put_page(pages[i]);
+                       }
+                       mem_cgroup_uncharge_end();
+                       kfree(pages);
+                       ret |= VM_FAULT_OOM;
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               copy_user_highpage(pages[i], page + i,
+                                  haddr + PAGE_SHIFT*i, vma);
+               __SetPageUptodate(pages[i]);
+               cond_resched();
+       }
+
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(*pmd, orig_pmd)))
+               goto out_free_pages;
+       VM_BUG_ON(!PageHead(page));
+
+       pmdp_clear_flush_notify(vma, haddr, pmd);
+       /* leave pmd empty until pte is filled */
+
+       pgtable = get_pmd_huge_pte(mm);
+       pmd_populate(mm, &_pmd, pgtable);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t *pte, entry;
+               entry = mk_pte(pages[i], vma->vm_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               page_add_new_anon_rmap(pages[i], vma, haddr);
+               pte = pte_offset_map(&_pmd, haddr);
+               VM_BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               pte_unmap(pte);
+       }
+       kfree(pages);
+
+       mm->nr_ptes++;
+       smp_wmb(); /* make pte visible before pmd */
+       pmd_populate(mm, pmd, pgtable);
+       page_remove_rmap(page);
+       spin_unlock(&mm->page_table_lock);
+
+       ret |= VM_FAULT_WRITE;
+       put_page(page);
+
+out:
+       return ret;
+
+out_free_pages:
+       spin_unlock(&mm->page_table_lock);
+       mem_cgroup_uncharge_start();
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               mem_cgroup_uncharge_page(pages[i]);
+               put_page(pages[i]);
+       }
+       mem_cgroup_uncharge_end();
+       kfree(pages);
+       goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+       int ret = 0;
+       struct page *page, *new_page;
+       unsigned long haddr;
+
+       VM_BUG_ON(!vma->anon_vma);
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(*pmd, orig_pmd)))
+               goto out_unlock;
+
+       page = pmd_page(orig_pmd);
+       VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+       haddr = address & HPAGE_PMD_MASK;
+       if (page_mapcount(page) == 1) {
+               pmd_t entry;
+               entry = pmd_mkyoung(orig_pmd);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+               if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
+                       update_mmu_cache(vma, address, entry);
+               ret |= VM_FAULT_WRITE;
+               goto out_unlock;
+       }
+       get_page(page);
+       spin_unlock(&mm->page_table_lock);
+
+       if (transparent_hugepage_enabled(vma) &&
+           !transparent_hugepage_debug_cow())
+               new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                             vma, haddr);
+       else
+               new_page = NULL;
+
+       if (unlikely(!new_page)) {
+               ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                                                  pmd, orig_pmd, page, haddr);
+               put_page(page);
+               goto out;
+       }
+
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+               put_page(new_page);
+               put_page(page);
+               ret |= VM_FAULT_OOM;
+               goto out;
+       }
+
+       copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+       __SetPageUptodate(new_page);
+
+       spin_lock(&mm->page_table_lock);
+       put_page(page);
+       if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+               mem_cgroup_uncharge_page(new_page);
+               put_page(new_page);
+       } else {
+               pmd_t entry;
+               VM_BUG_ON(!PageHead(page));
+               entry = mk_pmd(new_page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+               entry = pmd_mkhuge(entry);
+               pmdp_clear_flush_notify(vma, haddr, pmd);
+               page_add_new_anon_rmap(new_page, vma, haddr);
+               set_pmd_at(mm, haddr, pmd, entry);
+               update_mmu_cache(vma, address, entry);
+               page_remove_rmap(page);
+               put_page(page);
+               ret |= VM_FAULT_WRITE;
+       }
+out_unlock:
+       spin_unlock(&mm->page_table_lock);
+out:
+       return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                  unsigned long addr,
+                                  pmd_t *pmd,
+                                  unsigned int flags)
+{
+       struct page *page = NULL;
+
+       assert_spin_locked(&mm->page_table_lock);
+
+       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+               goto out;
+
+       page = pmd_page(*pmd);
+       VM_BUG_ON(!PageHead(page));
+       if (flags & FOLL_TOUCH) {
+               pmd_t _pmd;
+               /*
+                * We should set the dirty bit only for FOLL_WRITE but
+                * for now the dirty bit in the pmd is meaningless.
+                * And if the dirty bit will become meaningful and
+                * we'll only set it with FOLL_WRITE, an atomic
+                * set_bit will be required on the pmd to set the
+                * young bit, instead of the current set_pmd_at.
+                */
+               _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+               set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+       }
+       page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+       VM_BUG_ON(!PageCompound(page));
+       if (flags & FOLL_GET)
+               get_page(page);
+
+out:
+       return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                pmd_t *pmd)
+{
+       int ret = 0;
+
+       spin_lock(&tlb->mm->page_table_lock);
+       if (likely(pmd_trans_huge(*pmd))) {
+               if (unlikely(pmd_trans_splitting(*pmd))) {
+                       spin_unlock(&tlb->mm->page_table_lock);
+                       wait_split_huge_page(vma->anon_vma,
+                                            pmd);
+               } else {
+                       struct page *page;
+                       pgtable_t pgtable;
+                       pgtable = get_pmd_huge_pte(tlb->mm);
+                       page = pmd_page(*pmd);
+                       pmd_clear(pmd);
+                       page_remove_rmap(page);
+                       VM_BUG_ON(page_mapcount(page) < 0);
+                       add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                       VM_BUG_ON(!PageHead(page));
+                       spin_unlock(&tlb->mm->page_table_lock);
+                       tlb_remove_page(tlb, page);
+                       pte_free(tlb->mm, pgtable);
+                       ret = 1;
+               }
+       } else
+               spin_unlock(&tlb->mm->page_table_lock);
+
+       return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long addr, unsigned long end,
+               unsigned char *vec)
+{
+       int ret = 0;
+
+       spin_lock(&vma->vm_mm->page_table_lock);
+       if (likely(pmd_trans_huge(*pmd))) {
+               ret = !pmd_trans_splitting(*pmd);
+               spin_unlock(&vma->vm_mm->page_table_lock);
+               if (unlikely(!ret))
+                       wait_split_huge_page(vma->anon_vma, pmd);
+               else {
+                       /*
+                        * All logical pages in the range are present
+                        * if backed by a huge page.
+                        */
+                       memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+               }
+       } else
+               spin_unlock(&vma->vm_mm->page_table_lock);
+
+       return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long addr, pgprot_t newprot)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       int ret = 0;
+
+       spin_lock(&mm->page_table_lock);
+       if (likely(pmd_trans_huge(*pmd))) {
+               if (unlikely(pmd_trans_splitting(*pmd))) {
+                       spin_unlock(&mm->page_table_lock);
+                       wait_split_huge_page(vma->anon_vma, pmd);
+               } else {
+                       pmd_t entry;
+
+                       entry = pmdp_get_and_clear(mm, addr, pmd);
+                       entry = pmd_modify(entry, newprot);
+                       set_pmd_at(mm, addr, pmd, entry);
+                       spin_unlock(&vma->vm_mm->page_table_lock);
+                       flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+                       ret = 1;
+               }
+       } else
+               spin_unlock(&vma->vm_mm->page_table_lock);
+
+       return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+                             struct mm_struct *mm,
+                             unsigned long address,
+                             enum page_check_address_pmd_flag flag)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd, *ret = NULL;
+
+       if (address & ~HPAGE_PMD_MASK)
+               goto out;
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               goto out;
+
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd))
+               goto out;
+       if (pmd_page(*pmd) != page)
+               goto out;
+       /*
+        * split_vma() may create temporary aliased mappings. There is
+        * no risk as long as all huge pmd are found and have their
+        * splitting bit set before __split_huge_page_refcount
+        * runs. Finding the same huge pmd more than once during the
+        * same rmap walk is not a problem.
+        */
+       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+           pmd_trans_splitting(*pmd))
+               goto out;
+       if (pmd_trans_huge(*pmd)) {
+               VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+                         !pmd_trans_splitting(*pmd));
+               ret = pmd;
+       }
+out:
+       return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+                                      struct vm_area_struct *vma,
+                                      unsigned long address)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pmd_t *pmd;
+       int ret = 0;
+
+       spin_lock(&mm->page_table_lock);
+       pmd = page_check_address_pmd(page, mm, address,
+                                    PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+       if (pmd) {
+               /*
+                * We can't temporarily set the pmd to null in order
+                * to split it, the pmd must remain marked huge at all
+                * times or the VM won't take the pmd_trans_huge paths
+                * and it won't wait on the anon_vma->root->lock to
+                * serialize against split_huge_page*.
+                */
+               pmdp_splitting_flush_notify(vma, address, pmd);
+               ret = 1;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+       int i;
+       unsigned long head_index = page->index;
+       struct zone *zone = page_zone(page);
+       int zonestat;
+
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       compound_lock(page);
+
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               struct page *page_tail = page + i;
+
+               /* tail_page->_count cannot change */
+               atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+               BUG_ON(page_count(page) <= 0);
+               atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+               BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+               /* after clearing PageTail the gup refcount can be released */
+               smp_mb();
+
+               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+               page_tail->flags |= (page->flags &
+                                    ((1L << PG_referenced) |
+                                     (1L << PG_swapbacked) |
+                                     (1L << PG_mlocked) |
+                                     (1L << PG_uptodate)));
+               page_tail->flags |= (1L << PG_dirty);
+
+               /*
+                * 1) clear PageTail before overwriting first_page
+                * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+                */
+               smp_wmb();
+
+               /*
+                * __split_huge_page_splitting() already set the
+                * splitting bit in all pmd that could map this
+                * hugepage, that will ensure no CPU can alter the
+                * mapcount on the head page. The mapcount is only
+                * accounted in the head page and it has to be
+                * transferred to all tail pages in the below code. So
+                * for this code to be safe, the split the mapcount
+                * can't change. But that doesn't mean userland can't
+                * keep changing and reading the page contents while
+                * we transfer the mapcount, so the pmd splitting
+                * status is achieved setting a reserved bit in the
+                * pmd, not by clearing the present bit.
+               */
+               BUG_ON(page_mapcount(page_tail));
+               page_tail->_mapcount = page->_mapcount;
+
+               BUG_ON(page_tail->mapping);
+               page_tail->mapping = page->mapping;
+
+               page_tail->index = ++head_index;
+
+               BUG_ON(!PageAnon(page_tail));
+               BUG_ON(!PageUptodate(page_tail));
+               BUG_ON(!PageDirty(page_tail));
+               BUG_ON(!PageSwapBacked(page_tail));
+
+               lru_add_page_tail(zone, page, page_tail);
+       }
+
+       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+       __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+       /*
+        * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+        * so adjust those appropriately if this page is on the LRU.
+        */
+       if (PageLRU(page)) {
+               zonestat = NR_LRU_BASE + page_lru(page);
+               __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+       }
+
+       ClearPageCompound(page);
+       compound_unlock(page);
+       spin_unlock_irq(&zone->lru_lock);
+
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               struct page *page_tail = page + i;
+               BUG_ON(page_count(page_tail) <= 0);
+               /*
+                * Tail pages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(page_tail);
+       }
+
+       /*
+        * Only the head page (now become a regular page) is required
+        * to be pinned by the caller.
+        */
+       BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+                                struct vm_area_struct *vma,
+                                unsigned long address)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pmd_t *pmd, _pmd;
+       int ret = 0, i;
+       pgtable_t pgtable;
+       unsigned long haddr;
+
+       spin_lock(&mm->page_table_lock);
+       pmd = page_check_address_pmd(page, mm, address,
+                                    PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+       if (pmd) {
+               pgtable = get_pmd_huge_pte(mm);
+               pmd_populate(mm, &_pmd, pgtable);
+
+               for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                    i++, haddr += PAGE_SIZE) {
+                       pte_t *pte, entry;
+                       BUG_ON(PageCompound(page+i));
+                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (!pmd_write(*pmd))
+                               entry = pte_wrprotect(entry);
+                       else
+                               BUG_ON(page_mapcount(page) != 1);
+                       if (!pmd_young(*pmd))
+                               entry = pte_mkold(entry);
+                       pte = pte_offset_map(&_pmd, haddr);
+                       BUG_ON(!pte_none(*pte));
+                       set_pte_at(mm, haddr, pte, entry);
+                       pte_unmap(pte);
+               }
+
+               mm->nr_ptes++;
+               smp_wmb(); /* make pte visible before pmd */
+               /*
+                * Up to this point the pmd is present and huge and
+                * userland has the whole access to the hugepage
+                * during the split (which happens in place). If we
+                * overwrite the pmd with the not-huge version
+                * pointing to the pte here (which of course we could
+                * if all CPUs were bug free), userland could trigger
+                * a small page size TLB miss on the small sized TLB
+                * while the hugepage TLB entry is still established
+                * in the huge TLB. Some CPU doesn't like that. See
+                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+                * Erratum 383 on page 93. Intel should be safe but is
+                * also warns that it's only safe if the permission
+                * and cache attributes of the two entries loaded in
+                * the two TLB is identical (which should be the case
+                * here). But it is generally safer to never allow
+                * small and huge TLB entries for the same virtual
+                * address to be loaded simultaneously. So instead of
+                * doing "pmd_populate(); flush_tlb_range();" we first
+                * mark the current pmd notpresent (atomically because
+                * here the pmd_trans_huge and pmd_trans_splitting
+                * must remain set at all times on the pmd until the
+                * split is complete for this pmd), then we flush the
+                * SMP TLB and finally we write the non-huge version
+                * of the pmd entry with pmd_populate.
+                */
+               set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+               pmd_populate(mm, pmd, pgtable);
+               ret = 1;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+                             struct anon_vma *anon_vma)
+{
+       int mapcount, mapcount2;
+       struct anon_vma_chain *avc;
+
+       BUG_ON(!PageHead(page));
+       BUG_ON(PageTail(page));
+
+       mapcount = 0;
+       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+               struct vm_area_struct *vma = avc->vma;
+               unsigned long addr = vma_address(page, vma);
+               BUG_ON(is_vma_temporary_stack(vma));
+               if (addr == -EFAULT)
+                       continue;
+               mapcount += __split_huge_page_splitting(page, vma, addr);
+       }
+       /*
+        * It is critical that new vmas are added to the tail of the
+        * anon_vma list. This guarantes that if copy_huge_pmd() runs
+        * and establishes a child pmd before
+        * __split_huge_page_splitting() freezes the parent pmd (so if
+        * we fail to prevent copy_huge_pmd() from running until the
+        * whole __split_huge_page() is complete), we will still see
+        * the newly established pmd of the child later during the
+        * walk, to be able to set it as pmd_trans_splitting too.
+        */
+       if (mapcount != page_mapcount(page))
+               printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+                      mapcount, page_mapcount(page));
+       BUG_ON(mapcount != page_mapcount(page));
+
+       __split_huge_page_refcount(page);
+
+       mapcount2 = 0;
+       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+               struct vm_area_struct *vma = avc->vma;
+               unsigned long addr = vma_address(page, vma);
+               BUG_ON(is_vma_temporary_stack(vma));
+               if (addr == -EFAULT)
+                       continue;
+               mapcount2 += __split_huge_page_map(page, vma, addr);
+       }
+       if (mapcount != mapcount2)
+               printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+                      mapcount, mapcount2, page_mapcount(page));
+       BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+       struct anon_vma *anon_vma;
+       int ret = 1;
+
+       BUG_ON(!PageAnon(page));
+       anon_vma = page_lock_anon_vma(page);
+       if (!anon_vma)
+               goto out;
+       ret = 0;
+       if (!PageCompound(page))
+               goto out_unlock;
+
+       BUG_ON(!PageSwapBacked(page));
+       __split_huge_page(page, anon_vma);
+
+       BUG_ON(PageCompound(page));
+out_unlock:
+       page_unlock_anon_vma(anon_vma);
+out:
+       return ret;
+}
+
+int hugepage_madvise(struct vm_area_struct *vma,
+                    unsigned long *vm_flags, int advice)
+{
+       switch (advice) {
+       case MADV_HUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_HUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_NOHUGEPAGE;
+               *vm_flags |= VM_HUGEPAGE;
+               /*
+                * If the vma become good for khugepaged to scan,
+                * register it here without waiting a page fault that
+                * may not happen any time soon.
+                */
+               if (unlikely(khugepaged_enter_vma_merge(vma)))
+                       return -ENOMEM;
+               break;
+       case MADV_NOHUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_NOHUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_HUGEPAGE;
+               *vm_flags |= VM_NOHUGEPAGE;
+               /*
+                * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                * this vma even if we leave the mm registered in khugepaged if
+                * it got registered before VM_NOHUGEPAGE was set.
+                */
+               break;
+       }
+
+       return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+       mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+                                         sizeof(struct mm_slot),
+                                         __alignof__(struct mm_slot), 0, NULL);
+       if (!mm_slot_cache)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+       kmem_cache_destroy(mm_slot_cache);
+       mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+       if (!mm_slot_cache)     /* initialization failed */
+               return NULL;
+       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+       kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+       mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+                               GFP_KERNEL);
+       if (!mm_slots_hash)
+               return -ENOMEM;
+       return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+       kfree(mm_slots_hash);
+       mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+       struct mm_slot *mm_slot;
+       struct hlist_head *bucket;
+       struct hlist_node *node;
+
+       bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                               % MM_SLOTS_HASH_HEADS];
+       hlist_for_each_entry(mm_slot, node, bucket, hash) {
+               if (mm == mm_slot->mm)
+                       return mm_slot;
+       }
+       return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+                                   struct mm_slot *mm_slot)
+{
+       struct hlist_head *bucket;
+
+       bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                               % MM_SLOTS_HASH_HEADS];
+       mm_slot->mm = mm;
+       hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+       return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+       struct mm_slot *mm_slot;
+       int wakeup;
+
+       mm_slot = alloc_mm_slot();
+       if (!mm_slot)
+               return -ENOMEM;
+
+       /* __khugepaged_exit() must not run from under us */
+       VM_BUG_ON(khugepaged_test_exit(mm));
+       if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+               free_mm_slot(mm_slot);
+               return 0;
+       }
+
+       spin_lock(&khugepaged_mm_lock);
+       insert_to_mm_slots_hash(mm, mm_slot);
+       /*
+        * Insert just behind the scanning cursor, to let the area settle
+        * down a little.
+        */
+       wakeup = list_empty(&khugepaged_scan.mm_head);
+       list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+       spin_unlock(&khugepaged_mm_lock);
+
+       atomic_inc(&mm->mm_count);
+       if (wakeup)
+               wake_up_interruptible(&khugepaged_wait);
+
+       return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+       unsigned long hstart, hend;
+       if (!vma->anon_vma)
+               /*
+                * Not yet faulted in so we will register later in the
+                * page fault if needed.
+                */
+               return 0;
+       if (vma->vm_file || vma->vm_ops)
+               /* khugepaged not yet working on file or special mappings */
+               return 0;
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+       hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+       hend = vma->vm_end & HPAGE_PMD_MASK;
+       if (hstart < hend)
+               return khugepaged_enter(vma);
+       return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+       struct mm_slot *mm_slot;
+       int free = 0;
+
+       spin_lock(&khugepaged_mm_lock);
+       mm_slot = get_mm_slot(mm);
+       if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+               hlist_del(&mm_slot->hash);
+               list_del(&mm_slot->mm_node);
+               free = 1;
+       }
+
+       if (free) {
+               spin_unlock(&khugepaged_mm_lock);
+               clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+               free_mm_slot(mm_slot);
+               mmdrop(mm);
+       } else if (mm_slot) {
+               spin_unlock(&khugepaged_mm_lock);
+               /*
+                * This is required to serialize against
+                * khugepaged_test_exit() (which is guaranteed to run
+                * under mmap sem read mode). Stop here (after we
+                * return all pagetables will be destroyed) until
+                * khugepaged has finished working on the pagetables
+                * under the mmap_sem.
+                */
+               down_write(&mm->mmap_sem);
+               up_write(&mm->mmap_sem);
+       } else
+               spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+       /* 0 stands for page_is_file_cache(page) == false */
+       dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+       unlock_page(page);
+       putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+       while (--_pte >= pte) {
+               pte_t pteval = *_pte;
+               if (!pte_none(pteval))
+                       release_pte_page(pte_page(pteval));
+       }
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+       release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pte_t *pte)
+{
+       struct page *page;
+       pte_t *_pte;
+       int referenced = 0, isolated = 0, none = 0;
+       for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+            _pte++, address += PAGE_SIZE) {
+               pte_t pteval = *_pte;
+               if (pte_none(pteval)) {
+                       if (++none <= khugepaged_max_ptes_none)
+                               continue;
+                       else {
+                               release_pte_pages(pte, _pte);
+                               goto out;
+                       }
+               }
+               if (!pte_present(pteval) || !pte_write(pteval)) {
+                       release_pte_pages(pte, _pte);
+                       goto out;
+               }
+               page = vm_normal_page(vma, address, pteval);
+               if (unlikely(!page)) {
+                       release_pte_pages(pte, _pte);
+                       goto out;
+               }
+               VM_BUG_ON(PageCompound(page));
+               BUG_ON(!PageAnon(page));
+               VM_BUG_ON(!PageSwapBacked(page));
+
+               /* cannot use mapcount: can't collapse if there's a gup pin */
+               if (page_count(page) != 1) {
+                       release_pte_pages(pte, _pte);
+                       goto out;
+               }
+               /*
+                * We can do it before isolate_lru_page because the
+                * page can't be freed from under us. NOTE: PG_lock
+                * is needed to serialize against split_huge_page
+                * when invoked from the VM.
+                */
+               if (!trylock_page(page)) {
+                       release_pte_pages(pte, _pte);
+                       goto out;
+               }
+               /*
+                * Isolate the page to avoid collapsing an hugepage
+                * currently in use by the VM.
+                */
+               if (isolate_lru_page(page)) {
+                       unlock_page(page);
+                       release_pte_pages(pte, _pte);
+                       goto out;
+               }
+               /* 0 stands for page_is_file_cache(page) == false */
+               inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+               VM_BUG_ON(!PageLocked(page));
+               VM_BUG_ON(PageLRU(page));
+
+               /* If there is no mapped pte young don't collapse the page */
+               if (pte_young(pteval) || PageReferenced(page) ||
+                   mmu_notifier_test_young(vma->vm_mm, address))
+                       referenced = 1;
+       }
+       if (unlikely(!referenced))
+               release_all_pte_pages(pte);
+       else
+               isolated = 1;
+out:
+       return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                     struct vm_area_struct *vma,
+                                     unsigned long address,
+                                     spinlock_t *ptl)
+{
+       pte_t *_pte;
+       for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+               pte_t pteval = *_pte;
+               struct page *src_page;
+
+               if (pte_none(pteval)) {
+                       clear_user_highpage(page, address);
+                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+               } else {
+                       src_page = pte_page(pteval);
+                       copy_user_highpage(page, src_page, address, vma);
+                       VM_BUG_ON(page_mapcount(src_page) != 1);
+                       VM_BUG_ON(page_count(src_page) != 2);
+                       release_pte_page(src_page);
+                       /*
+                        * ptl mostly unnecessary, but preempt has to
+                        * be disabled to update the per-cpu stats
+                        * inside page_remove_rmap().
+                        */
+                       spin_lock(ptl);
+                       /*
+                        * paravirt calls inside pte_clear here are
+                        * superfluous.
+                        */
+                       pte_clear(vma->vm_mm, address, _pte);
+                       page_remove_rmap(src_page);
+                       spin_unlock(ptl);
+                       free_page_and_swap_cache(src_page);
+               }
+
+               address += PAGE_SIZE;
+               page++;
+       }
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+                              unsigned long address,
+                              struct page **hpage,
+                              struct vm_area_struct *vma)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd, _pmd;
+       pte_t *pte;
+       pgtable_t pgtable;
+       struct page *new_page;
+       spinlock_t *ptl;
+       int isolated;
+       unsigned long hstart, hend;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+       VM_BUG_ON(!*hpage);
+       new_page = *hpage;
+#else
+       VM_BUG_ON(*hpage);
+       /*
+        * Allocate the page while the vma is still valid and under
+        * the mmap_sem read mode so there is no memory allocation
+        * later when we take the mmap_sem in write mode. This is more
+        * friendly behavior (OTOH it may actually hide bugs) to
+        * filesystems in userland with daemons allocating memory in
+        * the userland I/O paths.  Allocating memory with the
+        * mmap_sem in read mode is good idea also to allow greater
+        * scalability.
+        */
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       if (unlikely(!new_page)) {
+               up_read(&mm->mmap_sem);
+               *hpage = ERR_PTR(-ENOMEM);
+               return;
+       }
+#endif
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+               up_read(&mm->mmap_sem);
+               put_page(new_page);
+               return;
+       }
+
+       /* after allocating the hugepage upgrade to mmap_sem write mode */
+       up_read(&mm->mmap_sem);
+
+       /*
+        * Prevent all access to pagetables with the exception of
+        * gup_fast later hanlded by the ptep_clear_flush and the VM
+        * handled by the anon_vma lock + PG_lock.
+        */
+       down_write(&mm->mmap_sem);
+       if (unlikely(khugepaged_test_exit(mm)))
+               goto out;
+
+       vma = find_vma(mm, address);
+       hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+       hend = vma->vm_end & HPAGE_PMD_MASK;
+       if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+               goto out;
+
+       if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+           (vma->vm_flags & VM_NOHUGEPAGE))
+               goto out;
+
+       /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+       if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+               goto out;
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               goto out;
+
+       pmd = pmd_offset(pud, address);
+       /* pmd can't go away or become huge under us */
+       if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+               goto out;
+
+       anon_vma_lock(vma->anon_vma);
+
+       pte = pte_offset_map(pmd, address);
+       ptl = pte_lockptr(mm, pmd);
+
+       spin_lock(&mm->page_table_lock); /* probably unnecessary */
+       /*
+        * After this gup_fast can't run anymore. This also removes
+        * any huge TLB entry from the CPU so we won't allow
+        * huge and small TLB entries for the same virtual address
+        * to avoid the risk of CPU bugs in that area.
+        */
+       _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+       spin_unlock(&mm->page_table_lock);
+
+       spin_lock(ptl);
+       isolated = __collapse_huge_page_isolate(vma, address, pte);
+       spin_unlock(ptl);
+       pte_unmap(pte);
+
+       if (unlikely(!isolated)) {
+               spin_lock(&mm->page_table_lock);
+               BUG_ON(!pmd_none(*pmd));
+               set_pmd_at(mm, address, pmd, _pmd);
+               spin_unlock(&mm->page_table_lock);
+               anon_vma_unlock(vma->anon_vma);
+               mem_cgroup_uncharge_page(new_page);
+               goto out;
+       }
+
+       /*
+        * All pages are isolated and locked so anon_vma rmap
+        * can't run anymore.
+        */
+       anon_vma_unlock(vma->anon_vma);
+
+       __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+       __SetPageUptodate(new_page);
+       pgtable = pmd_pgtable(_pmd);
+       VM_BUG_ON(page_count(pgtable) != 1);
+       VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+       _pmd = mk_pmd(new_page, vma->vm_page_prot);
+       _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+       _pmd = pmd_mkhuge(_pmd);
+
+       /*
+        * spin_lock() below is not the equivalent of smp_wmb(), so
+        * this is needed to avoid the copy_huge_page writes to become
+        * visible after the set_pmd_at() write.
+        */
+       smp_wmb();
+
+       spin_lock(&mm->page_table_lock);
+       BUG_ON(!pmd_none(*pmd));
+       page_add_new_anon_rmap(new_page, vma, address);
+       set_pmd_at(mm, address, pmd, _pmd);
+       update_mmu_cache(vma, address, entry);
+       prepare_pmd_huge_pte(pgtable, mm);
+       mm->nr_ptes--;
+       spin_unlock(&mm->page_table_lock);
+
+#ifndef CONFIG_NUMA
+       *hpage = NULL;
+#endif
+       khugepaged_pages_collapsed++;
+out_up_write:
+       up_write(&mm->mmap_sem);
+       return;
+
+out:
+#ifdef CONFIG_NUMA
+       put_page(new_page);
+#endif
+       goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                              struct vm_area_struct *vma,
+                              unsigned long address,
+                              struct page **hpage)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte, *_pte;
+       int ret = 0, referenced = 0, none = 0;
+       struct page *page;
+       unsigned long _address;
+       spinlock_t *ptl;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               goto out;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+               goto out;
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+            _pte++, _address += PAGE_SIZE) {
+               pte_t pteval = *_pte;
+               if (pte_none(pteval)) {
+                       if (++none <= khugepaged_max_ptes_none)
+                               continue;
+                       else
+                               goto out_unmap;
+               }
+               if (!pte_present(pteval) || !pte_write(pteval))
+                       goto out_unmap;
+               page = vm_normal_page(vma, _address, pteval);
+               if (unlikely(!page))
+                       goto out_unmap;
+               VM_BUG_ON(PageCompound(page));
+               if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+                       goto out_unmap;
+               /* cannot use mapcount: can't collapse if there's a gup pin */
+               if (page_count(page) != 1)
+                       goto out_unmap;
+               if (pte_young(pteval) || PageReferenced(page) ||
+                   mmu_notifier_test_young(vma->vm_mm, address))
+                       referenced = 1;
+       }
+       if (referenced)
+               ret = 1;
+out_unmap:
+       pte_unmap_unlock(pte, ptl);
+       if (ret)
+               /* collapse_huge_page will return with the mmap_sem released */
+               collapse_huge_page(mm, address, hpage, vma);
+out:
+       return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+       struct mm_struct *mm = mm_slot->mm;
+
+       VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+       if (khugepaged_test_exit(mm)) {
+               /* free mm_slot */
+               hlist_del(&mm_slot->hash);
+               list_del(&mm_slot->mm_node);
+
+               /*
+                * Not strictly needed because the mm exited already.
+                *
+                * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                */
+
+               /* khugepaged_mm_lock actually not necessary for the below */
+               free_mm_slot(mm_slot);
+               mmdrop(mm);
+       }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+                                           struct page **hpage)
+{
+       struct mm_slot *mm_slot;
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       int progress = 0;
+
+       VM_BUG_ON(!pages);
+       VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+       if (khugepaged_scan.mm_slot)
+               mm_slot = khugepaged_scan.mm_slot;
+       else {
+               mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                    struct mm_slot, mm_node);
+               khugepaged_scan.address = 0;
+               khugepaged_scan.mm_slot = mm_slot;
+       }
+       spin_unlock(&khugepaged_mm_lock);
+
+       mm = mm_slot->mm;
+       down_read(&mm->mmap_sem);
+       if (unlikely(khugepaged_test_exit(mm)))
+               vma = NULL;
+       else
+               vma = find_vma(mm, khugepaged_scan.address);
+
+       progress++;
+       for (; vma; vma = vma->vm_next) {
+               unsigned long hstart, hend;
+
+               cond_resched();
+               if (unlikely(khugepaged_test_exit(mm))) {
+                       progress++;
+                       break;
+               }
+
+               if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                    !khugepaged_always()) ||
+                   (vma->vm_flags & VM_NOHUGEPAGE)) {
+                       progress++;
+                       continue;
+               }
+
+               /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+               if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                       khugepaged_scan.address = vma->vm_end;
+                       progress++;
+                       continue;
+               }
+               VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+               hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+               hend = vma->vm_end & HPAGE_PMD_MASK;
+               if (hstart >= hend) {
+                       progress++;
+                       continue;
+               }
+               if (khugepaged_scan.address < hstart)
+                       khugepaged_scan.address = hstart;
+               if (khugepaged_scan.address > hend) {
+                       khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+                       progress++;
+                       continue;
+               }
+               BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+               while (khugepaged_scan.address < hend) {
+                       int ret;
+                       cond_resched();
+                       if (unlikely(khugepaged_test_exit(mm)))
+                               goto breakouterloop;
+
+                       VM_BUG_ON(khugepaged_scan.address < hstart ||
+                                 khugepaged_scan.address + HPAGE_PMD_SIZE >
+                                 hend);
+                       ret = khugepaged_scan_pmd(mm, vma,
+                                                 khugepaged_scan.address,
+                                                 hpage);
+                       /* move to next address */
+                       khugepaged_scan.address += HPAGE_PMD_SIZE;
+                       progress += HPAGE_PMD_NR;
+                       if (ret)
+                               /* we released mmap_sem so break loop */
+                               goto breakouterloop_mmap_sem;
+                       if (progress >= pages)
+                               goto breakouterloop;
+               }
+       }
+breakouterloop:
+       up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+       spin_lock(&khugepaged_mm_lock);
+       BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+       /*
+        * Release the current mm_slot if this mm is about to die, or
+        * if we scanned all vmas of this mm.
+        */
+       if (khugepaged_test_exit(mm) || !vma) {
+               /*
+                * Make sure that if mm_users is reaching zero while
+                * khugepaged runs here, khugepaged_exit will find
+                * mm_slot not pointing to the exiting mm.
+                */
+               if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+                       khugepaged_scan.mm_slot = list_entry(
+                               mm_slot->mm_node.next,
+                               struct mm_slot, mm_node);
+                       khugepaged_scan.address = 0;
+               } else {
+                       khugepaged_scan.mm_slot = NULL;
+                       khugepaged_full_scans++;
+               }
+
+               collect_mm_slot(mm_slot);
+       }
+
+       return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+       return !list_empty(&khugepaged_scan.mm_head) &&
+               khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+       return !list_empty(&khugepaged_scan.mm_head) ||
+               !khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+       unsigned int progress = 0, pass_through_head = 0;
+       unsigned int pages = khugepaged_pages_to_scan;
+
+       barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+       while (progress < pages) {
+               cond_resched();
+
+#ifndef CONFIG_NUMA
+               if (!*hpage) {
+                       *hpage = alloc_hugepage(khugepaged_defrag());
+                       if (unlikely(!*hpage))
+                               break;
+               }
+#else
+               if (IS_ERR(*hpage))
+                       break;
+#endif
+
+               if (unlikely(kthread_should_stop() || freezing(current)))
+                       break;
+
+               spin_lock(&khugepaged_mm_lock);
+               if (!khugepaged_scan.mm_slot)
+                       pass_through_head++;
+               if (khugepaged_has_work() &&
+                   pass_through_head < 2)
+                       progress += khugepaged_scan_mm_slot(pages - progress,
+                                                           hpage);
+               else
+                       progress = pages;
+               spin_unlock(&khugepaged_mm_lock);
+       }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+       DEFINE_WAIT(wait);
+       add_wait_queue(&khugepaged_wait, &wait);
+       schedule_timeout_interruptible(
+               msecs_to_jiffies(
+                       khugepaged_alloc_sleep_millisecs));
+       remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+       struct page *hpage;
+
+       do {
+               hpage = alloc_hugepage(khugepaged_defrag());
+               if (!hpage)
+                       khugepaged_alloc_sleep();
+       } while (unlikely(!hpage) &&
+                likely(khugepaged_enabled()));
+       return hpage;
+}
+#endif
+
+static void khugepaged_loop(void)
+{
+       struct page *hpage;
+
+#ifdef CONFIG_NUMA
+       hpage = NULL;
+#endif
+       while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+               hpage = khugepaged_alloc_hugepage();
+               if (unlikely(!hpage))
+                       break;
+#else
+               if (IS_ERR(hpage)) {
+                       khugepaged_alloc_sleep();
+                       hpage = NULL;
+               }
+#endif
+
+               khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+               if (hpage)
+                       put_page(hpage);
+#endif
+               try_to_freeze();
+               if (unlikely(kthread_should_stop()))
+                       break;
+               if (khugepaged_has_work()) {
+                       DEFINE_WAIT(wait);
+                       if (!khugepaged_scan_sleep_millisecs)
+                               continue;
+                       add_wait_queue(&khugepaged_wait, &wait);
+                       schedule_timeout_interruptible(
+                               msecs_to_jiffies(
+                                       khugepaged_scan_sleep_millisecs));
+                       remove_wait_queue(&khugepaged_wait, &wait);
+               } else if (khugepaged_enabled())
+                       wait_event_freezable(khugepaged_wait,
+                                            khugepaged_wait_event());
+       }
+}
+
+static int khugepaged(void *none)
+{
+       struct mm_slot *mm_slot;
+
+       set_freezable();
+       set_user_nice(current, 19);
+
+       /* serialize with start_khugepaged() */
+       mutex_lock(&khugepaged_mutex);
+
+       for (;;) {
+               mutex_unlock(&khugepaged_mutex);
+               BUG_ON(khugepaged_thread != current);
+               khugepaged_loop();
+               BUG_ON(khugepaged_thread != current);
+
+               mutex_lock(&khugepaged_mutex);
+               if (!khugepaged_enabled())
+                       break;
+               if (unlikely(kthread_should_stop()))
+                       break;
+       }
+
+       spin_lock(&khugepaged_mm_lock);
+       mm_slot = khugepaged_scan.mm_slot;
+       khugepaged_scan.mm_slot = NULL;
+       if (mm_slot)
+               collect_mm_slot(mm_slot);
+       spin_unlock(&khugepaged_mm_lock);
+
+       khugepaged_thread = NULL;
+       mutex_unlock(&khugepaged_mutex);
+
+       return 0;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+       struct page *page;
+
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_trans_huge(*pmd))) {
+               spin_unlock(&mm->page_table_lock);
+               return;
+       }
+       page = pmd_page(*pmd);
+       VM_BUG_ON(!page_count(page));
+       get_page(page);
+       spin_unlock(&mm->page_table_lock);
+
+       split_huge_page(page);
+
+       put_page(page);
+       BUG_ON(pmd_trans_huge(*pmd));
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+                                   unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return;
+       /*
+        * Caller holds the mmap_sem write mode, so a huge pmd cannot
+        * materialize from under us.
+        */
+       split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                            unsigned long start,
+                            unsigned long end,
+                            long adjust_next)
+{
+       /*
+        * If the new start address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (start & ~HPAGE_PMD_MASK &&
+           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, start);
+
+       /*
+        * If the new end address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (end & ~HPAGE_PMD_MASK &&
+           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, end);
+
+       /*
+        * If we're also updating the vma->vm_next->vm_start, if the new
+        * vm_next->vm_start isn't page aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (adjust_next > 0) {
+               struct vm_area_struct *next = vma->vm_next;
+               unsigned long nstart = next->vm_start;
+               nstart += adjust_next << PAGE_SHIFT;
+               if (nstart & ~HPAGE_PMD_MASK &&
+                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                       split_huge_page_address(next->vm_mm, nstart);
+       }
+}
index 8585524..bb0b7c1 100644 (file)
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
        return 0;
 }
 
-static void clear_gigantic_page(struct page *page,
-                       unsigned long addr, unsigned long sz)
-{
-       int i;
-       struct page *p = page;
-
-       might_sleep();
-       for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
-               cond_resched();
-               clear_user_highpage(p, addr + i * PAGE_SIZE);
-       }
-}
-static void clear_huge_page(struct page *page,
-                       unsigned long addr, unsigned long sz)
-{
-       int i;
-
-       if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
-               clear_gigantic_page(page, addr, sz);
-               return;
-       }
-
-       might_sleep();
-       for (i = 0; i < sz/PAGE_SIZE; i++) {
-               cond_resched();
-               clear_user_highpage(page + i, addr + i * PAGE_SIZE);
-       }
-}
-
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-                          unsigned long addr, struct vm_area_struct *vma)
-{
-       int i;
-       struct hstate *h = hstate_vma(vma);
-       struct page *dst_base = dst;
-       struct page *src_base = src;
-
-       for (i = 0; i < pages_per_huge_page(h); ) {
-               cond_resched();
-               copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
-       }
-}
-
-static void copy_user_huge_page(struct page *dst, struct page *src,
-                          unsigned long addr, struct vm_area_struct *vma)
-{
-       int i;
-       struct hstate *h = hstate_vma(vma);
-
-       if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-               copy_user_gigantic_page(dst, src, addr, vma);
-               return;
-       }
-
-       might_sleep();
-       for (i = 0; i < pages_per_huge_page(h); i++) {
-               cond_resched();
-               copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
-       }
-}
-
 static void copy_gigantic_page(struct page *dst, struct page *src)
 {
        int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
 
        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
+
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                        struct kobject *kobj, struct kobj_attribute *attr,
                        const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 
        err = strict_strtoul(buf, 10, &count);
        if (err)
-               return 0;
+               goto out;
 
        h = kobj_to_hstate(kobj, &nid);
+       if (h->order >= MAX_ORDER) {
+               err = -EINVAL;
+               goto out;
+       }
+
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                NODEMASK_FREE(nodes_allowed);
 
        return len;
+out:
+       NODEMASK_FREE(nodes_allowed);
+       return err;
 }
 
 static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 }
+
 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
 {
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);
 
+       if (h->order >= MAX_ORDER)
+               return -EINVAL;
+
        err = strict_strtoul(buf, 10, &input);
        if (err)
-               return 0;
+               return err;
 
        spin_lock(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+       int ret;
 
        if (!write)
                tmp = h->max_huge_pages;
 
+       if (write && h->order >= MAX_ORDER)
+               return -EINVAL;
+
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-       proc_doulongvec_minmax(table, write, buffer, length, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+       if (ret)
+               goto out;
 
        if (write) {
                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
                        NODEMASK_FREE(nodes_allowed);
        }
-
-       return 0;
+out:
+       return ret;
 }
 
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+       int ret;
 
        if (!write)
                tmp = h->nr_overcommit_huge_pages;
 
+       if (write && h->order >= MAX_ORDER)
+               return -EINVAL;
+
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-       proc_doulongvec_minmax(table, write, buffer, length, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+       if (ret)
+               goto out;
 
        if (write) {
                spin_lock(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock(&hugetlb_lock);
        }
-
-       return 0;
+out:
+       return ret;
 }
 
 #endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
                return VM_FAULT_OOM;
        }
 
-       copy_user_huge_page(new_page, old_page, address, vma);
+       copy_user_huge_page(new_page, old_page, address, vma,
+                           pages_per_huge_page(h));
        __SetPageUptodate(new_page);
 
        /*
@@ -2558,7 +2519,7 @@ retry:
                        ret = -PTR_ERR(page);
                        goto out;
                }
-               clear_huge_page(page, address, huge_page_size(h));
+               clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
 
                if (vma->vm_flags & VM_MAYSHARE) {
index dedb0af..4c98630 100644 (file)
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
 
 extern unsigned long highest_memmap_pfn;
 
+#ifdef CONFIG_SMP
+extern int putback_active_lru_page(struct zone *zone, struct page *page);
+#else
+static inline int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+       return 0;
+}
+#endif
+
 /*
  * in mm/vmscan.c:
  */
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
        }
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+                                struct vm_area_struct *vma);
+#endif
 #else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int len, unsigned int foll_flags,
-                    struct page **pages, struct vm_area_struct **vmas);
+                    struct page **pages, struct vm_area_struct **vmas,
+                    int *nonblocking);
 
 #define ZONE_RECLAIM_NOSCAN    -2
 #define ZONE_RECLAIM_FULL      -1
index 43bc893..c2b2a94 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/ksm.h>
 #include <linux/hash.h>
+#include <linux/freezer.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -411,6 +412,20 @@ out:
        up_read(&mm->mmap_sem);
 }
 
+static struct page *page_trans_compound_anon(struct page *page)
+{
+       if (PageTransCompound(page)) {
+               struct page *head = compound_trans_head(page);
+               /*
+                * head may actually be splitted and freed from under
+                * us but it's ok here.
+                */
+               if (PageAnon(head))
+                       return head;
+       }
+       return NULL;
+}
+
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-       if (PageAnon(page)) {
+       if (PageAnon(page) || page_trans_compound_anon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
 
+       BUG_ON(PageTransCompound(page));
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
                goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                goto out;
 
        pmd = pmd_offset(pud, addr);
+       BUG_ON(pmd_trans_huge(*pmd));
        if (!pmd_present(*pmd))
                goto out;
 
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
        page_remove_rmap(page);
+       if (!page_mapped(page))
+               try_to_free_swap(page);
        put_page(page);
 
        pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
        return err;
 }
 
+static int page_trans_compound_anon_split(struct page *page)
+{
+       int ret = 0;
+       struct page *transhuge_head = page_trans_compound_anon(page);
+       if (transhuge_head) {
+               /* Get the reference on the head to split it. */
+               if (get_page_unless_zero(transhuge_head)) {
+                       /*
+                        * Recheck we got the reference while the head
+                        * was still anonymous.
+                        */
+                       if (PageAnon(transhuge_head))
+                               ret = split_huge_page(transhuge_head);
+                       else
+                               /*
+                                * Retry later if split_huge_page run
+                                * from under us.
+                                */
+                               ret = 1;
+                       put_page(transhuge_head);
+               } else
+                       /* Retry later if split_huge_page run from under us. */
+                       ret = 1;
+       }
+       return ret;
+}
+
 /*
  * try_to_merge_one_page - take two pages and merge them into one
  * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
 
        if (!(vma->vm_flags & VM_MERGEABLE))
                goto out;
+       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+               goto out;
+       BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
 
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 
        slot = ksm_scan.mm_slot;
        if (slot == &ksm_mm_head) {
+               /*
+                * A number of pages can hang around indefinitely on per-cpu
+                * pagevecs, raised page count preventing write_protect_page
+                * from merging them.  Though it doesn't really matter much,
+                * it is puzzling to see some stuck in pages_volatile until
+                * other activity jostles them out, and they also prevented
+                * LTP's KSM test from succeeding deterministically; so drain
+                * them here (here rather than on entry to ksm_do_scan(),
+                * so we don't IPI too often when pages_to_scan is set low).
+                */
+               lru_add_drain_all();
+
                root_unstable_tree = RB_ROOT;
 
                spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                       if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+                       if (IS_ERR_OR_NULL(*page)) {
+                               ksm_scan.address += PAGE_SIZE;
+                               cond_resched();
+                               continue;
+                       }
+                       if (PageAnon(*page) ||
+                           page_trans_compound_anon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                       if (!IS_ERR_OR_NULL(*page))
-                               put_page(*page);
+                       put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
        struct rmap_item *rmap_item;
        struct page *uninitialized_var(page);
 
-       while (scan_npages--) {
+       while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
 
 static int ksm_scan_thread(void *nothing)
 {
+       set_freezable();
        set_user_nice(current, 5);
 
        while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
 
+               try_to_freeze();
+
                if (ksmd_should_run()) {
                        schedule_timeout_interruptible(
                                msecs_to_jiffies(ksm_thread_sleep_millisecs));
                } else {
-                       wait_event_interruptible(ksm_thread_wait,
+                       wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
index 319528b..2221491 100644 (file)
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
                if (error)
                        goto out;
                break;
+       case MADV_HUGEPAGE:
+       case MADV_NOHUGEPAGE:
+               error = hugepage_madvise(vma, &new_flags, behavior);
+               if (error)
+                       goto out;
+               break;
        }
 
        if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       case MADV_HUGEPAGE:
+       case MADV_NOHUGEPAGE:
+#endif
                return 1;
 
        default:
index 00bb8a6..8ab8410 100644 (file)
@@ -292,7 +292,6 @@ static struct move_charge_struct {
        unsigned long moved_charge;
        unsigned long moved_swap;
        struct task_struct *moving_task;        /* a task moving charges */
-       struct mm_struct *mm;
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
                return;
        VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
-       return;
 }
 
 void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                       nr_taken++;
+                       nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
                        /* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        u64 limit;
        u64 memsw;
 
-       limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
-                       total_swap_pages;
+       limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+       limit += total_swap_pages << PAGE_SHIFT;
+
        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
        /*
         * If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
  * possibility of race condition. If there is, we take a lock.
  */
 
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+                                enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc = lookup_page_cgroup(page);
        bool need_unlock = false;
+       unsigned long uninitialized_var(flags);
 
        if (unlikely(!pc))
                return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
        /* pc->mem_cgroup is unstable ? */
        if (unlikely(mem_cgroup_stealed(mem))) {
                /* take a lock against to access pc->mem_cgroup */
-               lock_page_cgroup(pc);
+               move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
                mem = pc->mem_cgroup;
                if (!mem || !PageCgroupUsed(pc))
                        goto out;
        }
 
-       this_cpu_add(mem->stat->count[idx], val);
-
        switch (idx) {
-       case MEM_CGROUP_STAT_FILE_MAPPED:
+       case MEMCG_NR_FILE_MAPPED:
                if (val > 0)
                        SetPageCgroupFileMapped(pc);
                else if (!page_mapped(page))
                        ClearPageCgroupFileMapped(pc);
+               idx = MEM_CGROUP_STAT_FILE_MAPPED;
                break;
        default:
                BUG();
        }
 
+       this_cpu_add(mem->stat->count[idx], val);
+
 out:
        if (unlikely(need_unlock))
-               unlock_page_cgroup(pc);
+               move_unlock_page_cgroup(pc, &flags);
        rcu_read_unlock();
        return;
 }
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
-       mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-               gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                                  gfp_t gfp_mask,
+                                  struct mem_cgroup **memcg, bool oom,
+                                  int page_size)
 {
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem = NULL;
        int ret;
-       int csize = CHARGE_SIZE;
+       int csize = max(CHARGE_SIZE, (unsigned long) page_size);
 
        /*
         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
                VM_BUG_ON(css_is_removed(&mem->css));
                if (mem_cgroup_is_root(mem))
                        goto done;
-               if (consume_stock(mem))
+               if (page_size == PAGE_SIZE && consume_stock(mem))
                        goto done;
                css_get(&mem->css);
        } else {
@@ -1940,7 +1940,7 @@ again:
                        rcu_read_unlock();
                        goto done;
                }
-               if (consume_stock(mem)) {
+               if (page_size == PAGE_SIZE && consume_stock(mem)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
                case CHARGE_OK:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
-                       csize = PAGE_SIZE;
+                       csize = page_size;
                        css_put(&mem->css);
                        mem = NULL;
                        goto again;
@@ -2002,8 +2002,8 @@ again:
                }
        } while (ret != CHARGE_OK);
 
-       if (csize > PAGE_SIZE)
-               refill_stock(mem, csize - PAGE_SIZE);
+       if (csize > page_size)
+               refill_stock(mem, csize - page_size);
        css_put(&mem->css);
 done:
        *memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
        }
 }
 
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+                                    int page_size)
 {
-       __mem_cgroup_cancel_charge(mem, 1);
+       __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
 }
 
 /*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
  * USED state. If already USED, uncharge and return.
  */
-
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                    struct page_cgroup *pc,
-                                    enum charge_type ctype)
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                        struct page_cgroup *pc,
+                                        enum charge_type ctype)
 {
-       /* try_charge() can return NULL to *memcg, taking care of it. */
-       if (!mem)
-               return;
-
-       lock_page_cgroup(pc);
-       if (unlikely(PageCgroupUsed(pc))) {
-               unlock_page_cgroup(pc);
-               mem_cgroup_cancel_charge(mem);
-               return;
-       }
-
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        }
 
        mem_cgroup_charge_statistics(mem, pc, true);
+}
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                      struct page_cgroup *pc,
+                                      enum charge_type ctype,
+                                      int page_size)
+{
+       int i;
+       int count = page_size >> PAGE_SHIFT;
+
+       /* try_charge() can return NULL to *memcg, taking care of it. */
+       if (!mem)
+               return;
+
+       lock_page_cgroup(pc);
+       if (unlikely(PageCgroupUsed(pc))) {
+               unlock_page_cgroup(pc);
+               mem_cgroup_cancel_charge(mem, page_size);
+               return;
+       }
+
+       /*
+        * we don't need page_cgroup_lock about tail pages, becase they are not
+        * accessed by any other context at this point.
+        */
+       for (i = 0; i < count; i++)
+               ____mem_cgroup_commit_charge(mem, pc + i, ctype);
 
        unlock_page_cgroup(pc);
        /*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
        mem_cgroup_charge_statistics(from, pc, false);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-               mem_cgroup_cancel_charge(from);
+               mem_cgroup_cancel_charge(from, PAGE_SIZE);
 
        /* caller should have done css_get */
        pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        int ret = -EINVAL;
+       unsigned long flags;
+
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+               move_lock_page_cgroup(pc, &flags);
                __mem_cgroup_move_account(pc, from, to, uncharge);
+               move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
        unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto put;
 
        parent = mem_cgroup_from_cont(pcg);
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+                                     PAGE_SIZE);
        if (ret || !parent)
                goto put_back;
 
        ret = mem_cgroup_move_account(pc, child, parent, true);
        if (ret)
-               mem_cgroup_cancel_charge(parent);
+               mem_cgroup_cancel_charge(parent, PAGE_SIZE);
 put_back:
        putback_lru_page(page);
 put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
+       int page_size = PAGE_SIZE;
+
+       if (PageTransHuge(page)) {
+               page_size <<= compound_order(page);
+               VM_BUG_ON(!PageTransHuge(page));
+       }
 
        pc = lookup_page_cgroup(page);
        /* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
 
-       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
        if (ret || !mem)
                return ret;
 
-       __mem_cgroup_commit_charge(mem, pc, ctype);
+       __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
        return 0;
 }
 
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-       if (PageCompound(page))
-               return 0;
        /*
         * If already mapped, we don't have to account.
         * If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-       return __mem_cgroup_try_charge(mm, mask, ptr, true);
+       return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
 }
 
 static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
-       __mem_cgroup_commit_charge(ptr, pc, ctype);
+       __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
        mem_cgroup_lru_add_after_commit_swapcache(page);
        /*
         * Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-       mem_cgroup_cancel_charge(mem);
+       mem_cgroup_cancel_charge(mem, PAGE_SIZE);
 }
 
 static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+             int page_size)
 {
        struct memcg_batch_info *batch = NULL;
        bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
                goto direct_uncharge;
 
+       if (page_size != PAGE_SIZE)
+               goto direct_uncharge;
+
        /*
         * In typical case, batch->memcg == mem. This means we can
         * merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
                batch->memsw_bytes += PAGE_SIZE;
        return;
 direct_uncharge:
-       res_counter_uncharge(&mem->res, PAGE_SIZE);
+       res_counter_uncharge(&mem->res, page_size);
        if (uncharge_memsw)
-               res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+               res_counter_uncharge(&mem->memsw, page_size);
        if (unlikely(batch->memcg != mem))
                memcg_oom_recover(mem);
        return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
+       int i;
+       int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
+       int page_size = PAGE_SIZE;
 
        if (mem_cgroup_disabled())
                return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (PageSwapCache(page))
                return NULL;
 
+       if (PageTransHuge(page)) {
+               page_size <<= compound_order(page);
+               VM_BUG_ON(!PageTransHuge(page));
+       }
+
+       count = page_size >> PAGE_SHIFT;
        /*
         * Check if our page_cgroup is valid
         */
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
 
-       mem_cgroup_charge_statistics(mem, pc, false);
+       for (i = 0; i < count; i++)
+               mem_cgroup_charge_statistics(mem, pc + i, false);
 
        ClearPageCgroupUsed(pc);
        /*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_get(mem);
        }
        if (!mem_cgroup_is_root(mem))
-               __do_uncharge(mem, ctype);
+               __do_uncharge(mem, ctype, page_size);
 
        return mem;
 
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
        enum charge_type ctype;
        int ret = 0;
 
+       VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return 0;
 
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
                return 0;
 
        *ptr = mem;
-       ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+       ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
        css_put(&mem->css);/* drop extra refcnt */
        if (ret || *ptr == NULL) {
                if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-       __mem_cgroup_commit_charge(mem, pc, ctype);
+       __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
        return ret;
 }
 
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
-       struct page *oldpage, struct page *newpage)
+       struct page *oldpage, struct page *newpage, bool migration_ok)
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
                return;
        /* blocks rmdir() */
        cgroup_exclude_rmdir(&mem->css);
-       /* at migration success, oldpage->mapping is NULL. */
-       if (oldpage->mapping) {
+       if (!migration_ok) {
                used = oldpage;
                unused = newpage;
        } else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
         */
        if (!node_state(node, N_NORMAL_MEMORY))
                tmp = -1;
-       pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+       pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
        if (!pn)
                return 1;
 
        mem->info.nodeinfo[node] = pn;
-       memset(pn, 0, sizeof(*pn));
-
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-               mem = kmalloc(size, GFP_KERNEL);
+               mem = kzalloc(size, GFP_KERNEL);
        else
-               mem = vmalloc(size);
+               mem = vzalloc(size);
 
        if (!mem)
                return NULL;
 
-       memset(mem, 0, size);
        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!mem->stat)
                goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+                                             PAGE_SIZE);
                if (ret || !mem)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
        unsigned long precharge;
        struct vm_area_struct *vma;
 
-       /* We've already held the mmap_sem */
+       down_read(&mm->mmap_sem);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                struct mm_walk mem_cgroup_count_precharge_walk = {
                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                walk_page_range(vma->vm_start, vma->vm_end,
                                        &mem_cgroup_count_precharge_walk);
        }
+       up_read(&mm->mmap_sem);
 
        precharge = mc.precharge;
        mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-       return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+       unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+       VM_BUG_ON(mc.moving_task);
+       mc.moving_task = current;
+       return mem_cgroup_do_precharge(precharge);
 }
 
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
                                                PAGE_SIZE * mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
-
                mc.moved_swap = 0;
        }
-       if (mc.mm) {
-               up_read(&mc.mm->mmap_sem);
-               mmput(mc.mm);
-       }
+       memcg_oom_recover(from);
+       memcg_oom_recover(to);
+       wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+       struct mem_cgroup *from = mc.from;
+
+       /*
+        * we must clear moving_task before waking up waiters at the end of
+        * task migration.
+        */
+       mc.moving_task = NULL;
+       __mem_cgroup_clear_mc();
        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        spin_unlock(&mc.lock);
-       mc.moving_task = NULL;
-       mc.mm = NULL;
        mem_cgroup_end_move(from);
-       memcg_oom_recover(from);
-       memcg_oom_recover(to);
-       wake_up_all(&mc.waitq);
 }
 
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        return 0;
                /* We move charges only when we move a owner of the mm */
                if (mm->owner == p) {
-                       /*
-                        * We do all the move charge works under one mmap_sem to
-                        * avoid deadlock with down_write(&mmap_sem)
-                        * -> try_charge() -> if (mc.moving_task) -> sleep.
-                        */
-                       down_read(&mm->mmap_sem);
-
                        VM_BUG_ON(mc.from);
                        VM_BUG_ON(mc.to);
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
-                       VM_BUG_ON(mc.moving_task);
-                       VM_BUG_ON(mc.mm);
-
                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
-                       mc.precharge = 0;
-                       mc.moved_charge = 0;
-                       mc.moved_swap = 0;
                        spin_unlock(&mc.lock);
-                       mc.moving_task = current;
-                       mc.mm = mm;
+                       /* We set mc.moving_task later */
 
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
                                mem_cgroup_clear_mc();
-                       /* We call up_read() and mmput() in clear_mc(). */
-               } else
-                       mmput(mm);
+               }
+               mmput(mm);
        }
        return ret;
 }
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        spinlock_t *ptl;
 
 retry:
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
 
        lru_add_drain_all();
-       /* We've already held the mmap_sem */
+retry:
+       if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+               /*
+                * Someone who are holding the mmap_sem might be waiting in
+                * waitq. So we cancel all extra charges, wake up all waiters,
+                * and retry. Because we cancel precharges, we might not be able
+                * to move enough charges, but moving charge is a best-effort
+                * feature anyway, so it wouldn't be a big problem.
+                */
+               __mem_cgroup_clear_mc();
+               cond_resched();
+               goto retry;
+       }
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                int ret;
                struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
                         */
                        break;
        }
+       up_read(&mm->mmap_sem);
 }
 
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct task_struct *p,
                                bool threadgroup)
 {
-       if (!mc.mm)
+       struct mm_struct *mm;
+
+       if (!mc.to)
                /* no need to move charge */
                return;
 
-       mem_cgroup_move_charge(mc.mm);
+       mm = get_task_mm(p);
+       if (mm) {
+               mem_cgroup_move_charge(mm);
+               mmput(mm);
+       }
        mem_cgroup_clear_mc();
 }
 #else  /* !CONFIG_MMU */
index 46ab2c0..548fbd7 100644 (file)
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-       si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+       si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
        /*
         * Don't use force here, it's convenient if the signal
         * can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
 
+       if (!PageHuge(page) && unlikely(split_huge_page(page)))
+               return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-       int nr_pages = 1 << compound_order(hpage);
+       int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-       int nr_pages = 1 << compound_order(hpage);
+       int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
 
-       nr_pages = 1 << compound_order(hpage);
+       nr_pages = 1 << compound_trans_order(hpage);
        atomic_long_add(nr_pages, &mce_bad_pages);
 
        /*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
 
-       nr_pages = 1 << compound_order(page);
+       nr_pages = 1 << compound_trans_order(page);
 
        if (!get_page_unless_zero(page)) {
                /*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
 
        list_add(&hpage->lru, &pagelist);
-       ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+       ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+                               true);
        if (ret) {
-                       putback_lru_pages(&pagelist);
+               putback_lru_pages(&pagelist);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
 done:
        if (!PageHWPoison(hpage))
-               atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+               atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
                LIST_HEAD(pagelist);
 
                list_add(&page->lru, &pagelist);
-               ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+               ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+                                                               0, true);
                if (ret) {
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
index 02e48aa..31250fa 100644 (file)
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long address)
 {
        pgtable_t new = pte_alloc_one(mm, address);
+       int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
 
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
        spin_lock(&mm->page_table_lock);
-       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+       wait_split_huge_page = 0;
+       if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm->nr_ptes++;
                pmd_populate(mm, pmd, new);
                new = NULL;
-       }
+       } else if (unlikely(pmd_trans_splitting(*pmd)))
+               wait_split_huge_page = 1;
        spin_unlock(&mm->page_table_lock);
        if (new)
                pte_free(mm, new);
+       if (wait_split_huge_page)
+               wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
 
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        smp_wmb(); /* See comment in __pte_alloc */
 
        spin_lock(&init_mm.page_table_lock);
-       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+       if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-       }
+       } else
+               VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
        return 0;
 }
 
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-               unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                  pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+                  unsigned long addr, unsigned long end)
 {
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               if (pmd_trans_huge(*src_pmd)) {
+                       int err;
+                       VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                       err = copy_huge_pmd(dst_mm, src_mm,
+                                           dst_pmd, src_pmd, addr, vma);
+                       if (err == -ENOMEM)
+                               return -ENOMEM;
+                       if (!err)
+                               continue;
+                       /* fall through */
+               }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               if (pmd_trans_huge(*pmd)) {
+                       if (next-addr != HPAGE_PMD_SIZE) {
+                               VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+                               split_huge_page_pmd(vma->vm_mm, pmd);
+                       } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                               (*zap_work)--;
+                               continue;
+                       }
+                       /* fall through */
+               }
                if (pmd_none_or_clear_bad(pmd)) {
                        (*zap_work)--;
                        continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-       if (pud_huge(*pud)) {
+       if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
-       if (pmd_huge(*pmd)) {
+       if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+       if (pmd_trans_huge(*pmd)) {
+               if (flags & FOLL_SPLIT) {
+                       split_huge_page_pmd(mm, pmd);
+                       goto split_fallthrough;
+               }
+               spin_lock(&mm->page_table_lock);
+               if (likely(pmd_trans_huge(*pmd))) {
+                       if (unlikely(pmd_trans_splitting(*pmd))) {
+                               spin_unlock(&mm->page_table_lock);
+                               wait_split_huge_page(vma->anon_vma, pmd);
+                       } else {
+                               page = follow_trans_huge_pmd(mm, address,
+                                                            pmd, flags);
+                               spin_unlock(&mm->page_table_lock);
+                               goto out;
+                       }
+               } else
+                       spin_unlock(&mm->page_table_lock);
+               /* fall through */
+       }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
 
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 */
                mark_page_accessed(page);
        }
+       if (flags & FOLL_MLOCK) {
+               /*
+                * The preliminary mapping check is mainly to avoid the
+                * pointless overhead of lock_page on the ZERO_PAGE
+                * which might bounce very badly if there is contention.
+                *
+                * If the page is already locked, we don't need to
+                * handle it now - vmscan will handle it later if and
+                * when it attempts to reclaim the page.
+                */
+               if (page->mapping && trylock_page(page)) {
+                       lru_add_drain();  /* push cached pages to LRU */
+                       /*
+                        * Because we lock page here and migration is
+                        * blocked by the pte's page reference, we need
+                        * only check for file-cache page truncation.
+                        */
+                       if (page->mapping)
+                               mlock_vma_page(page);
+                       unlock_page(page);
+               }
+       }
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -1341,7 +1412,8 @@ no_page_table:
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
-                    struct page **pages, struct vm_area_struct **vmas)
+                    struct page **pages, struct vm_area_struct **vmas,
+                    int *nonblocking)
 {
        int i;
        unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                       VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
+                               unsigned int fault_flags = 0;
+
+                               if (foll_flags & FOLL_WRITE)
+                                       fault_flags |= FAULT_FLAG_WRITE;
+                               if (nonblocking)
+                                       fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 
                                ret = handle_mm_fault(mm, vma, start,
-                                       (foll_flags & FOLL_WRITE) ?
-                                       FAULT_FLAG_WRITE : 0);
+                                                       fault_flags);
 
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                else
                                        tsk->min_flt++;
 
+                               if (ret & VM_FAULT_RETRY) {
+                                       *nonblocking = 0;
+                                       return i;
+                               }
+
                                /*
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
 
-       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                               NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
 
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
        struct page *page;
 
        if (__get_user_pages(current, current->mm, addr, 1,
-                       FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                            FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                            NULL) < 1)
                return NULL;
        flush_cache_page(vma, addr, page_to_pfn(page));
        return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-               if (pmd)
+               if (pmd) {
+                       VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+               }
        }
        return NULL;
 }
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
        return same;
 }
 
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-       if (likely(vma->vm_flags & VM_WRITE))
-               pte = pte_mkwrite(pte);
-       return pte;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-       int reuse = 0, ret = 0;
+       int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
 
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        }
                        page_cache_release(old_page);
                }
-               reuse = reuse_swap_page(old_page);
-               if (reuse)
+               if (reuse_swap_page(old_page)) {
                        /*
                         * The page is all ours.  Move it to our anon_vma so
                         * the rmap code will not search our parent or siblings.
                         * Protected against the rmap code by the page lock.
                         */
                        page_move_anon_rmap(old_page, vma, address);
+                       unlock_page(old_page);
+                       goto reuse;
+               }
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                dirty_page = old_page;
                get_page(dirty_page);
-               reuse = 1;
-       }
 
-       if (reuse) {
 reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, page_table);
+               pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-               goto unlock;
+
+               if (!dirty_page)
+                       return ret;
+
+               /*
+                * Yes, Virginia, this is actually required to prevent a race
+                * with clear_page_dirty_for_io() from clearing the page dirty
+                * bit after it clear all dirty ptes, but before a racing
+                * do_wp_page installs a dirty pte.
+                *
+                * do_no_page is protected similarly.
+                */
+               if (!page_mkwrite) {
+                       wait_on_page_locked(dirty_page);
+                       set_page_dirty_balance(dirty_page, page_mkwrite);
+               }
+               put_page(dirty_page);
+               if (page_mkwrite) {
+                       struct address_space *mapping = dirty_page->mapping;
+
+                       set_page_dirty(dirty_page);
+                       unlock_page(dirty_page);
+                       page_cache_release(dirty_page);
+                       if (mapping)    {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
+
+               return ret;
        }
 
        /*
@@ -2337,39 +2448,6 @@ gotten:
                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-       if (dirty_page) {
-               /*
-                * Yes, Virginia, this is actually required to prevent a race
-                * with clear_page_dirty_for_io() from clearing the page dirty
-                * bit after it clear all dirty ptes, but before a racing
-                * do_wp_page installs a dirty pte.
-                *
-                * do_no_page is protected similarly.
-                */
-               if (!page_mkwrite) {
-                       wait_on_page_locked(dirty_page);
-                       set_page_dirty_balance(dirty_page, page_mkwrite);
-               }
-               put_page(dirty_page);
-               if (page_mkwrite) {
-                       struct address_space *mapping = dirty_page->mapping;
-
-                       set_page_dirty(dirty_page);
-                       unlock_page(dirty_page);
-                       page_cache_release(dirty_page);
-                       if (mapping)    {
-                               /*
-                                * Some device drivers do not set page.mapping
-                                * but still dirty their pages
-                                */
-                               balance_dirty_pages_ratelimited(mapping);
-                       }
-               }
-
-               /* file_update_time outside page_lock */
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
-       }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static inline int handle_pte_fault(struct mm_struct *mm,
-               struct vm_area_struct *vma, unsigned long address,
-               pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+                    struct vm_area_struct *vma, unsigned long address,
+                    pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
-       pte = pte_alloc_map(mm, pmd, address);
-       if (!pte)
+       if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+               if (!vma->vm_ops)
+                       return do_huge_pmd_anonymous_page(mm, vma, address,
+                                                         pmd, flags);
+       } else {
+               pmd_t orig_pmd = *pmd;
+               barrier();
+               if (pmd_trans_huge(orig_pmd)) {
+                       if (flags & FAULT_FLAG_WRITE &&
+                           !pmd_write(orig_pmd) &&
+                           !pmd_trans_splitting(orig_pmd))
+                               return do_huge_pmd_wp_page(mm, vma, address,
+                                                          pmd, orig_pmd);
+                       return 0;
+               }
+       }
+
+       /*
+        * Use __pte_alloc instead of pte_alloc_map, because we can't
+        * run pte_offset_map on the pmd, if an huge pmd could
+        * materialize from under us from a different thread.
+        */
+       if (unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
+       /* if an huge pmd materialized from under us just retry later */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               return 0;
+       /*
+        * A regular pmd is established and it can't morph into a huge pmd
+        * from under us anymore at this point because we hold the mmap_sem
+        * read mode and khugepaged takes it in write mode. So now it's
+        * safe to run pte_offset_map().
+        */
+       pte = pte_offset_map(pmd, address);
 
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
                return -ENOMEM;
-       write = (vma->vm_flags & VM_WRITE) != 0;
+       /*
+        * We want to touch writable mappings with a write fault in order
+        * to break COW, except for shared mappings because these don't COW
+        * and we would not want to dirty them for nothing.
+        */
+       write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
 
        pmd = pmd_offset(pud, address);
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
 
@@ -3608,3 +3723,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                               unsigned long addr,
+                               unsigned int pages_per_huge_page)
+{
+       int i;
+       struct page *p = page;
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page;
+            i++, p = mem_map_next(p, page, i)) {
+               cond_resched();
+               clear_user_highpage(p, addr + i * PAGE_SIZE);
+       }
+}
+void clear_huge_page(struct page *page,
+                    unsigned long addr, unsigned int pages_per_huge_page)
+{
+       int i;
+
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+               clear_gigantic_page(page, addr, pages_per_huge_page);
+               return;
+       }
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page; i++) {
+               cond_resched();
+               clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+       }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                   unsigned long addr,
+                                   struct vm_area_struct *vma,
+                                   unsigned int pages_per_huge_page)
+{
+       int i;
+       struct page *dst_base = dst;
+       struct page *src_base = src;
+
+       for (i = 0; i < pages_per_huge_page; ) {
+               cond_resched();
+               copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+               i++;
+               dst = mem_map_next(dst, dst_base, i);
+               src = mem_map_next(src, src_base, i);
+       }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+                        unsigned long addr, struct vm_area_struct *vma,
+                        unsigned int pages_per_huge_page)
+{
+       int i;
+
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+               copy_user_gigantic_page(dst, src, addr, vma,
+                                       pages_per_huge_page);
+               return;
+       }
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page; i++) {
+               cond_resched();
+               copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+       }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
index 2c6523a..e92f047 100644 (file)
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int type)
+static void get_page_bootmem(unsigned long info,  struct page *page,
+                            unsigned long type)
 {
-       atomic_set(&page->_mapcount, type);
+       page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
  * so use __ref to tell modpost not to generate a warning */
 void __ref put_page_bootmem(struct page *page)
 {
-       int type;
+       unsigned long type;
 
-       type = atomic_read(&page->_mapcount);
-       BUG_ON(type >= -1);
+       type = (unsigned long) page->lru.next;
+       BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+              type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-               reset_page_mapcount(page);
+               INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
        }
 
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        goto out;
                }
                /* this function returns # of failed pages */
-               ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+               ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                                                               true, true);
                if (ret)
                        putback_lru_pages(&source);
        }
index 11ff260..368fc9d 100644 (file)
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               split_huge_page_pmd(vma->vm_mm, pmd);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                return PTR_ERR(vma);
 
        if (!list_empty(&pagelist)) {
-               err = migrate_pages(&pagelist, new_node_page, dest, 0);
+               err = migrate_pages(&pagelist, new_node_page, dest,
+                                                               false, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                               (unsigned long)vma, 0);
+                                               (unsigned long)vma,
+                                               false, true);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 
        /* Find the mm_struct */
        rcu_read_lock();
-       read_lock(&tasklist_lock);
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-               read_unlock(&tasklist_lock);
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        mm = get_task_mm(task);
-       read_unlock(&tasklist_lock);
        rcu_read_unlock();
 
        err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 }
 
 /**
- *     alloc_page_vma  - Allocate a page for a VMA.
+ *     alloc_pages_vma - Allocate a page for a VMA.
  *
  *     @gfp:
  *      %GFP_USER    user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *      %GFP_FS      allocation should not call back into a file system.
  *      %GFP_ATOMIC  don't sleep.
  *
+ *     @order:Order of the GFP allocation.
  *     @vma:  Pointer to VMA or NULL if not available.
  *     @addr: Virtual Address of the allocation. Must be inside the VMA.
  *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *     Should be called with the mm_sem of the vma hold.
  */
 struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+               unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
                mpol_cond_put(pol);
-               page = alloc_page_interleave(gfp, 0, nid);
+               page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                /*
                 * slow path: ref counted shared policy
                 */
-               struct page *page =  __alloc_pages_nodemask(gfp, 0,
+               struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
                put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        /*
         * fast path:  default or task policy
         */
-       page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+       page = __alloc_pages_nodemask(gfp, order, zl,
+                                     policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
index 6ae8a66..46fe8cc 100644 (file)
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                        goto out;
 
                pmd = pmd_offset(pud, addr);
+               if (pmd_trans_huge(*pmd))
+                       goto out;
                if (!pmd_present(*pmd))
                        goto out;
 
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-                       (struct page *)radix_tree_deref_slot(pslot) != page) {
+               radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-           (struct page *)radix_tree_deref_slot(pslot) != page) {
+               radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                       struct page *page, int force, int offlining)
+                       struct page *page, int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
-       int rcu_locked = 0;
        int charge = 0;
        struct mem_cgroup *mem = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
        }
+       if (unlikely(PageTransHuge(page)))
+               if (unlikely(split_huge_page(page)))
+                       goto move_newpage;
 
        /* prepare cgroup just returns 0 or -ENOMEM */
        rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+
+               /*
+                * It's not safe for direct compaction to call lock_page.
+                * For example, during page readahead pages are added locked
+                * to the LRU. Later, when the IO completes the pages are
+                * marked uptodate and unlocked. However, the queueing
+                * could be merging multiple pages for one bio (e.g.
+                * mpage_readpages). If an allocation happens for the
+                * second or third page, the process can end up locking
+                * the same page twice and deadlocking. Rather than
+                * trying to be clever about what pages can be locked,
+                * avoid the use of lock_page for direct compaction
+                * altogether.
+                */
+               if (current->flags & PF_MEMALLOC)
+                       goto move_newpage;
+
                lock_page(page);
        }
 
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        BUG_ON(charge);
 
        if (PageWriteback(page)) {
-               if (!force)
+               if (!force || !sync)
                        goto uncharge;
                wait_on_page_writeback(page);
        }
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
-        * This rcu_read_lock() delays freeing anon_vma pointer until the end
+        * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
-               rcu_read_lock();
-               rcu_locked = 1;
-
-               /* Determine how to safely use anon_vma */
-               if (!page_mapped(page)) {
-                       if (!PageSwapCache(page))
-                               goto rcu_unlock;
-
+               /*
+                * Only page_lock_anon_vma() understands the subtleties of
+                * getting a hold on an anon_vma from outside one of its mms.
+                */
+               anon_vma = page_lock_anon_vma(page);
+               if (anon_vma) {
+                       /*
+                        * Take a reference count on the anon_vma if the
+                        * page is mapped so that it is guaranteed to
+                        * exist when the page is remapped later
+                        */
+                       get_anon_vma(anon_vma);
+                       page_unlock_anon_vma(anon_vma);
+               } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
                         * swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         */
                        remap_swapcache = 0;
                } else {
-                       /*
-                        * Take a reference count on the anon_vma if the
-                        * page is mapped so that it is guaranteed to
-                        * exist when the page is remapped later
-                        */
-                       anon_vma = page_anon_vma(page);
-                       get_anon_vma(anon_vma);
+                       goto uncharge;
                }
        }
 
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-               if (!PageAnon(page) && page_has_private(page)) {
-                       /*
-                        * Go direct to try_to_free_buffers() here because
-                        * a) that's what try_to_release_page() would do anyway
-                        * b) we may be under rcu_read_lock() here, so we can't
-                        *    use GFP_KERNEL which is what try_to_release_page()
-                        *    needs to be effective.
-                        */
+               VM_BUG_ON(PageAnon(page));
+               if (page_has_private(page)) {
                        try_to_free_buffers(page);
-                       goto rcu_unlock;
+                       goto uncharge;
                }
                goto skip_unmap;
        }
@@ -746,17 +761,14 @@ skip_unmap:
 
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
-rcu_unlock:
 
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
                drop_anon_vma(anon_vma);
 
-       if (rcu_locked)
-               rcu_read_unlock();
 uncharge:
        if (!charge)
-               mem_cgroup_end_migration(mem, page, newpage);
+               mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
 
@@ -810,12 +822,11 @@ move_newpage:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                               int force, int offlining)
+                               int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *new_hpage = get_new_page(hpage, private, &result);
-       int rcu_locked = 0;
        struct anon_vma *anon_vma = NULL;
 
        if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        rc = -EAGAIN;
 
        if (!trylock_page(hpage)) {
-               if (!force)
+               if (!force || !sync)
                        goto out;
                lock_page(hpage);
        }
 
        if (PageAnon(hpage)) {
-               rcu_read_lock();
-               rcu_locked = 1;
-
-               if (page_mapped(hpage)) {
-                       anon_vma = page_anon_vma(hpage);
-                       atomic_inc(&anon_vma->external_refcount);
+               anon_vma = page_lock_anon_vma(hpage);
+               if (anon_vma) {
+                       get_anon_vma(anon_vma);
+                       page_unlock_anon_vma(anon_vma);
                }
        }
 
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (rc)
                remove_migration_ptes(hpage, hpage);
 
-       if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
-                                           &anon_vma->lock)) {
-               int empty = list_empty(&anon_vma->head);
-               spin_unlock(&anon_vma->lock);
-               if (empty)
-                       anon_vma_free(anon_vma);
-       }
-
-       if (rcu_locked)
-               rcu_read_unlock();
+       if (anon_vma)
+               drop_anon_vma(anon_vma);
 out:
        unlock_page(hpage);
 
@@ -892,7 +893,8 @@ out:
  * Return: Number of pages not migrated or error code.
  */
 int migrate_pages(struct list_head *from,
-               new_page_t get_new_page, unsigned long private, int offlining)
+               new_page_t get_new_page, unsigned long private, bool offlining,
+               bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
 
                        rc = unmap_and_move(get_new_page, private,
-                                               page, pass > 2, offlining);
+                                               page, pass > 2, offlining,
+                                               sync);
 
                        switch(rc) {
                        case -ENOMEM:
@@ -941,7 +944,8 @@ out:
 }
 
 int migrate_huge_pages(struct list_head *from,
-               new_page_t get_new_page, unsigned long private, int offlining)
+               new_page_t get_new_page, unsigned long private, bool offlining,
+               bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
                        cond_resched();
 
                        rc = unmap_and_move_huge_page(get_new_page,
-                                       private, page, pass > 2, offlining);
+                                       private, page, pass > 2, offlining,
+                                       sync);
 
                        switch(rc) {
                        case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
 
-               page = follow_page(vma, pp->addr, FOLL_GET);
+               page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
 
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                               (unsigned long)pm, 0);
+                               (unsigned long)pm, 0, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
index 9ac42dc..a4e6b9d 100644 (file)
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               if (pmd_trans_huge(*pmd)) {
+                       if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+                               vec += (next - addr) >> PAGE_SHIFT;
+                               continue;
+                       }
+                       /* fall through */
+               }
                if (pmd_none_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
index b70919c..13e81ee 100644 (file)
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
  * vma->vm_mm->mmap_sem must be held for at least read.
  */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                   unsigned long start, unsigned long end)
+                                   unsigned long start, unsigned long end,
+                                   int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
-       struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-       int ret = 0;
        int gup_flags;
 
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        VM_BUG_ON(end   > vma->vm_end);
        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
-       gup_flags = FOLL_TOUCH | FOLL_GET;
-       if (vma->vm_flags & VM_WRITE)
+       gup_flags = FOLL_TOUCH;
+       /*
+        * We want to touch writable mappings with a write fault in order
+        * to break COW, except for shared mappings because these don't COW
+        * and we would not want to dirty them for nothing.
+        */
+       if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
 
+       if (vma->vm_flags & VM_LOCKED)
+               gup_flags |= FOLL_MLOCK;
+
        /* We don't try to access the guard page of a stack vma */
        if (stack_guard_page(vma, start)) {
                addr += PAGE_SIZE;
                nr_pages--;
        }
 
-       while (nr_pages > 0) {
-               int i;
-
-               cond_resched();
-
-               /*
-                * get_user_pages makes pages present if we are
-                * setting mlock. and this extra reference count will
-                * disable migration of this page.  However, page may
-                * still be truncated out from under us.
-                */
-               ret = __get_user_pages(current, mm, addr,
-                               min_t(int, nr_pages, ARRAY_SIZE(pages)),
-                               gup_flags, pages, NULL);
-               /*
-                * This can happen for, e.g., VM_NONLINEAR regions before
-                * a page has been allocated and mapped at a given offset,
-                * or for addresses that map beyond end of a file.
-                * We'll mlock the pages if/when they get faulted in.
-                */
-               if (ret < 0)
-                       break;
-
-               lru_add_drain();        /* push cached pages to LRU */
-
-               for (i = 0; i < ret; i++) {
-                       struct page *page = pages[i];
-
-                       if (page->mapping) {
-                               /*
-                                * That preliminary check is mainly to avoid
-                                * the pointless overhead of lock_page on the
-                                * ZERO_PAGE: which might bounce very badly if
-                                * there is contention.  However, we're still
-                                * dirtying its cacheline with get/put_page:
-                                * we'll add another __get_user_pages flag to
-                                * avoid it if that case turns out to matter.
-                                */
-                               lock_page(page);
-                               /*
-                                * Because we lock page here and migration is
-                                * blocked by the elevated reference, we need
-                                * only check for file-cache page truncation.
-                                */
-                               if (page->mapping)
-                                       mlock_vma_page(page);
-                               unlock_page(page);
-                       }
-                       put_page(page); /* ref from get_user_pages() */
-               }
-
-               addr += ret * PAGE_SIZE;
-               nr_pages -= ret;
-               ret = 0;
-       }
-
-       return ret;     /* 0 or negative error code */
+       return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+                               NULL, NULL, nonblocking);
 }
 
 /*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
 
-               __mlock_vma_pages_range(vma, start, end);
+               __mlock_vma_pages_range(vma, start, end, NULL);
 
                /* Hide errors from mmap() and other callers */
                return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        int ret = 0;
        int lock = newflags & VM_LOCKED;
 
-       if (newflags == vma->vm_flags ||
-                       (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+       if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+           is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
                goto out;       /* don't set VM_LOCKED,  don't count */
 
-       if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
-                       is_vm_hugetlb_page(vma) ||
-                       vma == get_gate_vma(current)) {
-               if (lock)
-                       make_pages_present(start, end);
-               goto out;       /* don't set VM_LOCKED,  don't count */
-       }
-
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
 
-       if (lock) {
+       if (lock)
                vma->vm_flags = newflags;
-               ret = __mlock_vma_pages_range(vma, start, end);
-               if (ret < 0)
-                       ret = __mlock_posix_error_return(ret);
-       } else {
+       else
                munlock_vma_pages_range(vma, start, end);
-       }
 
 out:
        *prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
        struct vm_area_struct * vma, * prev;
        int error;
 
-       len = PAGE_ALIGN(len);
+       VM_BUG_ON(start & ~PAGE_MASK);
+       VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
 
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+       struct mm_struct *mm = current->mm;
+       unsigned long end, nstart, nend;
+       struct vm_area_struct *vma = NULL;
+       int locked = 0;
+       int ret = 0;
+
+       VM_BUG_ON(start & ~PAGE_MASK);
+       VM_BUG_ON(len != PAGE_ALIGN(len));
+       end = start + len;
+
+       for (nstart = start; nstart < end; nstart = nend) {
+               /*
+                * We want to fault in pages for [nstart; end) address range.
+                * Find first corresponding VMA.
+                */
+               if (!locked) {
+                       locked = 1;
+                       down_read(&mm->mmap_sem);
+                       vma = find_vma(mm, nstart);
+               } else if (nstart >= vma->vm_end)
+                       vma = vma->vm_next;
+               if (!vma || vma->vm_start >= end)
+                       break;
+               /*
+                * Set [nstart; nend) to intersection of desired address
+                * range with the first VMA. Also, skip undesirable VMA types.
+                */
+               nend = min(end, vma->vm_end);
+               if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                       continue;
+               if (nstart < vma->vm_start)
+                       nstart = vma->vm_start;
+               /*
+                * Now fault in a range of pages. __mlock_vma_pages_range()
+                * double checks the vma flags, so that it won't mlock pages
+                * if the vma was already munlocked.
+                */
+               ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+               if (ret < 0) {
+                       if (ignore_errors) {
+                               ret = 0;
+                               continue;       /* continue at next VMA */
+                       }
+                       ret = __mlock_posix_error_return(ret);
+                       break;
+               }
+               nend = nstart + ret * PAGE_SIZE;
+               ret = 0;
+       }
+       if (locked)
+               up_read(&mm->mmap_sem);
+       return ret;     /* 0 or negative error code */
+}
+
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
+       if (!error)
+               error = do_mlock_pages(start, len, 0);
        return error;
 }
 
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
+       if (!ret && (flags & MCL_CURRENT)) {
+               /* Ignore errors */
+               do_mlock_pages(0, TASK_SIZE, 1);
+       }
 out:
        return ret;
 }
index 50a4aa0..2ec8eb5 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
+#include <linux/khugepaged.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        down_write(&mm->mmap_sem);
 
 #ifdef CONFIG_COMPAT_BRK
-       min_brk = mm->end_code;
+       /*
+        * CONFIG_COMPAT_BRK can still be overridden by setting
+        * randomize_va_space to 2, which will still cause mm->start_brk
+        * to be arbitrarily shifted
+        */
+       if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+               min_brk = mm->start_brk;
+       else
+               min_brk = mm->end_data;
 #else
        min_brk = mm->start_brk;
 #endif
@@ -588,6 +597,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
                }
        }
 
+       vma_adjust_trans_huge(vma, start, end, adjust_next);
+
        /*
         * When changing only vma->vm_end, we don't really need anon_vma
         * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                end, prev->vm_pgoff, NULL);
                if (err)
                        return NULL;
+               khugepaged_enter_vma_merge(prev);
                return prev;
        }
 
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                next->vm_pgoff - pglen, NULL);
                if (err)
                        return NULL;
+               khugepaged_enter_vma_merge(area);
                return area;
        }
 
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                }
        }
        vma_unlock_anon_vma(vma);
+       khugepaged_enter_vma_merge(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
                }
        }
        vma_unlock_anon_vma(vma);
+       khugepaged_enter_vma_merge(vma);
        return error;
 }
 
index 438951d..8d032de 100644 (file)
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
 
+int __mmu_notifier_test_young(struct mm_struct *mm,
+                             unsigned long address)
+{
+       struct mmu_notifier *mn;
+       struct hlist_node *n;
+       int young = 0;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->test_young) {
+                       young = mn->ops->test_young(mn, mm, address);
+                       if (young)
+                               break;
+               }
+       }
+       rcu_read_unlock();
+
+       return young;
+}
+
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
index e35bfb8..f5b7d17 100644 (file)
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-       unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
-       /*
-        * While kswapd is awake, it is considered the zone is under some
-        * memory pressure. Under pressure, there is a risk that
-        * per-cpu-counter-drift will allow the min watermark to be breached
-        * potentially causing a live-lock. While kswapd is awake and
-        * free pages are low, get a better estimate for free pages
-        */
-       if (nr_free_pages < zone->percpu_drift_mark &&
-                       !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-               return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
-       return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
index 4c51338..5a688a2 100644 (file)
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
        pte_unmap_unlock(pte - 1, ptl);
 }
 
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               if (pmd_trans_huge(*pmd)) {
+                       if (next - addr != HPAGE_PMD_SIZE)
+                               split_huge_page_pmd(vma->vm_mm, pmd);
+                       else if (change_huge_pmd(vma, pmd, addr, newprot))
+                               continue;
+                       /* fall through */
+               }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-               change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+               change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                                dirty_accountable);
        } while (pmd++, addr = next, addr != end);
 }
 
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+               change_pmd_range(vma, pud, addr, next, newprot,
+                                dirty_accountable);
        } while (pud++, addr = next, addr != end);
 }
 
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-               change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+               change_pud_range(vma, pgd, addr, next, newprot,
+                                dirty_accountable);
        } while (pgd++, addr = next, addr != end);
        flush_tlb_range(vma, start, end);
 }
index 563fbdd..9925b63 100644 (file)
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
                return NULL;
 
        pmd = pmd_offset(pud, addr);
+       split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
 
        return pmd;
 }
 
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                           unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
        if (!pmd)
                return NULL;
 
-       if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+       VM_BUG_ON(pmd_trans_huge(*pmd));
+       if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
                return NULL;
 
        return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
-               new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+               new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
                next = (new_addr + PMD_SIZE) & PMD_MASK;
index ef4045d..f59e142 100644 (file)
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int foll_flags,
-                    struct page **pages, struct vm_area_struct **vmas)
+                    struct page **pages, struct vm_area_struct **vmas,
+                    int *retry)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
 
-       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                               NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
 
index b5d8a1f..2cb01f6 100644 (file)
@@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
-       unsigned long available_memory = determine_dirtyable_memory();
+       unsigned long uninitialized_var(available_memory);
        struct task_struct *tsk;
 
+       if (!vm_dirty_bytes || !dirty_background_bytes)
+               available_memory = determine_dirtyable_memory();
+
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
        else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
 int __set_page_dirty_no_writeback(struct page *page)
 {
        if (!PageDirty(page))
-               SetPageDirty(page);
+               return !TestSetPageDirty(page);
        return 0;
 }
 
index 826ba69..90c1439 100644 (file)
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
 
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
 }
 
 /*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
 
        if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
 
        while (order < MAX_ORDER-1) {
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
 
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
         */
        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-               combined_idx = __find_combined_index(page_idx, order);
-               higher_page = page + combined_idx - page_idx;
-               higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
 
-       for (i = 0; i < (1 << order); i++) {
-               struct page *pg = page + i;
-
-               if (PageAnon(pg))
-                       pg->mapping = NULL;
-               bad += free_pages_check(pg);
-       }
+       if (PageAnon(page))
+               page->mapping = NULL;
+       for (i = 0; i < (1 << order); i++)
+               bad += free_pages_check(page + i);
        if (bad)
                return false;
 
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
 
+       free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
 
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
 
                if (free_pages <= min)
-                       return 0;
+                       return false;
        }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
 }
 
 #ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
 {
        struct page *page;
 
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
 
+       current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                               nodemask);
+                                               nodemask, sync_migration);
+       current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
 
                /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
 {
        return NULL;
 }
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
        bool drained = false;
 
        cond_resched();
 
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
 
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
 
        cond_resched();
 
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                               enum zone_type high_zoneidx)
+                                               enum zone_type high_zoneidx,
+                                               enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
 
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-               wakeup_kswapd(zone, order);
+               wakeup_kswapd(zone, order, classzone_idx);
 }
 
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-       struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
 
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 
        if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-       struct task_struct *p = current;
+       bool sync_migration = false;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 
 restart:
-       wake_all_kswapd(order, zonelist, high_zoneidx);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapd(order, zonelist, high_zoneidx,
+                                               zone_idx(preferred_zone));
 
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
                goto nopage;
 
        /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                goto nopage;
 
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
 
-       /* Try direct compaction */
+       /*
+        * Try direct compaction. The first pass is asynchronous. Subsequent
+        * attempts after direct reclaim are synchronous
+        */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress);
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
        if (page)
                goto got_pg;
+       sync_migration = true;
 
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
+               if (page)
+                       goto got_pg;
        }
 
 nopage:
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
+                       current->comm, order, gfp_mask);
                dump_stack();
                show_mem();
        }
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                       K(zone_nr_free_pages(zone)),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
 
 static __init int setup_numa_zonelist_order(char *s)
 {
-       if (s)
-               return __parse_numa_zonelist_order(s);
-       return 0;
+       int ret;
+
+       if (!s)
+               return 0;
+
+       ret = __parse_numa_zonelist_order(s);
+       if (ret == 0)
+               strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+       return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-               page, page_count(page), page_mapcount(page),
+               page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
 }
index 38cc58b..7cfa6ae 100644 (file)
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               split_huge_page_pmd(walk->mm, pmd);
                if (pmd_none_or_clear_bad(pmd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
index 7d9c1d0..ea53496 100644 (file)
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
                return NULL;
 
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                               pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+                               pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644 (file)
index 0000000..d030548
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ *  mm/pgtable-generic.c
+ *
+ *  Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ *  Copyright (C) 2010  Linus Torvalds
+ */
+
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache.  This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pte_t *ptep,
+                         pte_t entry, int dirty)
+{
+       int changed = !pte_same(*ptep, entry);
+       if (changed) {
+               set_pte_at(vma->vm_mm, address, ptep, entry);
+               flush_tlb_page(vma, address);
+       }
+       return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp,
+                         pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       int changed = !pmd_same(*pmdp, entry);
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       if (changed) {
+               set_pmd_at(vma->vm_mm, address, pmdp, entry);
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       }
+       return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+       BUG();
+       return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep)
+{
+       int young;
+       young = ptep_test_and_clear_young(vma, address, ptep);
+       if (young)
+               flush_tlb_page(vma, address);
+       return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+       int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+       BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       young = pmdp_test_and_clear_young(vma, address, pmdp);
+       if (young)
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                      pte_t *ptep)
+{
+       pte_t pte;
+       pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+       flush_tlb_page(vma, address);
+       return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                      pmd_t *pmdp)
+{
+       pmd_t pmd;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+       BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return pmd;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+                          pmd_t *pmdp)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pmd_t pmd = pmd_mksplitting(*pmdp);
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+       /* tlb flush only to serialize against gup-fast */
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+       BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
index c95d2ba..f21f4a1 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        list_add(&avc->same_vma, &vma->anon_vma_chain);
 
        anon_vma_lock(anon_vma);
+       /*
+        * It's critical to add new vmas to the tail of the anon_vma,
+        * see comment in huge_memory.c:__split_huge_page().
+        */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
        anon_vma_unlock(anon_vma);
 }
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
  * Returns virtual address or -EFAULT if page's index/offset is not
  * within the range mapped the @vma.
  */
-static inline unsigned long
+inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return NULL;
+       if (pmd_trans_huge(*pmd))
+               return NULL;
 
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-       pte_t *pte;
-       spinlock_t *ptl;
        int referenced = 0;
 
-       pte = page_check_address(page, mm, address, &ptl, 0);
-       if (!pte)
-               goto out;
-
        /*
         * Don't want to elevate referenced for mlocked page that gets this far,
         * in order that it progresses to try_to_unmap and is moved to the
         * unevictable list.
         */
        if (vma->vm_flags & VM_LOCKED) {
-               *mapcount = 1;  /* break early from loop */
+               *mapcount = 0;  /* break early from loop */
                *vm_flags |= VM_LOCKED;
-               goto out_unmap;
-       }
-
-       if (ptep_clear_flush_young_notify(vma, address, pte)) {
-               /*
-                * Don't treat a reference through a sequentially read
-                * mapping as such.  If the page has been used in
-                * another mapping, we will catch it; if this other
-                * mapping is already gone, the unmap path will have
-                * set PG_referenced or activated the page.
-                */
-               if (likely(!VM_SequentialReadHint(vma)))
-                       referenced++;
+               goto out;
        }
 
        /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
 
-out_unmap:
+       if (unlikely(PageTransHuge(page))) {
+               pmd_t *pmd;
+
+               spin_lock(&mm->page_table_lock);
+               pmd = page_check_address_pmd(page, mm, address,
+                                            PAGE_CHECK_ADDRESS_PMD_FLAG);
+               if (pmd && !pmd_trans_splitting(*pmd) &&
+                   pmdp_clear_flush_young_notify(vma, address, pmd))
+                       referenced++;
+               spin_unlock(&mm->page_table_lock);
+       } else {
+               pte_t *pte;
+               spinlock_t *ptl;
+
+               pte = page_check_address(page, mm, address, &ptl, 0);
+               if (!pte)
+                       goto out;
+
+               if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                       /*
+                        * Don't treat a reference through a sequentially read
+                        * mapping as such.  If the page has been used in
+                        * another mapping, we will catch it; if this other
+                        * mapping is already gone, the unmap path will have
+                        * set PG_referenced or activated the page.
+                        */
+                       if (likely(!VM_SequentialReadHint(vma)))
+                               referenced++;
+               }
+               pte_unmap_unlock(pte, ptl);
+       }
+
        (*mapcount)--;
-       pte_unmap_unlock(pte, ptl);
 
        if (referenced)
                *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        int first = atomic_inc_and_test(&page->_mapcount);
-       if (first)
-               __inc_zone_page_state(page, NR_ANON_PAGES);
+       if (first) {
+               if (!PageTransHuge(page))
+                       __inc_zone_page_state(page, NR_ANON_PAGES);
+               else
+                       __inc_zone_page_state(page,
+                                             NR_ANON_TRANSPARENT_HUGEPAGES);
+       }
        if (unlikely(PageKsm(page)))
                return;
 
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-       __inc_zone_page_state(page, NR_ANON_PAGES);
+       if (!PageTransHuge(page))
+               __inc_zone_page_state(page, NR_ANON_PAGES);
+       else
+               __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-               mem_cgroup_update_file_mapped(page, 1);
+               mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
 }
 
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
-               __dec_zone_page_state(page, NR_ANON_PAGES);
+               if (!PageTransHuge(page))
+                       __dec_zone_page_state(page, NR_ANON_PAGES);
+               else
+                       __dec_zone_page_state(page,
+                                             NR_ANON_TRANSPARENT_HUGEPAGES);
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
-               mem_cgroup_update_file_mapped(page, -1);
+               mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
        /*
         * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
 
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
 
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        int ret;
 
        BUG_ON(!PageLocked(page));
+       VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
 
        if (unlikely(PageKsm(page)))
                ret = try_to_unmap_ksm(page, flags);
index 008cd74..c7ef007 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf + len, "%7ld ", l->count);
 
                if (l->addr)
-                       len += sprint_symbol(buf + len, (unsigned long)l->addr);
+                       len += sprintf(buf + len, "%pS", (void *)l->addr);
                else
                        len += sprintf(buf + len, "<not-available>");
 
@@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial);
 
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
-       if (s->ctor) {
-               int n = sprint_symbol(buf, (unsigned long)s->ctor);
-
-               return n + sprintf(buf + n, "\n");
-       }
-       return 0;
+       if (!s->ctor)
+               return 0;
+       return sprintf(buf, "%pS\n", s->ctor);
 }
 SLAB_ATTR_RO(ctor);
 
index 95ac219..9325020 100644 (file)
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 static void free_map_bootmem(struct page *page, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
-       int magic;
+       unsigned long magic;
 
        for (i = 0; i < nr_pages; i++, page++) {
-               magic = atomic_read(&page->_mapcount);
+               magic = (unsigned long) page->lru.next;
 
                BUG_ON(magic == NODE_INFO);
 
index 3f48542..bbc1ce9 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru(zone, page);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
+}
+
+static void __put_single_page(struct page *page)
+{
+       __page_cache_release(page);
        free_hot_cold_page(page, 0);
 }
 
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
 {
-       page = compound_head(page);
-       if (put_page_testzero(page)) {
-               compound_page_dtor *dtor;
+       compound_page_dtor *dtor;
 
-               dtor = get_compound_page_dtor(page);
-               (*dtor)(page);
+       __page_cache_release(page);
+       dtor = get_compound_page_dtor(page);
+       (*dtor)(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+       if (unlikely(PageTail(page))) {
+               /* __split_huge_page_refcount can run under us */
+               struct page *page_head = page->first_page;
+               smp_rmb();
+               /*
+                * If PageTail is still set after smp_rmb() we can be sure
+                * that the page->first_page we read wasn't a dangling pointer.
+                * See __split_huge_page_refcount() smp_wmb().
+                */
+               if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+                       unsigned long flags;
+                       /*
+                        * Verify that our page_head wasn't converted
+                        * to a a regular page before we got a
+                        * reference on it.
+                        */
+                       if (unlikely(!PageHead(page_head))) {
+                               /* PageHead is cleared after PageTail */
+                               smp_rmb();
+                               VM_BUG_ON(PageTail(page));
+                               goto out_put_head;
+                       }
+                       /*
+                        * Only run compound_lock on a valid PageHead,
+                        * after having it pinned with
+                        * get_page_unless_zero() above.
+                        */
+                       smp_mb();
+                       /* page_head wasn't a dangling pointer */
+                       flags = compound_lock_irqsave(page_head);
+                       if (unlikely(!PageTail(page))) {
+                               /* __split_huge_page_refcount run before us */
+                               compound_unlock_irqrestore(page_head, flags);
+                               VM_BUG_ON(PageHead(page_head));
+                       out_put_head:
+                               if (put_page_testzero(page_head))
+                                       __put_single_page(page_head);
+                       out_put_single:
+                               if (put_page_testzero(page))
+                                       __put_single_page(page);
+                               return;
+                       }
+                       VM_BUG_ON(page_head != page->first_page);
+                       /*
+                        * We can release the refcount taken by
+                        * get_page_unless_zero now that
+                        * split_huge_page_refcount is blocked on the
+                        * compound_lock.
+                        */
+                       if (put_page_testzero(page_head))
+                               VM_BUG_ON(1);
+                       /* __split_huge_page_refcount will wait now */
+                       VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                       atomic_dec(&page->_count);
+                       VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                       compound_unlock_irqrestore(page_head, flags);
+                       if (put_page_testzero(page_head)) {
+                               if (PageHead(page_head))
+                                       __put_compound_page(page_head);
+                               else
+                                       __put_single_page(page_head);
+                       }
+               } else {
+                       /* page_head is a dangling pointer */
+                       VM_BUG_ON(PageTail(page));
+                       goto out_put_single;
+               }
+       } else if (put_page_testzero(page)) {
+               if (PageHead(page))
+                       __put_compound_page(page);
+               else
+                       __put_single_page(page);
        }
 }
 
@@ -75,7 +155,7 @@ void put_page(struct page *page)
        if (unlikely(PageCompound(page)))
                put_compound_page(page);
        else if (put_page_testzero(page))
-               __page_cache_release(page);
+               __put_single_page(page);
 }
 EXPORT_SYMBOL(put_page);
 
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
 
-/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+                               void (*move_fn)(struct page *page, void *arg),
+                               void *arg)
 {
        int i;
-       int pgmoved = 0;
        struct zone *zone = NULL;
+       unsigned long flags = 0;
 
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
 
                if (pagezone != zone) {
                        if (zone)
-                               spin_unlock(&zone->lru_lock);
+                               spin_unlock_irqrestore(&zone->lru_lock, flags);
                        zone = pagezone;
-                       spin_lock(&zone->lru_lock);
-               }
-               if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                       int lru = page_lru_base_type(page);
-                       list_move_tail(&page->lru, &zone->lru[lru].list);
-                       pgmoved++;
+                       spin_lock_irqsave(&zone->lru_lock, flags);
                }
+
+               (*move_fn)(page, arg);
        }
        if (zone)
-               spin_unlock(&zone->lru_lock);
-       __count_vm_events(PGROTATED, pgmoved);
-       release_pages(pvec->pages, pvec->nr, pvec->cold);
+               spin_unlock_irqrestore(&zone->lru_lock, flags);
+       release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
        pagevec_reinit(pvec);
 }
 
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+       int *pgmoved = arg;
+       struct zone *zone = page_zone(page);
+
+       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+               int lru = page_lru_base_type(page);
+               list_move_tail(&page->lru, &zone->lru[lru].list);
+               (*pgmoved)++;
+       }
+}
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+       int pgmoved = 0;
+
+       pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+       __count_vm_events(PGROTATED, pgmoved);
+}
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  * inactive list.
  */
-void  rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
 {
        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 }
 
 /*
- * FIXME: speed this up?
+ * A page will go to active list either by activate_page or putback_lru_page.
+ * In the activate_page case, the page hasn't active bit set. The page might
+ * not in LRU list because it's isolated before it gets a chance to be moved to
+ * active list. The window is small because pagevec just stores several pages.
+ * For such case, we do nothing for such page.
+ * In the putback_lru_page case, the page isn't in lru list but has active
+ * bit set
  */
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
 {
        struct zone *zone = page_zone(page);
+       int file = page_is_file_cache(page);
+       int lru = page_lru_base_type(page);
+       bool putback = !PageLRU(page);
 
-       spin_lock_irq(&zone->lru_lock);
-       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-               int file = page_is_file_cache(page);
-               int lru = page_lru_base_type(page);
+       /* The page is isolated before it's moved to active list */
+       if (!PageLRU(page) && !PageActive(page))
+               return;
+       if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
+               return;
+
+       if (!putback)
                del_page_from_lru_list(zone, page, lru);
+       else
+               SetPageLRU(page);
 
-               SetPageActive(page);
-               lru += LRU_ACTIVE;
-               add_page_to_lru_list(zone, page, lru);
-               __count_vm_event(PGACTIVATE);
+       SetPageActive(page);
+       lru += LRU_ACTIVE;
+       add_page_to_lru_list(zone, page, lru);
 
-               update_page_reclaim_stat(zone, page, file, 1);
+       if (putback)
+               return;
+       __count_vm_event(PGACTIVATE);
+       update_page_reclaim_stat(zone, page, file, 1);
+}
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+
+static void activate_page_drain(int cpu)
+{
+       struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+
+       if (pagevec_count(pvec))
+               pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+void activate_page(struct page *page)
+{
+       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+               page_cache_get(page);
+               if (!pagevec_add(pvec, page))
+                       pagevec_lru_move_fn(pvec, __activate_page, NULL);
+               put_cpu_var(activate_page_pvecs);
+       }
+}
+
+/* Caller should hold zone->lru_lock */
+int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+       struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+       if (!pagevec_add(pvec, page)) {
+               spin_unlock_irq(&zone->lru_lock);
+               pagevec_lru_move_fn(pvec, __activate_page, NULL);
+               spin_lock_irq(&zone->lru_lock);
        }
+       put_cpu_var(activate_page_pvecs);
+       return 1;
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+void activate_page(struct page *page)
+{
+       struct zone *zone = page_zone(page);
+
+       spin_lock_irq(&zone->lru_lock);
+       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
+               __activate_page(page, NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
+#endif
 
 /*
  * Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
                pagevec_move_tail(pvec);
                local_irq_restore(flags);
        }
+       activate_page_drain(cpu);
 }
 
 void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
 
 EXPORT_SYMBOL(__pagevec_release);
 
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+                      struct page *page, struct page *page_tail)
+{
+       int active;
+       enum lru_list lru;
+       const int file = 0;
+       struct list_head *head;
+
+       VM_BUG_ON(!PageHead(page));
+       VM_BUG_ON(PageCompound(page_tail));
+       VM_BUG_ON(PageLRU(page_tail));
+       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+       SetPageLRU(page_tail);
+
+       if (page_evictable(page_tail, NULL)) {
+               if (PageActive(page)) {
+                       SetPageActive(page_tail);
+                       active = 1;
+                       lru = LRU_ACTIVE_ANON;
+               } else {
+                       active = 0;
+                       lru = LRU_INACTIVE_ANON;
+               }
+               update_page_reclaim_stat(zone, page_tail, file, active);
+               if (likely(PageLRU(page)))
+                       head = page->lru.prev;
+               else
+                       head = &zone->lru[lru].list;
+               __add_page_to_lru_list(zone, page_tail, lru, head);
+       } else {
+               SetPageUnevictable(page_tail);
+               add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+       }
+}
+
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+       enum lru_list lru = (enum lru_list)arg;
+       struct zone *zone = page_zone(page);
+       int file = is_file_lru(lru);
+       int active = is_active_lru(lru);
+
+       VM_BUG_ON(PageActive(page));
+       VM_BUG_ON(PageUnevictable(page));
+       VM_BUG_ON(PageLRU(page));
+
+       SetPageLRU(page);
+       if (active)
+               SetPageActive(page);
+       update_page_reclaim_stat(zone, page, file, active);
+       add_page_to_lru_list(zone, page, lru);
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
-       int i;
-       struct zone *zone = NULL;
-
        VM_BUG_ON(is_unevictable_lru(lru));
 
-       for (i = 0; i < pagevec_count(pvec); i++) {
-               struct page *page = pvec->pages[i];
-               struct zone *pagezone = page_zone(page);
-               int file;
-               int active;
-
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irq(&zone->lru_lock);
-                       zone = pagezone;
-                       spin_lock_irq(&zone->lru_lock);
-               }
-               VM_BUG_ON(PageActive(page));
-               VM_BUG_ON(PageUnevictable(page));
-               VM_BUG_ON(PageLRU(page));
-               SetPageLRU(page);
-               active = is_active_lru(lru);
-               file = is_file_lru(lru);
-               if (active)
-                       SetPageActive(page);
-               update_page_reclaim_stat(zone, page, file, active);
-               add_page_to_lru_list(zone, page, lru);
-       }
-       if (zone)
-               spin_unlock_irq(&zone->lru_lock);
-       release_pages(pvec->pages, pvec->nr, pvec->cold);
-       pagevec_reinit(pvec);
+       pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
 }
 
 EXPORT_SYMBOL(____pagevec_lru_add);
index e10f583..5c8cfab 100644 (file)
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
        if (!entry.val)
                return 0;
 
+       if (unlikely(PageTransHuge(page)))
+               if (unlikely(split_huge_page(page))) {
+                       swapcache_free(entry, NULL);
+                       return 0;
+               }
+
        /*
         * Radix-tree node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
index b6adcfb..07a458d 100644 (file)
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+               if (unlikely(pmd_trans_huge(*pmd)))
+                       continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
index eb5cc7d..cac13b4 100644 (file)
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask);
-       if (unlikely(IS_ERR(va))) {
+       if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }
@@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                                -1, GFP_KERNEL, caller);
 }
 
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
-                                  int node, gfp_t gfp_mask)
-{
-       return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                 node, gfp_mask, __builtin_return_address(0));
-}
-
 static struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1537,25 +1530,12 @@ fail:
        return NULL;
 }
 
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-       void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
-                                        __builtin_return_address(0));
-
-       /*
-        * A ref_count = 3 is needed because the vm_struct and vmap_area
-        * structures allocated in the __get_vm_area_node() function contain
-        * references to the virtual address of the vmalloc'ed block.
-        */
-       kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
-       return addr;
-}
-
 /**
- *     __vmalloc_node  -  allocate virtually contiguous memory
+ *     __vmalloc_node_range  -  allocate virtually contiguous memory
  *     @size:          allocation size
  *     @align:         desired alignment
+ *     @start:         vm area range start
+ *     @end:           vm area range end
  *     @gfp_mask:      flags for the page level allocator
  *     @prot:          protection mask for the allocated pages
  *     @node:          node to use for allocation or -1
@@ -1565,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
  *     allocator with @gfp_mask flags.  Map them into contiguous
  *     kernel virtual space, using a pagetable protection of @prot.
  */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
-                           gfp_t gfp_mask, pgprot_t prot,
-                           int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+                       unsigned long start, unsigned long end, gfp_t gfp_mask,
+                       pgprot_t prot, int node, void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1577,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
 
-       area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
-                                 VMALLOC_END, node, gfp_mask, caller);
+       area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+                                 gfp_mask, caller);
 
        if (!area)
                return NULL;
@@ -1595,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        return addr;
 }
 
+/**
+ *     __vmalloc_node  -  allocate virtually contiguous memory
+ *     @size:          allocation size
+ *     @align:         desired alignment
+ *     @gfp_mask:      flags for the page level allocator
+ *     @prot:          protection mask for the allocated pages
+ *     @node:          node to use for allocation or -1
+ *     @caller:        caller's return address
+ *
+ *     Allocate enough pages to cover @size from the page level
+ *     allocator with @gfp_mask flags.  Map them into contiguous
+ *     kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+                           gfp_t gfp_mask, pgprot_t prot,
+                           int node, void *caller)
+{
+       return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                               gfp_mask, prot, node, caller);
+}
+
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2203,17 +2204,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  * @sizes: array containing size of each area
  * @nr_vms: the number of areas to allocate
  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
  *
  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
  *         vm_structs on success, %NULL on failure
  *
  * Percpu allocator wants to use congruent vm areas so that it can
  * maintain the offsets among percpu areas.  This function allocates
- * congruent vmalloc areas for it.  These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes.  To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes.  To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
  *
  * Despite its complicated look, this allocator is rather simple.  It
  * does everything top-down and scans areas from the end looking for
@@ -2224,7 +2224,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                    size_t align, gfp_t gfp_mask)
+                                    size_t align)
 {
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2234,8 +2234,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        unsigned long base, start, end, last_end;
        bool purged = false;
 
-       gfp_mask &= GFP_RECLAIM_MASK;
-
        /* verify parameters and allocate data structures */
        BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2268,14 +2266,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
 
-       vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
-       vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+       vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+       vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
        if (!vas || !vms)
                goto err_free;
 
        for (area = 0; area < nr_vms; area++) {
-               vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
-               vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+               vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+               vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
@@ -2456,13 +2454,8 @@ static int s_show(struct seq_file *m, void *p)
        seq_printf(m, "0x%p-0x%p %7ld",
                v->addr, v->addr + v->size, v->size);
 
-       if (v->caller) {
-               char buff[KSYM_SYMBOL_LEN];
-
-               seq_putc(m, ' ');
-               sprint_symbol(buff, (unsigned long)v->caller);
-               seq_puts(m, buff);
-       }
+       if (v->caller)
+               seq_printf(m, " %pS", v->caller);
 
        if (v->nr_pages)
                seq_printf(m, " pages=%d", v->nr_pages);
index 9ca587c..99999a9 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -40,6 +41,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/compaction.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 
-enum lumpy_mode {
-       LUMPY_MODE_NONE,
-       LUMPY_MODE_ASYNC,
-       LUMPY_MODE_SYNC,
-};
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC:  Do not block
+ * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ *                     page from the LRU and reclaim all pages within a
+ *                     naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                     order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE            ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC             ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC              ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM      ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION                ((__force reclaim_mode_t)0x10u)
 
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
         * Intend to reclaim enough continuous memory rather than reclaim
         * enough amount of memory. i.e, mode for high order allocation.
         */
-       enum lumpy_mode lumpy_reclaim_mode;
+       reclaim_mode_t reclaim_mode;
 
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
 
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+static void set_reclaim_mode(int priority, struct scan_control *sc,
                                   bool sync)
 {
-       enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+       reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
 
        /*
-        * Some reclaim have alredy been failed. No worth to try synchronous
-        * lumpy reclaim.
+        * Initially assume we are entering either lumpy reclaim or
+        * reclaim/compaction.Depending on the order, we will either set the
+        * sync mode or just reclaim order-0 pages later.
         */
-       if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
-               return;
+       if (COMPACTION_BUILD)
+               sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+       else
+               sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
 
        /*
-        * If we need a large contiguous chunk of memory, or have
-        * trouble getting a small set of contiguous pages, we
-        * will reclaim both active and inactive pages.
+        * Avoid using lumpy reclaim or reclaim/compaction if possible by
+        * restricting when its set to either costly allocations or when
+        * under memory pressure
         */
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-               sc->lumpy_reclaim_mode = mode;
+               sc->reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
-               sc->lumpy_reclaim_mode = mode;
+               sc->reclaim_mode |= syncmode;
        else
-               sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+               sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
 
-static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+static void reset_reclaim_mode(struct scan_control *sc)
 {
-       sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+       sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * first attempt to free a range of pages fails.
                 */
                if (PageWriteback(page) &&
-                   sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
+                   (sc->reclaim_mode & RECLAIM_MODE_SYNC))
                        wait_on_page_writeback(page);
 
                if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                       trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
+                       trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
 
        /* Lumpy reclaim - ignore references */
-       if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
                return PAGEREF_RECLAIM;
 
        /*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                       if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+                       if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
                                wait_on_page_writeback(page);
                        else {
@@ -895,7 +912,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
-               disable_lumpy_reclaim_mode(sc);
+               reset_reclaim_mode(sc);
                continue;
 
 activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
-               disable_lumpy_reclaim_mode(sc);
+               reset_reclaim_mode(sc);
 keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                       nr_taken++;
+                       nr_taken += hpage_nr_pages(page);
                        break;
 
                case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                               nr_taken++;
+                               nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
        struct page *page;
 
        list_for_each_entry(page, page_list, lru) {
+               int numpages = hpage_nr_pages(page);
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
-                       nr_active++;
+                       nr_active += numpages;
                }
                if (count)
-                       count[lru]++;
+                       count[lru] += numpages;
        }
 
        return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                        spin_lock_irq(&zone->lru_lock);
                        continue;
                }
-               SetPageLRU(page);
                lru = page_lru(page);
-               add_page_to_lru_list(zone, page, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
-                       reclaim_stat->recent_rotated[file]++;
+                       int numpages = hpage_nr_pages(page);
+                       reclaim_stat->recent_rotated[file] += numpages;
+                       if (putback_active_lru_page(zone, page))
+                               continue;
                }
+               SetPageLRU(page);
+               add_page_to_lru_list(zone, page, lru);
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
                        __pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
 
        /* Only stall on lumpy reclaim */
-       if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+       if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
 
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
 
-       set_lumpy_reclaim_mode(priority, sc, false);
+       set_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
 
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-               set_lumpy_reclaim_mode(priority, sc, true);
+               set_reclaim_mode(priority, sc, true);
                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
        }
 
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
                priority,
-               trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+               trace_shrink_flags(file, sc->reclaim_mode));
        return nr_reclaimed;
 }
 
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
 
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
-               pgmoved++;
+               pgmoved += hpage_nr_pages(page);
 
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
 
                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                       nr_rotated++;
+                       nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
 }
 
 /*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                       unsigned long nr_reclaimed,
+                                       unsigned long nr_scanned,
+                                       struct scan_control *sc)
+{
+       unsigned long pages_for_compaction;
+       unsigned long inactive_lru_pages;
+
+       /* If not in reclaim/compaction mode, stop */
+       if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+               return false;
+
+       /*
+        * If we failed to reclaim and have scanned the full list, stop.
+        * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+        *       faster but obviously would be less likely to succeed
+        *       allocation. If this is desirable, use GFP_REPEAT to decide
+        *       if both reclaimed and scanned should be checked or just
+        *       reclaimed
+        */
+       if (!nr_reclaimed && !nr_scanned)
+               return false;
+
+       /*
+        * If we have not reclaimed enough pages for compaction and the
+        * inactive lists are large enough, continue reclaiming
+        */
+       pages_for_compaction = (2UL << sc->order);
+       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       if (sc->nr_reclaimed < pages_for_compaction &&
+                       inactive_lru_pages > pages_for_compaction)
+               return true;
+
+       /* If compaction would go ahead or the allocation would succeed, stop */
+       switch (compaction_suitable(zone, sc->order)) {
+       case COMPACT_PARTIAL:
+       case COMPACT_CONTINUE:
+               return false;
+       default:
+               return true;
+       }
+}
+
+/*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-       unsigned long nr_reclaimed = sc->nr_reclaimed;
+       unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       unsigned long nr_scanned = sc->nr_scanned;
 
+restart:
+       nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
 
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
-
-       sc->nr_reclaimed = nr_reclaimed;
+       sc->nr_reclaimed += nr_reclaimed;
 
        /*
         * Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
+       /* reclaim/compaction might need reclaim to continue */
+       if (should_continue_reclaim(zone, nr_reclaimed,
+                                       sc->nr_scanned - nr_scanned, sc))
+               goto restart;
+
        throttle_vm_writeout(sc->gfp_mask);
 }
 
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     precentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                               int classzone_idx)
+{
+       unsigned long present_pages = 0;
+       int i;
+
+       for (i = 0; i <= classzone_idx; i++)
+               present_pages += pgdat->node_zones[i].present_pages;
+
+       return balanced_pages > (present_pages >> 2);
+}
+
 /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                       int classzone_idx)
 {
        int i;
+       unsigned long balanced = 0;
+       bool all_zones_ok = true;
 
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-               return 1;
+               return true;
 
-       /* If after HZ/10, a zone is below the high mark, it's premature */
+       /* Check the watermark levels */
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
 
                if (!populated_zone(zone))
                        continue;
 
-               if (zone->all_unreclaimable)
+               /*
+                * balance_pgdat() skips over all_unreclaimable after
+                * DEF_PRIORITY. Effectively, it considers them balanced so
+                * they must be considered balanced here as well if kswapd
+                * is to sleep
+                */
+               if (zone->all_unreclaimable) {
+                       balanced += zone->present_pages;
                        continue;
+               }
 
-               if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
-                                                               0, 0))
-                       return 1;
+               if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+                                                       classzone_idx, 0))
+                       all_zones_ok = false;
+               else
+                       balanced += zone->present_pages;
        }
 
-       return 0;
+       /*
+        * For high-order requests, the balanced zones must contain at least
+        * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+        * must be balanced
+        */
+       if (order)
+               return pgdat_balanced(pgdat, balanced, classzone_idx);
+       else
+               return !all_zones_ok;
 }
 
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at high_wmark_pages(zone).
  *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                       int *classzone_idx)
 {
        int all_zones_ok;
+       unsigned long balanced;
        int priority;
        int i;
+       int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
        count_vm_event(PAGEOUTRUN);
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-               int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
 
@@ -2208,6 +2338,7 @@ loop_again:
                        disable_swap_token();
 
                all_zones_ok = 1;
+               balanced = 0;
 
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
 
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
+                               *classzone_idx = i;
                                break;
                        }
                }
@@ -2255,6 +2387,7 @@ loop_again:
                 * cause too much scanning of the lower zones.
                 */
                for (i = 0; i <= end_zone; i++) {
+                       int compaction;
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
 
@@ -2276,7 +2409,7 @@ loop_again:
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
                         */
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                        8*high_wmark_pages(zone), end_zone, 0))
                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
+
+                       compaction = 0;
+                       if (order &&
+                           zone_watermark_ok(zone, 0,
+                                              high_wmark_pages(zone),
+                                             end_zone, 0) &&
+                           !zone_watermark_ok(zone, order,
+                                              high_wmark_pages(zone),
+                                              end_zone, 0)) {
+                               compact_zone_order(zone,
+                                                  order,
+                                                  sc.gfp_mask, false,
+                                                  COMPACT_MODE_KSWAPD);
+                               compaction = 1;
+                       }
+
                        if (zone->all_unreclaimable)
                                continue;
-                       if (nr_slab == 0 && !zone_reclaimable(zone))
+                       if (!compaction && nr_slab == 0 &&
+                           !zone_reclaimable(zone))
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
 
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2305,7 +2455,7 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                               if (!zone_watermark_ok(zone, order,
+                               if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
                        } else {
@@ -2317,10 +2467,12 @@ loop_again:
                                 * spectulatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                               if (i <= *classzone_idx)
+                                       balanced += zone->present_pages;
                        }
 
                }
-               if (all_zones_ok)
+               if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
                        break;
        }
 out:
-       if (!all_zones_ok) {
+
+       /*
+        * order-0: All zones must meet high watermark for a balanced node
+        * high-order: Balanced zones must make up at least 25% of the node
+        *             for the node to be balanced
+        */
+       if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
 
                try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
                goto loop_again;
        }
 
-       return sc.nr_reclaimed;
+       /*
+        * If kswapd was reclaiming at a higher order, it has the option of
+        * sleeping without all zones being balanced. Before it does, it must
+        * ensure that the watermarks for order-0 on *all* zones are met and
+        * that the congestion flags are cleared. The congestion flag must
+        * be cleared as kswapd is the only mechanism that clears the flag
+        * and it is potentially going to sleep here.
+        */
+       if (order) {
+               for (i = 0; i <= end_zone; i++) {
+                       struct zone *zone = pgdat->node_zones + i;
+
+                       if (!populated_zone(zone))
+                               continue;
+
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                               continue;
+
+                       /* Confirm the zone is balanced for order-0 */
+                       if (!zone_watermark_ok(zone, 0,
+                                       high_wmark_pages(zone), 0, 0)) {
+                               order = sc.order = 0;
+                               goto loop_again;
+                       }
+
+                       /* If balanced, clear the congested flag */
+                       zone_clear_flag(zone, ZONE_CONGESTED);
+               }
+       }
+
+       /*
+        * Return the order we were reclaiming at so sleeping_prematurely()
+        * makes a decision on the order we were last reclaiming at. However,
+        * if another caller entered the allocator slow path while kswapd
+        * was awake, order will remain at the higher level
+        */
+       *classzone_idx = end_zone;
+       return order;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+       long remaining = 0;
+       DEFINE_WAIT(wait);
+
+       if (freezing(current) || kthread_should_stop())
+               return;
+
+       prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+       /* Try to sleep for a short interval */
+       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+               remaining = schedule_timeout(HZ/10);
+               finish_wait(&pgdat->kswapd_wait, &wait);
+               prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+       }
+
+       /*
+        * After a short sleep, check if it was a premature sleep. If not, then
+        * go fully to sleep until explicitly woken up.
+        */
+       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+               trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+               /*
+                * vmstat counters are not perfectly accurate and the estimated
+                * value for counters such as NR_FREE_PAGES can deviate from the
+                * true value by nr_online_cpus * threshold. To avoid the zone
+                * watermarks being breached while under pressure, we reduce the
+                * per-cpu vmstat threshold while kswapd is awake and restore
+                * them before going back to sleep.
+                */
+               set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+               schedule();
+               set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+       } else {
+               if (remaining)
+                       count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+               else
+                       count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+       }
+       finish_wait(&pgdat->kswapd_wait, &wait);
 }
 
 /*
@@ -2387,9 +2626,10 @@ out:
 static int kswapd(void *p)
 {
        unsigned long order;
+       int classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-       DEFINE_WAIT(wait);
+
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
        set_freezable();
 
        order = 0;
+       classzone_idx = MAX_NR_ZONES - 1;
        for ( ; ; ) {
                unsigned long new_order;
+               int new_classzone_idx;
                int ret;
 
-               prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;
+               new_classzone_idx = pgdat->classzone_idx;
                pgdat->kswapd_max_order = 0;
-               if (order < new_order) {
+               pgdat->classzone_idx = MAX_NR_ZONES - 1;
+               if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
-                        * allocation
+                        * allocation or has tigher zone constraints
                         */
                        order = new_order;
+                       classzone_idx = new_classzone_idx;
                } else {
-                       if (!freezing(current) && !kthread_should_stop()) {
-                               long remaining = 0;
-
-                               /* Try to sleep for a short interval */
-                               if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                       remaining = schedule_timeout(HZ/10);
-                                       finish_wait(&pgdat->kswapd_wait, &wait);
-                                       prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-                               }
-
-                               /*
-                                * After a short sleep, check if it was a
-                                * premature sleep. If not, then go fully
-                                * to sleep until explicitly woken up
-                                */
-                               if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                       trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-                                       schedule();
-                               } else {
-                                       if (remaining)
-                                               count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
-                                       else
-                                               count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
-                               }
-                       }
-
+                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
                        order = pgdat->kswapd_max_order;
+                       classzone_idx = pgdat->classzone_idx;
+                       pgdat->kswapd_max_order = 0;
+                       pgdat->classzone_idx = MAX_NR_ZONES - 1;
                }
-               finish_wait(&pgdat->kswapd_wait, &wait);
 
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balance_pgdat(pgdat, order);
+                       order = balance_pgdat(pgdat, order, &classzone_idx);
                }
        }
        return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
 
        if (!populated_zone(zone))
                return;
 
-       pgdat = zone->zone_pgdat;
-       if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
-               return;
-       if (pgdat->kswapd_max_order < order)
-               pgdat->kswapd_max_order = order;
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
+       pgdat = zone->zone_pgdat;
+       if (pgdat->kswapd_max_order < order) {
+               pgdat->kswapd_max_order = order;
+               pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+       }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+               return;
+
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
index 312d728..0c3b504 100644 (file)
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
 
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+       int threshold;
+       int watermark_distance;
+
+       /*
+        * As vmstats are not up to date, there is drift between the estimated
+        * and real values. For high thresholds and a high number of CPUs, it
+        * is possible for the min watermark to be breached while the estimated
+        * value looks fine. The pressure threshold is a reduced value such
+        * that even the maximum amount of drift will not accidentally breach
+        * the min watermark
+        */
+       watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+       threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+       /*
+        * Maximum threshold is 125
+        */
+       threshold = min(125, threshold);
+
+       return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
 {
        int threshold;
        int mem;        /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
        for_each_populated_zone(zone) {
                unsigned long max_drift, tolerate_drift;
 
-               threshold = calculate_threshold(zone);
+               threshold = calculate_normal_threshold(zone);
 
                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void)
        }
 }
 
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                               int (*calculate_pressure)(struct zone *))
+{
+       struct zone *zone;
+       int cpu;
+       int threshold;
+       int i;
+
+       for (i = 0; i < pgdat->nr_zones; i++) {
+               zone = &pgdat->node_zones[i];
+               if (!zone->percpu_drift_mark)
+                       continue;
+
+               threshold = (*calculate_pressure)(zone);
+               for_each_possible_cpu(cpu)
+                       per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                       = threshold;
+       }
+}
+
 /*
  * For use when we know that interrupts are disabled.
  */
@@ -836,6 +880,7 @@ static const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+       "nr_anon_transparent_hugepages",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
 
@@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                  zone_nr_free_pages(zone),
+                  zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
index 7f68625..f29abeb 100644 (file)
@@ -104,8 +104,26 @@ static pfn_t fault_pfn;
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
-               struct page *page = compound_head(pfn_to_page(pfn));
-               return PageReserved(page);
+               int reserved;
+               struct page *tail = pfn_to_page(pfn);
+               struct page *head = compound_trans_head(tail);
+               reserved = PageReserved(head);
+               if (head != tail) {
+                       /*
+                        * "head" is not a dangling pointer
+                        * (compound_trans_head takes care of that)
+                        * but the hugepage may have been splitted
+                        * from under us (and we may not hold a
+                        * reference count on the head page so it can
+                        * be reused before we run PageReferenced), so
+                        * we've to check PageTail before returning
+                        * what we just read.
+                        */
+                       smp_rmb();
+                       if (PageTail(tail))
+                               return reserved;
+               }
+               return PageReserved(tail);
        }
 
        return true;
@@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }
 
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+                                      struct mm_struct *mm,
+                                      unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young, idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       young = kvm_test_age_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return young;
+}
+
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
                                     struct mm_struct *mm)
 {
@@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
 };