Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Aug 2020 18:39:33 +0000 (11:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Aug 2020 18:39:33 +0000 (11:39 -0700)
Merge misc updates from Andrew Morton:

 - a few MM hotfixes

 - kthread, tools, scripts, ntfs and ocfs2

 - some of MM

Subsystems affected by this patch series: kthread, tools, scripts, ntfs,
ocfs2 and mm (hofixes, pagealloc, slab-generic, slab, slub, kcsan,
debug, pagecache, gup, swap, shmem, memcg, pagemap, mremap, mincore,
sparsemem, vmalloc, kasan, pagealloc, hugetlb and vmscan).

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits)
  mm: vmscan: consistent update to pgrefill
  mm/vmscan.c: fix typo
  khugepaged: khugepaged_test_exit() check mmget_still_valid()
  khugepaged: retract_page_tables() remember to test exit
  khugepaged: collapse_pte_mapped_thp() protect the pmd lock
  khugepaged: collapse_pte_mapped_thp() flush the right range
  mm/hugetlb: fix calculation of adjust_range_if_pmd_sharing_possible
  mm: thp: replace HTTP links with HTTPS ones
  mm/page_alloc: fix memalloc_nocma_{save/restore} APIs
  mm/page_alloc.c: skip setting nodemask when we are in interrupt
  mm/page_alloc: fallbacks at most has 3 elements
  mm/page_alloc: silence a KASAN false positive
  mm/page_alloc.c: remove unnecessary end_bitidx for [set|get]_pfnblock_flags_mask()
  mm/page_alloc.c: simplify pageblock bitmap access
  mm/page_alloc.c: extract the common part in pfn_to_bitidx()
  mm/page_alloc.c: replace the definition of NR_MIGRATETYPE_BITS with PB_migratetype_bits
  mm/shuffle: remove dynamic reconfiguration
  mm/memory_hotplug: document why shuffle_zone() is relevant
  mm/page_alloc: remove nr_free_pagecache_pages()
  mm: remove vm_total_pages
  ...

396 files changed:
Documentation/admin-guide/kernel-parameters.txt
Documentation/dev-tools/kasan.rst
Documentation/filesystems/dlmfs.rst
Documentation/filesystems/ocfs2.rst
Documentation/filesystems/tmpfs.rst
Documentation/vm/arch_pgtable_helpers.rst [new file with mode: 0644]
Documentation/vm/memory-model.rst
Documentation/vm/slub.rst
arch/alpha/include/asm/pgalloc.h
arch/alpha/include/asm/tlbflush.h
arch/alpha/kernel/core_irongate.c
arch/alpha/kernel/core_marvel.c
arch/alpha/kernel/core_titan.c
arch/alpha/kernel/machvec_impl.h
arch/alpha/kernel/smp.c
arch/alpha/mm/numa.c
arch/arc/mm/fault.c
arch/arc/mm/init.c
arch/arm/include/asm/pgalloc.h
arch/arm/include/asm/tlb.h
arch/arm/kernel/machine_kexec.c
arch/arm/kernel/smp.c
arch/arm/kernel/suspend.c
arch/arm/mach-omap2/omap-mpuss-lowpower.c
arch/arm/mm/hugetlbpage.c
arch/arm/mm/init.c
arch/arm/mm/mmu.c
arch/arm64/include/asm/pgalloc.h
arch/arm64/kernel/setup.c
arch/arm64/kernel/smp.c
arch/arm64/mm/hugetlbpage.c
arch/arm64/mm/init.c
arch/arm64/mm/ioremap.c
arch/arm64/mm/mmu.c
arch/csky/include/asm/pgalloc.h
arch/csky/kernel/smp.c
arch/hexagon/include/asm/pgalloc.h
arch/ia64/include/asm/pgalloc.h
arch/ia64/include/asm/tlb.h
arch/ia64/kernel/process.c
arch/ia64/kernel/smp.c
arch/ia64/kernel/smpboot.c
arch/ia64/mm/contig.c
arch/ia64/mm/discontig.c
arch/ia64/mm/hugetlbpage.c
arch/ia64/mm/tlb.c
arch/m68k/include/asm/mmu_context.h
arch/m68k/include/asm/sun3_pgalloc.h
arch/m68k/kernel/dma.c
arch/m68k/kernel/traps.c
arch/m68k/mm/cache.c
arch/m68k/mm/fault.c
arch/m68k/mm/kmap.c
arch/m68k/mm/mcfmmu.c
arch/m68k/mm/memory.c
arch/m68k/sun3x/dvma.c
arch/microblaze/include/asm/pgalloc.h
arch/microblaze/include/asm/tlbflush.h
arch/microblaze/kernel/process.c
arch/microblaze/kernel/signal.c
arch/microblaze/mm/init.c
arch/mips/include/asm/pgalloc.h
arch/mips/kernel/setup.c
arch/mips/loongson64/numa.c
arch/mips/sgi-ip27/ip27-memory.c
arch/mips/sgi-ip32/ip32-memory.c
arch/nds32/mm/mm-nds32.c
arch/nios2/include/asm/pgalloc.h
arch/openrisc/include/asm/pgalloc.h
arch/openrisc/include/asm/tlbflush.h
arch/openrisc/kernel/or32_ksyms.c
arch/parisc/include/asm/mmu_context.h
arch/parisc/include/asm/pgalloc.h
arch/parisc/kernel/cache.c
arch/parisc/kernel/pci-dma.c
arch/parisc/kernel/process.c
arch/parisc/kernel/signal.c
arch/parisc/kernel/smp.c
arch/parisc/mm/hugetlbpage.c
arch/parisc/mm/init.c
arch/parisc/mm/ioremap.c
arch/powerpc/include/asm/tlb.h
arch/powerpc/mm/book3s64/hash_hugetlbpage.c
arch/powerpc/mm/book3s64/hash_pgtable.c
arch/powerpc/mm/book3s64/hash_tlb.c
arch/powerpc/mm/book3s64/radix_hugetlbpage.c
arch/powerpc/mm/init_32.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/kasan/8xx.c
arch/powerpc/mm/kasan/book3s_32.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/nohash/40x.c
arch/powerpc/mm/nohash/8xx.c
arch/powerpc/mm/nohash/fsl_booke.c
arch/powerpc/mm/nohash/kaslr_booke.c
arch/powerpc/mm/nohash/tlb.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/ptdump/hashpagetable.c
arch/powerpc/mm/ptdump/ptdump.c
arch/powerpc/platforms/pseries/cmm.c
arch/riscv/include/asm/pgalloc.h
arch/riscv/mm/fault.c
arch/riscv/mm/init.c
arch/s390/crypto/prng.c
arch/s390/include/asm/tlb.h
arch/s390/include/asm/tlbflush.h
arch/s390/kernel/machine_kexec.c
arch/s390/kernel/ptrace.c
arch/s390/kvm/diag.c
arch/s390/kvm/priv.c
arch/s390/kvm/pv.c
arch/s390/mm/cmm.c
arch/s390/mm/init.c
arch/s390/mm/mmap.c
arch/s390/mm/pgtable.c
arch/sh/include/asm/pgalloc.h
arch/sh/kernel/idle.c
arch/sh/kernel/machine_kexec.c
arch/sh/mm/cache-sh3.c
arch/sh/mm/cache-sh7705.c
arch/sh/mm/hugetlbpage.c
arch/sh/mm/init.c
arch/sh/mm/ioremap_fixed.c
arch/sh/mm/numa.c
arch/sh/mm/tlb-sh3.c
arch/sparc/include/asm/ide.h
arch/sparc/include/asm/tlb_64.h
arch/sparc/kernel/leon_smp.c
arch/sparc/kernel/process_32.c
arch/sparc/kernel/signal_32.c
arch/sparc/kernel/smp_32.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sun4m_irq.c
arch/sparc/mm/highmem.c
arch/sparc/mm/init_64.c
arch/sparc/mm/io-unit.c
arch/sparc/mm/iommu.c
arch/sparc/mm/tlb.c
arch/um/include/asm/pgalloc.h
arch/um/include/asm/pgtable-3level.h
arch/um/kernel/mem.c
arch/x86/ia32/ia32_aout.c
arch/x86/include/asm/mmu_context.h
arch/x86/include/asm/pgalloc.h
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/traps.c
arch/x86/mm/fault.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/kaslr.c
arch/x86/mm/pgtable_32.c
arch/x86/mm/pti.c
arch/x86/platform/uv/bios_uv.c
arch/x86/power/hibernate.c
arch/xtensa/include/asm/pgalloc.h
arch/xtensa/kernel/xtensa_ksyms.c
arch/xtensa/mm/cache.c
arch/xtensa/mm/fault.c
crypto/adiantum.c
crypto/ahash.c
crypto/api.c
crypto/asymmetric_keys/verify_pefile.c
crypto/deflate.c
crypto/drbg.c
crypto/ecc.c
crypto/ecdh.c
crypto/gcm.c
crypto/gf128mul.c
crypto/jitterentropy-kcapi.c
crypto/rng.c
crypto/rsa-pkcs1pad.c
crypto/seqiv.c
crypto/shash.c
crypto/skcipher.c
crypto/testmgr.c
crypto/zstd.c
drivers/base/node.c
drivers/block/xen-blkback/common.h
drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c
drivers/crypto/amlogic/amlogic-gxl-cipher.c
drivers/crypto/atmel-ecc.c
drivers/crypto/caam/caampkc.c
drivers/crypto/cavium/cpt/cptvf_main.c
drivers/crypto/cavium/cpt/cptvf_reqmanager.c
drivers/crypto/cavium/nitrox/nitrox_lib.c
drivers/crypto/cavium/zip/zip_crypto.c
drivers/crypto/ccp/ccp-crypto-rsa.c
drivers/crypto/ccree/cc_aead.c
drivers/crypto/ccree/cc_buffer_mgr.c
drivers/crypto/ccree/cc_cipher.c
drivers/crypto/ccree/cc_hash.c
drivers/crypto/ccree/cc_request_mgr.c
drivers/crypto/marvell/cesa/hash.c
drivers/crypto/marvell/octeontx/otx_cptvf_main.c
drivers/crypto/marvell/octeontx/otx_cptvf_reqmgr.h
drivers/crypto/nx/nx.c
drivers/crypto/virtio/virtio_crypto_algs.c
drivers/crypto/virtio/virtio_crypto_core.c
drivers/iommu/ipmmu-vmsa.c
drivers/md/dm-crypt.c
drivers/md/dm-integrity.c
drivers/misc/ibmvmc.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
drivers/net/ppp/ppp_mppe.c
drivers/net/wireguard/noise.c
drivers/net/wireguard/peer.c
drivers/net/wireless/intel/iwlwifi/pcie/rx.c
drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
drivers/net/wireless/intel/iwlwifi/pcie/tx.c
drivers/net/wireless/intersil/orinoco/wext.c
drivers/s390/crypto/ap_bus.h
drivers/staging/ks7010/ks_hostif.c
drivers/staging/rtl8723bs/core/rtw_security.c
drivers/staging/wlan-ng/p80211netdev.c
drivers/target/iscsi/iscsi_target_auth.c
drivers/xen/balloon.c
drivers/xen/privcmd.c
fs/Kconfig
fs/aio.c
fs/binfmt_elf_fdpic.c
fs/cifs/cifsencrypt.c
fs/cifs/connect.c
fs/cifs/dfs_cache.c
fs/cifs/misc.c
fs/crypto/inline_crypt.c
fs/crypto/keyring.c
fs/crypto/keysetup_v1.c
fs/ecryptfs/keystore.c
fs/ecryptfs/messaging.c
fs/hugetlbfs/inode.c
fs/ntfs/dir.c
fs/ntfs/inode.c
fs/ntfs/inode.h
fs/ntfs/mft.c
fs/ocfs2/Kconfig
fs/ocfs2/acl.c
fs/ocfs2/blockcheck.c
fs/ocfs2/dlmglue.c
fs/ocfs2/ocfs2.h
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/proc/meminfo.c
include/asm-generic/pgalloc.h
include/asm-generic/tlb.h
include/crypto/aead.h
include/crypto/akcipher.h
include/crypto/gf128mul.h
include/crypto/hash.h
include/crypto/internal/acompress.h
include/crypto/kpp.h
include/crypto/skcipher.h
include/linux/efi.h
include/linux/fs.h
include/linux/huge_mm.h
include/linux/kasan.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mman.h
include/linux/mmu_notifier.h
include/linux/mmzone.h
include/linux/pageblock-flags.h
include/linux/percpu_counter.h
include/linux/sched/mm.h
include/linux/shmem_fs.h
include/linux/slab.h
include/linux/slab_def.h
include/linux/slub_def.h
include/linux/swap.h
include/linux/vmstat.h
init/Kconfig
init/main.c
ipc/shm.c
kernel/fork.c
kernel/kthread.c
kernel/power/snapshot.c
kernel/rcu/tree.c
kernel/scs.c
kernel/sysctl.c
lib/Kconfig.kasan
lib/Makefile
lib/mpi/mpiutil.c
lib/percpu_counter.c
lib/test_kasan.c
mm/Kconfig
mm/Makefile
mm/debug.c
mm/debug_vm_pgtable.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/ioremap.c [moved from lib/ioremap.c with 99% similarity]
mm/kasan/common.c
mm/kasan/generic.c
mm/kasan/generic_report.c
mm/kasan/kasan.h
mm/kasan/quarantine.c
mm/kasan/report.c
mm/kasan/tags.c
mm/khugepaged.c
mm/memcontrol.c
mm/memory.c
mm/memory_hotplug.c
mm/migrate.c
mm/mm_init.c
mm/mmap.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_counter.c
mm/page_io.c
mm/pgalloc-track.h [new file with mode: 0644]
mm/shmem.c
mm/shuffle.c
mm/shuffle.h
mm/slab.c
mm/slab.h
mm/slab_common.c
mm/slob.c
mm/slub.c
mm/sparse-vmemmap.c
mm/sparse.c
mm/swap_slots.c
mm/swap_state.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
net/atm/mpoa_caches.c
net/bluetooth/ecdh_helper.c
net/bluetooth/smp.c
net/core/sock.c
net/ipv4/tcp_fastopen.c
net/mac80211/aead_api.c
net/mac80211/aes_gmac.c
net/mac80211/key.c
net/mac802154/llsec.c
net/sctp/auth.c
net/sunrpc/auth_gss/gss_krb5_crypto.c
net/sunrpc/auth_gss/gss_krb5_keys.c
net/sunrpc/auth_gss/gss_krb5_mech.c
net/tipc/crypto.c
net/wireless/core.c
net/wireless/ibss.c
net/wireless/lib80211_crypt_tkip.c
net/wireless/lib80211_crypt_wep.c
net/wireless/nl80211.c
net/wireless/sme.c
net/wireless/util.c
net/wireless/wext-sme.c
scripts/Makefile.kasan
scripts/bloat-o-meter
scripts/coccinelle/free/devm_free.cocci
scripts/coccinelle/free/ifnullfree.cocci
scripts/coccinelle/free/kfree.cocci
scripts/coccinelle/free/kfreeaddr.cocci
scripts/const_structs.checkpatch
scripts/decode_stacktrace.sh
scripts/spelling.txt
scripts/tags.sh
security/apparmor/domain.c
security/apparmor/include/file.h
security/apparmor/policy.c
security/apparmor/policy_ns.c
security/apparmor/policy_unpack.c
security/keys/big_key.c
security/keys/dh.c
security/keys/encrypted-keys/encrypted.c
security/keys/trusted-keys/trusted_tpm1.c
security/keys/user_defined.c
tools/cgroup/memcg_slabinfo.py [new file with mode: 0644]
tools/include/linux/jhash.h
tools/lib/rbtree.c
tools/lib/traceevent/event-parse.h
tools/testing/ktest/examples/README
tools/testing/ktest/examples/crosstests.conf
tools/testing/selftests/Makefile
tools/testing/selftests/cgroup/.gitignore
tools/testing/selftests/cgroup/Makefile
tools/testing/selftests/cgroup/cgroup_util.c
tools/testing/selftests/cgroup/test_kmem.c [new file with mode: 0644]
tools/testing/selftests/mincore/.gitignore [new file with mode: 0644]
tools/testing/selftests/mincore/Makefile [new file with mode: 0644]
tools/testing/selftests/mincore/mincore_selftest.c [new file with mode: 0644]

index 00b993a..98ea67f 100644 (file)
                        fragmentation.  Defaults to 1 for systems with
                        more than 32MB of RAM, 0 otherwise.
 
-       slub_debug[=options[,slabs]]    [MM, SLUB]
+       slub_debug[=options[,slabs][;[options[,slabs]]...]      [MM, SLUB]
                        Enabling slub_debug allows one to determine the
                        culprit if slab objects become corrupted. Enabling
                        slub_debug can create guard zones around objects and
index c652d74..38fd568 100644 (file)
@@ -13,11 +13,8 @@ KASAN uses compile-time instrumentation to insert validity checks before every
 memory access, and therefore requires a compiler version that supports that.
 
 Generic KASAN is supported in both GCC and Clang. With GCC it requires version
-4.9.2 or later for basic support and version 5.0 or later for detection of
-out-of-bounds accesses for stack and global variables and for inline
-instrumentation mode (see the Usage section). With Clang it requires version
-7.0.0 or later and it doesn't support detection of out-of-bounds accesses for
-global variables yet.
+8.3.0 or later. With Clang it requires version 7.0.0 or later, but detection of
+out-of-bounds accesses for global variables is only supported since Clang 11.
 
 Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
 
@@ -193,6 +190,9 @@ function calls GCC directly inserts the code to check the shadow memory.
 This option significantly enlarges kernel but it gives x1.1-x2 performance
 boost over outline instrumented kernel.
 
+Generic KASAN prints up to 2 call_rcu() call stacks in reports, the last one
+and the second to last.
+
 Software tag-based KASAN
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
index 68daaa7..28dd41a 100644 (file)
@@ -12,7 +12,7 @@ dlmfs is built with OCFS2 as it requires most of its infrastructure.
 
 :Project web page:    http://ocfs2.wiki.kernel.org
 :Tools web page:      https://github.com/markfasheh/ocfs2-tools
-:OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+:OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
 
 All code copyright 2005 Oracle except when otherwise noted.
 
index 412386b..42ca9a3 100644 (file)
@@ -14,7 +14,7 @@ get "mount.ocfs2" and "ocfs2_hb_ctl".
 
 Project web page:    http://ocfs2.wiki.kernel.org
 Tools git tree:      https://github.com/markfasheh/ocfs2-tools
-OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
 
 All code copyright 2005 Oracle except when otherwise noted.
 
index 4e95929..c44f8b1 100644 (file)
@@ -150,6 +150,22 @@ These options do not have any effect on remount. You can change these
 parameters with chmod(1), chown(1) and chgrp(1) on a mounted filesystem.
 
 
+tmpfs has a mount option to select whether it will wrap at 32- or 64-bit inode
+numbers:
+
+=======   ========================
+inode64   Use 64-bit inode numbers
+inode32   Use 32-bit inode numbers
+=======   ========================
+
+On a 32-bit kernel, inode32 is implicit, and inode64 is refused at mount time.
+On a 64-bit kernel, CONFIG_TMPFS_INODE64 sets the default.  inode64 avoids the
+possibility of multiple files with the same inode number on a single device;
+but risks glibc failing with EOVERFLOW once 33-bit inode numbers are reached -
+if a long-lived tmpfs is accessed by 32-bit applications so ancient that
+opening a file larger than 2GiB fails with EINVAL.
+
+
 So 'mount -t tmpfs -o size=10G,nr_inodes=10k,mode=700 tmpfs /mytmpfs'
 will give you tmpfs instance on /mytmpfs which can allocate 10GB
 RAM/SWAP in 10240 inodes and it is only accessible by root.
@@ -161,3 +177,5 @@ RAM/SWAP in 10240 inodes and it is only accessible by root.
    Hugh Dickins, 4 June 2007
 :Updated:
    KOSAKI Motohiro, 16 Mar 2010
+:Updated:
+   Chris Down, 13 July 2020
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
new file mode 100644 (file)
index 0000000..f3591ee
--- /dev/null
@@ -0,0 +1,258 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _arch_page_table_helpers:
+
+===============================
+Architecture Page Table Helpers
+===============================
+
+Generic MM expects architectures (with MMU) to provide helpers to create, access
+and modify page table entries at various level for different memory functions.
+These page table helpers need to conform to a common semantics across platforms.
+Following tables describe the expected semantics which can also be tested during
+boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
+test need to be in sync.
+
+======================
+PTE Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pte_same                  | Tests whether both PTE entries are the same      |
++---------------------------+--------------------------------------------------+
+| pte_bad                   | Tests a non-table mapped PTE                     |
++---------------------------+--------------------------------------------------+
+| pte_present               | Tests a valid mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_young                 | Tests a young PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_dirty                 | Tests a dirty PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_write                 | Tests a writable PTE                             |
++---------------------------+--------------------------------------------------+
+| pte_special               | Tests a special PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_protnone              | Tests a PROT_NONE PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_soft_dirty            | Tests a soft dirty PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_mkyoung               | Creates a young PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkold                 | Creates an old PTE                               |
++---------------------------+--------------------------------------------------+
+| pte_mkdirty               | Creates a dirty PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkclean               | Creates a clean PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkwrite               | Creates a writable PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_mkwrprotect           | Creates a write protected PTE                    |
++---------------------------+--------------------------------------------------+
+| pte_mkspecial             | Creates a special PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
++---------------------------+--------------------------------------------------+
+| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
++---------------------------+--------------------------------------------------+
+| pte_mknotpresent          | Invalidates a mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear        | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear_full   | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_test_and_clear_young | Clears young from a PTE                          |
++---------------------------+--------------------------------------------------+
+| ptep_set_wrprotect        | Converts into a write protected PTE              |
++---------------------------+--------------------------------------------------+
+| ptep_set_access_flags     | Converts into a more permissive PTE              |
++---------------------------+--------------------------------------------------+
+
+======================
+PMD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pmd_same                  | Tests whether both PMD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pmd_bad                   | Tests a non-table mapped PMD                     |
++---------------------------+--------------------------------------------------+
+| pmd_leaf                  | Tests a leaf mapped PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
++---------------------------+--------------------------------------------------+
+| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
++---------------------------+--------------------------------------------------+
+| pmd_present               | Tests a valid mapped PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_young                 | Tests a young PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_dirty                 | Tests a dirty PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_write                 | Tests a writable PMD                             |
++---------------------------+--------------------------------------------------+
+| pmd_special               | Tests a special PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_protnone              | Tests a PROT_NONE PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_soft_dirty            | Tests a soft dirty PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_mkyoung               | Creates a young PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkold                 | Creates an old PMD                               |
++---------------------------+--------------------------------------------------+
+| pmd_mkdirty               | Creates a dirty PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkclean               | Creates a clean PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrite               | Creates a writable PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrprotect           | Creates a write protected PMD                    |
++---------------------------+--------------------------------------------------+
+| pmd_mkspecial             | Creates a special PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
++---------------------------+--------------------------------------------------+
+| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
++---------------------------+--------------------------------------------------+
+| pmd_set_huge              | Creates a PMD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pmd_clear_huge            | Clears a PMD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear        | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear_full   | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_test_and_clear_young | Clears young from a PMD                          |
++---------------------------+--------------------------------------------------+
+| pmdp_set_wrprotect        | Converts into a write protected PMD              |
++---------------------------+--------------------------------------------------+
+| pmdp_set_access_flags     | Converts into a more permissive PMD              |
++---------------------------+--------------------------------------------------+
+
+======================
+PUD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pud_same                  | Tests whether both PUD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pud_bad                   | Tests a non-table mapped PUD                     |
++---------------------------+--------------------------------------------------+
+| pud_leaf                  | Tests a leaf mapped PUD                          |
++---------------------------+--------------------------------------------------+
+| pud_huge                  | Tests a HugeTLB mapped PUD                       |
++---------------------------+--------------------------------------------------+
+| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
++---------------------------+--------------------------------------------------+
+| pud_present               | Tests a valid mapped PUD                         |
++---------------------------+--------------------------------------------------+
+| pud_young                 | Tests a young PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_dirty                 | Tests a dirty PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_write                 | Tests a writable PUD                             |
++---------------------------+--------------------------------------------------+
+| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
++---------------------------+--------------------------------------------------+
+| pud_mkyoung               | Creates a young PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkold                 | Creates an old PUD                               |
++---------------------------+--------------------------------------------------+
+| pud_mkdirty               | Creates a dirty PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkclean               | Creates a clean PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkwrite               | Creates a writable PUD                           |
++---------------------------+--------------------------------------------------+
+| pud_mkwrprotect           | Creates a write protected PUD                    |
++---------------------------+--------------------------------------------------+
+| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
++---------------------------+--------------------------------------------------+
+| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
++---------------------------+--------------------------------------------------+
+| pud_set_huge              | Creates a PUD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pud_clear_huge            | Clears a PUD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear        | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear_full   | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_test_and_clear_young | Clears young from a PUD                          |
++---------------------------+--------------------------------------------------+
+| pudp_set_wrprotect        | Converts into a write protected PUD              |
++---------------------------+--------------------------------------------------+
+| pudp_set_access_flags     | Converts into a more permissive PUD              |
++---------------------------+--------------------------------------------------+
+
+==========================
+HugeTLB Page Table Helpers
+==========================
+
++---------------------------+--------------------------------------------------+
+| pte_huge                  | Tests a HugeTLB                                  |
++---------------------------+--------------------------------------------------+
+| pte_mkhuge                | Creates a HugeTLB                                |
++---------------------------+--------------------------------------------------+
+| huge_pte_dirty            | Tests a dirty HugeTLB                            |
++---------------------------+--------------------------------------------------+
+| huge_pte_write            | Tests a writable HugeTLB                         |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrprotect      | Creates a write protected HugeTLB                |
++---------------------------+--------------------------------------------------+
+| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
++---------------------------+--------------------------------------------------+
+
+========================
+SWAP Page Table Helpers
+========================
+
++---------------------------+--------------------------------------------------+
+| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
++---------------------------+--------------------------------------------------+
+| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
++---------------------------+--------------------------------------------------+
+| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| is_migration_entry        | Tests a migration (read or write) swapped entry  |
++---------------------------+--------------------------------------------------+
+| is_write_migration_entry  | Tests a write migration swapped entry            |
++---------------------------+--------------------------------------------------+
+| make_migration_entry_read | Converts into read migration swapped entry       |
++---------------------------+--------------------------------------------------+
+| make_migration_entry      | Creates a migration swapped entry (read or write)|
++---------------------------+--------------------------------------------------+
+
+[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
index cc65bc8..7694497 100644 (file)
@@ -141,11 +141,8 @@ sections:
   `mem_section` objects and the number of rows is calculated to fit
   all the memory sections.
 
-The architecture setup code should call :c:func:`memory_present` for
-each active memory range or use :c:func:`memblocks_present` or
-:c:func:`sparse_memory_present_with_active_regions` wrappers to
-initialize the memory sections. Next, the actual memory maps should be
-set up using :c:func:`sparse_init`.
+The architecture setup code should call sparse_init() to
+initialize the memory sections and the memory maps.
 
 With SPARSEMEM there are two possible ways to convert a PFN to the
 corresponding `struct page` - a "classic sparse" and "sparse
@@ -178,7 +175,7 @@ for persistent memory devices in pre-allocated storage on those
 devices. This storage is represented with :c:type:`struct vmem_altmap`
 that is eventually passed to vmemmap_populate() through a long chain
 of function calls. The vmemmap_populate() implementation may use the
-`vmem_altmap` along with :c:func:`altmap_alloc_block_buf` helper to
+`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
 allocate memory map on the persistent memory device.
 
 ZONE_DEVICE
index 4eee598..289d231 100644 (file)
@@ -41,6 +41,11 @@ slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
        Enable options only for select slabs (no spaces
        after a comma)
 
+Multiple blocks of options for all slabs or selected slabs can be given, with
+blocks of options delimited by ';'. The last of "all slabs" blocks is applied
+to all slabs except those that match one of the "select slabs" block. Options
+of the first "select slabs" blocks that matches the slab's name are applied.
+
 Possible debug options are::
 
        F               Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
@@ -83,17 +88,33 @@ switch off debugging for such caches by default, use::
 
        slub_debug=O
 
-In case you forgot to enable debugging on the kernel command line: It is
-possible to enable debugging manually when the kernel is up. Look at the
-contents of::
+You can apply different options to different list of slab names, using blocks
+of options. This will enable red zoning for dentry and user tracking for
+kmalloc. All other slabs will not get any debugging enabled::
+
+       slub_debug=Z,dentry;U,kmalloc-*
+
+You can also enable options (e.g. sanity checks and poisoning) for all caches
+except some that are deemed too performance critical and don't need to be
+debugged by specifying global debug options followed by a list of slab names
+with "-" as options::
+
+       slub_debug=FZ;-,zs_handle,zspage
+
+The state of each debug option for a slab can be found in the respective files
+under::
 
        /sys/kernel/slab/<slab name>/
 
-Look at the writable files. Writing 1 to them will enable the
-corresponding debug option. All options can be set on a slab that does
-not contain objects. If the slab already contains objects then sanity checks
-and tracing may only be enabled. The other options may cause the realignment
-of objects.
+If the file contains 1, the option is enabled, 0 means disabled. The debug
+options from the ``slub_debug`` parameter translate to the following files::
+
+       F       sanity_checks
+       Z       red_zone
+       P       poison
+       U       store_user
+       T       trace
+       A       failslab
 
 Careful with tracing: It may spew out lots of information and never stop if
 used on the wrong slab.
index a1a29f6..9c6a24f 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 /*      
  * Allocate and free page tables. The xxx_kernel() versions are
@@ -34,23 +34,4 @@ pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
-static inline void
-pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long)pgd);
-}
-
-static inline pmd_t *
-pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-       pmd_t *ret = (pmd_t *)__get_free_page(GFP_PGTABLE_USER);
-       return ret;
-}
-
-static inline void
-pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       free_page((unsigned long)pmd);
-}
-
 #endif /* _ALPHA_PGALLOC_H */
index f8b4924..94dc37c 100644 (file)
@@ -5,7 +5,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <asm/compiler.h>
-#include <asm/pgalloc.h>
 
 #ifndef __EXTERN_INLINE
 #define __EXTERN_INLINE extern inline
index a9fd133..72af1e7 100644 (file)
@@ -302,7 +302,6 @@ irongate_init_arch(void)
 #include <linux/agp_backend.h>
 #include <linux/agpgart.h>
 #include <linux/export.h>
-#include <asm/pgalloc.h>
 
 #define GET_PAGE_DIR_OFF(addr) (addr >> 22)
 #define GET_PAGE_DIR_IDX(addr) (GET_PAGE_DIR_OFF(addr))
index 1db9d0e..4c80d99 100644 (file)
@@ -23,7 +23,6 @@
 #include <asm/ptrace.h>
 #include <asm/smp.h>
 #include <asm/gct.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/vga.h>
 
index 2a2820f..77f5d68 100644 (file)
@@ -20,7 +20,6 @@
 
 #include <asm/ptrace.h>
 #include <asm/smp.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/vga.h>
 
index 38f045e..393d5d6 100644 (file)
@@ -7,8 +7,6 @@
  * This file has goodies to help simplify instantiation of machine vectors.
  */
 
-#include <asm/pgalloc.h>
-
 /* Whee.  These systems don't have an HAE:
        IRONGATE, MARVEL, POLARIS, TSUNAMI, TITAN, WILDFIRE
    Fix things up for the GENERIC kernel by defining the HAE address
index 631cc17..f4dd9f3 100644 (file)
@@ -36,7 +36,6 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
index 5ad6087..0636e25 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 
 #include <asm/hwrpb.h>
-#include <asm/pgalloc.h>
 #include <asm/sections.h>
 
 pg_data_t node_data[MAX_NUMNODES];
index 72f5405..7287c79 100644 (file)
@@ -13,7 +13,6 @@
 #include <linux/kdebug.h>
 #include <linux/perf_event.h>
 #include <linux/mm_types.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu.h>
 
 /*
index e7bdc2a..f886ac6 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/arcregs.h>
 
index 069da39..15f4674 100644 (file)
 
 #ifdef CONFIG_ARM_LPAE
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return (pmd_t *)get_zeroed_page(GFP_KERNEL);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-       free_page((unsigned long)pmd);
-}
-
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
        set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
@@ -76,6 +65,7 @@ static inline void clean_pte_table(pte_t *pte)
 
 #define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
 #define __HAVE_ARCH_PTE_ALLOC_ONE
+#define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>
 
 static inline pte_t *
index 4d4e7b6..9415222 100644 (file)
@@ -27,7 +27,6 @@
 #else /* !CONFIG_MMU */
 
 #include <linux/swap.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
 static inline void __tlb_remove_table(void *_table)
index 974b6c6..5d84ad3 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/irq.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/fncpy.h>
index 9a64325..5d9da61 100644 (file)
@@ -37,7 +37,6 @@
 #include <asm/idmap.h>
 #include <asm/topology.h>
 #include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
 #include <asm/procinfo.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
index d2c9338..24bd205 100644 (file)
@@ -7,7 +7,6 @@
 #include <asm/bugs.h>
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
-#include <asm/pgalloc.h>
 #include <asm/memory.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
index 67fa285..9fba98c 100644 (file)
@@ -42,7 +42,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/smp_scu.h>
-#include <asm/pgalloc.h>
 #include <asm/suspend.h>
 #include <asm/virt.h>
 #include <asm/hardware/cache-l2x0.h>
index a1e5aac..dd7a027 100644 (file)
@@ -17,7 +17,6 @@
 #include <asm/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 
 /*
  * On ARM, huge pages are backed by pmd's rather than pte's, so we do a lot
index 01e18e4..000c1b4 100644 (file)
@@ -243,13 +243,8 @@ void __init bootmem_init(void)
                      (phys_addr_t)max_low_pfn << PAGE_SHIFT);
 
        /*
-        * Sparsemem tries to allocate bootmem in memory_present(),
-        * so must be done after the fixed reservations
-        */
-       memblocks_present();
-
-       /*
-        * sparse_init() needs the bootmem allocator up and running.
+        * sparse_init() tries to allocate memory from memblock, so must be
+        * done after the fixed reservations
         */
        sparse_init();
 
index cc3c9a6..c36f977 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/traps.h>
 #include <asm/procinfo.h>
 #include <asm/memory.h>
+#include <asm/pgalloc.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
index 58e9358..3c6a7f5 100644 (file)
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
 
 #define PGD_SIZE       (PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       gfp_t gfp = GFP_PGTABLE_USER;
-       struct page *page;
-
-       if (mm == &init_mm)
-               gfp = GFP_PGTABLE_KERNEL;
-
-       page = alloc_page(gfp);
-       if (!page)
-               return NULL;
-       if (!pgtable_pmd_page_ctor(page)) {
-               __free_page(page);
-               return NULL;
-       }
-       return page_address(page);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
-{
-       BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1));
-       pgtable_pmd_page_dtor(virt_to_page(pmdp));
-       free_page((unsigned long)pmdp);
-}
-
 static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 {
        set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
@@ -62,17 +38,6 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 
 #if CONFIG_PGTABLE_LEVELS > 3
 
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return (pud_t *)__get_free_page(GFP_PGTABLE_USER);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pudp)
-{
-       BUG_ON((unsigned long)pudp & (PAGE_SIZE-1));
-       free_page((unsigned long)pudp);
-}
-
 static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 {
        set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
index c793276..87e81d2 100644 (file)
@@ -276,7 +276,7 @@ arch_initcall(reserve_memblock_reserved_regions);
 
 u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
 
-void __init setup_arch(char **cmdline_p)
+void __init __no_sanitize_address setup_arch(char **cmdline_p)
 {
        init_mm.start_code = (unsigned long) _text;
        init_mm.end_code   = (unsigned long) _etext;
index e43a8ff..8059d50 100644 (file)
@@ -43,7 +43,6 @@
 #include <asm/kvm_mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/numa.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/smp_plat.h>
 #include <asm/sections.h>
index aa421bf..55ecf6d 100644 (file)
@@ -17,7 +17,6 @@
 #include <asm/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 
 /*
  * HugeTLB Support Matrix
index f8c19c6..481d22c 100644 (file)
@@ -430,11 +430,9 @@ void __init bootmem_init(void)
 #endif
 
        /*
-        * Sparsemem tries to allocate bootmem in memory_present(), so must be
-        * done after the fixed reservations.
+        * sparse_init() tries to allocate memory from memblock, so must be
+        * done after the fixed reservations
         */
-       memblocks_present();
-
        sparse_init();
        zone_sizes_init(min, max);
 
index 9be71be..b5e83c4 100644 (file)
@@ -16,7 +16,6 @@
 
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 
 static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
                                      pgprot_t prot, void *caller)
index 1df25f2..75df62f 100644 (file)
@@ -35,6 +35,7 @@
 #include <asm/mmu_context.h>
 #include <asm/ptdump.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #define NO_BLOCK_MAPPINGS      BIT(0)
 #define NO_CONT_MAPPINGS       BIT(1)
@@ -760,15 +761,20 @@ int kern_addr_valid(unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-static void free_hotplug_page_range(struct page *page, size_t size)
+static void free_hotplug_page_range(struct page *page, size_t size,
+                                   struct vmem_altmap *altmap)
 {
-       WARN_ON(PageReserved(page));
-       free_pages((unsigned long)page_address(page), get_order(size));
+       if (altmap) {
+               vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+       } else {
+               WARN_ON(PageReserved(page));
+               free_pages((unsigned long)page_address(page), get_order(size));
+       }
 }
 
 static void free_hotplug_pgtable_page(struct page *page)
 {
-       free_hotplug_page_range(page, PAGE_SIZE);
+       free_hotplug_page_range(page, PAGE_SIZE, NULL);
 }
 
 static bool pgtable_range_aligned(unsigned long start, unsigned long end,
@@ -791,7 +797,8 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
 }
 
 static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
-                                   unsigned long end, bool free_mapped)
+                                   unsigned long end, bool free_mapped,
+                                   struct vmem_altmap *altmap)
 {
        pte_t *ptep, pte;
 
@@ -805,12 +812,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
                pte_clear(&init_mm, addr, ptep);
                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                if (free_mapped)
-                       free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
+                       free_hotplug_page_range(pte_page(pte),
+                                               PAGE_SIZE, altmap);
        } while (addr += PAGE_SIZE, addr < end);
 }
 
 static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
-                                   unsigned long end, bool free_mapped)
+                                   unsigned long end, bool free_mapped,
+                                   struct vmem_altmap *altmap)
 {
        unsigned long next;
        pmd_t *pmdp, pmd;
@@ -833,16 +842,17 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                        if (free_mapped)
                                free_hotplug_page_range(pmd_page(pmd),
-                                                       PMD_SIZE);
+                                                       PMD_SIZE, altmap);
                        continue;
                }
                WARN_ON(!pmd_table(pmd));
-               unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
+               unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
 }
 
 static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
-                                   unsigned long end, bool free_mapped)
+                                   unsigned long end, bool free_mapped,
+                                   struct vmem_altmap *altmap)
 {
        unsigned long next;
        pud_t *pudp, pud;
@@ -865,16 +875,17 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                        if (free_mapped)
                                free_hotplug_page_range(pud_page(pud),
-                                                       PUD_SIZE);
+                                                       PUD_SIZE, altmap);
                        continue;
                }
                WARN_ON(!pud_table(pud));
-               unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
+               unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
 }
 
 static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
-                                   unsigned long end, bool free_mapped)
+                                   unsigned long end, bool free_mapped,
+                                   struct vmem_altmap *altmap)
 {
        unsigned long next;
        p4d_t *p4dp, p4d;
@@ -887,16 +898,24 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
                        continue;
 
                WARN_ON(!p4d_present(p4d));
-               unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
+               unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
 }
 
 static void unmap_hotplug_range(unsigned long addr, unsigned long end,
-                               bool free_mapped)
+                               bool free_mapped, struct vmem_altmap *altmap)
 {
        unsigned long next;
        pgd_t *pgdp, pgd;
 
+       /*
+        * altmap can only be used as vmemmap mapping backing memory.
+        * In case the backing memory itself is not being freed, then
+        * altmap is irrelevant. Warn about this inconsistency when
+        * encountered.
+        */
+       WARN_ON(!free_mapped && altmap);
+
        do {
                next = pgd_addr_end(addr, end);
                pgdp = pgd_offset_k(addr);
@@ -905,7 +924,7 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
                        continue;
 
                WARN_ON(!pgd_present(pgd));
-               unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
+               unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
 }
 
@@ -1069,7 +1088,7 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap)
 {
-       return vmemmap_populate_basepages(start, end, node);
+       return vmemmap_populate_basepages(start, end, node, altmap);
 }
 #else  /* !ARM64_SWAPPER_USES_SECTION_MAPS */
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1101,7 +1120,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                if (pmd_none(READ_ONCE(*pmdp))) {
                        void *p = NULL;
 
-                       p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+                       p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
                        if (!p)
                                return -ENOMEM;
 
@@ -1119,7 +1138,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 #ifdef CONFIG_MEMORY_HOTPLUG
        WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
 
-       unmap_hotplug_range(start, end, true);
+       unmap_hotplug_range(start, end, true, altmap);
        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
 #endif
 }
@@ -1410,7 +1429,7 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
        WARN_ON(pgdir != init_mm.pgd);
        WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
 
-       unmap_hotplug_range(start, end, false);
+       unmap_hotplug_range(start, end, false, NULL);
        free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
 }
 
index c7c1ed2..d58d814 100644 (file)
@@ -9,7 +9,7 @@
 #include <linux/sched.h>
 
 #define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
                                        pte_t *pte)
@@ -42,11 +42,6 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
        return pte;
 }
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *ret;
index e7425e6..041d0de 100644 (file)
@@ -23,7 +23,6 @@
 #include <asm/traps.h>
 #include <asm/sections.h>
 #include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
 #ifdef CONFIG_CPU_HAS_FPU
 #include <abi/fpu.h>
 #endif
index cc9be51..f0c47e6 100644 (file)
@@ -11,7 +11,7 @@
 #include <asm/mem-layout.h>
 #include <asm/atomic.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 extern unsigned long long kmap_generation;
 
@@ -41,11 +41,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
        return pgd;
 }
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long) pgd);
-}
-
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                pgtable_t pte)
 {
index 2a30503..9601cfe 100644 (file)
@@ -29,11 +29,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
        return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 }
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long)pgd);
-}
-
 #if CONFIG_PGTABLE_LEVELS == 4
 static inline void
 p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
@@ -41,15 +36,6 @@ p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
        p4d_val(*p4d_entry) = __pa(pud);
 }
 
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-       free_page((unsigned long)pud);
-}
 #define __pud_free_tlb(tlb, pud, address)      pud_free((tlb)->mm, pud)
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
@@ -59,16 +45,6 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
        pud_val(*pud_entry) = __pa(pmd);
 }
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       free_page((unsigned long)pmd);
-}
-
 #define __pmd_free_tlb(tlb, pmd, address)      pmd_free((tlb)->mm, pmd)
 
 static inline void
index f1f257d..8d9da6f 100644 (file)
@@ -42,7 +42,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 
index 7a4de9d..ec0b40f 100644 (file)
@@ -40,7 +40,6 @@
 #include <asm/elf.h>
 #include <asm/irq.h>
 #include <asm/kexec.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
 #include <asm/switch_to.h>
index bbfd421..0e27420 100644 (file)
@@ -39,7 +39,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
index 016683b..c29c600 100644 (file)
@@ -49,7 +49,6 @@
 #include <asm/irq.h>
 #include <asm/mca.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
index d7d31c7..e30e360 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/swap.h>
 
 #include <asm/meminit.h>
-#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/mca.h>
 
index da810ca..dbe829f 100644 (file)
@@ -24,7 +24,6 @@
 #include <linux/efi.h>
 #include <linux/nodemask.h>
 #include <linux/slab.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/meminit.h>
 #include <asm/numa.h>
@@ -601,7 +600,6 @@ void __init paging_init(void)
 
        max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
@@ -656,7 +654,7 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap)
 {
-       return vmemmap_populate_basepages(start, end, node);
+       return vmemmap_populate_basepages(start, end, node, NULL);
 }
 
 void vmemmap_free(unsigned long start, unsigned long end,
index 32352a7..b331f94 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/sysctl.h>
 #include <linux/log2.h>
 #include <asm/mman.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
index 71c1991..135b513 100644 (file)
@@ -27,7 +27,6 @@
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
 #include <asm/dma.h>
index cac9f28..993fd7e 100644 (file)
@@ -222,7 +222,7 @@ static inline void activate_mm(struct mm_struct *prev_mm,
 
 #include <asm/setup.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
 
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
index 11b95da..000f648 100644 (file)
@@ -13,7 +13,7 @@
 
 #include <asm/tlb.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 extern const char bad_pmd_string[];
 
@@ -40,11 +40,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
  */
 #define pmd_free(mm, x)                        do { } while (0)
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-        free_page((unsigned long) pgd);
-}
-
 static inline pgd_t * pgd_alloc(struct mm_struct *mm)
 {
      pgd_t *new_pgd;
index 871a0e1..b1ca352 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
 
 #if defined(CONFIG_MMU) && !defined(CONFIG_COLDFIRE)
 void arch_dma_prep_coherent(struct page *page, size_t size)
index df6fc78..546e819 100644 (file)
 #include <asm/fpu.h>
 #include <linux/uaccess.h>
 #include <asm/traps.h>
-#include <asm/pgalloc.h>
 #include <asm/machdep.h>
 #include <asm/siginfo.h>
-
+#include <asm/tlbflush.h>
 
 static const char *vec_names[] = {
        [VEC_RESETSP]   = "RESET SP",
index 5ecb331..b486c08 100644 (file)
@@ -8,7 +8,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
 #include <asm/traps.h>
 
 
index a94a814..508abb6 100644 (file)
@@ -15,7 +15,6 @@
 
 #include <asm/setup.h>
 #include <asm/traps.h>
-#include <asm/pgalloc.h>
 
 extern void die_if_kernel(char *, struct pt_regs *, long);
 
index 14d31d2..1269d51 100644 (file)
@@ -19,8 +19,8 @@
 #include <asm/setup.h>
 #include <asm/segment.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/io.h>
+#include <asm/tlbflush.h>
 
 #undef DEBUG
 
index a3e7e4e..2b9cb4a 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mcf_pgalloc.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #define KMAPAREA(x)    ((x >= VMALLOC_START) && (x < KMAP_END))
 
index 65e0c40..fe75aec 100644 (file)
@@ -17,7 +17,6 @@
 #include <asm/setup.h>
 #include <asm/segment.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/traps.h>
 #include <asm/machdep.h>
 
index fef52d2..08bb921 100644 (file)
@@ -22,7 +22,7 @@
 #include <asm/dvma.h>
 #include <asm/io.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 
 /* IOMMU support */
 
index ebb6b79..8839ce0 100644 (file)
@@ -28,12 +28,6 @@ static inline pgd_t *get_pgd(void)
        return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
 }
 
-static inline void free_pgd(pgd_t *pgd)
-{
-       free_page((unsigned long)pgd);
-}
-
-#define pgd_free(mm, pgd)      free_pgd(pgd)
 #define pgd_alloc(mm)          get_pgd()
 
 #define pmd_pgtable(pmd)       pmd_page(pmd)
index 6f8f5c7..1200e2b 100644 (file)
@@ -15,7 +15,6 @@
 #include <asm/processor.h>     /* For TASK_SIZE */
 #include <asm/mmu.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 
 extern void _tlbie(unsigned long address);
 extern void _tlbia(void);
index 6cabeab..a9e46e5 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/tick.h>
 #include <linux/bitops.h>
 #include <linux/ptrace.h>
-#include <asm/pgalloc.h>
 #include <linux/uaccess.h> /* for USER_DS macros */
 #include <asm/cacheflush.h>
 
index bdd6d0c..65bf5fd 100644 (file)
@@ -35,7 +35,6 @@
 #include <asm/entry.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <linux/syscalls.h>
 #include <asm/cacheflush.h>
 #include <asm/syscalls.h>
index 521b59b..0880a00 100644 (file)
@@ -172,9 +172,6 @@ void __init setup_memory(void)
                                  &memblock.memory, 0);
        }
 
-       /* XXX need to clip this if using highmem? */
-       sparse_memory_present_with_active_regions(0);
-
        paging_init();
 }
 
index fa77cb7..8b18424 100644 (file)
@@ -13,7 +13,9 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PMD_ALLOC_ONE
+#define __HAVE_ARCH_PUD_ALLOC_ONE
+#include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
        pte_t *pte)
@@ -47,11 +49,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern void pgd_init(unsigned long page);
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
 #define __pte_free_tlb(tlb,pte,address)                        \
 do {                                                   \
        pgtable_pte_page_dtor(pte);                     \
@@ -70,11 +67,6 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
        return pmd;
 }
 
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       free_pages((unsigned long)pmd, PMD_ORDER);
-}
-
 #define __pmd_free_tlb(tlb, x, addr)   pmd_free((tlb)->mm, x)
 
 #endif
@@ -91,11 +83,6 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
        return pud;
 }
 
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-       free_pages((unsigned long)pud, PUD_ORDER);
-}
-
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
 {
        set_p4d(p4d, __p4d((unsigned long)pud));
index 588b212..bf5f5ac 100644 (file)
@@ -371,14 +371,6 @@ static void __init bootmem_init(void)
 #endif
        }
 
-
-       /*
-        * In any case the added to the memblock memory regions
-        * (highmem/lowmem, available/reserved, etc) are considered
-        * as present, so inform sparsemem about them.
-        */
-       memblocks_present();
-
        /*
         * Reserve initrd memory if needed.
         */
index 901f5be..ea8bb1b 100644 (file)
@@ -220,7 +220,6 @@ static __init void prom_meminit(void)
                        cpumask_clear(&__node_cpumask[node]);
                }
        }
-       memblocks_present();
        max_low_pfn = PHYS_PFN(memblock_end_of_DRAM());
 
        for (cpu = 0; cpu < loongson_sysconf.nr_cpus; cpu++) {
index 1213215..d411e0a 100644 (file)
@@ -402,8 +402,6 @@ void __init prom_meminit(void)
                }
                __node_data[node] = &null_node;
        }
-
-       memblocks_present();
 }
 
 void __init prom_free_prom_memory(void)
index be1b2cf..62b956c 100644 (file)
@@ -14,7 +14,6 @@
 #include <asm/ip32/crime.h>
 #include <asm/bootinfo.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 
 extern void crime_init(void);
 
index 8503bee..55bec50 100644 (file)
@@ -2,6 +2,8 @@
 // Copyright (C) 2005-2017 Andes Technology Corporation
 
 #include <linux/init_task.h>
+
+#define __HAVE_ARCH_PGD_FREE
 #include <asm/pgalloc.h>
 
 #define FIRST_KERNEL_PGD_NR    (USER_PTRS_PER_PGD)
index 0b146d7..e6600d2 100644 (file)
@@ -12,7 +12,7 @@
 
 #include <linux/mm.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
        pte_t *pte)
@@ -34,11 +34,6 @@ extern void pmd_init(unsigned long page, unsigned long pagetable);
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_pages((unsigned long)pgd, PGD_ORDER);
-}
-
 #define __pte_free_tlb(tlb, pte, addr)                         \
        do {                                                    \
                pgtable_pte_page_dtor(pte);                     \
index da12a4c..8882029 100644 (file)
@@ -20,6 +20,9 @@
 #include <linux/mm.h>
 #include <linux/memblock.h>
 
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#include <asm-generic/pgalloc.h>
+
 extern int mem_init_done;
 
 #define pmd_populate_kernel(mm, pmd, pte) \
@@ -61,38 +64,8 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
 }
 #endif
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long)pgd);
-}
-
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
 
-static inline struct page *pte_alloc_one(struct mm_struct *mm)
-{
-       struct page *pte;
-       pte = alloc_pages(GFP_KERNEL, 0);
-       if (!pte)
-               return NULL;
-       clear_page(page_address(pte));
-       if (!pgtable_pte_page_ctor(pte)) {
-               __free_page(pte);
-               return NULL;
-       }
-       return pte;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *pte)
-{
-       pgtable_pte_page_dtor(pte);
-       __free_page(pte);
-}
-
 #define __pte_free_tlb(tlb, pte, addr) \
 do {                                   \
        pgtable_pte_page_dtor(pte);     \
index 4a4639c..185dcd3 100644 (file)
@@ -17,7 +17,6 @@
 
 #include <linux/mm.h>
 #include <asm/processor.h>
-#include <asm/pgalloc.h>
 #include <asm/current.h>
 #include <linux/sched.h>
 
index 277ac7a..212e5f8 100644 (file)
@@ -26,7 +26,6 @@
 #include <asm/io.h>
 #include <asm/hardirq.h>
 #include <asm/delay.h>
-#include <asm/pgalloc.h>
 
 #define DECLARE_EXPORT(name) extern void name(void); EXPORT_SYMBOL(name)
 
index 07b89c7..cb5f2f7 100644 (file)
@@ -5,7 +5,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/atomic.h>
-#include <asm/pgalloc.h>
 #include <asm-generic/mm_hooks.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
index 9ac74da..cc7ecc2 100644 (file)
@@ -10,7 +10,9 @@
 
 #include <asm/cache.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PMD_FREE
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
 
 /* Allocate the top level pgd (page directory)
  *
@@ -65,14 +67,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
                        (__u32)(__pa((unsigned long)pmd) >> PxD_VALUE_SHIFT)));
 }
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-       pmd_t *pmd = (pmd_t *)__get_free_pages(GFP_KERNEL, PMD_ORDER);
-       if (pmd)
-               memset(pmd, 0, PAGE_SIZE<<PMD_ORDER);
-       return pmd;
-}
-
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
        if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) {
index 1eedfec..b5e1d9f 100644 (file)
@@ -24,7 +24,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
 #include <asm/shmparam.h>
index 4f1596b..38c68e1 100644 (file)
@@ -32,7 +32,6 @@
 #include <asm/dma.h>    /* for DMA_CHUNK_SIZE */
 #include <asm/io.h>
 #include <asm/page.h>  /* get_order */
-#include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/tlbflush.h>      /* for purge_tlb_*() macros */
 
index de6299f..86ec30c 100644 (file)
@@ -47,7 +47,6 @@
 #include <asm/assembly.h>
 #include <asm/pdc.h>
 #include <asm/pdc_chassis.h>
-#include <asm/pgalloc.h>
 #include <asm/unwind.h>
 #include <asm/sections.h>
 
index 02895a8..5df5d4c 100644 (file)
@@ -30,7 +30,6 @@
 #include <asm/ucontext.h>
 #include <asm/rt_sigframe.h>
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/asm-offsets.h>
 
index f8a842d..6271139 100644 (file)
@@ -39,7 +39,6 @@
 #include <asm/irq.h>           /* for CPU_IRQ_REGION and friends */
 #include <asm/mmu_context.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/unistd.h>
index 0e1e212..d7ba014 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/sysctl.h>
 
 #include <asm/mman.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
index 39ea464..4381b65 100644 (file)
@@ -689,11 +689,6 @@ void __init paging_init(void)
        flush_cache_all_local(); /* start with known state */
        flush_tlb_all_local(NULL);
 
-       /*
-        * Mark all memblocks as present for sparsemem using
-        * memory_present() and then initialize sparsemem.
-        */
-       memblocks_present();
        sparse_init();
        parisc_bootmem_free();
 }
index 6e7c005..345ff0b 100644 (file)
@@ -11,7 +11,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/io.h>
-#include <asm/pgalloc.h>
+#include <linux/mm.h>
 
 /*
  * Generic mapping function (not visible outside):
index 862985c..fbc6f30 100644 (file)
@@ -12,7 +12,6 @@
 #ifndef __powerpc64__
 #include <linux/pgtable.h>
 #endif
-#include <asm/pgalloc.h>
 #ifndef __powerpc64__
 #include <asm/page.h>
 #include <asm/mmu.h>
index 25acb9c..964467b 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 
index 2a99167..fd9c7f9 100644 (file)
@@ -9,7 +9,6 @@
 #include <linux/mm_types.h>
 #include <linux/mm.h>
 
-#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/mmu.h>
 #include <asm/tlb.h>
index 0fbf3dc..eb0bcca 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
index c812b40..cb91071 100644 (file)
@@ -2,7 +2,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/security.h>
-#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 #include <asm/mman.h>
index bf1717f..02c7db4 100644 (file)
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 
-#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
 #include <asm/mmu.h>
index 4ae5fc0..02e127f 100644 (file)
@@ -225,12 +225,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                 * fall back to system memory if the altmap allocation fail.
                 */
                if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
-                       p = altmap_alloc_block_buf(page_size, altmap);
+                       p = vmemmap_alloc_block_buf(page_size, node, altmap);
                        if (!p)
                                pr_debug("altmap block allocation failed, falling back to system memory");
                }
                if (!p)
-                       p = vmemmap_alloc_block_buf(page_size, node);
+                       p = vmemmap_alloc_block_buf(page_size, node, NULL);
                if (!p)
                        return -ENOMEM;
 
index 569d98a..2784224 100644 (file)
@@ -5,7 +5,6 @@
 #include <linux/kasan.h>
 #include <linux/memblock.h>
 #include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
 
 static int __init
 kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
index a32b464..202bd26 100644 (file)
@@ -4,7 +4,6 @@
 
 #include <linux/kasan.h>
 #include <linux/memblock.h>
-#include <asm/pgalloc.h>
 #include <mm/mmu_decl.h>
 
 int __init kasan_init_region(void *start, size_t size)
index 9dafc63..42e2587 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/dma-direct.h>
 #include <linux/kprobes.h>
 
-#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -179,8 +178,6 @@ void __init mem_topology_setup(void)
 
 void __init initmem_init(void)
 {
-       /* XXX need to clip this if using highmem? */
-       sparse_memory_present_with_active_regions(0);
        sparse_init();
 }
 
index 13e74bc..95751c3 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/highmem.h>
 #include <linux/memblock.h>
 
-#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
index 92e8929..d2b3714 100644 (file)
@@ -13,7 +13,6 @@
 #include <asm/fixmap.h>
 #include <asm/code-patching.h>
 #include <asm/inst.h>
-#include <asm/pgalloc.h>
 
 #include <mm/mmu_decl.h>
 
index c06dfbb..0c29482 100644 (file)
@@ -37,7 +37,6 @@
 #include <linux/highmem.h>
 #include <linux/memblock.h>
 
-#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
index bce0e53..4c74e8a 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/libfdt.h>
 #include <linux/crash_core.h>
 #include <asm/cacheflush.h>
-#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/kdump.h>
 #include <mm/mmu_decl.h>
index 696f568..1451458 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/of_fdt.h>
 #include <linux/hugetlb.h>
 
+#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
index 058fee9..1f61fa2 100644 (file)
@@ -953,7 +953,6 @@ void __init initmem_init(void)
 
                get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
                setup_node_data(nid, start_pfn, end_pfn);
-               sparse_memory_present_with_active_regions(nid);
        }
 
        sparse_init();
index 1136257..9c0547d 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/hugetlb.h>
index bb43a8c..cc6e2f9 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 
-#include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/mmu_context.h>
index 5b8bd34..ad6df9a 100644 (file)
 #include <linux/seq_file.h>
 #include <linux/const.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/plpar_wrappers.h>
 #include <linux/memblock.h>
 #include <asm/firmware.h>
+#include <asm/pgalloc.h>
 
 struct pg_state {
        struct seq_file *seq;
index c911cd7..aca354f 100644 (file)
@@ -21,7 +21,6 @@
 #include <asm/fixmap.h>
 #include <linux/const.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/hugetlb.h>
 
 #include <mm/mmu_decl.h>
index 9dba7e8..45a3a30 100644 (file)
@@ -26,7 +26,6 @@
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
 #include <asm/mmu.h>
-#include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <linux/memory.h>
 #include <asm/plpar_wrappers.h>
index 3f601ee..23b1544 100644 (file)
@@ -11,7 +11,7 @@
 #include <asm/tlb.h>
 
 #ifdef CONFIG_MMU
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm,
        pmd_t *pmd, pte_t *pte)
@@ -55,24 +55,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
        return pgd;
 }
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long)pgd);
-}
-
 #ifndef __PAGETABLE_PMD_FOLDED
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return (pmd_t *)__get_free_page(
-               GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       free_page((unsigned long)pmd);
-}
-
 #define __pmd_free_tlb(tlb, pmd, addr)  pmd_free((tlb)->mm, pmd)
 
 #endif /* __PAGETABLE_PMD_FOLDED */
index ae7b7fe..5873835 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/signal.h>
 #include <linux/uaccess.h>
 
-#include <asm/pgalloc.h>
 #include <asm/ptrace.h>
 #include <asm/tlbflush.h>
 
index 50bcd9f..787c75f 100644 (file)
@@ -570,7 +570,6 @@ static void __init resource_init(void)
 void __init paging_init(void)
 {
        setup_vm_final();
-       memblocks_present();
        sparse_init();
        setup_zero_page();
        zone_sizes_init();
@@ -581,6 +580,6 @@ void __init paging_init(void)
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                               struct vmem_altmap *altmap)
 {
-       return vmemmap_populate_basepages(start, end, node);
+       return vmemmap_populate_basepages(start, end, node, NULL);
 }
 #endif
index e1ae239..5057773 100644 (file)
@@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void)
 {
        pr_debug("The prng module stopped "
                 "after running in triple DES mode\n");
-       kzfree(prng_data);
+       kfree_sensitive(prng_data);
 }
 
 
@@ -442,7 +442,7 @@ outfree:
 static void prng_sha512_deinstantiate(void)
 {
        pr_debug("The prng module stopped after running in SHA-512 mode\n");
-       kzfree(prng_data);
+       kfree_sensitive(prng_data);
 }
 
 
index aa406c0..954fa8c 100644 (file)
@@ -36,7 +36,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 #define p4d_free_tlb p4d_free_tlb
 #define pud_free_tlb pud_free_tlb
 
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm-generic/tlb.h>
 
index 2204704..acce6a0 100644 (file)
@@ -5,7 +5,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <asm/processor.h>
-#include <asm/pgalloc.h>
 
 /*
  * Flush all TLB entries on the local CPU.
index 93c6b89..d91989c 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/debug_locks.h>
 #include <asm/cio.h>
 #include <asm/setup.h>
-#include <asm/pgalloc.h>
 #include <asm/smp.h>
 #include <asm/ipl.h>
 #include <asm/diag.h>
index a09b9e9..11d2f7d 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/compat.h>
 #include <trace/syscall.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/switch_to.h>
index 563429d..5b8ec1c 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
-#include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
index 96ae368..2f721a9 100644 (file)
@@ -22,7 +22,6 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/page-states.h>
-#include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/io.h>
 #include <asm/ptrace.h>
index 63e3301..eb99e2f 100644 (file)
@@ -9,7 +9,6 @@
 #include <linux/kvm_host.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
-#include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/uv.h>
 #include <asm/mman.h>
index 5c15ae3..1141c8d 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/oom.h>
 #include <linux/uaccess.h>
 
-#include <asm/pgalloc.h>
 #include <asm/diag.h>
 
 #ifdef CONFIG_CMM_IUCV
index 6dc7c3b..0d28208 100644 (file)
@@ -115,7 +115,6 @@ void __init paging_init(void)
        __load_psw_mask(psw.mask);
        kasan_free_early_identity();
 
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        zone_dma_bits = 31;
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
index 1b78f63..e54f928 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/random.h>
 #include <linux/compat.h>
 #include <linux/security.h>
-#include <asm/pgalloc.h>
 #include <asm/elf.h>
 
 static unsigned long stack_maxrandom_size(void)
index 2e0cc19..0d25f74 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/ksm.h>
 #include <linux/mman.h>
 
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
index d770da3..0e6b0be 100644 (file)
@@ -3,6 +3,10 @@
 #define __ASM_SH_PGALLOC_H
 
 #include <asm/page.h>
+
+#define __HAVE_ARCH_PMD_ALLOC_ONE
+#define __HAVE_ARCH_PMD_FREE
+#define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>
 
 extern pgd_t *pgd_alloc(struct mm_struct *);
index c20fc54..0dc0f52 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/irqflags.h>
 #include <linux/smp.h>
 #include <linux/atomic.h>
-#include <asm/pgalloc.h>
 #include <asm/smp.h>
 #include <asm/bl_bit.h>
 
index 4a98980..223c14f 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/memblock.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
index 26f3bd4..bc59598 100644 (file)
@@ -16,7 +16,6 @@
 #include <asm/cache.h>
 #include <asm/io.h>
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
index 4897829..4c67b3d 100644 (file)
@@ -20,7 +20,6 @@
 #include <asm/cache.h>
 #include <asm/io.h>
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
index acd5652..220d7bc 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/sysctl.h>
 
 #include <asm/mman.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
index a70ba0f..613de80 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/cache.h>
+#include <asm/pgalloc.h>
 #include <linux/sizes.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD];
@@ -240,12 +241,6 @@ static void __init do_init_bootmem(void)
 
        plat_mem_setup();
 
-       for_each_memblock(memory, reg) {
-               int nid = memblock_get_region_node(reg);
-
-               memory_present(nid, memblock_region_memory_base_pfn(reg),
-                       memblock_region_memory_end_pfn(reg));
-       }
        sparse_init();
 }
 
index 07e744d..aab3f82 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/proc_fs.h>
 #include <asm/fixmap.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/addrspace.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
index f7e4439..50f0dc1 100644 (file)
@@ -53,7 +53,4 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 
        /* It's up */
        node_set_online(nid);
-
-       /* Kick sparsemem */
-       sparse_memory_present_with_active_regions(nid);
 }
index 8692435..fb400af 100644 (file)
@@ -21,7 +21,6 @@
 
 #include <asm/io.h>
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
index 499aa2e..904cc6c 100644 (file)
@@ -13,7 +13,6 @@
 
 #include <asm/io.h>
 #ifdef CONFIG_SPARC64
-#include <asm/pgalloc.h>
 #include <asm/spitfire.h>
 #include <asm/cacheflush.h>
 #include <asm/page.h>
index 6820d35..e841cae 100644 (file)
@@ -4,7 +4,6 @@
 
 #include <linux/swap.h>
 #include <linux/pagemap.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
index 41829c0..1eed26d 100644 (file)
@@ -38,7 +38,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/oplib.h>
 #include <asm/cpudata.h>
 #include <asm/asi.h>
index bd123f1..3f519e1 100644 (file)
@@ -34,7 +34,6 @@
 #include <asm/oplib.h>
 #include <linux/uaccess.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/delay.h>
 #include <asm/processor.h>
 #include <asm/psr.h>
index 3b005b6..f1f8c8e 100644 (file)
@@ -23,7 +23,6 @@
 
 #include <linux/uaccess.h>
 #include <asm/ptrace.h>
-#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>    /* flush_sig_insns */
 #include <asm/switch_to.h>
 
index 76ce290..50c127a 100644 (file)
@@ -29,7 +29,6 @@
 
 #include <asm/irq.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/oplib.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
index 0085e28..e286e2b 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/uaccess.h>
 #include <asm/starfire.h>
 #include <asm/tlb.h>
+#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/prom.h>
 #include <asm/mdesc.h>
index 91b61f0..1079638 100644 (file)
@@ -16,7 +16,6 @@
 
 #include <asm/timer.h>
 #include <asm/traps.h>
-#include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
index d1fc9a7..8f2a2af 100644 (file)
@@ -29,7 +29,6 @@
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 #include <asm/vaddrs.h>
 
 static pte_t *kmap_pte;
index 02e6e5e..fad6d31 100644 (file)
@@ -1610,7 +1610,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
 
        /* XXX cpu notifier XXX */
 
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
 
        return end_pfn;
index bfcc04b..430a47a 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/pgalloc.h>
 #include <asm/io.h>
 #include <asm/io-unit.h>
 #include <asm/mxcc.h>
index 35b002e..3a388b1 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/pgalloc.h>
 #include <asm/io.h>
 #include <asm/mxcc.h>
 #include <asm/mbus.h>
index a32a16c..20ee147 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/swap.h>
 #include <linux/preempt.h>
 
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
index 881e76d..5393e13 100644 (file)
@@ -10,7 +10,7 @@
 
 #include <linux/mm.h>
 
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#include <asm-generic/pgalloc.h>
 
 #define pmd_populate_kernel(mm, pmd, pte) \
        set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) __pa(pte)))
@@ -25,7 +25,6 @@
  * Allocate and free page tables.
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 #define __pte_free_tlb(tlb,pte, address)               \
 do {                                                   \
@@ -34,12 +33,6 @@ do {                                                 \
 } while (0)
 
 #ifdef CONFIG_3_LEVEL_PGTABLES
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       free_page((unsigned long)pmd);
-}
-
 #define __pmd_free_tlb(tlb,x, address)   tlb_remove_page((tlb),virt_to_page(x))
 #endif
 
index 36f4529..7e6a418 100644 (file)
@@ -78,9 +78,6 @@ static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; }
 #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
 #endif
 
-struct mm_struct;
-extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address);
-
 static inline void pud_clear (pud_t *pud)
 {
        set_pud(pud, __pud(_PAGE_NEWPAGE));
index c2ff76c..9242dc9 100644 (file)
@@ -196,23 +196,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
        return pgd;
 }
 
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       free_page((unsigned long) pgd);
-}
-
-#ifdef CONFIG_3_LEVEL_PGTABLES
-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-       pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
-
-       if (pmd)
-               memset(pmd, 0, PAGE_SIZE);
-
-       return pmd;
-}
-#endif
-
 void *uml_kmalloc(int size, int flags)
 {
        return kmalloc(size, flags);
index 385d3d1..ca8a657 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/user32.h>
 #include <asm/ia32.h>
index 4756214..d98016b 100644 (file)
@@ -9,7 +9,6 @@
 
 #include <trace/events/tlb.h>
 
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/debugreg.h>
index 29aa785..62ad61d 100644 (file)
@@ -7,7 +7,8 @@
 #include <linux/pagemap.h>
 
 #define __HAVE_ARCH_PTE_ALLOC_ONE
-#include <asm-generic/pgalloc.h>       /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
 
 static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
 
@@ -86,30 +87,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
 #if CONFIG_PGTABLE_LEVELS > 2
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       struct page *page;
-       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
-       if (mm == &init_mm)
-               gfp &= ~__GFP_ACCOUNT;
-       page = alloc_pages(gfp, 0);
-       if (!page)
-               return NULL;
-       if (!pgtable_pmd_page_ctor(page)) {
-               __free_pages(page, 0);
-               return NULL;
-       }
-       return (pmd_t *)page_address(page);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-       pgtable_pmd_page_dtor(virt_to_page(pmd));
-       free_page((unsigned long)pmd);
-}
-
 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
@@ -147,21 +124,6 @@ static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pu
        set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
 }
 
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       gfp_t gfp = GFP_KERNEL_ACCOUNT;
-
-       if (mm == &init_mm)
-               gfp &= ~__GFP_ACCOUNT;
-       return (pud_t *)get_zeroed_page(gfp);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-       free_page((unsigned long)pud);
-}
-
 extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
 
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
index c826cdd..d117553 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/stringify.h>
+#include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/memory.h>
index e0e2f02..ccf726c 100644 (file)
@@ -40,7 +40,6 @@
 #include <asm/irq_remapping.h>
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
-#include <asm/pgalloc.h>
 #include <linux/atomic.h>
 #include <asm/mpspec.h>
 #include <asm/i8259.h>
index afac7cc..c27b82b 100644 (file)
@@ -22,7 +22,6 @@
 #include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
-#include <asm/pgalloc.h>
 #include <asm/io_apic.h>
 #include <asm/proto.h>
 #include <asm/bios_ebda.h>
index 438fc55..1f66d2d 100644 (file)
@@ -62,7 +62,6 @@
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
-#include <asm/pgalloc.h>
 #include <asm/proto.h>
 #else
 #include <asm/processor-flags.h>
index 5e5edd2..0c7643d 100644 (file)
@@ -21,7 +21,6 @@
 
 #include <asm/cpufeature.h>            /* boot_cpu_has, ...            */
 #include <asm/traps.h>                 /* dotraplinkage, ...           */
-#include <asm/pgalloc.h>               /* pgd_*(), ...                 */
 #include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
 #include <asm/vsyscall.h>              /* emulate_vsyscall             */
 #include <asm/vm86.h>                  /* struct vm86                  */
index cf57811..a0d023c 100644 (file)
@@ -17,7 +17,6 @@
 #include <asm/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 #include <asm/elf.h>
 
 #if 0  /* This is just for testing */
index 8b4afad..4cb9584 100644 (file)
@@ -678,7 +678,6 @@ void __init initmem_init(void)
 #endif
 
        memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
-       sparse_memory_present_with_active_regions(0);
 
 #ifdef CONFIG_FLATMEM
        max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
@@ -718,7 +717,6 @@ void __init paging_init(void)
         * NOTE: at this point the bootmem allocator is fully available.
         */
        olpc_dt_build_devicetree();
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        zone_sizes_init();
 }
index 449e071..3b246ae 100644 (file)
@@ -817,7 +817,6 @@ void __init initmem_init(void)
 
 void __init paging_init(void)
 {
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
 
        /*
@@ -1510,10 +1509,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                if (pmd_none(*pmd)) {
                        void *p;
 
-                       if (altmap)
-                               p = altmap_alloc_block_buf(PMD_SIZE, altmap);
-                       else
-                               p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+                       p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
                        if (p) {
                                pte_t entry;
 
@@ -1540,7 +1536,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                        vmemmap_verify((pte_t *)pmd, node, addr, next);
                        continue;
                }
-               if (vmemmap_populate_basepages(addr, next, node))
+               if (vmemmap_populate_basepages(addr, next, node, NULL))
                        return -ENOMEM;
        }
        return 0;
@@ -1552,7 +1548,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
        int err;
 
        if (end - start < PAGES_PER_SECTION * sizeof(struct page))
-               err = vmemmap_populate_basepages(start, end, node);
+               err = vmemmap_populate_basepages(start, end, node, NULL);
        else if (boot_cpu_has(X86_FEATURE_PSE))
                err = vmemmap_populate_hugepages(start, end, node, altmap);
        else if (altmap) {
@@ -1560,7 +1556,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                                __func__);
                err = -ENOMEM;
        } else
-               err = vmemmap_populate_basepages(start, end, node);
+               err = vmemmap_populate_basepages(start, end, node, NULL);
        if (!err)
                sync_global_pgds(start, end - 1);
        return err;
index fb620fd..6e6b397 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/memblock.h>
 #include <linux/pgtable.h>
 
-#include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/kaslr.h>
 
index 1953685..c234634 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/spinlock.h>
 
 #include <asm/cpu_entry_area.h>
-#include <asm/pgalloc.h>
 #include <asm/fixmap.h>
 #include <asm/e820/api.h>
 #include <asm/tlb.h>
index a8a924b..1aab929 100644 (file)
@@ -34,7 +34,6 @@
 #include <asm/vsyscall.h>
 #include <asm/cmdline.h>
 #include <asm/pti.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/sections.h>
index a6e5f2c..a2f447d 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <asm/efi.h>
 #include <linux/io.h>
+#include <asm/pgalloc.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_hub.h>
 
index d147f1b..cd3914f 100644 (file)
@@ -98,7 +98,7 @@ static int get_e820_md5(struct e820_table *table, void *buf)
        if (crypto_shash_digest(desc, (u8 *)table, size, buf))
                ret = -EINVAL;
 
-       kzfree(desc);
+       kfree_sensitive(desc);
 
 free_tfm:
        crypto_free_shash(tfm);
index 1d38f0e..d3a22da 100644 (file)
@@ -8,9 +8,14 @@
 #ifndef _XTENSA_PGALLOC_H
 #define _XTENSA_PGALLOC_H
 
+#ifdef CONFIG_MMU
 #include <linux/highmem.h>
 #include <linux/slab.h>
 
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#define __HAVE_ARCH_PTE_ALLOC_ONE
+#include <asm-generic/pgalloc.h>
+
 /*
  * Allocating and freeing a pmd is trivial: the 1-entry pmd is
  * inside the pgd, so has no extra memory associated with it.
@@ -28,50 +33,37 @@ pgd_alloc(struct mm_struct *mm)
        return (pgd_t*) __get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
 }
 
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+static inline void ptes_clear(pte_t *ptep)
 {
-       free_page((unsigned long)pgd);
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               pte_clear(NULL, 0, ptep + i);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
        pte_t *ptep;
-       int i;
 
-       ptep = (pte_t *)__get_free_page(GFP_KERNEL);
+       ptep = (pte_t *)__pte_alloc_one_kernel(mm);
        if (!ptep)
                return NULL;
-       for (i = 0; i < 1024; i++)
-               pte_clear(NULL, 0, ptep + i);
+       ptes_clear(ptep);
        return ptep;
 }
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-       pte_t *pte;
        struct page *page;
 
-       pte = pte_alloc_one_kernel(mm);
-       if (!pte)
+       page = __pte_alloc_one(mm, GFP_PGTABLE_USER);
+       if (!page)
                return NULL;
-       page = virt_to_page(pte);
-       if (!pgtable_pte_page_ctor(page)) {
-               __free_page(page);
-               return NULL;
-       }
+       ptes_clear(page_address(page));
        return page;
 }
 
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-       pgtable_pte_page_dtor(pte);
-       __free_page(pte);
-}
 #define pmd_pgtable(pmd) pmd_page(pmd)
+#endif /* CONFIG_MMU */
 
 #endif /* _XTENSA_PGALLOC_H */
index 24cf697..415fe7f 100644 (file)
@@ -25,7 +25,6 @@
 #include <asm/dma.h>
 #include <asm/io.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/ftrace.h>
 #ifdef CONFIG_BLK_DEV_FD
 #include <asm/floppy.h>
index 2369433..5835406 100644 (file)
@@ -31,7 +31,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 
 /* 
  * Note:
index c4decc7..c128dcc 100644 (file)
@@ -20,7 +20,6 @@
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/hardirq.h>
-#include <asm/pgalloc.h>
 
 DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
 void bad_page_fault(struct pt_regs*, unsigned long, int);
index 7fbdc32..ce4d572 100644 (file)
@@ -177,7 +177,7 @@ static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key,
        keyp += NHPOLY1305_KEY_SIZE;
        WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]);
 out:
-       kzfree(data);
+       kfree_sensitive(data);
        return err;
 }
 
index 68a0f0c..d9d65d1 100644 (file)
@@ -183,7 +183,7 @@ static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = tfm->setkey(tfm, alignbuffer, keylen);
-       kzfree(buffer);
+       kfree_sensitive(buffer);
        return ret;
 }
 
@@ -302,7 +302,7 @@ static void ahash_restore_req(struct ahash_request *req, int err)
        req->priv = NULL;
 
        /* Free the req->priv.priv from the ADJUSTED request. */
-       kzfree(priv);
+       kfree_sensitive(priv);
 }
 
 static void ahash_notify_einprogress(struct ahash_request *req)
index 5d8fe60..ed08cbd 100644 (file)
@@ -571,7 +571,7 @@ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
                alg->cra_exit(tfm);
        crypto_exit_ops(tfm);
        crypto_mod_put(alg);
-       kzfree(mem);
+       kfree_sensitive(mem);
 }
 EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
 
index cc9dbce..7553ab1 100644 (file)
@@ -376,7 +376,7 @@ static int pefile_digest_pe(const void *pebuf, unsigned int pelen,
        }
 
 error:
-       kzfree(desc);
+       kfree_sensitive(desc);
 error_no_desc:
        crypto_free_shash(tfm);
        kleave(" = %d", ret);
@@ -447,6 +447,6 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
        ret = pefile_digest_pe(pebuf, pelen, &ctx);
 
 error:
-       kzfree(ctx.digest);
+       kfree_sensitive(ctx.digest);
        return ret;
 }
index 4c0e6c9..b2a46f6 100644 (file)
@@ -163,7 +163,7 @@ static void __deflate_exit(void *ctx)
 static void deflate_free_ctx(struct crypto_scomp *tfm, void *ctx)
 {
        __deflate_exit(ctx);
-       kzfree(ctx);
+       kfree_sensitive(ctx);
 }
 
 static void deflate_exit(struct crypto_tfm *tfm)
index 8d80d93..e99fe34 100644 (file)
@@ -1218,19 +1218,19 @@ static inline void drbg_dealloc_state(struct drbg_state *drbg)
 {
        if (!drbg)
                return;
-       kzfree(drbg->Vbuf);
+       kfree_sensitive(drbg->Vbuf);
        drbg->Vbuf = NULL;
        drbg->V = NULL;
-       kzfree(drbg->Cbuf);
+       kfree_sensitive(drbg->Cbuf);
        drbg->Cbuf = NULL;
        drbg->C = NULL;
-       kzfree(drbg->scratchpadbuf);
+       kfree_sensitive(drbg->scratchpadbuf);
        drbg->scratchpadbuf = NULL;
        drbg->reseed_ctr = 0;
        drbg->d_ops = NULL;
        drbg->core = NULL;
        if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) {
-               kzfree(drbg->prev);
+               kfree_sensitive(drbg->prev);
                drbg->prev = NULL;
                drbg->fips_primed = false;
        }
@@ -1701,7 +1701,7 @@ static int drbg_fini_hash_kernel(struct drbg_state *drbg)
        struct sdesc *sdesc = (struct sdesc *)drbg->priv_data;
        if (sdesc) {
                crypto_free_shash(sdesc->shash.tfm);
-               kzfree(sdesc);
+               kfree_sensitive(sdesc);
        }
        drbg->priv_data = NULL;
        return 0;
index 8acf843..c80aa25 100644 (file)
@@ -67,7 +67,7 @@ static u64 *ecc_alloc_digits_space(unsigned int ndigits)
 
 static void ecc_free_digits_space(u64 *space)
 {
-       kzfree(space);
+       kfree_sensitive(space);
 }
 
 static struct ecc_point *ecc_alloc_point(unsigned int ndigits)
@@ -101,9 +101,9 @@ static void ecc_free_point(struct ecc_point *p)
        if (!p)
                return;
 
-       kzfree(p->x);
-       kzfree(p->y);
-       kzfree(p);
+       kfree_sensitive(p->x);
+       kfree_sensitive(p->y);
+       kfree_sensitive(p);
 }
 
 static void vli_clear(u64 *vli, unsigned int ndigits)
index bd59905..b0232d6 100644 (file)
@@ -124,7 +124,7 @@ static int ecdh_compute_value(struct kpp_request *req)
 
        /* fall through */
 free_all:
-       kzfree(shared_secret);
+       kfree_sensitive(shared_secret);
 free_pubkey:
        kfree(public_key);
        return ret;
index 3a36a95..338ee07 100644 (file)
@@ -139,7 +139,7 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key,
                               CRYPTO_TFM_REQ_MASK);
        err = crypto_ahash_setkey(ghash, (u8 *)&data->hash, sizeof(be128));
 out:
-       kzfree(data);
+       kfree_sensitive(data);
        return err;
 }
 
index a4b1c02..a69ae3e 100644 (file)
@@ -304,8 +304,8 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
        int i;
 
        for (i = 0; i < 16; i++)
-               kzfree(t->t[i]);
-       kzfree(t);
+               kfree_sensitive(t->t[i]);
+       kfree_sensitive(t);
 }
 EXPORT_SYMBOL(gf128mul_free_64k);
 
index b43684c..eb7d1dd 100644 (file)
@@ -57,7 +57,7 @@ void *jent_zalloc(unsigned int len)
 
 void jent_zfree(void *ptr)
 {
-       kzfree(ptr);
+       kfree_sensitive(ptr);
 }
 
 int jent_fips_enabled(void)
index 1490d21..a888d84 100644 (file)
@@ -53,7 +53,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
        err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
        crypto_stats_rng_seed(alg, err);
 out:
-       kzfree(buf);
+       kfree_sensitive(buf);
        return err;
 }
 EXPORT_SYMBOL_GPL(crypto_rng_reset);
index 4983b2b..ddd3d10 100644 (file)
@@ -199,7 +199,7 @@ static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err)
        sg_copy_from_buffer(req->dst,
                            sg_nents_for_len(req->dst, ctx->key_size),
                            out_buf, ctx->key_size);
-       kzfree(out_buf);
+       kfree_sensitive(out_buf);
 
 out:
        req->dst_len = ctx->key_size;
@@ -322,7 +322,7 @@ static int pkcs1pad_decrypt_complete(struct akcipher_request *req, int err)
                                out_buf + pos, req->dst_len);
 
 done:
-       kzfree(req_ctx->out_buf);
+       kfree_sensitive(req_ctx->out_buf);
 
        return err;
 }
@@ -500,7 +500,7 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
                   req->dst_len) != 0)
                err = -EKEYREJECTED;
 done:
-       kzfree(req_ctx->out_buf);
+       kfree_sensitive(req_ctx->out_buf);
 
        return err;
 }
index 23e22d8..0899d52 100644 (file)
@@ -33,7 +33,7 @@ static void seqiv_aead_encrypt_complete2(struct aead_request *req, int err)
        memcpy(req->iv, subreq->iv, crypto_aead_ivsize(geniv));
 
 out:
-       kzfree(subreq->iv);
+       kfree_sensitive(subreq->iv);
 }
 
 static void seqiv_aead_encrypt_complete(struct crypto_async_request *base,
index e6a4b5f..2e3433a 100644 (file)
@@ -44,7 +44,7 @@ static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        err = shash->setkey(tfm, alignbuffer, keylen);
-       kzfree(buffer);
+       kfree_sensitive(buffer);
        return err;
 }
 
index 467af52..b4dae64 100644 (file)
@@ -592,7 +592,7 @@ static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm,
        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = cipher->setkey(tfm, alignbuffer, keylen);
-       kzfree(buffer);
+       kfree_sensitive(buffer);
        return ret;
 }
 
index 6863f91..23c27fc 100644 (file)
@@ -1744,7 +1744,7 @@ out:
        kfree(vec.plaintext);
        kfree(vec.digest);
        crypto_free_shash(generic_tfm);
-       kzfree(generic_desc);
+       kfree_sensitive(generic_desc);
        return err;
 }
 #else /* !CONFIG_CRYPTO_MANAGER_EXTRA_TESTS */
@@ -3665,7 +3665,7 @@ static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
        if (IS_ERR(drng)) {
                printk(KERN_ERR "alg: drbg: could not allocate DRNG handle for "
                       "%s\n", driver);
-               kzfree(buf);
+               kfree_sensitive(buf);
                return -ENOMEM;
        }
 
@@ -3712,7 +3712,7 @@ static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
 
 outbuf:
        crypto_free_rng(drng);
-       kzfree(buf);
+       kfree_sensitive(buf);
        return ret;
 }
 
index 5a3ff25..1a3309f 100644 (file)
@@ -137,7 +137,7 @@ static void __zstd_exit(void *ctx)
 static void zstd_free_ctx(struct crypto_scomp *tfm, void *ctx)
 {
        __zstd_exit(ctx);
-       kzfree(ctx);
+       kfree_sensitive(ctx);
 }
 
 static void zstd_exit(struct crypto_tfm *tfm)
index 5b02f69..508b80f 100644 (file)
@@ -368,8 +368,8 @@ static ssize_t node_read_meminfo(struct device *dev,
        unsigned long sreclaimable, sunreclaimable;
 
        si_meminfo_node(&i, nid);
-       sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
-       sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
+       sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
+       sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
        n = sprintf(buf,
                       "Node %d MemTotal:       %8lu kB\n"
                       "Node %d MemFree:        %8lu kB\n"
@@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
                       nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
                       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
                       nid, K(i.sharedram),
-                      nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
+                      nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
 #ifdef CONFIG_SHADOW_CALL_STACK
-                      nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
+                      nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
                       nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
                       nid, 0UL,
@@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev,
 
        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
                n += sprintf(buf+n, "%s %lu\n", node_stat_name(i),
-                            node_page_state(pgdat, i));
+                            node_page_state_pages(pgdat, i));
 
        return n;
 }
index a3eeccf..c6ea5d3 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/io.h>
 #include <linux/rbtree.h>
 #include <asm/setup.h>
-#include <asm/pgalloc.h>
 #include <asm/hypervisor.h>
 #include <xen/grant_table.h>
 #include <xen/page.h>
index 1e4f9a5..b4d5fea 100644 (file)
@@ -254,7 +254,7 @@ theend_iv:
                offset = areq->cryptlen - ivsize;
                if (rctx->op_dir & CE_DECRYPTION) {
                        memcpy(areq->iv, backup_iv, ivsize);
-                       kzfree(backup_iv);
+                       kfree_sensitive(backup_iv);
                } else {
                        scatterwalk_map_and_copy(areq->iv, areq->dst, offset,
                                                 ivsize, 0);
index 7a13167..7b39b44 100644 (file)
@@ -249,7 +249,7 @@ theend_iv:
                        if (rctx->op_dir & SS_DECRYPTION) {
                                memcpy(areq->iv, backup_iv, ivsize);
                                memzero_explicit(backup_iv, ivsize);
-                               kzfree(backup_iv);
+                               kfree_sensitive(backup_iv);
                        } else {
                                scatterwalk_map_and_copy(areq->iv, areq->dst, offset,
                                                         ivsize, 0);
index 5880b94..d932107 100644 (file)
@@ -252,8 +252,8 @@ static int meson_cipher(struct skcipher_request *areq)
                }
        }
 theend:
-       kzfree(bkeyiv);
-       kzfree(backup_iv);
+       kfree_sensitive(bkeyiv);
+       kfree_sensitive(backup_iv);
 
        return err;
 }
index ff02cc0..9bd8e51 100644 (file)
@@ -69,7 +69,7 @@ static void atmel_ecdh_done(struct atmel_i2c_work_data *work_data, void *areq,
 
        /* fall through */
 free_work_data:
-       kzfree(work_data);
+       kfree_sensitive(work_data);
        kpp_request_complete(req, status);
 }
 
index 2e44d68..dd5f101 100644 (file)
@@ -854,14 +854,14 @@ static int caam_rsa_dec(struct akcipher_request *req)
 
 static void caam_rsa_free_key(struct caam_rsa_key *key)
 {
-       kzfree(key->d);
-       kzfree(key->p);
-       kzfree(key->q);
-       kzfree(key->dp);
-       kzfree(key->dq);
-       kzfree(key->qinv);
-       kzfree(key->tmp1);
-       kzfree(key->tmp2);
+       kfree_sensitive(key->d);
+       kfree_sensitive(key->p);
+       kfree_sensitive(key->q);
+       kfree_sensitive(key->dp);
+       kfree_sensitive(key->dq);
+       kfree_sensitive(key->qinv);
+       kfree_sensitive(key->tmp1);
+       kfree_sensitive(key->tmp2);
        kfree(key->e);
        kfree(key->n);
        memset(key, 0, sizeof(*key));
@@ -1018,17 +1018,17 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
        return;
 
 free_dq:
-       kzfree(rsa_key->dq);
+       kfree_sensitive(rsa_key->dq);
 free_dp:
-       kzfree(rsa_key->dp);
+       kfree_sensitive(rsa_key->dp);
 free_tmp2:
-       kzfree(rsa_key->tmp2);
+       kfree_sensitive(rsa_key->tmp2);
 free_tmp1:
-       kzfree(rsa_key->tmp1);
+       kfree_sensitive(rsa_key->tmp1);
 free_q:
-       kzfree(rsa_key->q);
+       kfree_sensitive(rsa_key->q);
 free_p:
-       kzfree(rsa_key->p);
+       kfree_sensitive(rsa_key->p);
 }
 
 static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key,
index 0f72e9a..a152459 100644 (file)
@@ -74,7 +74,7 @@ static void cleanup_worker_threads(struct cpt_vf *cptvf)
        for (i = 0; i < cptvf->nr_queues; i++)
                tasklet_kill(&cwqe_info->vq_wqe[i].twork);
 
-       kzfree(cwqe_info);
+       kfree_sensitive(cwqe_info);
        cptvf->wqe_info = NULL;
 }
 
@@ -88,7 +88,7 @@ static void free_pending_queues(struct pending_qinfo *pqinfo)
                        continue;
 
                /* free single queue */
-               kzfree((queue->head));
+               kfree_sensitive((queue->head));
 
                queue->front = 0;
                queue->rear = 0;
@@ -189,7 +189,7 @@ static void free_command_queues(struct cpt_vf *cptvf,
                        chunk->head = NULL;
                        chunk->dma_addr = 0;
                        hlist_del(&chunk->nextchunk);
-                       kzfree(chunk);
+                       kfree_sensitive(chunk);
                }
 
                queue->nchunks = 0;
index 3878b01..dc5fda5 100644 (file)
@@ -305,12 +305,12 @@ static void do_request_cleanup(struct cpt_vf *cptvf,
                }
        }
 
-       kzfree(info->scatter_components);
-       kzfree(info->gather_components);
-       kzfree(info->out_buffer);
-       kzfree(info->in_buffer);
-       kzfree((void *)info->completion_addr);
-       kzfree(info);
+       kfree_sensitive(info->scatter_components);
+       kfree_sensitive(info->gather_components);
+       kfree_sensitive(info->out_buffer);
+       kfree_sensitive(info->in_buffer);
+       kfree_sensitive((void *)info->completion_addr);
+       kfree_sensitive(info);
 }
 
 static void do_post_process(struct cpt_vf *cptvf, struct cpt_info_buffer *info)
index 5cbc64b..a5cdc2b 100644 (file)
@@ -90,7 +90,7 @@ static void nitrox_free_aqm_queues(struct nitrox_device *ndev)
 
        for (i = 0; i < ndev->nr_queues; i++) {
                nitrox_cmdq_cleanup(ndev->aqmq[i]);
-               kzfree(ndev->aqmq[i]);
+               kfree_sensitive(ndev->aqmq[i]);
                ndev->aqmq[i] = NULL;
        }
 }
@@ -122,7 +122,7 @@ static int nitrox_alloc_aqm_queues(struct nitrox_device *ndev)
 
                err = nitrox_cmdq_init(cmdq, AQM_Q_ALIGN_BYTES);
                if (err) {
-                       kzfree(cmdq);
+                       kfree_sensitive(cmdq);
                        goto aqmq_fail;
                }
                ndev->aqmq[i] = cmdq;
index 4985bc8..7df71fc 100644 (file)
@@ -260,7 +260,7 @@ void *zip_alloc_scomp_ctx_deflate(struct crypto_scomp *tfm)
        ret = zip_ctx_init(zip_ctx, 0);
 
        if (ret) {
-               kzfree(zip_ctx);
+               kfree_sensitive(zip_ctx);
                return ERR_PTR(ret);
        }
 
@@ -279,7 +279,7 @@ void *zip_alloc_scomp_ctx_lzs(struct crypto_scomp *tfm)
        ret = zip_ctx_init(zip_ctx, 1);
 
        if (ret) {
-               kzfree(zip_ctx);
+               kfree_sensitive(zip_ctx);
                return ERR_PTR(ret);
        }
 
@@ -291,7 +291,7 @@ void zip_free_scomp_ctx(struct crypto_scomp *tfm, void *ctx)
        struct zip_kernel_ctx *zip_ctx = ctx;
 
        zip_ctx_exit(zip_ctx);
-       kzfree(zip_ctx);
+       kfree_sensitive(zip_ctx);
 }
 
 int zip_scomp_compress(struct crypto_scomp *tfm,
index 649c91d..1223ac7 100644 (file)
@@ -112,13 +112,13 @@ static int ccp_check_key_length(unsigned int len)
 static void ccp_rsa_free_key_bufs(struct ccp_ctx *ctx)
 {
        /* Clean up old key data */
-       kzfree(ctx->u.rsa.e_buf);
+       kfree_sensitive(ctx->u.rsa.e_buf);
        ctx->u.rsa.e_buf = NULL;
        ctx->u.rsa.e_len = 0;
-       kzfree(ctx->u.rsa.n_buf);
+       kfree_sensitive(ctx->u.rsa.n_buf);
        ctx->u.rsa.n_buf = NULL;
        ctx->u.rsa.n_len = 0;
-       kzfree(ctx->u.rsa.d_buf);
+       kfree_sensitive(ctx->u.rsa.d_buf);
        ctx->u.rsa.d_buf = NULL;
        ctx->u.rsa.d_len = 0;
 }
index 1cf51ed..35794c7 100644 (file)
@@ -448,7 +448,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *authkey,
                if (dma_mapping_error(dev, key_dma_addr)) {
                        dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
                                key, keylen);
-                       kzfree(key);
+                       kfree_sensitive(key);
                        return -ENOMEM;
                }
                if (keylen > blocksize) {
@@ -533,7 +533,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *authkey,
        if (key_dma_addr)
                dma_unmap_single(dev, key_dma_addr, keylen, DMA_TO_DEVICE);
 
-       kzfree(key);
+       kfree_sensitive(key);
 
        return rc;
 }
index b2bd093..a5e041d 100644 (file)
@@ -488,7 +488,7 @@ void cc_unmap_aead_request(struct device *dev, struct aead_request *req)
        if (areq_ctx->gen_ctx.iv_dma_addr) {
                dma_unmap_single(dev, areq_ctx->gen_ctx.iv_dma_addr,
                                 hw_iv_size, DMA_BIDIRECTIONAL);
-               kzfree(areq_ctx->gen_ctx.iv);
+               kfree_sensitive(areq_ctx->gen_ctx.iv);
        }
 
        /* Release pool */
@@ -559,7 +559,7 @@ static int cc_aead_chain_iv(struct cc_drvdata *drvdata,
        if (dma_mapping_error(dev, areq_ctx->gen_ctx.iv_dma_addr)) {
                dev_err(dev, "Mapping iv %u B at va=%pK for DMA failed\n",
                        hw_iv_size, req->iv);
-               kzfree(areq_ctx->gen_ctx.iv);
+               kfree_sensitive(areq_ctx->gen_ctx.iv);
                areq_ctx->gen_ctx.iv = NULL;
                rc = -ENOMEM;
                goto chain_iv_exit;
index 076669d..d77ae98 100644 (file)
@@ -257,7 +257,7 @@ static void cc_cipher_exit(struct crypto_tfm *tfm)
                &ctx_p->user.key_dma_addr);
 
        /* Free key buffer in context */
-       kzfree(ctx_p->user.key);
+       kfree_sensitive(ctx_p->user.key);
        dev_dbg(dev, "Free key buffer in context. key=@%p\n", ctx_p->user.key);
 }
 
@@ -881,7 +881,7 @@ static void cc_cipher_complete(struct device *dev, void *cc_req, int err)
                /* Not a BACKLOG notification */
                cc_unmap_cipher_request(dev, req_ctx, ivsize, src, dst);
                memcpy(req->iv, req_ctx->iv, ivsize);
-               kzfree(req_ctx->iv);
+               kfree_sensitive(req_ctx->iv);
        }
 
        skcipher_request_complete(req, err);
@@ -994,7 +994,7 @@ static int cc_cipher_process(struct skcipher_request *req,
 
 exit_process:
        if (rc != -EINPROGRESS && rc != -EBUSY) {
-               kzfree(req_ctx->iv);
+               kfree_sensitive(req_ctx->iv);
        }
 
        return rc;
index d531078..683c9a4 100644 (file)
@@ -764,7 +764,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
                if (dma_mapping_error(dev, ctx->key_params.key_dma_addr)) {
                        dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
                                ctx->key_params.key, keylen);
-                       kzfree(ctx->key_params.key);
+                       kfree_sensitive(ctx->key_params.key);
                        return -ENOMEM;
                }
                dev_dbg(dev, "mapping key-buffer: key_dma_addr=%pad keylen=%u\n",
@@ -913,7 +913,7 @@ out:
                        &ctx->key_params.key_dma_addr, ctx->key_params.keylen);
        }
 
-       kzfree(ctx->key_params.key);
+       kfree_sensitive(ctx->key_params.key);
 
        return rc;
 }
@@ -950,7 +950,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash,
        if (dma_mapping_error(dev, ctx->key_params.key_dma_addr)) {
                dev_err(dev, "Mapping key va=0x%p len=%u for DMA failed\n",
                        key, keylen);
-               kzfree(ctx->key_params.key);
+               kfree_sensitive(ctx->key_params.key);
                return -ENOMEM;
        }
        dev_dbg(dev, "mapping key-buffer: key_dma_addr=%pad keylen=%u\n",
@@ -999,7 +999,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash,
        dev_dbg(dev, "Unmapped key-buffer: key_dma_addr=%pad keylen=%u\n",
                &ctx->key_params.key_dma_addr, ctx->key_params.keylen);
 
-       kzfree(ctx->key_params.key);
+       kfree_sensitive(ctx->key_params.key);
 
        return rc;
 }
index 1d7649e..33fb277 100644 (file)
@@ -107,7 +107,7 @@ void cc_req_mgr_fini(struct cc_drvdata *drvdata)
        /* Kill tasklet */
        tasklet_kill(&req_mgr_h->comptask);
 #endif
-       kzfree(req_mgr_h);
+       kfree_sensitive(req_mgr_h);
        drvdata->request_mgr_handle = NULL;
 }
 
index bd0bd9f..f2a2fc1 100644 (file)
@@ -1157,7 +1157,7 @@ static int mv_cesa_ahmac_pad_init(struct ahash_request *req,
                }
 
                /* Set the memory region to 0 to avoid any leak. */
-               kzfree(keydup);
+               kfree_sensitive(keydup);
 
                if (ret)
                        return ret;
index ce31683..228fe8e 100644 (file)
@@ -68,7 +68,7 @@ static void cleanup_worker_threads(struct otx_cptvf *cptvf)
        for (i = 0; i < cptvf->num_queues; i++)
                tasklet_kill(&cwqe_info->vq_wqe[i].twork);
 
-       kzfree(cwqe_info);
+       kfree_sensitive(cwqe_info);
        cptvf->wqe_info = NULL;
 }
 
@@ -82,7 +82,7 @@ static void free_pending_queues(struct otx_cpt_pending_qinfo *pqinfo)
                        continue;
 
                /* free single queue */
-               kzfree((queue->head));
+               kfree_sensitive((queue->head));
                queue->front = 0;
                queue->rear = 0;
                queue->qlen = 0;
@@ -176,7 +176,7 @@ static void free_command_queues(struct otx_cptvf *cptvf,
                        chunk->head = NULL;
                        chunk->dma_addr = 0;
                        list_del(&chunk->nextchunk);
-                       kzfree(chunk);
+                       kfree_sensitive(chunk);
                }
                queue->num_chunks = 0;
                queue->idx = 0;
index d912fe0..a02d059 100644 (file)
@@ -215,7 +215,7 @@ static inline void do_request_cleanup(struct pci_dev *pdev,
                                                 DMA_BIDIRECTIONAL);
                }
        }
-       kzfree(info);
+       kfree_sensitive(info);
 }
 
 struct otx_cptvf_wqe;
index f03c238..40882d6 100644 (file)
@@ -746,7 +746,7 @@ void nx_crypto_ctx_exit(struct crypto_tfm *tfm)
 {
        struct nx_crypto_ctx *nx_ctx = crypto_tfm_ctx(tfm);
 
-       kzfree(nx_ctx->kmem);
+       kfree_sensitive(nx_ctx->kmem);
        nx_ctx->csbcpb = NULL;
        nx_ctx->csbcpb_aead = NULL;
        nx_ctx->in_sg = NULL;
@@ -762,7 +762,7 @@ void nx_crypto_ctx_aead_exit(struct crypto_aead *tfm)
 {
        struct nx_crypto_ctx *nx_ctx = crypto_aead_ctx(tfm);
 
-       kzfree(nx_ctx->kmem);
+       kfree_sensitive(nx_ctx->kmem);
 }
 
 static int nx_probe(struct vio_dev *viodev, const struct vio_device_id *id)
index b260195..583c0b5 100644 (file)
@@ -167,7 +167,7 @@ static int virtio_crypto_alg_skcipher_init_session(
                                num_in, vcrypto, GFP_ATOMIC);
        if (err < 0) {
                spin_unlock(&vcrypto->ctrl_lock);
-               kzfree(cipher_key);
+               kfree_sensitive(cipher_key);
                return err;
        }
        virtqueue_kick(vcrypto->ctrl_vq);
@@ -184,7 +184,7 @@ static int virtio_crypto_alg_skcipher_init_session(
                spin_unlock(&vcrypto->ctrl_lock);
                pr_err("virtio_crypto: Create session failed status: %u\n",
                        le32_to_cpu(vcrypto->input.status));
-               kzfree(cipher_key);
+               kfree_sensitive(cipher_key);
                return -EINVAL;
        }
 
@@ -197,7 +197,7 @@ static int virtio_crypto_alg_skcipher_init_session(
 
        spin_unlock(&vcrypto->ctrl_lock);
 
-       kzfree(cipher_key);
+       kfree_sensitive(cipher_key);
        return 0;
 }
 
@@ -472,9 +472,9 @@ __virtio_crypto_skcipher_do_req(struct virtio_crypto_sym_request *vc_sym_req,
        return 0;
 
 free_iv:
-       kzfree(iv);
+       kfree_sensitive(iv);
 free:
-       kzfree(req_data);
+       kfree_sensitive(req_data);
        kfree(sgs);
        return err;
 }
@@ -583,7 +583,7 @@ static void virtio_crypto_skcipher_finalize_req(
                scatterwalk_map_and_copy(req->iv, req->dst,
                                         req->cryptlen - AES_BLOCK_SIZE,
                                         AES_BLOCK_SIZE, 0);
-       kzfree(vc_sym_req->iv);
+       kfree_sensitive(vc_sym_req->iv);
        virtcrypto_clear_request(&vc_sym_req->base);
 
        crypto_finalize_skcipher_request(vc_sym_req->base.dataq->engine,
index 77e744e..0c66d61 100644 (file)
@@ -17,7 +17,7 @@ void
 virtcrypto_clear_request(struct virtio_crypto_request *vc_req)
 {
        if (vc_req) {
-               kzfree(vc_req->req_data);
+               kfree_sensitive(vc_req->req_data);
                kfree(vc_req->sgs);
        }
 }
index 4c2972f..6de86e7 100644 (file)
@@ -28,7 +28,6 @@
 
 #if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
 #include <asm/dma-iommu.h>
-#include <asm/pgalloc.h>
 #else
 #define arm_iommu_create_mapping(...)  NULL
 #define arm_iommu_attach_device(...)   -ENODEV
index b437a14..37dcc52 100644 (file)
@@ -407,7 +407,7 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc)
                crypto_free_shash(lmk->hash_tfm);
        lmk->hash_tfm = NULL;
 
-       kzfree(lmk->seed);
+       kfree_sensitive(lmk->seed);
        lmk->seed = NULL;
 }
 
@@ -558,9 +558,9 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
 {
        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 
-       kzfree(tcw->iv_seed);
+       kfree_sensitive(tcw->iv_seed);
        tcw->iv_seed = NULL;
-       kzfree(tcw->whitening);
+       kfree_sensitive(tcw->whitening);
        tcw->whitening = NULL;
 
        if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
@@ -994,8 +994,8 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d
 
        kunmap_atomic(data);
 out:
-       kzfree(ks);
-       kzfree(es);
+       kfree_sensitive(ks);
+       kfree_sensitive(es);
        skcipher_request_free(req);
        return r;
 }
@@ -2294,7 +2294,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
 
        key = request_key(type, key_desc + 1, NULL);
        if (IS_ERR(key)) {
-               kzfree(new_key_string);
+               kfree_sensitive(new_key_string);
                return PTR_ERR(key);
        }
 
@@ -2304,7 +2304,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
        if (ret < 0) {
                up_read(&key->sem);
                key_put(key);
-               kzfree(new_key_string);
+               kfree_sensitive(new_key_string);
                return ret;
        }
 
@@ -2318,10 +2318,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
 
        if (!ret) {
                set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-               kzfree(cc->key_string);
+               kfree_sensitive(cc->key_string);
                cc->key_string = new_key_string;
        } else
-               kzfree(new_key_string);
+               kfree_sensitive(new_key_string);
 
        return ret;
 }
@@ -2382,7 +2382,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
        /* wipe references to any kernel keyring key */
-       kzfree(cc->key_string);
+       kfree_sensitive(cc->key_string);
        cc->key_string = NULL;
 
        /* Decode key from its hex representation. */
@@ -2414,7 +2414,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
                        return r;
        }
 
-       kzfree(cc->key_string);
+       kfree_sensitive(cc->key_string);
        cc->key_string = NULL;
        r = crypt_setkey(cc);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
@@ -2493,15 +2493,15 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->dev)
                dm_put_device(ti, cc->dev);
 
-       kzfree(cc->cipher_string);
-       kzfree(cc->key_string);
-       kzfree(cc->cipher_auth);
-       kzfree(cc->authenc_key);
+       kfree_sensitive(cc->cipher_string);
+       kfree_sensitive(cc->key_string);
+       kfree_sensitive(cc->cipher_auth);
+       kfree_sensitive(cc->authenc_key);
 
        mutex_destroy(&cc->bio_alloc_lock);
 
        /* Must zero key material before freeing */
-       kzfree(cc);
+       kfree_sensitive(cc);
 
        spin_lock(&dm_crypt_clients_lock);
        WARN_ON(!dm_crypt_clients_n);
index 5da3eb6..8c8d940 100644 (file)
@@ -3405,8 +3405,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
 
 static void free_alg(struct alg_spec *a)
 {
-       kzfree(a->alg_string);
-       kzfree(a->key);
+       kfree_sensitive(a->alg_string);
+       kfree_sensitive(a->key);
        memset(a, 0, sizeof *a);
 }
 
@@ -4337,7 +4337,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
                for (i = 0; i < ic->journal_sections; i++) {
                        struct skcipher_request *req = ic->sk_requests[i];
                        if (req) {
-                               kzfree(req->iv);
+                               kfree_sensitive(req->iv);
                                skcipher_request_free(req);
                        }
                }
index c0d139c..2d778d0 100644 (file)
@@ -286,7 +286,7 @@ static void *alloc_dma_buffer(struct vio_dev *vdev, size_t size,
 
        if (dma_mapping_error(&vdev->dev, *dma_handle)) {
                *dma_handle = 0;
-               kzfree(buffer);
+               kfree_sensitive(buffer);
                return NULL;
        }
 
@@ -310,7 +310,7 @@ static void free_dma_buffer(struct vio_dev *vdev, size_t size, void *vaddr,
        dma_unmap_single(&vdev->dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
        /* deallocate memory */
-       kzfree(vaddr);
+       kfree_sensitive(vaddr);
 }
 
 /**
@@ -883,7 +883,7 @@ static int ibmvmc_close(struct inode *inode, struct file *file)
                spin_unlock_irqrestore(&hmc->lock, flags);
        }
 
-       kzfree(session);
+       kfree_sensitive(session);
 
        return rc;
 }
index 0874ae4..3ab6db2 100644 (file)
@@ -137,7 +137,7 @@ static void hclge_free_vector_ring_chain(struct hnae3_ring_chain_node *head)
 
        while (chain) {
                chain_tmp = chain->next;
-               kzfree(chain);
+               kfree_sensitive(chain);
                chain = chain_tmp;
        }
 }
index 6516980..eca7352 100644 (file)
@@ -960,9 +960,9 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf)
        return 0;
 
 err_aead:
-       kzfree(xs->aead);
+       kfree_sensitive(xs->aead);
 err_xs:
-       kzfree(xs);
+       kfree_sensitive(xs);
 err_out:
        msgbuf[1] = err;
        return err;
@@ -1047,7 +1047,7 @@ int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf)
        ixgbe_ipsec_del_sa(xs);
 
        /* remove the xs that was made-up in the add request */
-       kzfree(xs);
+       kfree_sensitive(xs);
 
        return 0;
 }
index de3b57d..208f6e2 100644 (file)
@@ -222,7 +222,7 @@ out_free:
        kfree(state->sha1_digest);
        if (state->sha1) {
                crypto_free_shash(state->sha1->tfm);
-               kzfree(state->sha1);
+               kfree_sensitive(state->sha1);
        }
        kfree(state);
 out:
@@ -238,8 +238,8 @@ static void mppe_free(void *arg)
        if (state) {
                kfree(state->sha1_digest);
                crypto_free_shash(state->sha1->tfm);
-               kzfree(state->sha1);
-               kzfree(state);
+               kfree_sensitive(state->sha1);
+               kfree_sensitive(state);
        }
 }
 
index 201a226..3dd3b76 100644 (file)
@@ -114,7 +114,7 @@ static struct noise_keypair *keypair_create(struct wg_peer *peer)
 
 static void keypair_free_rcu(struct rcu_head *rcu)
 {
-       kzfree(container_of(rcu, struct noise_keypair, rcu));
+       kfree_sensitive(container_of(rcu, struct noise_keypair, rcu));
 }
 
 static void keypair_free_kref(struct kref *kref)
@@ -821,7 +821,7 @@ bool wg_noise_handshake_begin_session(struct noise_handshake *handshake,
                        handshake->entry.peer->device->index_hashtable,
                        &handshake->entry, &new_keypair->entry);
        } else {
-               kzfree(new_keypair);
+               kfree_sensitive(new_keypair);
        }
        rcu_read_unlock_bh();
 
index 1d634bd..b3b6370 100644 (file)
@@ -203,7 +203,7 @@ static void rcu_release(struct rcu_head *rcu)
        /* The final zeroing takes care of clearing any remaining handshake key
         * material and other potentially sensitive information.
         */
-       kzfree(peer);
+       kfree_sensitive(peer);
 }
 
 static void kref_release(struct kref *refcount)
index 24cb1b1..9463c10 100644 (file)
@@ -1369,7 +1369,7 @@ static void iwl_pcie_rx_handle_rb(struct iwl_trans *trans,
                                           &rxcb, rxq->id);
 
                if (reclaim) {
-                       kzfree(txq->entries[cmd_index].free_buf);
+                       kfree_sensitive(txq->entries[cmd_index].free_buf);
                        txq->entries[cmd_index].free_buf = NULL;
                }
 
index 7fc7542..606bef2 100644 (file)
@@ -1026,7 +1026,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans,
        BUILD_BUG_ON(IWL_TFH_NUM_TBS > sizeof(out_meta->tbs) * BITS_PER_BYTE);
        out_meta->flags = cmd->flags;
        if (WARN_ON_ONCE(txq->entries[idx].free_buf))
-               kzfree(txq->entries[idx].free_buf);
+               kfree_sensitive(txq->entries[idx].free_buf);
        txq->entries[idx].free_buf = dup_buf;
 
        trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size, &out_cmd->hdr_wide);
@@ -1257,8 +1257,8 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id)
        /* De-alloc array of command/tx buffers */
        if (txq_id == trans->txqs.cmd.q_id)
                for (i = 0; i < txq->n_window; i++) {
-                       kzfree(txq->entries[i].cmd);
-                       kzfree(txq->entries[i].free_buf);
+                       kfree_sensitive(txq->entries[i].cmd);
+                       kfree_sensitive(txq->entries[i].free_buf);
                }
        del_timer_sync(&txq->stuck_timer);
 
index 5c6c3fa..eb396c0 100644 (file)
@@ -721,8 +721,8 @@ static void iwl_pcie_txq_free(struct iwl_trans *trans, int txq_id)
        /* De-alloc array of command/tx buffers */
        if (txq_id == trans->txqs.cmd.q_id)
                for (i = 0; i < txq->n_window; i++) {
-                       kzfree(txq->entries[i].cmd);
-                       kzfree(txq->entries[i].free_buf);
+                       kfree_sensitive(txq->entries[i].cmd);
+                       kfree_sensitive(txq->entries[i].free_buf);
                }
 
        /* De-alloc circular buffer of TFDs */
@@ -1765,7 +1765,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans,
        BUILD_BUG_ON(IWL_TFH_NUM_TBS > sizeof(out_meta->tbs) * BITS_PER_BYTE);
        out_meta->flags = cmd->flags;
        if (WARN_ON_ONCE(txq->entries[idx].free_buf))
-               kzfree(txq->entries[idx].free_buf);
+               kfree_sensitive(txq->entries[idx].free_buf);
        txq->entries[idx].free_buf = dup_buf;
 
        trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size, &out_cmd->hdr_wide);
index 1d4dae4..7b6c4ae 100644 (file)
@@ -31,8 +31,8 @@ static int orinoco_set_key(struct orinoco_private *priv, int index,
                           enum orinoco_alg alg, const u8 *key, int key_len,
                           const u8 *seq, int seq_len)
 {
-       kzfree(priv->keys[index].key);
-       kzfree(priv->keys[index].seq);
+       kfree_sensitive(priv->keys[index].key);
+       kfree_sensitive(priv->keys[index].seq);
 
        if (key_len) {
                priv->keys[index].key = kzalloc(key_len, GFP_ATOMIC);
index 1a1d5e3..1ea0463 100644 (file)
@@ -219,8 +219,8 @@ static inline void ap_init_message(struct ap_message *ap_msg)
  */
 static inline void ap_release_message(struct ap_message *ap_msg)
 {
-       kzfree(ap_msg->msg);
-       kzfree(ap_msg->private);
+       kfree_sensitive(ap_msg->msg);
+       kfree_sensitive(ap_msg->private);
 }
 
 /*
index b10a92a..eaaf6a5 100644 (file)
@@ -245,7 +245,7 @@ michael_mic(u8 *key, u8 *data, unsigned int len, u8 priority, u8 *result)
        ret = crypto_shash_finup(desc, data + 12, len - 12, result);
 
 err_free_desc:
-       kzfree(desc);
+       kfree_sensitive(desc);
 
 err_free_tfm:
        crypto_free_shash(tfm);
index 0f15c96..7f74e1d 100644 (file)
@@ -2251,7 +2251,7 @@ static void gf_mulx(u8 *pad)
 
 static void aes_encrypt_deinit(void *ctx)
 {
-       kzfree(ctx);
+       kfree_sensitive(ctx);
 }
 
 
index b809c00..7b091c5 100644 (file)
@@ -429,7 +429,7 @@ static netdev_tx_t p80211knetdev_hard_start_xmit(struct sk_buff *skb,
 failed:
        /* Free up the WEP buffer if it's not the same as the skb */
        if ((p80211_wep.data) && (p80211_wep.data != skb->data))
-               kzfree(p80211_wep.data);
+               kfree_sensitive(p80211_wep.data);
 
        /* we always free the skb here, never in a lower level. */
        if (!result)
index 0e54627..62d912b 100644 (file)
@@ -484,7 +484,7 @@ static int chap_server_compute_hash(
        pr_debug("[server] Sending CHAP_R=0x%s\n", response);
        auth_ret = 0;
 out:
-       kzfree(desc);
+       kfree_sensitive(desc);
        if (tfm)
                crypto_free_shash(tfm);
        kfree(initiatorchg);
index b1d8b02..37ffccd 100644 (file)
@@ -58,7 +58,6 @@
 #include <linux/sysctl.h>
 
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
 #include <asm/xen/hypervisor.h>
index 095d683..63abe6c 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/miscdevice.h>
 #include <linux/moduleparam.h>
 
-#include <asm/pgalloc.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
index a88aa3a..aa4c122 100644 (file)
@@ -201,6 +201,27 @@ config TMPFS_XATTR
 
          If unsure, say N.
 
+config TMPFS_INODE64
+       bool "Use 64-bit ino_t by default in tmpfs"
+       depends on TMPFS && 64BIT
+       default n
+       help
+         tmpfs has historically used only inode numbers as wide as an unsigned
+         int. In some cases this can cause wraparound, potentially resulting
+         in multiple files with the same inode number on a single device. This
+         option makes tmpfs use the full width of ino_t by default, without
+         needing to specify the inode64 option when mounting.
+
+         But if a long-lived tmpfs is to be accessed by 32-bit applications so
+         ancient that opening a file larger than 2GiB fails with EINVAL, then
+         the INODE64 config option and inode64 mount option risk operations
+         failing with EOVERFLOW once 33-bit inode numbers are reached.
+
+         To override this configured default, use the inode32 or inode64
+         option when mounting.
+
+         If unsure, say N.
+
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
index 91e7cc4..5736bff 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -525,9 +525,9 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
                return -EINTR;
        }
 
-       ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
-                                      PROT_READ | PROT_WRITE,
-                                      MAP_SHARED, 0, &unused, NULL);
+       ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
+                                PROT_READ | PROT_WRITE,
+                                MAP_SHARED, 0, &unused, NULL);
        mmap_write_unlock(mm);
        if (IS_ERR((void *)ctx->mmap_base)) {
                ctx->mmap_size = 0;
index 0f45521..cf306e0 100644 (file)
@@ -38,7 +38,6 @@
 
 #include <linux/uaccess.h>
 #include <asm/param.h>
-#include <asm/pgalloc.h>
 
 typedef char *elf_caddr_t;
 
index 874a551..9daa256 100644 (file)
@@ -797,7 +797,7 @@ calc_seckey(struct cifs_ses *ses)
        ses->auth_key.len = CIFS_SESS_KEY_SIZE;
 
        memzero_explicit(sec_key, CIFS_SESS_KEY_SIZE);
-       kzfree(ctx_arc4);
+       kfree_sensitive(ctx_arc4);
        return 0;
 }
 
index 7e3e5e2..0ad1309 100644 (file)
@@ -2183,7 +2183,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        tmp_end++;
                        if (!(tmp_end < end && tmp_end[1] == delim)) {
                                /* No it is not. Set the password to NULL */
-                               kzfree(vol->password);
+                               kfree_sensitive(vol->password);
                                vol->password = NULL;
                                break;
                        }
@@ -2221,7 +2221,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                        options = end;
                        }
 
-                       kzfree(vol->password);
+                       kfree_sensitive(vol->password);
                        /* Now build new password string */
                        temp_len = strlen(value);
                        vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -3199,7 +3199,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
                        rc = -ENOMEM;
                        kfree(vol->username);
                        vol->username = NULL;
-                       kzfree(vol->password);
+                       kfree_sensitive(vol->password);
                        vol->password = NULL;
                        goto out_key_put;
                }
@@ -4220,7 +4220,7 @@ void
 cifs_cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
        kfree(volume_info->username);
-       kzfree(volume_info->password);
+       kfree_sensitive(volume_info->password);
        kfree(volume_info->UNC);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
@@ -5339,7 +5339,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 
 out:
        kfree(vol_info->username);
-       kzfree(vol_info->password);
+       kfree_sensitive(vol_info->password);
        kfree(vol_info);
 
        return tcon;
index a44f58b..6ee8496 100644 (file)
@@ -1191,7 +1191,7 @@ err_free_domainname:
 err_free_unc:
        kfree(new->UNC);
 err_free_password:
-       kzfree(new->password);
+       kfree_sensitive(new->password);
 err_free_username:
        kfree(new->username);
        kfree(new);
index 7642348..1c14cf0 100644 (file)
@@ -103,12 +103,12 @@ sesInfoFree(struct cifs_ses *buf_to_free)
        kfree(buf_to_free->serverOS);
        kfree(buf_to_free->serverDomain);
        kfree(buf_to_free->serverNOS);
-       kzfree(buf_to_free->password);
+       kfree_sensitive(buf_to_free->password);
        kfree(buf_to_free->user_name);
        kfree(buf_to_free->domainName);
-       kzfree(buf_to_free->auth_key.response);
+       kfree_sensitive(buf_to_free->auth_key.response);
        kfree(buf_to_free->iface_list);
-       kzfree(buf_to_free);
+       kfree_sensitive(buf_to_free);
 }
 
 struct cifs_tcon *
@@ -148,7 +148,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
        }
        atomic_dec(&tconInfoAllocCount);
        kfree(buf_to_free->nativeFileSystem);
-       kzfree(buf_to_free->password);
+       kfree_sensitive(buf_to_free->password);
        kfree(buf_to_free->crfid.fid);
 #ifdef CONFIG_CIFS_DFS_UPCALL
        kfree(buf_to_free->dfs_path);
index b6b8574..faa2554 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/sched/mm.h>
+#include <linux/slab.h>
 
 #include "fscrypt_private.h"
 
@@ -187,7 +188,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 fail:
        for (i = 0; i < queue_refs; i++)
                blk_put_queue(blk_key->devs[i]);
-       kzfree(blk_key);
+       kfree_sensitive(blk_key);
        return err;
 }
 
@@ -201,7 +202,7 @@ void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
                        blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
                        blk_put_queue(blk_key->devs[i]);
                }
-               kzfree(blk_key);
+               kfree_sensitive(blk_key);
        }
 }
 
index 71d56f8..e74f239 100644 (file)
@@ -51,7 +51,7 @@ static void free_master_key(struct fscrypt_master_key *mk)
        }
 
        key_put(mk->mk_users);
-       kzfree(mk);
+       kfree_sensitive(mk);
 }
 
 static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
@@ -531,7 +531,7 @@ static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
 static void fscrypt_provisioning_key_free_preparse(
                                        struct key_preparsed_payload *prep)
 {
-       kzfree(prep->payload.data[0]);
+       kfree_sensitive(prep->payload.data[0]);
 }
 
 static void fscrypt_provisioning_key_describe(const struct key *key,
@@ -548,7 +548,7 @@ static void fscrypt_provisioning_key_describe(const struct key *key,
 
 static void fscrypt_provisioning_key_destroy(struct key *key)
 {
-       kzfree(key->payload.data[0]);
+       kfree_sensitive(key->payload.data[0]);
 }
 
 static struct key_type key_type_fscrypt_provisioning = {
index e4e707f..a3cb525 100644 (file)
@@ -155,7 +155,7 @@ static void free_direct_key(struct fscrypt_direct_key *dk)
 {
        if (dk) {
                fscrypt_destroy_prepared_key(&dk->dk_key);
-               kzfree(dk);
+               kfree_sensitive(dk);
        }
 }
 
@@ -283,7 +283,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
 
        err = fscrypt_set_per_file_enc_key(ci, derived_key);
 out:
-       kzfree(derived_key);
+       kfree_sensitive(derived_key);
        return err;
 }
 
index af3eb02..f6a17d2 100644 (file)
@@ -838,7 +838,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 out_release_free_unlock:
        crypto_free_shash(s->hash_tfm);
 out_free_unlock:
-       kzfree(s->block_aligned_filename);
+       kfree_sensitive(s->block_aligned_filename);
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
@@ -847,7 +847,7 @@ out:
                key_put(auth_tok_key);
        }
        skcipher_request_free(s->skcipher_req);
-       kzfree(s->hash_desc);
+       kfree_sensitive(s->hash_desc);
        kfree(s);
        return rc;
 }
index 8646ba7..c0dfd96 100644 (file)
@@ -175,7 +175,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
        }
        hlist_del(&daemon->euid_chain);
        mutex_unlock(&daemon->mux);
-       kzfree(daemon);
+       kfree_sensitive(daemon);
 out:
        return rc;
 }
index ef5313f..523954d 100644 (file)
@@ -140,7 +140,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         * already been checked by prepare_hugepage_range.  If you add
         * any error returns here, do so after setting VM_HUGETLB, so
         * is_vm_hugetlb_page tests below unmap_region go the right
-        * way when do_mmap_pgoff unwinds (may be important on powerpc
+        * way when do_mmap unwinds (may be important on powerpc
         * and ia64).
         */
        vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
index a87d439..cd96083 100644 (file)
@@ -1504,7 +1504,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
        na.type = AT_BITMAP;
        na.name = I30;
        na.name_len = 4;
-       bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+       bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
        if (bmp_vi) {
                write_inode_now(bmp_vi, !datasync);
                iput(bmp_vi);
index d4359a1..9bb9f09 100644 (file)
 /**
  * ntfs_test_inode - compare two (possibly fake) inodes for equality
  * @vi:                vfs inode which to test
- * @na:                ntfs attribute which is being tested with
+ * @data:      data which is being tested with
  *
  * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @na.
+ * inode @vi for equality with the ntfs attribute @data.
  *
  * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
  * @na->name and @na->name_len are then ignored.
@@ -43,8 +43,9 @@
  * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
  * allowed to sleep.
  */
-int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+int ntfs_test_inode(struct inode *vi, void *data)
 {
+       ntfs_attr *na = (ntfs_attr *)data;
        ntfs_inode *ni;
 
        if (vi->i_ino != na->mft_no)
@@ -72,9 +73,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
 /**
  * ntfs_init_locked_inode - initialize an inode
  * @vi:                vfs inode to initialize
- * @na:                ntfs attribute which to initialize @vi to
+ * @data:      data which to initialize @vi to
  *
- * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
  * order to enable ntfs_test_inode() to do its work.
  *
  * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
@@ -87,8 +88,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
  * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
  */
-static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+static int ntfs_init_locked_inode(struct inode *vi, void *data)
 {
+       ntfs_attr *na = (ntfs_attr *)data;
        ntfs_inode *ni = NTFS_I(vi);
 
        vi->i_ino = na->mft_no;
@@ -131,7 +133,6 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
        return 0;
 }
 
-typedef int (*set_t)(struct inode *, void *);
 static int ntfs_read_locked_inode(struct inode *vi);
 static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
 static int ntfs_read_locked_index_inode(struct inode *base_vi,
@@ -164,8 +165,8 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
        na.name = NULL;
        na.name_len = 0;
 
-       vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
-                       (set_t)ntfs_init_locked_inode, &na);
+       vi = iget5_locked(sb, mft_no, ntfs_test_inode,
+                       ntfs_init_locked_inode, &na);
        if (unlikely(!vi))
                return ERR_PTR(-ENOMEM);
 
@@ -225,8 +226,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
        na.name = name;
        na.name_len = name_len;
 
-       vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
-                       (set_t)ntfs_init_locked_inode, &na);
+       vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+                       ntfs_init_locked_inode, &na);
        if (unlikely(!vi))
                return ERR_PTR(-ENOMEM);
 
@@ -280,8 +281,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
        na.name = name;
        na.name_len = name_len;
 
-       vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
-                       (set_t)ntfs_init_locked_inode, &na);
+       vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+                       ntfs_init_locked_inode, &na);
        if (unlikely(!vi))
                return ERR_PTR(-ENOMEM);
 
index 98e670f..363e4e8 100644 (file)
@@ -253,9 +253,7 @@ typedef struct {
        ATTR_TYPE type;
 } ntfs_attr;
 
-typedef int (*test_t)(struct inode *, void *);
-
-extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+extern int ntfs_test_inode(struct inode *vi, void *data);
 
 extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
 extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
index fbb9f1b..0d62cd5 100644 (file)
@@ -958,7 +958,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
                 * dirty code path of the inode dirty code path when writing
                 * $MFT occurs.
                 */
-               vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+               vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
        }
        if (vi) {
                ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
@@ -1019,7 +1019,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
                vi = igrab(mft_vi);
                BUG_ON(vi != mft_vi);
        } else
-               vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+               vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
                                &na);
        if (!vi) {
                /*
index aca1662..5d11380 100644 (file)
@@ -16,9 +16,9 @@ config OCFS2_FS
          You'll want to install the ocfs2-tools package in order to at least
          get "mount.ocfs2".
 
-         Project web page:    http://oss.oracle.com/projects/ocfs2
-         Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
-         OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+         Project web page:    https://oss.oracle.com/projects/ocfs2
+         Tools web page:      https://oss.oracle.com/projects/ocfs2-tools
+         OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/
 
          For more information on OCFS2, see the file
          <file:Documentation/filesystems/ocfs2.rst>.
index bb981ec..7b07f5d 100644 (file)
@@ -256,6 +256,8 @@ static int ocfs2_set_acl(handle_t *handle,
                ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
 
        kfree(value);
+       if (!ret)
+               set_cached_acl(inode, type, acl);
 
        return ret;
 }
index eaf042f..6e07ddb 100644 (file)
@@ -124,7 +124,7 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr
                 * parity bits that are part of the bit number
                 * representation.  Huh?
                 *
-                * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                * <wikipedia href="https://en.wikipedia.org/wiki/Hamming_code">
                 * In other words, the parity bit at position 2^k
                 * checks bits in positions having bit k set in
                 * their binary representation.  Conversely, for
index 751bc4d..8e3a369 100644 (file)
@@ -2871,9 +2871,15 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
 
        status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
                                    0, 0);
-       if (status < 0)
+       if (status < 0) {
                mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
 
+               if (ex)
+                       up_write(&osb->nfs_sync_rwlock);
+               else
+                       up_read(&osb->nfs_sync_rwlock);
+       }
+
        return status;
 }
 
index 2dd71d6..7993d52 100644 (file)
@@ -327,8 +327,8 @@ struct ocfs2_super
        spinlock_t osb_lock;
        u32 s_next_generation;
        unsigned long osb_flags;
-       s16 s_inode_steal_slot;
-       s16 s_meta_steal_slot;
+       u16 s_inode_steal_slot;
+       u16 s_meta_steal_slot;
        atomic_t s_num_inodes_stolen;
        atomic_t s_num_meta_stolen;
 
index 45745cc..8c8cf7f 100644 (file)
@@ -879,9 +879,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
 {
        spin_lock(&osb->osb_lock);
        if (type == INODE_ALLOC_SYSTEM_INODE)
-               osb->s_inode_steal_slot = slot;
+               osb->s_inode_steal_slot = (u16)slot;
        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
-               osb->s_meta_steal_slot = slot;
+               osb->s_meta_steal_slot = (u16)slot;
        spin_unlock(&osb->osb_lock);
 }
 
index f0a5d30..50b3625 100644 (file)
@@ -40,7 +40,7 @@ struct ocfs2_alloc_context {
 
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
-                                is the same as ~0 - unlimited */
+                                the same as ~0 - unlimited */
 
        int    ac_find_loc_only;  /* hack for reflink operation ordering */
        struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
index 71ea9ce..1d91dd1 100644 (file)
@@ -78,7 +78,7 @@ struct mount_options
        unsigned long   commit_interval;
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
-       signed short    slot;
+       unsigned short  slot;
        int             localalloc_opt;
        unsigned int    resv_level;
        int             dir_resv_level;
@@ -1349,7 +1349,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                goto bail;
                        }
                        if (option)
-                               mopt->slot = (s16)option;
+                               mopt->slot = (u16)option;
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option)) {
index e9a6841..887a553 100644 (file)
@@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
        si_meminfo(&i);
        si_swapinfo(&i);
-       committed = percpu_counter_read_positive(&vm_committed_as);
+       committed = vm_memory_committed();
 
        cached = global_node_page_state(NR_FILE_PAGES) -
                        total_swapcache_pages() - i.bufferram;
@@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 
        available = si_mem_available();
-       sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
-       sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
+       sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+       sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
 
        show_val_kb(m, "MemTotal:       ", i.totalram);
        show_val_kb(m, "MemFree:        ", i.freeram);
@@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        show_val_kb(m, "SReclaimable:   ", sreclaimable);
        show_val_kb(m, "SUnreclaim:     ", sunreclaim);
        seq_printf(m, "KernelStack:    %8lu kB\n",
-                  global_zone_page_state(NR_KERNEL_STACK_KB));
+                  global_node_page_state(NR_KERNEL_STACK_KB));
 #ifdef CONFIG_SHADOW_CALL_STACK
        seq_printf(m, "ShadowCallStack:%8lu kB\n",
-                  global_zone_page_state(NR_KERNEL_SCS_KB));
+                  global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
        show_val_kb(m, "PageTables:     ",
                    global_zone_page_state(NR_PAGETABLE));
index 73f7421..6f44810 100644 (file)
@@ -102,6 +102,86 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
        __free_page(pte_page);
 }
 
+
+#if CONFIG_PGTABLE_LEVELS > 2
+
+#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
+/**
+ * pmd_alloc_one - allocate a page for PMD-level page table
+ * @mm: the mm_struct of the current context
+ *
+ * Allocates a page and runs the pgtable_pmd_page_ctor().
+ * Allocations use %GFP_PGTABLE_USER in user context and
+ * %GFP_PGTABLE_KERNEL in kernel context.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       struct page *page;
+       gfp_t gfp = GFP_PGTABLE_USER;
+
+       if (mm == &init_mm)
+               gfp = GFP_PGTABLE_KERNEL;
+       page = alloc_pages(gfp, 0);
+       if (!page)
+               return NULL;
+       if (!pgtable_pmd_page_ctor(page)) {
+               __free_pages(page, 0);
+               return NULL;
+       }
+       return (pmd_t *)page_address(page);
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMD_FREE
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+       pgtable_pmd_page_dtor(virt_to_page(pmd));
+       free_page((unsigned long)pmd);
+}
+#endif
+
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+#if CONFIG_PGTABLE_LEVELS > 3
+
+#ifndef __HAVE_ARCH_PUD_FREE
+/**
+ * pud_alloc_one - allocate a page for PUD-level page table
+ * @mm: the mm_struct of the current context
+ *
+ * Allocates a page using %GFP_PGTABLE_USER for user context and
+ * %GFP_PGTABLE_KERNEL for kernel context.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       gfp_t gfp = GFP_PGTABLE_USER;
+
+       if (mm == &init_mm)
+               gfp = GFP_PGTABLE_KERNEL;
+       return (pud_t *)get_zeroed_page(gfp);
+}
+#endif
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+       free_page((unsigned long)pud);
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+
+#ifndef __HAVE_ARCH_PGD_FREE
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       free_page((unsigned long)pgd);
+}
+#endif
+
 #endif /* CONFIG_MMU */
 
 #endif /* __ASM_GENERIC_PGALLOC_H */
index ef75ec8..6661ee1 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/hugetlb_inline.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
index 62c6855..c32a6f5 100644 (file)
@@ -425,7 +425,7 @@ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
  */
 static inline void aead_request_free(struct aead_request *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 /**
index 6924b09..1d3aa25 100644 (file)
@@ -207,7 +207,7 @@ static inline struct akcipher_request *akcipher_request_alloc(
  */
 static inline void akcipher_request_free(struct akcipher_request *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 /**
index fa0a63d..81330c6 100644 (file)
@@ -230,7 +230,7 @@ void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t);
 void gf128mul_x8_ble(le128 *r, const le128 *x);
 static inline void gf128mul_free_4k(struct gf128mul_4k *t)
 {
-       kzfree(t);
+       kfree_sensitive(t);
 }
 
 
index 19ce91f..0d1b403 100644 (file)
@@ -606,7 +606,7 @@ static inline struct ahash_request *ahash_request_alloc(
  */
 static inline void ahash_request_free(struct ahash_request *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 static inline void ahash_request_zero(struct ahash_request *req)
index cf47868..cfc47e1 100644 (file)
@@ -46,7 +46,7 @@ static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm)
 
 static inline void __acomp_request_free(struct acomp_req *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 /**
index cd9a9b5..88b5912 100644 (file)
@@ -187,7 +187,7 @@ static inline struct kpp_request *kpp_request_alloc(struct crypto_kpp *tfm,
  */
 static inline void kpp_request_free(struct kpp_request *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 /**
index 5663f71..6a733b1 100644 (file)
@@ -508,7 +508,7 @@ static inline struct skcipher_request *skcipher_request_alloc(
  */
 static inline void skcipher_request_free(struct skcipher_request *req)
 {
-       kzfree(req);
+       kfree_sensitive(req);
 }
 
 static inline void skcipher_request_zero(struct skcipher_request *req)
index 05c47f8..73db1ae 100644 (file)
@@ -606,7 +606,11 @@ extern void *efi_get_pal_addr (void);
 extern void efi_map_pal_code (void);
 extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
 extern void efi_gettimeofday (struct timespec64 *ts);
+#ifdef CONFIG_EFI
 extern void efi_enter_virtual_mode (void);     /* switch EFI to virtual mode, if possible */
+#else
+static inline void efi_enter_virtual_mode (void) {}
+#endif
 #ifdef CONFIG_X86
 extern efi_status_t efi_query_variable_store(u32 attributes,
                                             unsigned long size,
index 6d19769..2df72de 100644 (file)
@@ -528,7 +528,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
+ * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
@@ -2950,6 +2950,21 @@ extern void discard_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
 extern void evict_inodes(struct super_block *sb);
 
+/*
+ * Userspace may rely on the the inode number being non-zero. For example, glibc
+ * simply ignores files with zero i_ino in unlink() and other places.
+ *
+ * As an additional complication, if userspace was compiled with
+ * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
+ * lower 32 bits, so we need to check that those aren't zero explicitly. With
+ * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
+ * better safe than sorry.
+ */
+static inline bool is_zero_ino(ino_t ino)
+{
+       return (u32)ino == 0;
+}
+
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
index 71f2077..17c4c49 100644 (file)
@@ -42,7 +42,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned char *vec);
 extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
-                        unsigned long new_addr, unsigned long old_end,
+                        unsigned long new_addr,
                         pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
index 82522e9..087fba3 100644 (file)
@@ -38,7 +38,6 @@ extern void kasan_disable_current(void);
 void kasan_unpoison_shadow(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
-void kasan_unpoison_stack_above_sp_to(const void *watermark);
 
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
@@ -101,7 +100,6 @@ void kasan_restore_multi_shot(bool enabled);
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
-static inline void kasan_unpoison_stack_above_sp_to(const void *watermark) {}
 
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
@@ -174,11 +172,13 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
+void kasan_record_aux_stack(void *ptr);
 
 #else /* CONFIG_KASAN_GENERIC */
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
+static inline void kasan_record_aux_stack(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
index e77197a..1bb49b6 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/page-flags.h>
 
 struct mem_cgroup;
+struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
@@ -31,8 +32,6 @@ struct kmem_cache;
 enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
-       /* XXX: why are these zone and not node counters? */
-       MEMCG_KERNEL_STACK_KB,
        MEMCG_NR_STAT,
 };
 
@@ -48,12 +47,6 @@ enum memcg_memory_event {
        MEMCG_NR_MEMORY_EVENTS,
 };
 
-enum mem_cgroup_protection {
-       MEMCG_PROT_NONE,
-       MEMCG_PROT_LOW,
-       MEMCG_PROT_MIN,
-};
-
 struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        unsigned int generation;
@@ -193,6 +186,22 @@ struct memcg_cgwb_frn {
 };
 
 /*
+ * Bucket for arbitrarily byte-sized objects charged to a memory
+ * cgroup. The bucket can be reparented in one piece when the cgroup
+ * is destroyed, without having to round up the individual references
+ * of all live memory objects in the wild.
+ */
+struct obj_cgroup {
+       struct percpu_ref refcnt;
+       struct mem_cgroup *memcg;
+       atomic_t nr_charged_bytes;
+       union {
+               struct list_head list;
+               struct rcu_head rcu;
+       };
+};
+
+/*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -300,7 +309,8 @@ struct mem_cgroup {
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
        enum memcg_kmem_state kmem_state;
-       struct list_head kmem_caches;
+       struct obj_cgroup __rcu *objcg;
+       struct list_head objcg_list; /* list of inherited objcgs */
 #endif
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -339,12 +349,49 @@ static inline bool mem_cgroup_disabled(void)
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+                                                 struct mem_cgroup *memcg,
                                                  bool in_low_reclaim)
 {
        if (mem_cgroup_disabled())
                return 0;
 
+       /*
+        * There is no reclaim protection applied to a targeted reclaim.
+        * We are special casing this specific case here because
+        * mem_cgroup_protected calculation is not robust enough to keep
+        * the protection invariant for calculated effective values for
+        * parallel reclaimers with different reclaim target. This is
+        * especially a problem for tail memcgs (as they have pages on LRU)
+        * which would want to have effective values 0 for targeted reclaim
+        * but a different value for external reclaim.
+        *
+        * Example
+        * Let's have global and A's reclaim in parallel:
+        *  |
+        *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
+        *  |\
+        *  | C (low = 1G, usage = 2.5G)
+        *  B (low = 1G, usage = 0.5G)
+        *
+        * For the global reclaim
+        * A.elow = A.low
+        * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
+        * C.elow = min(C.usage, C.low)
+        *
+        * With the effective values resetting we have A reclaim
+        * A.elow = 0
+        * B.elow = B.low
+        * C.elow = C.low
+        *
+        * If the global reclaim races with A's reclaim then
+        * B.elow = C.elow = 0 because children_low_usage > A.elow)
+        * is possible and reclaiming B would be violating the protection.
+        *
+        */
+       if (root == memcg)
+               return 0;
+
        if (in_low_reclaim)
                return READ_ONCE(memcg->memory.emin);
 
@@ -352,8 +399,36 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
                   READ_ONCE(memcg->memory.elow));
 }
 
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
-                                               struct mem_cgroup *memcg);
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                    struct mem_cgroup *memcg);
+
+static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg)
+{
+       /*
+        * The root memcg doesn't account charges, and doesn't support
+        * protection.
+        */
+       return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg);
+
+}
+
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
+{
+       if (!mem_cgroup_supports_protection(memcg))
+               return false;
+
+       return READ_ONCE(memcg->memory.elow) >=
+               page_counter_read(&memcg->memory);
+}
+
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
+{
+       if (!mem_cgroup_supports_protection(memcg))
+               return false;
+
+       return READ_ONCE(memcg->memory.emin) >=
+               page_counter_read(&memcg->memory);
+}
 
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
 
@@ -416,6 +491,33 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
 }
 
+static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
+{
+       return percpu_ref_tryget(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+       percpu_ref_get(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+{
+       percpu_ref_put(&objcg->refcnt);
+}
+
+/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+       return READ_ONCE(objcg->memcg);
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
        if (memcg)
@@ -679,11 +781,34 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
        return x;
 }
 
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                             int val);
 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
+
 void mod_memcg_obj_state(void *p, int idx, int val);
 
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+                                        int val)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __mod_lruvec_slab_state(p, idx, val);
+       local_irq_restore(flags);
+}
+
+static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
+                                         enum node_stat_item idx, int val)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __mod_memcg_lruvec_state(lruvec, idx, val);
+       local_irq_restore(flags);
+}
+
 static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
 {
@@ -825,16 +950,26 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+                                                 struct mem_cgroup *memcg,
                                                  bool in_low_reclaim)
 {
        return 0;
 }
 
-static inline enum mem_cgroup_protection mem_cgroup_protected(
-       struct mem_cgroup *root, struct mem_cgroup *memcg)
+static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                                  struct mem_cgroup *memcg)
+{
+}
+
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
+{
+       return false;
+}
+
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 {
-       return MEMCG_PROT_NONE;
+       return false;
 }
 
 static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
@@ -1057,6 +1192,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
        return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
+static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+                                           enum node_stat_item idx, int val)
+{
+}
+
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
 {
@@ -1089,6 +1229,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
        __mod_node_page_state(page_pgdat(page), idx, val);
 }
 
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+                                        int val)
+{
+       struct page *page = virt_to_head_page(p);
+
+       mod_node_page_state(page_pgdat(page), idx, val);
+}
+
 static inline void mod_memcg_obj_state(void *p, int idx, int val)
 {
 }
@@ -1341,9 +1489,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
 }
 #endif
 
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
-void memcg_kmem_put_cache(struct kmem_cache *cachep);
-
 #ifdef CONFIG_MEMCG_KMEM
 int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
                        unsigned int nr_pages);
@@ -1351,8 +1496,12 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
+struct obj_cgroup *get_obj_cgroup_from_current(void);
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
+
 extern struct static_key_false memcg_kmem_enabled_key;
-extern struct workqueue_struct *memcg_kmem_cache_wq;
 
 extern int memcg_nr_cache_ids;
 void memcg_get_cache_ids(void);
@@ -1368,7 +1517,19 @@ void memcg_put_cache_ids(void);
 
 static inline bool memcg_kmem_enabled(void)
 {
-       return static_branch_unlikely(&memcg_kmem_enabled_key);
+       return static_branch_likely(&memcg_kmem_enabled_key);
+}
+
+static inline bool memcg_kmem_bypass(void)
+{
+       if (in_interrupt())
+               return true;
+
+       /* Allow remote memcg charging in kthread contexts. */
+       if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+            !current->active_memcg)
+               return true;
+       return false;
 }
 
 static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
index 6c8333d..f6a82f9 100644 (file)
@@ -206,6 +206,8 @@ int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
 int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
+int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
+               loff_t *);
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
@@ -777,6 +779,11 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
+static inline int head_mapcount(struct page *head)
+{
+       return atomic_read(compound_mapcount_ptr(head)) + 1;
+}
+
 /*
  * Mapcount of compound page as a whole, does not include mapped sub-pages.
  *
@@ -786,7 +793,7 @@ static inline int compound_mapcount(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        page = compound_head(page);
-       return atomic_read(compound_mapcount_ptr(page)) + 1;
+       return head_mapcount(page);
 }
 
 /*
@@ -899,11 +906,16 @@ static inline bool hpage_pincount_available(struct page *page)
        return PageCompound(page) && compound_order(page) > 1;
 }
 
+static inline int head_pincount(struct page *head)
+{
+       return atomic_read(compound_pincount_ptr(head));
+}
+
 static inline int compound_pincount(struct page *page)
 {
        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
        page = compound_head(page);
-       return atomic_read(compound_pincount_ptr(page));
+       return head_pincount(page);
 }
 
 static inline void set_compound_order(struct page *page, unsigned int order)
@@ -2091,51 +2103,11 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                NULL : pud_offset(p4d, address);
 }
 
-static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
-                                    unsigned long address,
-                                    pgtbl_mod_mask *mod_mask)
-
-{
-       if (unlikely(pgd_none(*pgd))) {
-               if (__p4d_alloc(mm, pgd, address))
-                       return NULL;
-               *mod_mask |= PGTBL_PGD_MODIFIED;
-       }
-
-       return p4d_offset(pgd, address);
-}
-
-static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
-                                    unsigned long address,
-                                    pgtbl_mod_mask *mod_mask)
-{
-       if (unlikely(p4d_none(*p4d))) {
-               if (__pud_alloc(mm, p4d, address))
-                       return NULL;
-               *mod_mask |= PGTBL_P4D_MODIFIED;
-       }
-
-       return pud_offset(p4d, address);
-}
-
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
 }
-
-static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
-                                    unsigned long address,
-                                    pgtbl_mod_mask *mod_mask)
-{
-       if (unlikely(pud_none(*pud))) {
-               if (__pmd_alloc(mm, pud, address))
-                       return NULL;
-               *mod_mask |= PGTBL_PUD_MODIFIED;
-       }
-
-       return pmd_offset(pud, address);
-}
 #endif /* CONFIG_MMU */
 
 #if USE_SPLIT_PTE_PTLOCKS
@@ -2251,11 +2223,6 @@ static inline void pgtable_pte_page_dtor(struct page *page)
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))
 
-#define pte_alloc_kernel_track(pmd, address, mask)                     \
-       ((unlikely(pmd_none(*(pmd))) &&                                 \
-         (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
-               NULL: pte_offset_kernel(pmd, address))
-
 #if USE_SPLIT_PMD_PTLOCKS
 
 static struct page *pmd_to_page(pmd_t *pmd)
@@ -2413,9 +2380,6 @@ static inline unsigned long get_num_physpages(void)
  * for_each_valid_physical_page_range()
  *     memblock_add_node(base, size, nid)
  * free_area_init(max_zone_pfns);
- *
- * sparse_memory_present_with_active_regions() calls memory_present() for
- * each range when SPARSEMEM is enabled.
  */
 void free_area_init(unsigned long *max_zone_pfn);
 unsigned long node_map_pfn_alignment(void);
@@ -2426,7 +2390,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);
 extern unsigned long find_min_pfn_with_active_regions(void);
-extern void sparse_memory_present_with_active_regions(int nid);
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static inline int early_pfn_to_nid(unsigned long pfn)
@@ -2577,23 +2540,13 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
        struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
-       vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
-       struct list_head *uf);
+       unsigned long pgoff, unsigned long *populate, struct list_head *uf);
 extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
                       struct list_head *uf, bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
 extern int do_madvise(unsigned long start, size_t len_in, int behavior);
 
-static inline unsigned long
-do_mmap_pgoff(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long pgoff, unsigned long *populate,
-       struct list_head *uf)
-{
-       return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
-}
-
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
@@ -3009,14 +2962,15 @@ pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
-pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
+pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+                           struct vmem_altmap *altmap);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
-void *vmemmap_alloc_block_buf(unsigned long size, int node);
-void *altmap_alloc_block_buf(unsigned long size, struct vmem_altmap *altmap);
+void *vmemmap_alloc_block_buf(unsigned long size, int node,
+                             struct vmem_altmap *altmap);
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
-                              int node);
+                              int node, struct vmem_altmap *altmap);
 int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
 void vmemmap_populate_print_last(void);
index 64ede5f..0277fba 100644 (file)
@@ -198,7 +198,10 @@ struct page {
        atomic_t _refcount;
 
 #ifdef CONFIG_MEMCG
-       struct mem_cgroup *mem_cgroup;
+       union {
+               struct mem_cgroup *mem_cgroup;
+               struct obj_cgroup **obj_cgroups;
+       };
 #endif
 
        /*
index 4b08e9c..6f34c33 100644 (file)
@@ -57,8 +57,12 @@ extern struct percpu_counter vm_committed_as;
 
 #ifdef CONFIG_SMP
 extern s32 vm_committed_as_batch;
+extern void mm_compute_batch(int overcommit_policy);
 #else
 #define vm_committed_as_batch 0
+static inline void mm_compute_batch(int overcommit_policy)
+{
+}
 #endif
 
 unsigned long vm_memory_committed(void);
index c6f0708..b820078 100644 (file)
@@ -521,6 +521,16 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
        range->flags = flags;
 }
 
+static inline void mmu_notifier_range_init_migrate(
+                       struct mmu_notifier_range *range, unsigned int flags,
+                       struct vm_area_struct *vma, struct mm_struct *mm,
+                       unsigned long start, unsigned long end, void *pgmap)
+{
+       mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
+                               start, end);
+       range->migrate_pgmap_owner = pgmap;
+}
+
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
 ({                                                                     \
        int __young;                                                    \
@@ -645,6 +655,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
 
 #define mmu_notifier_range_init(range,event,flags,vma,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
+#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
+                                       pgmap) \
+       _mmu_notifier_range_init(range, start, end)
 
 static inline bool
 mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
index f6f8849..635a96c 100644 (file)
@@ -88,12 +88,10 @@ static inline bool is_migrate_movable(int mt)
 
 extern int page_group_by_mobility_disabled;
 
-#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
-#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
+#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
 
 #define get_pageblock_migratetype(page)                                        \
-       get_pfnblock_flags_mask(page, page_to_pfn(page),                \
-                       PB_migrate_end, MIGRATETYPE_MASK)
+       get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
 
 struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
@@ -155,10 +153,6 @@ enum zone_stat_item {
        NR_ZONE_WRITE_PENDING,  /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,               /* mlock()ed pages found and moved off LRU */
        NR_PAGETABLE,           /* used for pagetables */
-       NR_KERNEL_STACK_KB,     /* measured in KiB */
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
-       NR_KERNEL_SCS_KB,       /* measured in KiB */
-#endif
        /* Second 128 byte cacheline */
        NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
@@ -174,8 +168,8 @@ enum node_stat_item {
        NR_INACTIVE_FILE,       /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,         /*  "     "     "   "       "         */
        NR_UNEVICTABLE,         /*  "     "     "   "       "         */
-       NR_SLAB_RECLAIMABLE,
-       NR_SLAB_UNRECLAIMABLE,
+       NR_SLAB_RECLAIMABLE_B,
+       NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,       /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
@@ -203,10 +197,34 @@ enum node_stat_item {
        NR_KERNEL_MISC_RECLAIMABLE,     /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,   /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,   /* pages returned via unpin_user_page() */
+       NR_KERNEL_STACK_KB,     /* measured in KiB */
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+       NR_KERNEL_SCS_KB,       /* measured in KiB */
+#endif
        NR_VM_NODE_STAT_ITEMS
 };
 
 /*
+ * Returns true if the value is measured in bytes (most vmstat values are
+ * measured in pages). This defines the API part, the internal representation
+ * might be different.
+ */
+static __always_inline bool vmstat_item_in_bytes(int idx)
+{
+       /*
+        * Global and per-node slab counters track slab pages.
+        * It's expected that changes are multiples of PAGE_SIZE.
+        * Internally values are stored in pages.
+        *
+        * Per-memcg and per-lruvec counters track memory, consumed
+        * by individual slab objects. These counters are actually
+        * byte-precise.
+        */
+       return (idx == NR_SLAB_RECLAIMABLE_B ||
+               idx == NR_SLAB_UNRECLAIMABLE_B);
+}
+
+/*
  * We do arithmetic on the LRU lists in various places in the code,
  * so it is important to keep the active lists LRU_ACTIVE higher in
  * the array than the corresponding inactive lists, and to keep
@@ -819,18 +837,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 
 extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
 
-#ifdef CONFIG_HAVE_MEMORY_PRESENT
-void memory_present(int nid, unsigned long start, unsigned long end);
-#else
-static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
-#endif
-
-#if defined(CONFIG_SPARSEMEM)
-void memblocks_present(void);
-#else
-static inline void memblocks_present(void) {}
-#endif
-
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 int local_memory_node(int node_id);
 #else
@@ -1387,8 +1393,6 @@ struct mminit_pfnnid_cache {
 #define early_pfn_valid(pfn)   (1)
 #endif
 
-void memory_present(int nid, unsigned long start, unsigned long end);
-
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
  * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
index c066fec..fff52ad 100644 (file)
@@ -56,35 +56,25 @@ struct page;
 
 unsigned long get_pfnblock_flags_mask(struct page *page,
                                unsigned long pfn,
-                               unsigned long end_bitidx,
                                unsigned long mask);
 
 void set_pfnblock_flags_mask(struct page *page,
                                unsigned long flags,
                                unsigned long pfn,
-                               unsigned long end_bitidx,
                                unsigned long mask);
 
 /* Declarations for getting and setting flags. See mm/page_alloc.c */
-#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
-       get_pfnblock_flags_mask(page, page_to_pfn(page),                \
-                       end_bitidx,                                     \
-                       (1 << (end_bitidx - start_bitidx + 1)) - 1)
-#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
-       set_pfnblock_flags_mask(page, flags, page_to_pfn(page),         \
-                       end_bitidx,                                     \
-                       (1 << (end_bitidx - start_bitidx + 1)) - 1)
-
 #ifdef CONFIG_COMPACTION
 #define get_pageblock_skip(page) \
-                       get_pageblock_flags_group(page, PB_migrate_skip,     \
-                                                       PB_migrate_skip)
+       get_pfnblock_flags_mask(page, page_to_pfn(page),        \
+                       (1 << (PB_migrate_skip)))
 #define clear_pageblock_skip(page) \
-                       set_pageblock_flags_group(page, 0, PB_migrate_skip,  \
-                                                       PB_migrate_skip)
+       set_pfnblock_flags_mask(page, 0, page_to_pfn(page),     \
+                       (1 << PB_migrate_skip))
 #define set_pageblock_skip(page) \
-                       set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
-                                                       PB_migrate_skip)
+       set_pfnblock_flags_mask(page, (1 << PB_migrate_skip),   \
+                       page_to_pfn(page),                      \
+                       (1 << PB_migrate_skip))
 #else
 static inline bool get_pageblock_skip(struct page *page)
 {
index 0a4f54d..01861ee 100644 (file)
@@ -44,6 +44,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+void percpu_counter_sync(struct percpu_counter *fbc);
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 {
@@ -172,6 +173,9 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
        return true;
 }
 
+static inline void percpu_counter_sync(struct percpu_counter *fbc)
+{
+}
 #endif /* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
index 6be66f5..85023dd 100644 (file)
@@ -175,12 +175,10 @@ static inline bool in_vfork(struct task_struct *tsk)
  * Applies per-task gfp context to the given allocation flags.
  * PF_MEMALLOC_NOIO implies GFP_NOIO
  * PF_MEMALLOC_NOFS implies GFP_NOFS
- * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
  */
 static inline gfp_t current_gfp_context(gfp_t flags)
 {
-       if (unlikely(current->flags &
-                    (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) {
+       if (unlikely(current->flags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
@@ -189,10 +187,6 @@ static inline gfp_t current_gfp_context(gfp_t flags)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (current->flags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;
-#ifdef CONFIG_CMA
-               if (current->flags & PF_MEMALLOC_NOCMA)
-                       flags &= ~__GFP_MOVABLE;
-#endif
        }
        return flags;
 }
index 7a35a69..a5a5d1d 100644 (file)
@@ -36,6 +36,9 @@ struct shmem_sb_info {
        unsigned char huge;         /* Whether to try for hugepages */
        kuid_t uid;                 /* Mount uid for root directory */
        kgid_t gid;                 /* Mount gid for root directory */
+       bool full_inums;            /* If i_ino should be uint or ino_t */
+       ino_t next_ino;             /* The next per-sb inode number to use */
+       ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
        struct mempolicy *mpol;     /* default memory policy for mappings */
        spinlock_t shrinklist_lock;   /* Protects shrinklist */
        struct list_head shrinklist;  /* List of shinkable inodes */
index 6d45488..24df239 100644 (file)
@@ -155,9 +155,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 
-void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
-
 /*
  * Please use this macro to create slab caches. Simply specify the
  * name of the structure and maybe some flags that are listed above.
@@ -186,10 +183,12 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
  */
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
-void kzfree(const void *);
+void kfree_sensitive(const void *);
 size_t __ksize(const void *);
 size_t ksize(const void *);
 
+#define kzfree(x)      kfree_sensitive(x)      /* For backward compatibility */
+
 #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
 void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
                        bool to_user);
@@ -578,8 +577,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
        return __kmalloc_node(size, flags, node);
 }
 
-int memcg_update_all_caches(int num_memcgs);
-
 /**
  * kmalloc_array - allocate memory for an array.
  * @n: number of elements.
index abc7de7..9eb430c 100644 (file)
@@ -72,9 +72,6 @@ struct kmem_cache {
        int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
 
-#ifdef CONFIG_MEMCG
-       struct memcg_cache_params memcg_params;
-#endif
 #ifdef CONFIG_KASAN
        struct kasan_cache kasan_info;
 #endif
@@ -114,4 +111,10 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 
+static inline int objs_per_slab_page(const struct kmem_cache *cache,
+                                    const struct page *page)
+{
+       return cache->num;
+}
+
 #endif /* _LINUX_SLAB_DEF_H */
index d215378..1be0ed5 100644 (file)
@@ -8,6 +8,7 @@
  * (C) 2007 SGI, Christoph Lameter
  */
 #include <linux/kobject.h>
+#include <linux/reciprocal_div.h>
 
 enum stat_item {
        ALLOC_FASTPATH,         /* Allocation from cpu slab */
@@ -86,6 +87,7 @@ struct kmem_cache {
        unsigned long min_partial;
        unsigned int size;      /* The size of an object including metadata */
        unsigned int object_size;/* The size of an object without metadata */
+       struct reciprocal_value reciprocal_size;
        unsigned int offset;    /* Free pointer offset */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
        /* Number of per cpu partial objects to keep around */
@@ -106,17 +108,7 @@ struct kmem_cache {
        struct list_head list;  /* List of slab caches */
 #ifdef CONFIG_SYSFS
        struct kobject kobj;    /* For sysfs */
-       struct work_struct kobj_remove_work;
 #endif
-#ifdef CONFIG_MEMCG
-       struct memcg_cache_params memcg_params;
-       /* For propagation, maximum size of a stored attr */
-       unsigned int max_attr_size;
-#ifdef CONFIG_SYSFS
-       struct kset *memcg_kset;
-#endif
-#endif
-
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
        unsigned long random;
 #endif
@@ -182,4 +174,23 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
        return result;
 }
 
+/* Determine object index from a given position */
+static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
+                                         void *addr, void *obj)
+{
+       return reciprocal_divide(kasan_reset_tag(obj) - addr,
+                                cache->reciprocal_size);
+}
+
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+                                       const struct page *page, void *obj)
+{
+       return __obj_to_index(cache, page_address(page), obj);
+}
+
+static inline int objs_per_slab_page(const struct kmem_cache *cache,
+                                    const struct page *page)
+{
+       return page->objects;
+}
 #endif /* _LINUX_SLUB_DEF_H */
index 5b3216b..7eb59bc 100644 (file)
@@ -328,7 +328,6 @@ void workingset_update_node(struct xa_node *node);
 /* linux/mm/page_alloc.c */
 extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
-extern unsigned long nr_free_pagecache_pages(void);
 
 /* Definition of global_zone_page_state not available yet */
 #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
@@ -372,7 +371,6 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
-extern unsigned long vm_total_pages;
 
 extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
index aa96108..91220ac 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
 #include <linux/static_key.h>
+#include <linux/mmdebug.h>
 
 extern int sysctl_stat_interval;
 
@@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item)
        return x;
 }
 
-static inline unsigned long global_node_page_state(enum node_stat_item item)
+static inline
+unsigned long global_node_page_state_pages(enum node_stat_item item)
 {
        long x = atomic_long_read(&vm_node_stat[item]);
 #ifdef CONFIG_SMP
@@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item)
        return x;
 }
 
+static inline unsigned long global_node_page_state(enum node_stat_item item)
+{
+       VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+       return global_node_page_state_pages(item);
+}
+
 static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
 {
@@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
 extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
+extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
+                                          enum node_stat_item item);
 #else
 #define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
+#define node_page_state_pages(node, item) global_node_page_state_pages(item)
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
index 9082ed3..d6a0b31 100644 (file)
@@ -1913,9 +1913,8 @@ config SLAB_MERGE_DEFAULT
          command line.
 
 config SLAB_FREELIST_RANDOM
-       default n
+       bool "Randomize slab freelist"
        depends on SLAB || SLUB
-       bool "SLAB freelist randomization"
        help
          Randomizes the freelist order used on creating new pages. This
          security feature reduces the predictability of the kernel slab
@@ -1923,12 +1922,14 @@ config SLAB_FREELIST_RANDOM
 
 config SLAB_FREELIST_HARDENED
        bool "Harden slab freelist metadata"
-       depends on SLUB
+       depends on SLAB || SLUB
        help
          Many kernel heap attacks try to target slab cache metadata and
          other infrastructure. This options makes minor performance
          sacrifices to harden the kernel slab allocator against common
-         freelist exploit methods.
+         freelist exploit methods. Some slab implementations have more
+         sanity-checking than others. This option is most effective with
+         CONFIG_SLUB.
 
 config SHUFFLE_PAGE_ALLOCATOR
        bool "Page allocator randomization"
index 276ca31..de2f9fa 100644 (file)
@@ -830,7 +830,7 @@ void __init __weak arch_call_rest_init(void)
        rest_init();
 }
 
-asmlinkage __visible void __init start_kernel(void)
+asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 {
        char *command_line;
        char *after_dashes;
index 0a6dd94..bf38d7e 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1558,7 +1558,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
                        goto invalid;
        }
 
-       addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
+       addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
        *raddr = addr;
        err = 0;
        if (IS_ERR_VALUE(addr))
index 76d3f33..35e9894 100644 (file)
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                                             THREAD_SIZE_ORDER);
 
        if (likely(page)) {
-               tsk->stack = page_address(page);
+               tsk->stack = kasan_reset_tag(page_address(page));
                return tsk->stack;
        }
        return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
        if (vm) {
                int i;
 
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            -(int)(PAGE_SIZE / 1024));
-
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
-               }
 
                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 {
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+       stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack;
 }
@@ -382,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);
 
-       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
-       if (vm) {
-               int i;
-
-               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_zone_page_state(page_zone(vm->pages[i]),
-                                           NR_KERNEL_STACK_KB,
-                                           PAGE_SIZE / 1024 * account);
-               }
-       } else {
-               /*
-                * All stack pages are in the same zone and belong to the
-                * same memcg.
-                */
-               struct page *first_page = virt_to_page(stack);
-
-               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                                   THREAD_SIZE / 1024 * account);
-
-               mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
-                                   account * (THREAD_SIZE / 1024));
-       }
+       /* All stack pages are in the same node. */
+       if (vm)
+               mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
+       else
+               mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
 }
 
 static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -415,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
        struct vm_struct *vm = task_stack_vm_area(tsk);
        int ret;
 
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
        if (vm) {
                int i;
 
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
                         * If memcg_kmem_charge_page() fails, page->mem_cgroup
-                        * pointer is NULL, and both memcg_kmem_uncharge_page()
-                        * and mod_memcg_page_state() in free_thread_stack()
-                        * will ignore this page. So it's safe.
+                        * pointer is NULL, and memcg_kmem_uncharge_page() in
+                        * free_thread_stack() will ignore this page.
                         */
                        ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                     0);
                        if (ret)
                                return ret;
-
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            PAGE_SIZE / 1024);
                }
        }
 #endif
index 1d9e2fd..b2807e7 100644 (file)
@@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind);
  *          to "name.*%u". Code fills in cpu number.
  *
  * Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
  */
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
@@ -1241,13 +1240,16 @@ void kthread_use_mm(struct mm_struct *mm)
        WARN_ON_ONCE(tsk->mm);
 
        task_lock(tsk);
+       /* Hold off tlb flush IPIs while switching mm's */
+       local_irq_disable();
        active_mm = tsk->active_mm;
        if (active_mm != mm) {
                mmgrab(mm);
                tsk->active_mm = mm;
        }
        tsk->mm = mm;
-       switch_mm(active_mm, mm, tsk);
+       switch_mm_irqs_off(active_mm, mm, tsk);
+       local_irq_enable();
        task_unlock(tsk);
 #ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
@@ -1276,9 +1278,11 @@ void kthread_unuse_mm(struct mm_struct *mm)
 
        task_lock(tsk);
        sync_mm_rss(mm);
+       local_irq_disable();
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
+       local_irq_enable();
        task_unlock(tsk);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
index cef1542..d25749b 100644 (file)
@@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 {
        unsigned long size;
 
-       size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+       size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
                + global_node_page_state(NR_ACTIVE_ANON)
                + global_node_page_state(NR_INACTIVE_ANON)
                + global_node_page_state(NR_ACTIVE_FILE)
index ac7198e..8ce77d9 100644 (file)
@@ -59,6 +59,7 @@
 #include <linux/sched/clock.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/kasan.h>
 #include "../time/tick-internal.h"
 
 #include "tree.h"
@@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
        head->func = func;
        head->next = NULL;
        local_irq_save(flags);
+       kasan_record_aux_stack(head);
        rdp = this_cpu_ptr(&rcu_data);
 
        /* Add the callback to our list. */
index 5d4d9bb..4ff4a7b 100644 (file)
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
 {
        struct page *scs_page = virt_to_page(s);
 
-       mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+       mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
                            account * (SCS_SIZE / SZ_1K));
 }
 
index 1b4d2dc..f785de3 100644 (file)
@@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_overcommit_memory,
                .maxlen         = sizeof(sysctl_overcommit_memory),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
+               .proc_handler   = overcommit_policy_handler,
                .extra1         = SYSCTL_ZERO,
                .extra2         = &two,
        },
index 34b84bc..047b53d 100644 (file)
@@ -18,7 +18,7 @@ config CC_HAS_KASAN_SW_TAGS
 config CC_HAS_WORKING_NOSANITIZE_ADDRESS
        def_bool !CC_IS_GCC || GCC_VERSION >= 80300
 
-config KASAN
+menuconfig KASAN
        bool "KASAN: runtime memory debugger"
        depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
                   (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)
@@ -29,9 +29,10 @@ config KASAN
          designed to find out-of-bounds accesses and use-after-free bugs.
          See Documentation/dev-tools/kasan.rst for details.
 
+if KASAN
+
 choice
        prompt "KASAN mode"
-       depends on KASAN
        default KASAN_GENERIC
        help
          KASAN has two modes: generic KASAN (similar to userspace ASan,
@@ -39,6 +40,7 @@ choice
          software tag-based KASAN (a version based on software memory
          tagging, arm64 only, similar to userspace HWASan, enabled with
          CONFIG_KASAN_SW_TAGS).
+
          Both generic and tag-based KASAN are strictly debugging features.
 
 config KASAN_GENERIC
@@ -50,16 +52,18 @@ config KASAN_GENERIC
        select STACKDEPOT
        help
          Enables generic KASAN mode.
-         Supported in both GCC and Clang. With GCC it requires version 4.9.2
-         or later for basic support and version 5.0 or later for detection of
-         out-of-bounds accesses for stack and global variables and for inline
-         instrumentation mode (CONFIG_KASAN_INLINE). With Clang it requires
-         version 3.7.0 or later and it doesn't support detection of
-         out-of-bounds accesses for global variables yet.
+
+         This mode is supported in both GCC and Clang. With GCC it requires
+         version 8.3.0 or later. With Clang it requires version 7.0.0 or
+         later, but detection of out-of-bounds accesses for global variables
+         is supported only since Clang 11.
+
          This mode consumes about 1/8th of available memory at kernel start
          and introduces an overhead of ~x1.5 for the rest of the allocations.
          The performance slowdown is ~x3.
+
          For better error detection enable CONFIG_STACKTRACE.
+
          Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB
          (the resulting kernel does not boot).
 
@@ -72,15 +76,19 @@ config KASAN_SW_TAGS
        select STACKDEPOT
        help
          Enables software tag-based KASAN mode.
+
          This mode requires Top Byte Ignore support by the CPU and therefore
-         is only supported for arm64.
-         This mode requires Clang version 7.0.0 or later.
+         is only supported for arm64. This mode requires Clang version 7.0.0
+         or later.
+
          This mode consumes about 1/16th of available memory at kernel start
          and introduces an overhead of ~20% for the rest of the allocations.
          This mode may potentially introduce problems relating to pointer
          casting and comparison, as it embeds tags into the top byte of each
          pointer.
+
          For better error detection enable CONFIG_STACKTRACE.
+
          Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB
          (the resulting kernel does not boot).
 
@@ -88,7 +96,6 @@ endchoice
 
 choice
        prompt "Instrumentation type"
-       depends on KASAN
        default KASAN_OUTLINE
 
 config KASAN_OUTLINE
@@ -107,13 +114,11 @@ config KASAN_INLINE
          memory accesses. This is faster than outline (in some workloads
          it gives about x2 boost over outline instrumentation), but
          make kernel's .text size much bigger.
-         For CONFIG_KASAN_GENERIC this requires GCC 5.0 or later.
 
 endchoice
 
 config KASAN_STACK_ENABLE
        bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST
-       depends on KASAN
        help
          The LLVM stack address sanitizer has a know problem that
          causes excessive stack usage in a lot of functions, see
@@ -134,7 +139,7 @@ config KASAN_STACK
 
 config KASAN_S390_4_LEVEL_PAGING
        bool "KASan: use 4-level paging"
-       depends on KASAN && S390
+       depends on S390
        help
          Compiling the kernel with KASan disables automatic 3-level vs
          4-level paging selection. 3-level paging is used by default (up
@@ -151,7 +156,7 @@ config KASAN_SW_TAGS_IDENTIFY
 
 config KASAN_VMALLOC
        bool "Back mappings in vmalloc space with real shadow memory"
-       depends on KASAN && HAVE_ARCH_KASAN_VMALLOC
+       depends on HAVE_ARCH_KASAN_VMALLOC
        help
          By default, the shadow region for vmalloc space is the read-only
          zero page. This means that KASAN cannot detect errors involving
@@ -164,8 +169,10 @@ config KASAN_VMALLOC
 
 config TEST_KASAN
        tristate "Module for testing KASAN for bug detection"
-       depends on m && KASAN
+       depends on m
        help
          This is a test module doing various nasty things like
          out of bounds accesses, use after free. It is useful for testing
          kernel debugging features like KASAN.
+
+endif # KASAN
index 435f7f1..f399621 100644 (file)
@@ -37,7 +37,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
         nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
-lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y  += kobject.o klist.o
index 20ed0f7..4cd2b33 100644 (file)
@@ -69,7 +69,7 @@ void mpi_free_limb_space(mpi_ptr_t a)
        if (!a)
                return;
 
-       kzfree(a);
+       kfree_sensitive(a);
 }
 
 void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
@@ -95,7 +95,7 @@ int mpi_resize(MPI a, unsigned nlimbs)
                if (!p)
                        return -ENOMEM;
                memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
-               kzfree(a->d);
+               kfree_sensitive(a->d);
                a->d = p;
        } else {
                a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
@@ -112,7 +112,7 @@ void mpi_free(MPI a)
                return;
 
        if (a->flags & 4)
-               kzfree(a->d);
+               kfree_sensitive(a->d);
        else
                mpi_free_limb_space(a->d);
 
index a66595b..a2345de 100644 (file)
@@ -99,6 +99,25 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
 EXPORT_SYMBOL(percpu_counter_add_batch);
 
 /*
+ * For percpu_counter with a big batch, the devication of its count could
+ * be big, and there is requirement to reduce the deviation, like when the
+ * counter's batch could be runtime decreased to get a better accuracy,
+ * which can be achieved by running this sync function on each CPU.
+ */
+void percpu_counter_sync(struct percpu_counter *fbc)
+{
+       unsigned long flags;
+       s64 count;
+
+       raw_spin_lock_irqsave(&fbc->lock, flags);
+       count = __this_cpu_read(*fbc->counters);
+       fbc->count += count;
+       __this_cpu_sub(*fbc->counters, count);
+       raw_spin_unlock_irqrestore(&fbc->lock, flags);
+}
+EXPORT_SYMBOL(percpu_counter_sync);
+
+/*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
index dc2c6a5..53e953b 100644 (file)
 
 #include <asm/page.h>
 
+#include "../mm/kasan/kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
+
 /*
  * We assign some test results to these globals to make sure the tests
  * are not eliminated as dead code.
@@ -48,7 +52,8 @@ static noinline void __init kmalloc_oob_right(void)
                return;
        }
 
-       ptr[size] = 'x';
+       ptr[size + OOB_TAG_OFF] = 'x';
+
        kfree(ptr);
 }
 
@@ -100,7 +105,8 @@ static noinline void __init kmalloc_pagealloc_oob_right(void)
                return;
        }
 
-       ptr[size] = 0;
+       ptr[size + OOB_TAG_OFF] = 0;
+
        kfree(ptr);
 }
 
@@ -170,7 +176,8 @@ static noinline void __init kmalloc_oob_krealloc_more(void)
                return;
        }
 
-       ptr2[size2] = 'x';
+       ptr2[size2 + OOB_TAG_OFF] = 'x';
+
        kfree(ptr2);
 }
 
@@ -188,7 +195,9 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
                kfree(ptr1);
                return;
        }
-       ptr2[size2] = 'x';
+
+       ptr2[size2 + OOB_TAG_OFF] = 'x';
+
        kfree(ptr2);
 }
 
@@ -224,7 +233,8 @@ static noinline void __init kmalloc_oob_memset_2(void)
                return;
        }
 
-       memset(ptr+7, 0, 2);
+       memset(ptr + 7 + OOB_TAG_OFF, 0, 2);
+
        kfree(ptr);
 }
 
@@ -240,7 +250,8 @@ static noinline void __init kmalloc_oob_memset_4(void)
                return;
        }
 
-       memset(ptr+5, 0, 4);
+       memset(ptr + 5 + OOB_TAG_OFF, 0, 4);
+
        kfree(ptr);
 }
 
@@ -257,7 +268,8 @@ static noinline void __init kmalloc_oob_memset_8(void)
                return;
        }
 
-       memset(ptr+1, 0, 8);
+       memset(ptr + 1 + OOB_TAG_OFF, 0, 8);
+
        kfree(ptr);
 }
 
@@ -273,7 +285,8 @@ static noinline void __init kmalloc_oob_memset_16(void)
                return;
        }
 
-       memset(ptr+1, 0, 16);
+       memset(ptr + 1 + OOB_TAG_OFF, 0, 16);
+
        kfree(ptr);
 }
 
@@ -289,7 +302,8 @@ static noinline void __init kmalloc_oob_in_memset(void)
                return;
        }
 
-       memset(ptr, 0, size+5);
+       memset(ptr, 0, size + 5 + OOB_TAG_OFF);
+
        kfree(ptr);
 }
 
@@ -423,7 +437,8 @@ static noinline void __init kmem_cache_oob(void)
                return;
        }
 
-       *p = p[size];
+       *p = p[size + OOB_TAG_OFF];
+
        kmem_cache_free(cache, p);
        kmem_cache_destroy(cache);
 }
@@ -473,7 +488,7 @@ static noinline void __init kasan_global_oob(void)
 static noinline void __init kasan_stack_oob(void)
 {
        char stack_array[10];
-       volatile int i = 0;
+       volatile int i = OOB_TAG_OFF;
        char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
 
        pr_info("out-of-bounds on stack\n");
@@ -520,25 +535,25 @@ static noinline void __init copy_user_test(void)
        }
 
        pr_info("out-of-bounds in copy_from_user()\n");
-       unused = copy_from_user(kmem, usermem, size + 1);
+       unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in copy_to_user()\n");
-       unused = copy_to_user(usermem, kmem, size + 1);
+       unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in __copy_from_user()\n");
-       unused = __copy_from_user(kmem, usermem, size + 1);
+       unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in __copy_to_user()\n");
-       unused = __copy_to_user(usermem, kmem, size + 1);
+       unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
-       unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+       unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
-       unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+       unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
 
        pr_info("out-of-bounds in strncpy_from_user()\n");
-       unused = strncpy_from_user(kmem, usermem, size + 1);
+       unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
 
        vm_munmap((unsigned long)usermem, PAGE_SIZE);
        kfree(kmem);
@@ -766,15 +781,15 @@ static noinline void __init kmalloc_double_kzfree(void)
        char *ptr;
        size_t size = 16;
 
-       pr_info("double-free (kzfree)\n");
+       pr_info("double-free (kfree_sensitive)\n");
        ptr = kmalloc(size, GFP_KERNEL);
        if (!ptr) {
                pr_err("Allocation failed\n");
                return;
        }
 
-       kzfree(ptr);
-       kzfree(ptr);
+       kfree_sensitive(ptr);
+       kfree_sensitive(ptr);
 }
 
 #ifdef CONFIG_KASAN_VMALLOC
@@ -801,6 +816,35 @@ static noinline void __init vmalloc_oob(void)
 static void __init vmalloc_oob(void) {}
 #endif
 
+static struct kasan_rcu_info {
+       int i;
+       struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
+{
+       struct kasan_rcu_info *fp = container_of(rp,
+                                               struct kasan_rcu_info, rcu);
+
+       kfree(fp);
+       fp->i = 1;
+}
+
+static noinline void __init kasan_rcu_uaf(void)
+{
+       struct kasan_rcu_info *ptr;
+
+       pr_info("use-after-free in kasan_rcu_reclaim\n");
+       ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+       if (!ptr) {
+               pr_err("Allocation failed\n");
+               return;
+       }
+
+       global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
+       call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
+}
+
 static int __init kmalloc_tests_init(void)
 {
        /*
@@ -848,6 +892,7 @@ static int __init kmalloc_tests_init(void)
        kasan_bitops();
        kmalloc_double_kzfree();
        vmalloc_oob();
+       kasan_rcu_uaf();
 
        kasan_restore_multi_shot(multishot);
 
index d41f3fa..6c97488 100644 (file)
@@ -88,13 +88,9 @@ config NEED_MULTIPLE_NODES
        def_bool y
        depends on DISCONTIGMEM || NUMA
 
-config HAVE_MEMORY_PRESENT
-       def_bool y
-       depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
-
 #
 # SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called.  If this cannot
+# allocations when sparse_init() is called.  If this cannot
 # be done on your architecture, select this option.  However,
 # statically allocating the mem_section[] array can potentially
 # consume vast quantities of .bss, so be careful.
index 6e9d46b..d5649f1 100644 (file)
@@ -38,7 +38,7 @@ mmu-y                 := nommu.o
 mmu-$(CONFIG_MMU)      := highmem.o memory.o mincore.o \
                           mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
                           msync.o page_vma_mapped.o pagewalk.o \
-                          pgtable-generic.o rmap.o vmalloc.o
+                          pgtable-generic.o rmap.o vmalloc.o ioremap.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
index 4f37651..ca8d1ca 100644 (file)
@@ -69,8 +69,19 @@ void __dump_page(struct page *page, const char *reason)
        }
 
        if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
-               /* Corrupt page, cannot call page_mapping */
-               mapping = page->mapping;
+               /*
+                * Corrupt page, so we cannot call page_mapping. Instead, do a
+                * safe subset of the steps that page_mapping() does. Caution:
+                * this will be misleading for tail pages, PageSwapCache pages,
+                * and potentially other situations. (See the page_mapping()
+                * implementation for what's missing here.)
+                */
+               unsigned long tmp = (unsigned long)page->mapping;
+
+               if (tmp & PAGE_MAPPING_ANON)
+                       mapping = NULL;
+               else
+                       mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
                head = page;
                compound = false;
        } else {
@@ -84,86 +95,76 @@ void __dump_page(struct page *page, const char *reason)
         */
        mapcount = PageSlab(head) ? 0 : page_mapcount(page);
 
-       if (compound)
+       pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+                       page, page_ref_count(head), mapcount, mapping,
+                       page_to_pgoff(page), page_to_pfn(page));
+       if (compound) {
                if (hpage_pincount_available(page)) {
-                       pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
-                               "index:%#lx head:%px order:%u "
-                               "compound_mapcount:%d compound_pincount:%d\n",
-                               page, page_ref_count(head), mapcount,
-                               mapping, page_to_pgoff(page), head,
-                               compound_order(head), compound_mapcount(page),
-                               compound_pincount(page));
+                       pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+                                       head, compound_order(head),
+                                       head_mapcount(head),
+                                       head_pincount(head));
                } else {
-                       pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
-                               "index:%#lx head:%px order:%u "
-                               "compound_mapcount:%d\n",
-                               page, page_ref_count(head), mapcount,
-                               mapping, page_to_pgoff(page), head,
-                               compound_order(head), compound_mapcount(page));
+                       pr_warn("head:%p order:%u compound_mapcount:%d\n",
+                                       head, compound_order(head),
+                                       head_mapcount(head));
                }
-       else
-               pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n",
-                       page, page_ref_count(page), mapcount,
-                       mapping, page_to_pgoff(page));
+       }
        if (PageKsm(page))
                type = "ksm ";
        else if (PageAnon(page))
                type = "anon ";
        else if (mapping) {
-               const struct inode *host;
+               struct inode *host;
                const struct address_space_operations *a_ops;
-               const struct hlist_node *dentry_first;
-               const struct dentry *dentry_ptr;
+               struct hlist_node *dentry_first;
+               struct dentry *dentry_ptr;
                struct dentry dentry;
 
                /*
                 * mapping can be invalid pointer and we don't want to crash
                 * accessing it, so probe everything depending on it carefully
                 */
-               if (copy_from_kernel_nofault(&host, &mapping->host,
-                                       sizeof(struct inode *)) ||
-                   copy_from_kernel_nofault(&a_ops, &mapping->a_ops,
-                               sizeof(struct address_space_operations *))) {
-                       pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n");
+               if (get_kernel_nofault(host, &mapping->host) ||
+                   get_kernel_nofault(a_ops, &mapping->a_ops)) {
+                       pr_warn("failed to read mapping contents, not a valid kernel address?\n");
                        goto out_mapping;
                }
 
                if (!host) {
-                       pr_warn("mapping->a_ops:%ps\n", a_ops);
+                       pr_warn("aops:%ps\n", a_ops);
                        goto out_mapping;
                }
 
-               if (copy_from_kernel_nofault(&dentry_first,
-                       &host->i_dentry.first, sizeof(struct hlist_node *))) {
-                       pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n",
-                               a_ops, host);
+               if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+                       pr_warn("aops:%ps with invalid host inode %px\n",
+                                       a_ops, host);
                        goto out_mapping;
                }
 
                if (!dentry_first) {
-                       pr_warn("mapping->a_ops:%ps\n", a_ops);
+                       pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
                        goto out_mapping;
                }
 
                dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
-               if (copy_from_kernel_nofault(&dentry, dentry_ptr,
-                                                       sizeof(struct dentry))) {
-                       pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n",
-                               a_ops, dentry_ptr);
+               if (get_kernel_nofault(dentry, dentry_ptr)) {
+                       pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
+                                       dentry_ptr);
                } else {
                        /*
                         * if dentry is corrupted, the %pd handler may still
                         * crash, but it's unlikely that we reach here with a
                         * corrupted struct page
                         */
-                       pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n",
-                                                               a_ops, &dentry);
+                       pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
+                                       a_ops, host->i_ino, &dentry);
                }
        }
 out_mapping:
        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
 
-       pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+       pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
                page_cma ? " CMA" : "");
 
 hex_only:
index d315ff5..086309f 100644 (file)
@@ -8,7 +8,7 @@
  *
  * Author: Anshuman Khandual <anshuman.khandual@arm.com>
  */
-#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
 
 #include <linux/gfp.h>
 #include <linux/highmem.h>
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/pfn_t.h>
 #include <linux/printk.h>
+#include <linux/pgtable.h>
 #include <linux/random.h>
 #include <linux/spinlock.h>
 #include <linux/swap.h>
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
 #include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
 
 #define VMFLAGS        (VM_READ|VM_WRITE|VM_EXEC)
 
@@ -46,6 +54,7 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
 {
        pte_t pte = pfn_pte(pfn, prot);
 
+       pr_debug("Validating PTE basic\n");
        WARN_ON(!pte_same(pte, pte));
        WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
        WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -55,6 +64,57 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
 }
 
+static void __init pte_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pte_t *ptep,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       pr_debug("Validating PTE advanced\n");
+       pte = pfn_pte(pfn, prot);
+       set_pte_at(mm, vaddr, ptep, pte);
+       ptep_set_wrprotect(mm, vaddr, ptep);
+       pte = ptep_get(ptep);
+       WARN_ON(pte_write(pte));
+
+       pte = pfn_pte(pfn, prot);
+       set_pte_at(mm, vaddr, ptep, pte);
+       ptep_get_and_clear(mm, vaddr, ptep);
+       pte = ptep_get(ptep);
+       WARN_ON(!pte_none(pte));
+
+       pte = pfn_pte(pfn, prot);
+       pte = pte_wrprotect(pte);
+       pte = pte_mkclean(pte);
+       set_pte_at(mm, vaddr, ptep, pte);
+       pte = pte_mkwrite(pte);
+       pte = pte_mkdirty(pte);
+       ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+       pte = ptep_get(ptep);
+       WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+
+       pte = pfn_pte(pfn, prot);
+       set_pte_at(mm, vaddr, ptep, pte);
+       ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+       pte = ptep_get(ptep);
+       WARN_ON(!pte_none(pte));
+
+       pte = pte_mkyoung(pte);
+       set_pte_at(mm, vaddr, ptep, pte);
+       ptep_test_and_clear_young(vma, vaddr, ptep);
+       pte = ptep_get(ptep);
+       WARN_ON(pte_young(pte));
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       pr_debug("Validating PTE saved write\n");
+       WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+       WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -63,6 +123,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
        if (!has_transparent_hugepage())
                return;
 
+       pr_debug("Validating PMD basic\n");
        WARN_ON(!pmd_same(pmd, pmd));
        WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
        WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -77,6 +138,95 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
 }
 
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pmd_t *pmdp,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       if (!has_transparent_hugepage())
+               return;
+
+       pr_debug("Validating PMD advanced\n");
+       /* Align the address wrt HPAGE_PMD_SIZE */
+       vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+
+       pmd = pfn_pmd(pfn, prot);
+       set_pmd_at(mm, vaddr, pmdp, pmd);
+       pmdp_set_wrprotect(mm, vaddr, pmdp);
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(pmd_write(pmd));
+
+       pmd = pfn_pmd(pfn, prot);
+       set_pmd_at(mm, vaddr, pmdp, pmd);
+       pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(!pmd_none(pmd));
+
+       pmd = pfn_pmd(pfn, prot);
+       pmd = pmd_wrprotect(pmd);
+       pmd = pmd_mkclean(pmd);
+       set_pmd_at(mm, vaddr, pmdp, pmd);
+       pmd = pmd_mkwrite(pmd);
+       pmd = pmd_mkdirty(pmd);
+       pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+
+       pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+       set_pmd_at(mm, vaddr, pmdp, pmd);
+       pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(!pmd_none(pmd));
+
+       pmd = pmd_mkyoung(pmd);
+       set_pmd_at(mm, vaddr, pmdp, pmd);
+       pmdp_test_and_clear_young(vma, vaddr, pmdp);
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(pmd_young(pmd));
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       pr_debug("Validating PMD leaf\n");
+       /*
+        * PMD based THP is a leaf entry.
+        */
+       pmd = pmd_mkhuge(pmd);
+       WARN_ON(!pmd_leaf(pmd));
+}
+
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd;
+
+       if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+               return;
+
+       pr_debug("Validating PMD huge\n");
+       /*
+        * X86 defined pmd_set_huge() verifies that the given
+        * PMD is not a populated non-leaf entry.
+        */
+       WRITE_ONCE(*pmdp, __pmd(0));
+       WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+       WARN_ON(!pmd_clear_huge(pmdp));
+       pmd = READ_ONCE(*pmdp);
+       WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       pr_debug("Validating PMD saved write\n");
+       WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+       WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -85,6 +235,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
        if (!has_transparent_hugepage())
                return;
 
+       pr_debug("Validating PUD basic\n");
        WARN_ON(!pud_same(pud, pud));
        WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
        WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
@@ -100,18 +251,130 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
         */
        WARN_ON(!pud_bad(pud_mkhuge(pud)));
 }
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pud_t *pudp,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+       pud_t pud = pfn_pud(pfn, prot);
+
+       if (!has_transparent_hugepage())
+               return;
+
+       pr_debug("Validating PUD advanced\n");
+       /* Align the address wrt HPAGE_PUD_SIZE */
+       vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+
+       set_pud_at(mm, vaddr, pudp, pud);
+       pudp_set_wrprotect(mm, vaddr, pudp);
+       pud = READ_ONCE(*pudp);
+       WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+       pud = pfn_pud(pfn, prot);
+       set_pud_at(mm, vaddr, pudp, pud);
+       pudp_huge_get_and_clear(mm, vaddr, pudp);
+       pud = READ_ONCE(*pudp);
+       WARN_ON(!pud_none(pud));
+
+       pud = pfn_pud(pfn, prot);
+       set_pud_at(mm, vaddr, pudp, pud);
+       pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+       pud = READ_ONCE(*pudp);
+       WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+       pud = pfn_pud(pfn, prot);
+       pud = pud_wrprotect(pud);
+       pud = pud_mkclean(pud);
+       set_pud_at(mm, vaddr, pudp, pud);
+       pud = pud_mkwrite(pud);
+       pud = pud_mkdirty(pud);
+       pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+       pud = READ_ONCE(*pudp);
+       WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+       pud = pud_mkyoung(pud);
+       set_pud_at(mm, vaddr, pudp, pud);
+       pudp_test_and_clear_young(vma, vaddr, pudp);
+       pud = READ_ONCE(*pudp);
+       WARN_ON(pud_young(pud));
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+       pud_t pud = pfn_pud(pfn, prot);
+
+       pr_debug("Validating PUD leaf\n");
+       /*
+        * PUD based THP is a leaf entry.
+        */
+       pud = pud_mkhuge(pud);
+       WARN_ON(!pud_leaf(pud));
+}
+
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+       pud_t pud;
+
+       if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+               return;
+
+       pr_debug("Validating PUD huge\n");
+       /*
+        * X86 defined pud_set_huge() verifies that the given
+        * PUD is not a populated non-leaf entry.
+        */
+       WRITE_ONCE(*pudp, __pud(0));
+       WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+       WARN_ON(!pud_clear_huge(pudp));
+       pud = READ_ONCE(*pudp);
+       WARN_ON(!pud_none(pud));
+}
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pud_t *pudp,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pmd_t *pmdp,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+                                     struct vm_area_struct *vma, pud_t *pudp,
+                                     unsigned long pfn, unsigned long vaddr,
+                                     pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
 {
        p4d_t p4d;
 
+       pr_debug("Validating P4D basic\n");
        memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
        WARN_ON(!p4d_same(p4d, p4d));
 }
@@ -120,6 +383,7 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
        pgd_t pgd;
 
+       pr_debug("Validating PGD basic\n");
        memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
        WARN_ON(!pgd_same(pgd, pgd));
 }
@@ -132,6 +396,7 @@ static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
        if (mm_pmd_folded(mm))
                return;
 
+       pr_debug("Validating PUD clear\n");
        pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
        WRITE_ONCE(*pudp, pud);
        pud_clear(pudp);
@@ -146,6 +411,8 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
 
        if (mm_pmd_folded(mm))
                return;
+
+       pr_debug("Validating PUD populate\n");
        /*
         * This entry points to next level page table page.
         * Hence this must not qualify as pud_bad().
@@ -172,6 +439,7 @@ static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
        if (mm_pud_folded(mm))
                return;
 
+       pr_debug("Validating P4D clear\n");
        p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
        WRITE_ONCE(*p4dp, p4d);
        p4d_clear(p4dp);
@@ -187,6 +455,7 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
        if (mm_pud_folded(mm))
                return;
 
+       pr_debug("Validating P4D populate\n");
        /*
         * This entry points to next level page table page.
         * Hence this must not qualify as p4d_bad().
@@ -205,6 +474,7 @@ static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
        if (mm_p4d_folded(mm))
                return;
 
+       pr_debug("Validating PGD clear\n");
        pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
        WRITE_ONCE(*pgdp, pgd);
        pgd_clear(pgdp);
@@ -220,6 +490,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
        if (mm_p4d_folded(mm))
                return;
 
+       pr_debug("Validating PGD populate\n");
        /*
         * This entry points to next level page table page.
         * Hence this must not qualify as pgd_bad().
@@ -248,6 +519,7 @@ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
 {
        pte_t pte = ptep_get(ptep);
 
+       pr_debug("Validating PTE clear\n");
        pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
        set_pte_at(mm, vaddr, ptep, pte);
        barrier();
@@ -260,6 +532,7 @@ static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
 {
        pmd_t pmd = READ_ONCE(*pmdp);
 
+       pr_debug("Validating PMD clear\n");
        pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
        WRITE_ONCE(*pmdp, pmd);
        pmd_clear(pmdp);
@@ -272,6 +545,7 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
 {
        pmd_t pmd;
 
+       pr_debug("Validating PMD populate\n");
        /*
         * This entry points to next level page table page.
         * Hence this must not qualify as pmd_bad().
@@ -282,6 +556,344 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
        WARN_ON(pmd_bad(pmd));
 }
 
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+               return;
+
+       pr_debug("Validating PTE special\n");
+       WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+               return;
+
+       pr_debug("Validating PTE protnone\n");
+       WARN_ON(!pte_protnone(pte));
+       WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+       if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+               return;
+
+       pr_debug("Validating PMD protnone\n");
+       WARN_ON(!pmd_protnone(pmd));
+       WARN_ON(!pmd_present(pmd));
+}
+#else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       pr_debug("Validating PTE devmap\n");
+       WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       pr_debug("Validating PMD devmap\n");
+       WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+       pud_t pud = pfn_pud(pfn, prot);
+
+       pr_debug("Validating PUD devmap\n");
+       WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else  /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+               return;
+
+       pr_debug("Validating PTE soft dirty\n");
+       WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+       WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+       pte_t pte = pfn_pte(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+               return;
+
+       pr_debug("Validating PTE swap soft dirty\n");
+       WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+       WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+               return;
+
+       pr_debug("Validating PMD soft dirty\n");
+       WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+       WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd = pfn_pmd(pfn, prot);
+
+       if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+               !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+               return;
+
+       pr_debug("Validating PMD swap soft dirty\n");
+       WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+       WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else  /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+       swp_entry_t swp;
+       pte_t pte;
+
+       pr_debug("Validating PTE swap\n");
+       pte = pfn_pte(pfn, prot);
+       swp = __pte_to_swp_entry(pte);
+       pte = __swp_entry_to_pte(swp);
+       WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+       swp_entry_t swp;
+       pmd_t pmd;
+
+       pr_debug("Validating PMD swap\n");
+       pmd = pfn_pmd(pfn, prot);
+       swp = __pmd_to_swp_entry(pmd);
+       pmd = __swp_entry_to_pmd(swp);
+       WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else  /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+       struct page *page;
+       swp_entry_t swp;
+
+       if (!IS_ENABLED(CONFIG_MIGRATION))
+               return;
+
+       pr_debug("Validating swap migration\n");
+       /*
+        * swap_migration_tests() requires a dedicated page as it needs to
+        * be locked before creating a migration entry from it. Locking the
+        * page that actually maps kernel text ('start_kernel') can be real
+        * problematic. Lets allocate a dedicated page explicitly for this
+        * purpose that will be freed subsequently.
+        */
+       page = alloc_page(GFP_KERNEL);
+       if (!page) {
+               pr_err("page allocation failed\n");
+               return;
+       }
+
+       /*
+        * make_migration_entry() expects given page to be
+        * locked, otherwise it stumbles upon a BUG_ON().
+        */
+       __SetPageLocked(page);
+       swp = make_migration_entry(page, 1);
+       WARN_ON(!is_migration_entry(swp));
+       WARN_ON(!is_write_migration_entry(swp));
+
+       make_migration_entry_read(&swp);
+       WARN_ON(!is_migration_entry(swp));
+       WARN_ON(is_write_migration_entry(swp));
+
+       swp = make_migration_entry(page, 0);
+       WARN_ON(!is_migration_entry(swp));
+       WARN_ON(is_write_migration_entry(swp));
+       __ClearPageLocked(page);
+       __free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+       struct page *page;
+       pte_t pte;
+
+       pr_debug("Validating HugeTLB basic\n");
+       /*
+        * Accessing the page associated with the pfn is safe here,
+        * as it was previously derived from a real kernel symbol.
+        */
+       page = pfn_to_page(pfn);
+       pte = mk_huge_pte(page, prot);
+
+       WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+       WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+       WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+       pte = pfn_pte(pfn, prot);
+
+       WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+                                         struct vm_area_struct *vma,
+                                         pte_t *ptep, unsigned long pfn,
+                                         unsigned long vaddr, pgprot_t prot)
+{
+       struct page *page = pfn_to_page(pfn);
+       pte_t pte = ptep_get(ptep);
+       unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
+
+       pr_debug("Validating HugeTLB advanced\n");
+       pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
+       set_huge_pte_at(mm, vaddr, ptep, pte);
+       barrier();
+       WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
+       huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
+       pte = huge_ptep_get(ptep);
+       WARN_ON(!huge_pte_none(pte));
+
+       pte = mk_huge_pte(page, prot);
+       set_huge_pte_at(mm, vaddr, ptep, pte);
+       barrier();
+       huge_ptep_set_wrprotect(mm, vaddr, ptep);
+       pte = huge_ptep_get(ptep);
+       WARN_ON(huge_pte_write(pte));
+
+       pte = mk_huge_pte(page, prot);
+       set_huge_pte_at(mm, vaddr, ptep, pte);
+       barrier();
+       huge_ptep_get_and_clear(mm, vaddr, ptep);
+       pte = huge_ptep_get(ptep);
+       WARN_ON(!huge_pte_none(pte));
+
+       pte = mk_huge_pte(page, prot);
+       pte = huge_pte_wrprotect(pte);
+       set_huge_pte_at(mm, vaddr, ptep, pte);
+       barrier();
+       pte = huge_pte_mkwrite(pte);
+       pte = huge_pte_mkdirty(pte);
+       huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+       pte = huge_ptep_get(ptep);
+       WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
+}
+#else  /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+                                         struct vm_area_struct *vma,
+                                         pte_t *ptep, unsigned long pfn,
+                                         unsigned long vaddr, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+       pmd_t pmd;
+
+       if (!has_transparent_hugepage())
+               return;
+
+       pr_debug("Validating PMD based THP\n");
+       /*
+        * pmd_trans_huge() and pmd_present() must return positive after
+        * MMU invalidation with pmd_mkinvalid(). This behavior is an
+        * optimization for transparent huge page. pmd_trans_huge() must
+        * be true if pmd_page() returns a valid THP to avoid taking the
+        * pmd_lock when others walk over non transhuge pmds (i.e. there
+        * are no THP allocated). Especially when splitting a THP and
+        * removing the present bit from the pmd, pmd_trans_huge() still
+        * needs to return true. pmd_present() should be true whenever
+        * pmd_trans_huge() returns true.
+        */
+       pmd = pfn_pmd(pfn, prot);
+       WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+       WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+       WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+       pud_t pud;
+
+       if (!has_transparent_hugepage())
+               return;
+
+       pr_debug("Validating PUD based THP\n");
+       pud = pfn_pud(pfn, prot);
+       WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+       /*
+        * pud_mkinvalid() has been dropped for now. Enable back
+        * these tests when it comes back with a modified pud_present().
+        *
+        * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+        * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+        */
+}
+#else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static unsigned long __init get_random_vaddr(void)
 {
        unsigned long random_vaddr, random_pages, total_user_pages;
@@ -296,6 +908,7 @@ static unsigned long __init get_random_vaddr(void)
 
 static int __init debug_vm_pgtable(void)
 {
+       struct vm_area_struct *vma;
        struct mm_struct *mm;
        pgd_t *pgdp;
        p4d_t *p4dp, *saved_p4dp;
@@ -303,7 +916,7 @@ static int __init debug_vm_pgtable(void)
        pmd_t *pmdp, *saved_pmdp, pmd;
        pte_t *ptep;
        pgtable_t saved_ptep;
-       pgprot_t prot;
+       pgprot_t prot, protnone;
        phys_addr_t paddr;
        unsigned long vaddr, pte_aligned, pmd_aligned;
        unsigned long pud_aligned, p4d_aligned, pgd_aligned;
@@ -319,6 +932,18 @@ static int __init debug_vm_pgtable(void)
        }
 
        /*
+        * __P000 (or even __S000) will help create page table entries with
+        * PROT_NONE permission as required for pxx_protnone_tests().
+        */
+       protnone = __P000;
+
+       vma = vm_area_alloc(mm);
+       if (!vma) {
+               pr_err("vma allocation failed\n");
+               return 1;
+       }
+
+       /*
         * PFN for mapping at PTE level is determined from a standard kernel
         * text symbol. But pfns for higher page table levels are derived by
         * masking lower bits of this real pfn. These derived pfns might not
@@ -366,6 +991,20 @@ static int __init debug_vm_pgtable(void)
        p4d_clear_tests(mm, p4dp);
        pgd_clear_tests(mm, pgdp);
 
+       pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+       pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+       pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+       hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+       pmd_leaf_tests(pmd_aligned, prot);
+       pud_leaf_tests(pud_aligned, prot);
+
+       pmd_huge_tests(pmdp, pmd_aligned, prot);
+       pud_huge_tests(pudp, pud_aligned, prot);
+
+       pte_savedwrite_tests(pte_aligned, prot);
+       pmd_savedwrite_tests(pmd_aligned, prot);
+
        pte_unmap_unlock(ptep, ptl);
 
        pmd_populate_tests(mm, pmdp, saved_ptep);
@@ -373,11 +1012,34 @@ static int __init debug_vm_pgtable(void)
        p4d_populate_tests(mm, p4dp, saved_pudp);
        pgd_populate_tests(mm, pgdp, saved_p4dp);
 
+       pte_special_tests(pte_aligned, prot);
+       pte_protnone_tests(pte_aligned, protnone);
+       pmd_protnone_tests(pmd_aligned, protnone);
+
+       pte_devmap_tests(pte_aligned, prot);
+       pmd_devmap_tests(pmd_aligned, prot);
+       pud_devmap_tests(pud_aligned, prot);
+
+       pte_soft_dirty_tests(pte_aligned, prot);
+       pmd_soft_dirty_tests(pmd_aligned, prot);
+       pte_swap_soft_dirty_tests(pte_aligned, prot);
+       pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+       pte_swap_tests(pte_aligned, prot);
+       pmd_swap_tests(pmd_aligned, prot);
+
+       swap_migration_tests();
+       hugetlb_basic_tests(pte_aligned, prot);
+
+       pmd_thp_tests(pmd_aligned, prot);
+       pud_thp_tests(pud_aligned, prot);
+
        p4d_free(mm, saved_p4dp);
        pud_free(mm, saved_pudp);
        pmd_free(mm, saved_pmdp);
        pte_free(mm, saved_ptep);
 
+       vm_area_free(vma);
        mm_dec_nr_puds(mm);
        mm_dec_nr_pmds(mm);
        mm_dec_nr_ptes(mm);
index 9f131f1..f2bb5ff 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/delayacct.h>
 #include <linux/psi.h>
 #include <linux/ramfs.h>
+#include <linux/page_idle.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -1648,6 +1649,9 @@ EXPORT_SYMBOL(find_lock_entry);
  * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
  *   page is already in cache.  If the page was allocated, unlock it before
  *   returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
+ * * %FGP_NOWAIT - Don't get blocked by page lock
  *
  * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
  * if the %GFP flags specified for %FGP_CREAT are atomic.
@@ -1689,6 +1693,11 @@ repeat:
 
        if (fgp_flags & FGP_ACCESSED)
                mark_page_accessed(page);
+       else if (fgp_flags & FGP_WRITE) {
+               /* Clear idle flag for buffer write */
+               if (page_is_idle(page))
+                       clear_page_idle(page);
+       }
 
 no_page:
        if (!page && (fgp_flags & FGP_CREAT)) {
index 6f47697..d8a33dd 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1404,7 +1404,8 @@ retry:
  *
  * This takes care of mlocking the pages too if VM_LOCKED is set.
  *
- * return 0 on success, negative error code on error.
+ * Return either number of pages pinned in the vma, or a negative error
+ * code on error.
  *
  * vma->vm_mm->mmap_lock must be held.
  *
index 78c84be..206f52b 100644 (file)
@@ -1722,19 +1722,13 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
 }
 
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
-                 unsigned long new_addr, unsigned long old_end,
-                 pmd_t *old_pmd, pmd_t *new_pmd)
+                 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
        bool force_flush = false;
 
-       if ((old_addr & ~HPAGE_PMD_MASK) ||
-           (new_addr & ~HPAGE_PMD_MASK) ||
-           old_end - old_addr < HPAGE_PMD_SIZE)
-               return false;
-
        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have release it.
@@ -2069,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         * free), userland could trigger a small page size TLB miss on the
         * small sized TLB while the hugepage TLB entry is still established in
         * the huge TLB. Some CPU doesn't like that.
-        * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
-        * 383 on page 93. Intel should be safe but is also warns that it's
+        * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+        * 383 on page 105. Intel should be safe but is also warns that it's
         * only safe if the permission and cache attributes of the two entries
         * loaded in the two TLB is identical (which should be the case here).
         * But it is generally safer to never allow small and huge TLB entries
index 590111e..e52c878 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/cma.h>
 
 #include <asm/page.h>
+#include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
 #include <linux/io.h>
@@ -5313,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
 {
-       unsigned long check_addr;
+       unsigned long a_start, a_end;
 
        if (!(vma->vm_flags & VM_MAYSHARE))
                return;
 
-       for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
-               unsigned long a_start = check_addr & PUD_MASK;
-               unsigned long a_end = a_start + PUD_SIZE;
+       /* Extend the range to be PUD aligned for a worst case scenario */
+       a_start = ALIGN_DOWN(*start, PUD_SIZE);
+       a_end = ALIGN(*end, PUD_SIZE);
 
-               /*
-                * If sharing is possible, adjust start/end if necessary.
-                */
-               if (range_in_vma(vma, a_start, a_end)) {
-                       if (a_start < *start)
-                               *start = a_start;
-                       if (a_end > *end)
-                               *end = a_end;
-               }
-       }
+       /*
+        * Intersect the range with the vma range, since pmd sharing won't be
+        * across vma after all
+        */
+       *start = max(vma->vm_start, a_start);
+       *end = min(vma->vm_end, a_end);
 }
 
 /*
similarity index 99%
rename from lib/ioremap.c
rename to mm/ioremap.c
index 5ee3526..5fa1ab4 100644 (file)
@@ -13,6 +13,8 @@
 #include <linux/export.h>
 #include <asm/cacheflush.h>
 
+#include "pgalloc-track.h"
+
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static int __read_mostly ioremap_p4d_capable;
 static int __read_mostly ioremap_pud_capable;
index 757d407..950fd37 100644 (file)
@@ -40,7 +40,7 @@
 #include "kasan.h"
 #include "../slab.h"
 
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
 {
        unsigned long entries[KASAN_STACK_DEPTH];
        unsigned int nr_entries;
@@ -50,10 +50,10 @@ static inline depot_stack_handle_t save_stack(gfp_t flags)
        return stack_depot_save(entries, nr_entries, flags);
 }
 
-static inline void set_track(struct kasan_track *track, gfp_t flags)
+void kasan_set_track(struct kasan_track *track, gfp_t flags)
 {
        track->pid = current->pid;
-       track->stack = save_stack(flags);
+       track->stack = kasan_save_stack(flags);
 }
 
 void kasan_enable_current(void)
@@ -180,21 +180,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
        kasan_unpoison_shadow(base, watermark - base);
 }
 
-/*
- * Clear all poison for the region between the current SP and a provided
- * watermark value, as is sometimes required prior to hand-crafted asm function
- * returns in the middle of functions.
- */
-void kasan_unpoison_stack_above_sp_to(const void *watermark)
-{
-       const void *sp = __builtin_frame_address(0);
-       size_t size = watermark - sp;
-
-       if (WARN_ON(sp > watermark))
-               return;
-       kasan_unpoison_shadow(sp, size);
-}
-
 void kasan_alloc_pages(struct page *page, unsigned int order)
 {
        u8 tag;
@@ -298,24 +283,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
        return (void *)object + cache->kasan_info.free_meta_offset;
 }
 
-
-static void kasan_set_free_info(struct kmem_cache *cache,
-               void *object, u8 tag)
-{
-       struct kasan_alloc_meta *alloc_meta;
-       u8 idx = 0;
-
-       alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
-       idx = alloc_meta->free_track_idx;
-       alloc_meta->free_pointer_tag[idx] = tag;
-       alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
-       set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
-}
-
 void kasan_poison_slab(struct page *page)
 {
        unsigned long i;
@@ -491,7 +458,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
                KASAN_KMALLOC_REDZONE);
 
        if (cache->flags & SLAB_KASAN)
-               set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+               kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
 
        return set_tag(object, tag);
 }
index 098a7db..248264b 100644 (file)
@@ -324,3 +324,46 @@ DEFINE_ASAN_SET_SHADOW(f2);
 DEFINE_ASAN_SET_SHADOW(f3);
 DEFINE_ASAN_SET_SHADOW(f5);
 DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+       struct page *page = kasan_addr_to_page(addr);
+       struct kmem_cache *cache;
+       struct kasan_alloc_meta *alloc_info;
+       void *object;
+
+       if (!(page && PageSlab(page)))
+               return;
+
+       cache = page->slab_cache;
+       object = nearest_obj(cache, page, addr);
+       alloc_info = get_alloc_info(cache, object);
+
+       /*
+        * record the last two call_rcu() call stacks.
+        */
+       alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+       alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+                               void *object, u8 tag)
+{
+       struct kasan_free_meta *free_meta;
+
+       free_meta = get_free_info(cache, object);
+       kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+
+       /*
+        *  the object was freed and has free track set
+        */
+       *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+                               void *object, u8 tag)
+{
+       if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+               return NULL;
+       return &get_free_info(cache, object)->free_track;
+}
index e200acb..a38c7a9 100644 (file)
@@ -80,6 +80,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
                break;
        case KASAN_FREE_PAGE:
        case KASAN_KMALLOC_FREE:
+       case KASAN_KMALLOC_FREETRACK:
                bug_type = "use-after-free";
                break;
        case KASAN_ALLOCA_LEFT:
index cfade64..ac49945 100644 (file)
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_KMALLOC_FREETRACK 0xFA  /* object was freed and has free track set */
 #else
 #define KASAN_FREE_PAGE         KASAN_TAG_INVALID
 #define KASAN_PAGE_REDZONE      KASAN_TAG_INVALID
 #define KASAN_KMALLOC_REDZONE   KASAN_TAG_INVALID
 #define KASAN_KMALLOC_FREE      KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
 #endif
 
-#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID   0xF9  /* unallocated space in vmapped page */
+#define KASAN_GLOBAL_REDZONE    0xF9  /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID   0xF8  /* unallocated space in vmapped page */
 
 /*
  * Stack redzone shadow values
@@ -104,7 +106,15 @@ struct kasan_track {
 
 struct kasan_alloc_meta {
        struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+       /*
+        * call_rcu() call stack is stored into struct kasan_alloc_meta.
+        * The free stack is stored into struct kasan_free_meta.
+        */
+       depot_stack_handle_t aux_stack[2];
+#else
        struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#endif
 #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
        u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
        u8 free_track_idx;
@@ -119,6 +129,9 @@ struct kasan_free_meta {
         * Otherwise it might be used for the allocator freelist.
         */
        struct qlist_node quarantine_link;
+#ifdef CONFIG_KASAN_GENERIC
+       struct kasan_track free_track;
+#endif
 };
 
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
@@ -159,6 +172,12 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
 
 struct page *kasan_addr_to_page(const void *addr);
 
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+                               void *object, u8 tag);
+
 #if defined(CONFIG_KASAN_GENERIC) && \
        (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
index 978bc4a..4c53758 100644 (file)
@@ -145,6 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
        if (IS_ENABLED(CONFIG_SLAB))
                local_irq_save(flags);
 
+       *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
        ___cache_free(cache, object, _THIS_IP_);
 
        if (IS_ENABLED(CONFIG_SLAB))
index 51ec454..4f49fa6 100644 (file)
@@ -106,15 +106,20 @@ static void end_report(unsigned long *flags)
        kasan_enable_current();
 }
 
+static void print_stack(depot_stack_handle_t stack)
+{
+       unsigned long *entries;
+       unsigned int nr_entries;
+
+       nr_entries = stack_depot_fetch(stack, &entries);
+       stack_trace_print(entries, nr_entries, 0);
+}
+
 static void print_track(struct kasan_track *track, const char *prefix)
 {
        pr_err("%s by task %u:\n", prefix, track->pid);
        if (track->stack) {
-               unsigned long *entries;
-               unsigned int nr_entries;
-
-               nr_entries = stack_depot_fetch(track->stack, &entries);
-               stack_trace_print(entries, nr_entries, 0);
+               print_stack(track->stack);
        } else {
                pr_err("(stack is not available)\n");
        }
@@ -160,26 +165,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
                (void *)(object_addr + cache->object_size));
 }
 
-static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
-               void *object, u8 tag)
-{
-       struct kasan_alloc_meta *alloc_meta;
-       int i = 0;
-
-       alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
-       for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
-               if (alloc_meta->free_pointer_tag[i] == tag)
-                       break;
-       }
-       if (i == KASAN_NR_FREE_STACKS)
-               i = alloc_meta->free_track_idx;
-#endif
-
-       return &alloc_meta->free_track[i];
-}
-
 static void describe_object(struct kmem_cache *cache, void *object,
                                const void *addr, u8 tag)
 {
@@ -191,8 +176,23 @@ static void describe_object(struct kmem_cache *cache, void *object,
                print_track(&alloc_info->alloc_track, "Allocated");
                pr_err("\n");
                free_track = kasan_get_free_track(cache, object, tag);
-               print_track(free_track, "Freed");
-               pr_err("\n");
+               if (free_track) {
+                       print_track(free_track, "Freed");
+                       pr_err("\n");
+               }
+
+#ifdef CONFIG_KASAN_GENERIC
+               if (alloc_info->aux_stack[0]) {
+                       pr_err("Last call_rcu():\n");
+                       print_stack(alloc_info->aux_stack[0]);
+                       pr_err("\n");
+               }
+               if (alloc_info->aux_stack[1]) {
+                       pr_err("Second to last call_rcu():\n");
+                       print_stack(alloc_info->aux_stack[1]);
+                       pr_err("\n");
+               }
+#endif
        }
 
        describe_object_addr(cache, object, addr);
index 8a959fd..e02a36a 100644 (file)
@@ -161,3 +161,40 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
        kasan_poison_shadow((void *)addr, size, tag);
 }
 EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_set_free_info(struct kmem_cache *cache,
+                               void *object, u8 tag)
+{
+       struct kasan_alloc_meta *alloc_meta;
+       u8 idx = 0;
+
+       alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+       idx = alloc_meta->free_track_idx;
+       alloc_meta->free_pointer_tag[idx] = tag;
+       alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+       kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+                               void *object, u8 tag)
+{
+       struct kasan_alloc_meta *alloc_meta;
+       int i = 0;
+
+       alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+       for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+               if (alloc_meta->free_pointer_tag[i] == tag)
+                       break;
+       }
+       if (i == KASAN_NR_FREE_STACKS)
+               i = alloc_meta->free_track_idx;
+#endif
+
+       return &alloc_meta->free_track[i];
+}
index 700f516..b52bd46 100644 (file)
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
 
 static inline int khugepaged_test_exit(struct mm_struct *mm)
 {
-       return atomic_read(&mm->mm_users) == 0;
+       return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
 }
 
 static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        mmap_write_lock(mm);
-       result = SCAN_ANY_PROCESS;
-       if (!mmget_still_valid(mm))
-               goto out;
        result = hugepage_vma_revalidate(mm, address, &vma);
        if (result)
                goto out;
@@ -1412,7 +1409,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 {
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = find_vma(mm, haddr);
-       struct page *hpage = NULL;
+       struct page *hpage;
        pte_t *start_pte, *pte;
        pmd_t *pmd, _pmd;
        spinlock_t *ptl;
@@ -1432,9 +1429,17 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
                return;
 
+       hpage = find_lock_page(vma->vm_file->f_mapping,
+                              linear_page_index(vma, haddr));
+       if (!hpage)
+               return;
+
+       if (!PageHead(hpage))
+               goto drop_hpage;
+
        pmd = mm_find_pmd(mm, haddr);
        if (!pmd)
-               return;
+               goto drop_hpage;
 
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
 
@@ -1453,30 +1458,11 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
                page = vm_normal_page(vma, addr, *pte);
 
-               if (!page || !PageCompound(page))
-                       goto abort;
-
-               if (!hpage) {
-                       hpage = compound_head(page);
-                       /*
-                        * The mapping of the THP should not change.
-                        *
-                        * Note that uprobe, debugger, or MAP_PRIVATE may
-                        * change the page table, but the new page will
-                        * not pass PageCompound() check.
-                        */
-                       if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
-                               goto abort;
-               }
-
                /*
-                * Confirm the page maps to the correct subpage.
-                *
-                * Note that uprobe, debugger, or MAP_PRIVATE may change
-                * the page table, but the new page will not pass
-                * PageCompound() check.
+                * Note that uprobe, debugger, or MAP_PRIVATE may change the
+                * page table, but the new page will not be a subpage of hpage.
                 */
-               if (WARN_ON(hpage + i != page))
+               if (hpage + i != page)
                        goto abort;
                count++;
        }
@@ -1495,21 +1481,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        pte_unmap_unlock(start_pte, ptl);
 
        /* step 3: set proper refcount and mm_counters. */
-       if (hpage) {
+       if (count) {
                page_ref_sub(hpage, count);
                add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
        }
 
        /* step 4: collapse pmd */
        ptl = pmd_lock(vma->vm_mm, pmd);
-       _pmd = pmdp_collapse_flush(vma, addr, pmd);
+       _pmd = pmdp_collapse_flush(vma, haddr, pmd);
        spin_unlock(ptl);
        mm_dec_nr_ptes(mm);
        pte_free(mm, pmd_pgtable(_pmd));
+
+drop_hpage:
+       unlock_page(hpage);
+       put_page(hpage);
        return;
 
 abort:
        pte_unmap_unlock(start_pte, ptl);
+       goto drop_hpage;
 }
 
 static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
@@ -1538,6 +1529,7 @@ out:
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
        struct vm_area_struct *vma;
+       struct mm_struct *mm;
        unsigned long addr;
        pmd_t *pmd, _pmd;
 
@@ -1566,7 +1558,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                        continue;
                if (vma->vm_end < addr + HPAGE_PMD_SIZE)
                        continue;
-               pmd = mm_find_pmd(vma->vm_mm, addr);
+               mm = vma->vm_mm;
+               pmd = mm_find_pmd(mm, addr);
                if (!pmd)
                        continue;
                /*
@@ -1576,17 +1569,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                 * mmap_lock while holding page lock. Fault path does it in
                 * reverse order. Trylock is a way to avoid deadlock.
                 */
-               if (mmap_write_trylock(vma->vm_mm)) {
-                       spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
-                       /* assume page table is clear */
-                       _pmd = pmdp_collapse_flush(vma, addr, pmd);
-                       spin_unlock(ptl);
-                       mmap_write_unlock(vma->vm_mm);
-                       mm_dec_nr_ptes(vma->vm_mm);
-                       pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+               if (mmap_write_trylock(mm)) {
+                       if (!khugepaged_test_exit(mm)) {
+                               spinlock_t *ptl = pmd_lock(mm, pmd);
+                               /* assume page table is clear */
+                               _pmd = pmdp_collapse_flush(vma, addr, pmd);
+                               spin_unlock(ptl);
+                               mm_dec_nr_ptes(mm);
+                               pte_free(mm, pmd_pgtable(_pmd));
+                       }
+                       mmap_write_unlock(mm);
                } else {
                        /* Try again later */
-                       khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+                       khugepaged_add_pte_mapped_thp(mm, addr);
                }
        }
        i_mmap_unlock_write(mapping);
index 8cc617e..8d9ceea 100644 (file)
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 
-#define MEM_CGROUP_RECLAIM_RETRIES     5
-
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket;
 
@@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+extern spinlock_t css_set_lock;
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+       struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+       struct mem_cgroup *memcg;
+       unsigned int nr_bytes;
+       unsigned int nr_pages;
+       unsigned long flags;
+
+       /*
+        * At this point all allocated objects are freed, and
+        * objcg->nr_charged_bytes can't have an arbitrary byte value.
+        * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+        *
+        * The following sequence can lead to it:
+        * 1) CPU0: objcg == stock->cached_objcg
+        * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+        *          PAGE_SIZE bytes are charged
+        * 3) CPU1: a process from another memcg is allocating something,
+        *          the stock if flushed,
+        *          objcg->nr_charged_bytes = PAGE_SIZE - 92
+        * 5) CPU0: we do release this object,
+        *          92 bytes are added to stock->nr_bytes
+        * 6) CPU0: stock is flushed,
+        *          92 bytes are added to objcg->nr_charged_bytes
+        *
+        * In the result, nr_charged_bytes == PAGE_SIZE.
+        * This page will be uncharged in obj_cgroup_release().
+        */
+       nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+       WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+       nr_pages = nr_bytes >> PAGE_SHIFT;
+
+       spin_lock_irqsave(&css_set_lock, flags);
+       memcg = obj_cgroup_memcg(objcg);
+       if (nr_pages)
+               __memcg_kmem_uncharge(memcg, nr_pages);
+       list_del(&objcg->list);
+       mem_cgroup_put(memcg);
+       spin_unlock_irqrestore(&css_set_lock, flags);
+
+       percpu_ref_exit(ref);
+       kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+       struct obj_cgroup *objcg;
+       int ret;
+
+       objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+       if (!objcg)
+               return NULL;
+
+       ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+                             GFP_KERNEL);
+       if (ret) {
+               kfree(objcg);
+               return NULL;
+       }
+       INIT_LIST_HEAD(&objcg->list);
+       return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+                                 struct mem_cgroup *parent)
+{
+       struct obj_cgroup *objcg, *iter;
+
+       objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+       spin_lock_irq(&css_set_lock);
+
+       /* Move active objcg to the parent's list */
+       xchg(&objcg->memcg, parent);
+       css_get(&parent->css);
+       list_add(&objcg->list, &parent->objcg_list);
+
+       /* Move already reparented objcgs to the parent's list */
+       list_for_each_entry(iter, &memcg->objcg_list, list) {
+               css_get(&parent->css);
+               xchg(&iter->memcg, parent);
+               css_put(&memcg->css);
+       }
+       list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+       spin_unlock_irq(&css_set_lock);
+
+       percpu_ref_kill(&objcg->refcnt);
+}
+
 /*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)
 
 /*
  * A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
  * conditional to this static branch, we'll have to allow modules that does
  * kmem_cache_alloc and the such to see this symbol as well
  */
 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
 #endif
 
 static int memcg_shrinker_map_size;
@@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)
        unsigned long ino = 0;
 
        rcu_read_lock();
-       if (PageSlab(page) && !PageTail(page))
-               memcg = memcg_from_slab_page(page);
-       else
-               memcg = READ_ONCE(page->mem_cgroup);
+       memcg = page->mem_cgroup;
+
+       /*
+        * The lowest bit set means that memcg isn't a valid
+        * memcg pointer, but a obj_cgroups pointer.
+        * In this case the page is shared and doesn't belong
+        * to any specific memory cgroup.
+        */
+       if ((unsigned long) memcg & 0x1UL)
+               memcg = NULL;
+
        while (memcg && !(memcg->css.flags & CSS_ONLINE))
                memcg = parent_mem_cgroup(memcg);
        if (memcg)
@@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  */
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 {
-       long x;
+       long x, threshold = MEMCG_CHARGE_BATCH;
 
        if (mem_cgroup_disabled())
                return;
 
+       if (vmstat_item_in_bytes(idx))
+               threshold <<= PAGE_SHIFT;
+
        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
-       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+       if (unlikely(abs(x) > threshold)) {
                struct mem_cgroup *mi;
 
                /*
@@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
        return mem_cgroup_nodeinfo(parent, nid);
 }
 
-/**
- * __mod_lruvec_state - update lruvec memory statistics
- * @lruvec: the lruvec
- * @idx: the stat item
- * @val: delta to add to the counter, can be negative
- *
- * The lruvec is the intersection of the NUMA node and a cgroup. This
- * function updates the all three counters that are affected by a
- * change of state at this level: per-node, per-cgroup, per-lruvec.
- */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-                       int val)
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                             int val)
 {
-       pg_data_t *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup *memcg;
-       long x;
-
-       /* Update node */
-       __mod_node_page_state(pgdat, idx, val);
-
-       if (mem_cgroup_disabled())
-               return;
+       long x, threshold = MEMCG_CHARGE_BATCH;
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
@@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        /* Update lruvec */
        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 
+       if (vmstat_item_in_bytes(idx))
+               threshold <<= PAGE_SHIFT;
+
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
-       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+       if (unlikely(abs(x) > threshold)) {
+               pg_data_t *pgdat = lruvec_pgdat(lruvec);
                struct mem_cgroup_per_node *pi;
 
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
@@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 }
 
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                       int val)
+{
+       /* Update node */
+       __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+       /* Update memcg and lruvec */
+       if (!mem_cgroup_disabled())
+               __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 {
        pg_data_t *pgdat = page_pgdat(virt_to_page(p));
@@ -1377,12 +1483,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
                       PAGE_SIZE);
        seq_buf_printf(&s, "kernel_stack %llu\n",
-                      (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+                      (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
                       1024);
        seq_buf_printf(&s, "slab %llu\n",
-                      (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
-                            memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
-                      PAGE_SIZE);
+                      (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+                            memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
        seq_buf_printf(&s, "sock %llu\n",
                       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
                       PAGE_SIZE);
@@ -1412,11 +1517,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                               PAGE_SIZE);
 
        seq_buf_printf(&s, "slab_reclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
-                      PAGE_SIZE);
+                      (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
        seq_buf_printf(&s, "slab_unreclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
-                      PAGE_SIZE);
+                      (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
 
        /* Accumulated memory events */
 
@@ -1560,15 +1663,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                .gfp_mask = gfp_mask,
                .order = order,
        };
-       bool ret;
+       bool ret = true;
 
        if (mutex_lock_killable(&oom_lock))
                return true;
+
+       if (mem_cgroup_margin(memcg) >= (1 << order))
+               goto unlock;
+
        /*
         * A few threads which were not waiting at mutex_lock_killable() can
         * fail to bail out. Therefore, check again after holding oom_lock.
         */
        ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
        mutex_unlock(&oom_lock);
        return ret;
 }
@@ -2039,6 +2148,12 @@ EXPORT_SYMBOL(unlock_page_memcg);
 struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+       struct obj_cgroup *cached_objcg;
+       unsigned int nr_bytes;
+#endif
+
        struct work_struct work;
        unsigned long flags;
 #define FLUSHING_CACHED_CHARGE 0
@@ -2046,6 +2161,22 @@ struct memcg_stock_pcp {
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg)
+{
+       return false;
+}
+#endif
+
 /**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
@@ -2086,13 +2217,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 {
        struct mem_cgroup *old = stock->cached;
 
+       if (!old)
+               return;
+
        if (stock->nr_pages) {
                page_counter_uncharge(&old->memory, stock->nr_pages);
                if (do_memsw_account())
                        page_counter_uncharge(&old->memsw, stock->nr_pages);
-               css_put_many(&old->css, stock->nr_pages);
                stock->nr_pages = 0;
        }
+
+       css_put(&old->css);
        stock->cached = NULL;
 }
 
@@ -2108,6 +2243,7 @@ static void drain_local_stock(struct work_struct *dummy)
        local_irq_save(flags);
 
        stock = this_cpu_ptr(&memcg_stock);
+       drain_obj_stock(stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
@@ -2128,6 +2264,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached != memcg) { /* reset if necessary */
                drain_stock(stock);
+               css_get(&memcg->css);
                stock->cached = memcg;
        }
        stock->nr_pages += nr_pages;
@@ -2166,6 +2303,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                if (memcg && stock->nr_pages &&
                    mem_cgroup_is_descendant(memcg, root_memcg))
                        flush = true;
+               if (obj_stock_flush_required(stock, root_memcg))
+                       flush = true;
                rcu_read_unlock();
 
                if (flush &&
@@ -2228,18 +2367,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
        return 0;
 }
 
-static void reclaim_high(struct mem_cgroup *memcg,
-                        unsigned int nr_pages,
-                        gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+                                 unsigned int nr_pages,
+                                 gfp_t gfp_mask)
 {
+       unsigned long nr_reclaimed = 0;
+
        do {
+               unsigned long pflags;
+
                if (page_counter_read(&memcg->memory) <=
                    READ_ONCE(memcg->memory.high))
                        continue;
+
                memcg_memory_event(memcg, MEMCG_HIGH);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+               psi_memstall_enter(&pflags);
+               nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+                                                            gfp_mask, true);
+               psi_memstall_leave(&pflags);
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
+
+       return nr_reclaimed;
 }
 
 static void high_work_func(struct work_struct *work)
@@ -2395,16 +2545,32 @@ void mem_cgroup_handle_over_high(void)
 {
        unsigned long penalty_jiffies;
        unsigned long pflags;
+       unsigned long nr_reclaimed;
        unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *memcg;
+       bool in_retry = false;
 
        if (likely(!nr_pages))
                return;
 
        memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
        current->memcg_nr_pages_over_high = 0;
 
+retry_reclaim:
+       /*
+        * The allocating task should reclaim at least the batch size, but for
+        * subsequent retries we only want to do what's necessary to prevent oom
+        * or breaching resource isolation.
+        *
+        * This is distinct from memory.max or page allocator behaviour because
+        * memory.high is currently batched, whereas memory.max and the page
+        * allocator run every time an allocation is made.
+        */
+       nr_reclaimed = reclaim_high(memcg,
+                                   in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+                                   GFP_KERNEL);
+
        /*
         * memory.high is breached and reclaim is unable to keep up. Throttle
         * allocators proactively to slow down excessive growth.
@@ -2432,6 +2598,16 @@ void mem_cgroup_handle_over_high(void)
                goto out;
 
        /*
+        * If reclaim is making forward progress but we're still over
+        * memory.high, we want to encourage that rather than doing allocator
+        * throttling.
+        */
+       if (nr_reclaimed || nr_retries--) {
+               in_retry = true;
+               goto retry_reclaim;
+       }
+
+       /*
         * If we exit early, we're guaranteed to die (since
         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
         * need to account for any ill-begotten jiffies to pay them off later.
@@ -2448,13 +2624,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *mem_over_limit;
        struct page_counter *counter;
+       enum oom_status oom_status;
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       enum oom_status oom_status;
+       unsigned long pflags;
 
        if (mem_cgroup_is_root(memcg))
                return 0;
@@ -2514,8 +2691,10 @@ retry:
 
        memcg_memory_event(mem_over_limit, MEMCG_MAX);
 
+       psi_memstall_enter(&pflags);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
+       psi_memstall_leave(&pflags);
 
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                goto retry;
@@ -2567,7 +2746,7 @@ retry:
                       get_order(nr_pages * PAGE_SIZE));
        switch (oom_status) {
        case OOM_SUCCESS:
-               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               nr_retries = MAX_RECLAIM_RETRIES;
                goto retry;
        case OOM_FAILED:
                goto force;
@@ -2586,12 +2765,10 @@ force:
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_charge(&memcg->memsw, nr_pages);
-       css_get_many(&memcg->css, nr_pages);
 
        return 0;
 
 done_restock:
-       css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
 
@@ -2649,8 +2826,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_uncharge(&memcg->memsw, nr_pages);
-
-       css_put_many(&memcg->css, nr_pages);
 }
 #endif
 
@@ -2669,6 +2844,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+                                gfp_t gfp)
+{
+       unsigned int objects = objs_per_slab_page(s, page);
+       void *vec;
+
+       vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+                          page_to_nid(page));
+       if (!vec)
+               return -ENOMEM;
+
+       if (cmpxchg(&page->obj_cgroups, NULL,
+                   (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+               kfree(vec);
+       else
+               kmemleak_not_leak(vec);
+
+       return 0;
+}
+
 /*
  * Returns a pointer to the memory cgroup to which the kernel object is charged.
  *
@@ -2685,17 +2880,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
        page = virt_to_head_page(p);
 
        /*
-        * Slab pages don't have page->mem_cgroup set because corresponding
-        * kmem caches can be reparented during the lifetime. That's why
-        * memcg_from_slab_page() should be used instead.
+        * Slab objects are accounted individually, not per-page.
+        * Memcg membership data for each individual object is saved in
+        * the page->obj_cgroups.
         */
-       if (PageSlab(page))
-               return memcg_from_slab_page(page);
+       if (page_has_obj_cgroups(page)) {
+               struct obj_cgroup *objcg;
+               unsigned int off;
+
+               off = obj_to_index(page->slab_cache, page, p);
+               objcg = page_obj_cgroups(page)[off];
+               if (objcg)
+                       return obj_cgroup_memcg(objcg);
+
+               return NULL;
+       }
 
        /* All other pages use page->mem_cgroup */
        return page->mem_cgroup;
 }
 
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+       struct obj_cgroup *objcg = NULL;
+       struct mem_cgroup *memcg;
+
+       if (unlikely(!current->mm && !current->active_memcg))
+               return NULL;
+
+       rcu_read_lock();
+       if (unlikely(current->active_memcg))
+               memcg = rcu_dereference(current->active_memcg);
+       else
+               memcg = mem_cgroup_from_task(current);
+
+       for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+               objcg = rcu_dereference(memcg->objcg);
+               if (objcg && obj_cgroup_tryget(objcg))
+                       break;
+       }
+       rcu_read_unlock();
+
+       return objcg;
+}
+
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -2721,9 +2949,7 @@ static int memcg_alloc_cache_id(void)
        else if (size > MEMCG_CACHES_MAX_SIZE)
                size = MEMCG_CACHES_MAX_SIZE;
 
-       err = memcg_update_all_caches(size);
-       if (!err)
-               err = memcg_update_all_list_lrus(size);
+       err = memcg_update_all_list_lrus(size);
        if (!err)
                memcg_nr_cache_ids = size;
 
@@ -2741,150 +2967,6 @@ static void memcg_free_cache_id(int id)
        ida_simple_remove(&memcg_cache_ida, id);
 }
 
-struct memcg_kmem_cache_create_work {
-       struct mem_cgroup *memcg;
-       struct kmem_cache *cachep;
-       struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
-       struct memcg_kmem_cache_create_work *cw =
-               container_of(w, struct memcg_kmem_cache_create_work, work);
-       struct mem_cgroup *memcg = cw->memcg;
-       struct kmem_cache *cachep = cw->cachep;
-
-       memcg_create_kmem_cache(memcg, cachep);
-
-       css_put(&memcg->css);
-       kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                              struct kmem_cache *cachep)
-{
-       struct memcg_kmem_cache_create_work *cw;
-
-       if (!css_tryget_online(&memcg->css))
-               return;
-
-       cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
-       if (!cw) {
-               css_put(&memcg->css);
-               return;
-       }
-
-       cw->memcg = memcg;
-       cw->cachep = cachep;
-       INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
-       queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
-       if (in_interrupt())
-               return true;
-
-       /* Allow remote memcg charging in kthread contexts. */
-       if ((!current->mm || (current->flags & PF_KTHREAD)) &&
-            !current->active_memcg)
-               return true;
-       return false;
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
-       struct mem_cgroup *memcg;
-       struct kmem_cache *memcg_cachep;
-       struct memcg_cache_array *arr;
-       int kmemcg_id;
-
-       VM_BUG_ON(!is_root_cache(cachep));
-
-       if (memcg_kmem_bypass())
-               return cachep;
-
-       rcu_read_lock();
-
-       if (unlikely(current->active_memcg))
-               memcg = current->active_memcg;
-       else
-               memcg = mem_cgroup_from_task(current);
-
-       if (!memcg || memcg == root_mem_cgroup)
-               goto out_unlock;
-
-       kmemcg_id = READ_ONCE(memcg->kmemcg_id);
-       if (kmemcg_id < 0)
-               goto out_unlock;
-
-       arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
-       /*
-        * Make sure we will access the up-to-date value. The code updating
-        * memcg_caches issues a write barrier to match the data dependency
-        * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
-        */
-       memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
-       /*
-        * If we are in a safe context (can wait, and not in interrupt
-        * context), we could be be predictable and return right away.
-        * This would guarantee that the allocation being performed
-        * already belongs in the new cache.
-        *
-        * However, there are some clashes that can arrive from locking.
-        * For instance, because we acquire the slab_mutex while doing
-        * memcg_create_kmem_cache, this means no further allocation
-        * could happen with the slab_mutex held. So it's better to
-        * defer everything.
-        *
-        * If the memcg is dying or memcg_cache is about to be released,
-        * don't bother creating new kmem_caches. Because memcg_cachep
-        * is ZEROed as the fist step of kmem offlining, we don't need
-        * percpu_ref_tryget_live() here. css_tryget_online() check in
-        * memcg_schedule_kmem_cache_create() will prevent us from
-        * creation of a new kmem_cache.
-        */
-       if (unlikely(!memcg_cachep))
-               memcg_schedule_kmem_cache_create(memcg, cachep);
-       else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
-               cachep = memcg_cachep;
-out_unlock:
-       rcu_read_unlock();
-       return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
-       if (!is_root_cache(cachep))
-               percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
 /**
  * __memcg_kmem_charge: charge a number of kernel pages to a memcg
  * @memcg: memory cgroup to charge
@@ -2958,6 +3040,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
                if (!ret) {
                        page->mem_cgroup = memcg;
                        __SetPageKmemcg(page);
+                       return 0;
                }
        }
        css_put(&memcg->css);
@@ -2980,13 +3063,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
        __memcg_kmem_uncharge(memcg, nr_pages);
        page->mem_cgroup = NULL;
+       css_put(&memcg->css);
 
        /* slab pages do not have PageKmemcg flag set */
        if (PageKmemcg(page))
                __ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+       struct memcg_stock_pcp *stock;
+       unsigned long flags;
+       bool ret = false;
+
+       local_irq_save(flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
+       if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+               stock->nr_bytes -= nr_bytes;
+               ret = true;
+       }
+
+       local_irq_restore(flags);
+
+       return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+       struct obj_cgroup *old = stock->cached_objcg;
+
+       if (!old)
+               return;
+
+       if (stock->nr_bytes) {
+               unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+               unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+               if (nr_pages) {
+                       rcu_read_lock();
+                       __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
+                       rcu_read_unlock();
+               }
+
+               /*
+                * The leftover is flushed to the centralized per-memcg value.
+                * On the next attempt to refill obj stock it will be moved
+                * to a per-cpu stock (probably, on an other CPU), see
+                * refill_obj_stock().
+                *
+                * How often it's flushed is a trade-off between the memory
+                * limit enforcement accuracy and potential CPU contention,
+                * so it might be changed in the future.
+                */
+               atomic_add(nr_bytes, &old->nr_charged_bytes);
+               stock->nr_bytes = 0;
+       }
+
+       obj_cgroup_put(old);
+       stock->cached_objcg = NULL;
+}
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg)
+{
+       struct mem_cgroup *memcg;
+
+       if (stock->cached_objcg) {
+               memcg = obj_cgroup_memcg(stock->cached_objcg);
+               if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+                       return true;
+       }
+
+       return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+       struct memcg_stock_pcp *stock;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
+       if (stock->cached_objcg != objcg) { /* reset if necessary */
+               drain_obj_stock(stock);
+               obj_cgroup_get(objcg);
+               stock->cached_objcg = objcg;
+               stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+       }
+       stock->nr_bytes += nr_bytes;
+
+       if (stock->nr_bytes > PAGE_SIZE)
+               drain_obj_stock(stock);
+
+       local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+       struct mem_cgroup *memcg;
+       unsigned int nr_pages, nr_bytes;
+       int ret;
+
+       if (consume_obj_stock(objcg, size))
+               return 0;
+
+       /*
+        * In theory, memcg->nr_charged_bytes can have enough
+        * pre-charged bytes to satisfy the allocation. However,
+        * flushing memcg->nr_charged_bytes requires two atomic
+        * operations, and memcg->nr_charged_bytes can't be big,
+        * so it's better to ignore it and try grab some new pages.
+        * memcg->nr_charged_bytes will be flushed in
+        * refill_obj_stock(), called from this function or
+        * independently later.
+        */
+       rcu_read_lock();
+       memcg = obj_cgroup_memcg(objcg);
+       css_get(&memcg->css);
+       rcu_read_unlock();
+
+       nr_pages = size >> PAGE_SHIFT;
+       nr_bytes = size & (PAGE_SIZE - 1);
+
+       if (nr_bytes)
+               nr_pages += 1;
 
-       css_put_many(&memcg->css, nr_pages);
+       ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+       if (!ret && nr_bytes)
+               refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+       css_put(&memcg->css);
+       return ret;
 }
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+       refill_obj_stock(objcg, size);
+}
+
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2997,13 +3213,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
+       struct mem_cgroup *memcg = head->mem_cgroup;
        int i;
 
        if (mem_cgroup_disabled())
                return;
 
-       for (i = 1; i < HPAGE_PMD_NR; i++)
-               head[i].mem_cgroup = head->mem_cgroup;
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               css_get(&memcg->css);
+               head[i].mem_cgroup = memcg;
+       }
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -3207,7 +3426,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 {
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
 
        /* we call try-to-free pages for make this cgroup empty */
        lru_add_drain_all();
@@ -3404,6 +3623,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
+       struct obj_cgroup *objcg;
        int memcg_id;
 
        if (cgroup_memory_nokmem)
@@ -3416,7 +3636,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
        if (memcg_id < 0)
                return memcg_id;
 
-       static_branch_inc(&memcg_kmem_enabled_key);
+       objcg = obj_cgroup_alloc();
+       if (!objcg) {
+               memcg_free_cache_id(memcg_id);
+               return -ENOMEM;
+       }
+       objcg->memcg = memcg;
+       rcu_assign_pointer(memcg->objcg, objcg);
+
+       static_branch_enable(&memcg_kmem_enabled_key);
+
        /*
         * A memory cgroup is considered kmem-online as soon as it gets
         * kmemcg_id. Setting the id after enabling static branching will
@@ -3425,7 +3654,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
         */
        memcg->kmemcg_id = memcg_id;
        memcg->kmem_state = KMEM_ONLINE;
-       INIT_LIST_HEAD(&memcg->kmem_caches);
 
        return 0;
 }
@@ -3438,22 +3666,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 
        if (memcg->kmem_state != KMEM_ONLINE)
                return;
-       /*
-        * Clear the online state before clearing memcg_caches array
-        * entries. The slab_mutex in memcg_deactivate_kmem_caches()
-        * guarantees that no cache will be created for this cgroup
-        * after we are done (see memcg_create_kmem_cache()).
-        */
+
        memcg->kmem_state = KMEM_ALLOCATED;
 
        parent = parent_mem_cgroup(memcg);
        if (!parent)
                parent = root_mem_cgroup;
 
-       /*
-        * Deactivate and reparent kmem_caches.
-        */
-       memcg_deactivate_kmem_caches(memcg, parent);
+       memcg_reparent_objcgs(memcg, parent);
 
        kmemcg_id = memcg->kmemcg_id;
        BUG_ON(kmemcg_id < 0);
@@ -3486,11 +3706,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
        /* css_alloc() failed, offlining didn't happen */
        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
                memcg_offline_kmem(memcg);
-
-       if (memcg->kmem_state == KMEM_ALLOCATED) {
-               WARN_ON(!list_empty(&memcg->kmem_caches));
-               static_branch_dec(&memcg_kmem_enabled_key);
-       }
 }
 #else
 static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4800,9 +5015,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
        (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
        {
                .name = "kmem.slabinfo",
-               .seq_start = memcg_slab_start,
-               .seq_next = memcg_slab_next,
-               .seq_stop = memcg_slab_stop,
                .seq_show = memcg_slab_show,
        },
 #endif
@@ -5022,6 +5234,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->socket_pressure = jiffies;
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
+       INIT_LIST_HEAD(&memcg->objcg_list);
 #endif
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&memcg->cgwb_list);
@@ -5084,9 +5297,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
        /* The following stuff does not apply to the root */
        if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
-               INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
                root_mem_cgroup = memcg;
                return &memcg->css;
        }
@@ -5448,7 +5658,10 @@ static int mem_cgroup_move_account(struct page *page,
         */
        smp_mb();
 
-       page->mem_cgroup = to;  /* caller should have done css_get */
+       css_get(&to->css);
+       css_put(&from->css);
+
+       page->mem_cgroup = to;
 
        __unlock_page_memcg(from);
 
@@ -5669,8 +5882,6 @@ static void __mem_cgroup_clear_mc(void)
                if (!mem_cgroup_is_root(mc.to))
                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
 
-               css_put_many(&mc.to->css, mc.moved_swap);
-
                mc.moved_swap = 0;
        }
        memcg_oom_recover(from);
@@ -6036,7 +6247,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_retries = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long high;
        int err;
@@ -6046,8 +6257,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
-       page_counter_set_high(&memcg->memory, high);
-
        for (;;) {
                unsigned long nr_pages = page_counter_read(&memcg->memory);
                unsigned long reclaimed;
@@ -6071,6 +6280,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                        break;
        }
 
+       page_counter_set_high(&memcg->memory, high);
+
+       memcg_wb_domain_size_changed(memcg);
+
        return nbytes;
 }
 
@@ -6084,7 +6297,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long max;
        int err;
@@ -6391,40 +6604,42 @@ static unsigned long effective_protection(unsigned long usage,
  *
  * WARNING: This function is not stateless! It can only be used as part
  *          of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- *   MEMCG_PROT_NONE: cgroup memory is not protected
- *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
- *     an unprotected supply of reclaimable memory from other cgroups.
- *   MEMCG_PROT_MIN: cgroup memory is protected
  */
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
-                                               struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                    struct mem_cgroup *memcg)
 {
        unsigned long usage, parent_usage;
        struct mem_cgroup *parent;
 
        if (mem_cgroup_disabled())
-               return MEMCG_PROT_NONE;
+               return;
 
        if (!root)
                root = root_mem_cgroup;
+
+       /*
+        * Effective values of the reclaim targets are ignored so they
+        * can be stale. Have a look at mem_cgroup_protection for more
+        * details.
+        * TODO: calculation should be more robust so that we do not need
+        * that special casing.
+        */
        if (memcg == root)
-               return MEMCG_PROT_NONE;
+               return;
 
        usage = page_counter_read(&memcg->memory);
        if (!usage)
-               return MEMCG_PROT_NONE;
+               return;
 
        parent = parent_mem_cgroup(memcg);
        /* No parent means a non-hierarchical mode on v1 memcg */
        if (!parent)
-               return MEMCG_PROT_NONE;
+               return;
 
        if (parent == root) {
                memcg->memory.emin = READ_ONCE(memcg->memory.min);
                memcg->memory.elow = READ_ONCE(memcg->memory.low);
-               goto out;
+               return;
        }
 
        parent_usage = page_counter_read(&parent->memory);
@@ -6438,14 +6653,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
                        READ_ONCE(memcg->memory.low),
                        READ_ONCE(parent->memory.elow),
                        atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
-       if (usage <= memcg->memory.emin)
-               return MEMCG_PROT_MIN;
-       else if (usage <= memcg->memory.elow)
-               return MEMCG_PROT_LOW;
-       else
-               return MEMCG_PROT_NONE;
 }
 
 /**
@@ -6498,6 +6705,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
        if (ret)
                goto out_put;
 
+       css_get(&memcg->css);
        commit_charge(page, memcg);
 
        local_irq_disable();
@@ -6552,9 +6760,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)
        __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
        memcg_check_events(ug->memcg, ug->dummy_page);
        local_irq_restore(flags);
-
-       if (!mem_cgroup_is_root(ug->memcg))
-               css_put_many(&ug->memcg->css, ug->nr_pages);
 }
 
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6592,6 +6797,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 
        ug->dummy_page = page;
        page->mem_cgroup = NULL;
+       css_put(&ug->memcg->css);
 }
 
 static void uncharge_list(struct list_head *page_list)
@@ -6697,8 +6903,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_charge(&memcg->memsw, nr_pages);
-       css_get_many(&memcg->css, nr_pages);
 
+       css_get(&memcg->css);
        commit_charge(newpage, memcg);
 
        local_irq_save(flags);
@@ -6821,17 +7027,6 @@ static int __init mem_cgroup_init(void)
 {
        int cpu, node;
 
-#ifdef CONFIG_MEMCG_KMEM
-       /*
-        * Kmem cache creation is mostly done with the slab_mutex held,
-        * so use a workqueue with limited concurrency to avoid stalling
-        * all worker threads in case lots of cgroups are created and
-        * destroyed simultaneously.
-        */
-       memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
-       BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
        cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
                                  memcg_hotplug_cpu_dead);
 
@@ -6935,8 +7130,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        mem_cgroup_charge_statistics(memcg, page, -nr_entries);
        memcg_check_events(memcg, page);
 
-       if (!mem_cgroup_is_root(memcg))
-               css_put_many(&memcg->css, nr_entries);
+       css_put(&memcg->css);
 }
 
 /**
index 0da48f6..c39a13b 100644 (file)
@@ -1098,7 +1098,7 @@ again:
                }
 
                entry = pte_to_swp_entry(ptent);
-               if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+               if (is_device_private_entry(entry)) {
                        struct page *page = device_private_entry_to_page(entry);
 
                        if (unlikely(details && details->check_mapping)) {
@@ -2082,7 +2082,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 /**
  * remap_pfn_range - remap kernel memory to userspace
  * @vma: user vma to map to
- * @addr: target user address to start at
+ * @addr: target page aligned user address to start at
  * @pfn: page frame number of kernel physical memory address
  * @size: size of mapping area
  * @prot: page protection flags for this mapping
@@ -2101,6 +2101,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        unsigned long remap_pfn = pfn;
        int err;
 
+       if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+               return -EINVAL;
+
        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
index da374cd..ac6961a 100644 (file)
@@ -831,6 +831,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
        zone->zone_pgdat->node_present_pages += onlined_pages;
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 
+       /*
+        * When exposing larger, physically contiguous memory areas to the
+        * buddy, shuffling in the buddy (when freeing onlined pages, putting
+        * them either to the head or the tail of the freelist) is only helpful
+        * for maintaining the shuffle, but not for creating the initial
+        * shuffle. Shuffle the whole zone to make sure the just onlined pages
+        * are properly distributed across the whole freelist.
+        */
        shuffle_zone(zone);
 
        node_states_set_node(nid, &arg);
@@ -844,8 +852,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
        kswapd_run(nid);
        kcompactd_run(nid);
 
-       vm_total_pages = nr_free_pagecache_pages();
-
        writeback_set_ratelimit();
 
        memory_notify(MEM_ONLINE, &arg);
@@ -1595,7 +1601,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
                kcompactd_stop(node);
        }
 
-       vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
 
        memory_notify(MEM_OFFLINE, &arg);
index 4fcc465..d179657 100644 (file)
@@ -2386,9 +2386,9 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
         * that the registered device driver can skip invalidating device
         * private page mappings that won't be migrated.
         */
-       mmu_notifier_range_init(&range, MMU_NOTIFY_MIGRATE, 0, migrate->vma,
-                       migrate->vma->vm_mm, migrate->start, migrate->end);
-       range.migrate_pgmap_owner = migrate->pgmap_owner;
+       mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+               migrate->vma->vm_mm, migrate->start, migrate->end,
+               migrate->pgmap_owner);
        mmu_notifier_invalidate_range_start(&range);
 
        walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
index 435e5f7..b06a30f 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/memory.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/mman.h>
 #include "internal.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj);
 #ifdef CONFIG_SMP
 s32 vm_committed_as_batch = 32;
 
-static void __meminit mm_compute_batch(void)
+void mm_compute_batch(int overcommit_policy)
 {
        u64 memsized_batch;
        s32 nr = num_present_cpus();
        s32 batch = max_t(s32, nr*2, 32);
-
-       /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
-       memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
+       unsigned long ram_pages = totalram_pages();
+
+       /*
+        * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+        * (total memory/#cpus), and lift it to 25% for other policies
+        * to easy the possible lock contention for percpu_counter
+        * vm_committed_as, while the max limit is INT_MAX
+        */
+       if (overcommit_policy == OVERCOMMIT_NEVER)
+               memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+       else
+               memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
 
        vm_committed_as_batch = max_t(s32, memsized_batch, batch);
 }
@@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
        switch (action) {
        case MEM_ONLINE:
        case MEM_OFFLINE:
-               mm_compute_batch();
+               mm_compute_batch(sysctl_overcommit_memory);
        default:
                break;
        }
@@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = {
 
 static int __init mm_compute_batch_init(void)
 {
-       mm_compute_batch();
+       mm_compute_batch(sysctl_overcommit_memory);
        register_hotmemory_notifier(&compute_batch_nb);
 
        return 0;
index dcdab26..40248d8 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,7 +1030,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  *
  * We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
  * wrap, nor mmaps which cover the final page at index -1UL.
  */
 static int
@@ -1365,11 +1365,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
  */
 unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, vm_flags_t vm_flags,
-                       unsigned long pgoff, unsigned long *populate,
-                       struct list_head *uf)
+                       unsigned long flags, unsigned long pgoff,
+                       unsigned long *populate, struct list_head *uf)
 {
        struct mm_struct *mm = current->mm;
+       vm_flags_t vm_flags;
        int pkey = 0;
 
        *populate = 0;
@@ -1431,7 +1431,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
-       vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+       vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
        if (flags & MAP_LOCKED)
@@ -1562,11 +1562,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                file = fget(fd);
                if (!file)
                        return -EBADF;
-               if (is_file_hugepages(file))
+               if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
-               retval = -EINVAL;
-               if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+               } else if (unlikely(flags & MAP_HUGETLB)) {
+                       retval = -EINVAL;
                        goto out_fput;
+               }
        } else if (flags & MAP_HUGETLB) {
                struct user_struct *user = NULL;
                struct hstate *hs;
@@ -1689,7 +1690,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
                struct list_head *uf)
 {
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma, *prev;
+       struct vm_area_struct *vma, *prev, *merge;
        int error;
        struct rb_node **rb_link, *rb_parent;
        unsigned long charged = 0;
@@ -1773,6 +1774,25 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
                if (error)
                        goto unmap_and_free_vma;
 
+               /* If vm_flags changed after call_mmap(), we should try merge vma again
+                * as we may succeed this time.
+                */
+               if (unlikely(vm_flags != vma->vm_flags && prev)) {
+                       merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+                               NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+                       if (merge) {
+                               fput(file);
+                               vm_area_free(vma);
+                               vma = merge;
+                               /* Update vm_flags and possible addr to pick up the change. We don't
+                                * warn here if addr changed as the vma is not linked by vma_link().
+                                */
+                               addr = vma->vm_start;
+                               vm_flags = vma->vm_flags;
+                               goto unmap_writable;
+                       }
+               }
+
                /* Can addr have changed??
                 *
                 * Answer: Yes, several device drivers can do it in their
@@ -1795,6 +1815,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        vma_link(mm, vma, prev, rb_link, rb_parent);
        /* Once vma denies write, undo our temporary denial count */
        if (file) {
+unmap_writable:
                if (vm_flags & VM_SHARED)
                        mapping_unmap_writable(file->f_mapping);
                if (vm_flags & VM_DENYWRITE)
@@ -2209,7 +2230,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
-                * do_mmap_pgoff() will clear pgoff, so match alignment.
+                * do_mmap() will clear pgoff, so match alignment.
                 */
                pgoff = 0;
                get_area = shmem_get_unmapped_area;
@@ -2982,7 +3003,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        }
 
        file = get_file(vma->vm_file);
-       ret = do_mmap_pgoff(vma->vm_file, start, size,
+       ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, pgoff, &populate, NULL);
        fput(file);
 out:
@@ -3202,7 +3223,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
-        * Similarly in do_mmap_pgoff and in do_brk.
+        * Similarly in do_mmap and in do_brk.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
index 6b153dc..138abba 100644 (file)
@@ -193,17 +193,12 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 #ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
-                 unsigned long new_addr, unsigned long old_end,
-                 pmd_t *old_pmd, pmd_t *new_pmd)
+                 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
        struct mm_struct *mm = vma->vm_mm;
        pmd_t pmd;
 
-       if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
-           || old_end - old_addr < PMD_SIZE)
-               return false;
-
        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it.
@@ -279,6 +274,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                extent = next - old_addr;
                if (extent > old_end - old_addr)
                        extent = old_end - old_addr;
+               next = (new_addr + PMD_SIZE) & PMD_MASK;
+               if (extent > next - new_addr)
+                       extent = next - new_addr;
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
@@ -292,7 +290,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                if (need_rmap_locks)
                                        take_rmap_locks(vma);
                                moved = move_huge_pmd(vma, old_addr, new_addr,
-                                                   old_end, old_pmd, new_pmd);
+                                                     old_pmd, new_pmd);
                                if (need_rmap_locks)
                                        drop_rmap_locks(vma);
                                if (moved)
@@ -312,7 +310,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                        if (need_rmap_locks)
                                take_rmap_locks(vma);
                        moved = move_normal_pmd(vma, old_addr, new_addr,
-                                       old_end, old_pmd, new_pmd);
+                                               old_pmd, new_pmd);
                        if (need_rmap_locks)
                                drop_rmap_locks(vma);
                        if (moved)
@@ -322,9 +320,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
                if (pte_alloc(new_vma->vm_mm, new_pmd))
                        break;
-               next = (new_addr + PMD_SIZE) & PMD_MASK;
-               if (extent > next - new_addr)
-                       extent = next - new_addr;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
                          new_pmd, new_addr, need_rmap_locks);
        }
index 3141748..340ae77 100644 (file)
@@ -1078,7 +1078,6 @@ unsigned long do_mmap(struct file *file,
                        unsigned long len,
                        unsigned long prot,
                        unsigned long flags,
-                       vm_flags_t vm_flags,
                        unsigned long pgoff,
                        unsigned long *populate,
                        struct list_head *uf)
@@ -1086,6 +1085,7 @@ unsigned long do_mmap(struct file *file,
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *rb;
+       vm_flags_t vm_flags;
        unsigned long capabilities, result;
        int ret;
 
@@ -1104,7 +1104,7 @@ unsigned long do_mmap(struct file *file,
 
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
-       vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
+       vm_flags = determine_vm_flags(file, prot, flags, capabilities);
 
        /* we're going to need to record the mapping */
        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
@@ -1763,7 +1763,7 @@ EXPORT_SYMBOL_GPL(access_process_vm);
  *
  * Check the shared mappings on an inode on behalf of a shrinking truncate to
  * make sure that that any outstanding VMAs aren't broken and then shrink the
- * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * vm_regions that extend that beyond so that do_mmap() doesn't
  * automatically grant mappings that are too large.
  */
 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
index 6e94962..d30ce75 100644 (file)
@@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void)
                 global_node_page_state(NR_ISOLATED_FILE) +
                 global_node_page_state(NR_UNEVICTABLE);
 
-       return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+       return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
 }
 
 /**
index 28b3e7a..4e4ddd6 100644 (file)
@@ -2076,13 +2076,11 @@ static int page_writeback_cpu_online(unsigned int cpu)
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
- * related to pages that could be allocated for buffers (by
- * comparing nr_free_buffer_pages() to vm_total_pages.
+ * related to pages that could be allocated for buffers.
  *
  * However, that was when we used "dirty_ratio" to scale with
  * all memory, and we don't do that any more. "dirty_ratio"
- * is now applied to total non-HIGHPAGE memory (by subtracting
- * totalhigh_pages from vm_total_pages), and as such we can't
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
  * get into the old insane situation any more where we had
  * large amounts of dirty pages compared to a small amount of
  * non-HIGHMEM memory.
index 901a21f..167732f 100644 (file)
@@ -459,25 +459,23 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
        pfn &= (PAGES_PER_SECTION-1);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
        pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
+       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 }
 
 /**
  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
  * @mask: mask of bits that the caller is interested in
  *
  * Return: pageblock_bits flags
  */
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+static __always_inline
+unsigned long __get_pfnblock_flags_mask(struct page *page,
                                        unsigned long pfn,
-                                       unsigned long end_bitidx,
                                        unsigned long mask)
 {
        unsigned long *bitmap;
@@ -490,20 +488,18 @@ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page
        bitidx &= (BITS_PER_LONG-1);
 
        word = bitmap[word_bitidx];
-       bitidx += end_bitidx;
-       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+       return (word >> bitidx) & mask;
 }
 
 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-                                       unsigned long end_bitidx,
                                        unsigned long mask)
 {
-       return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+       return __get_pfnblock_flags_mask(page, pfn, mask);
 }
 
 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
 {
-       return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+       return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
 }
 
 /**
@@ -511,12 +507,10 @@ static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned
  * @page: The page within the block of interest
  * @flags: The flags to set
  * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
  * @mask: mask of bits that the caller is interested in
  */
 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
                                        unsigned long pfn,
-                                       unsigned long end_bitidx,
                                        unsigned long mask)
 {
        unsigned long *bitmap;
@@ -533,9 +527,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 
        VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 
-       bitidx += end_bitidx;
-       mask <<= (BITS_PER_LONG - bitidx - 1);
-       flags <<= (BITS_PER_LONG - bitidx - 1);
+       mask <<= bitidx;
+       flags <<= bitidx;
 
        word = READ_ONCE(bitmap[word_bitidx]);
        for (;;) {
@@ -552,8 +545,8 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
                     migratetype < MIGRATE_PCPTYPES))
                migratetype = MIGRATE_UNMOVABLE;
 
-       set_pageblock_flags_group(page, (unsigned long)migratetype,
-                                       PB_migrate, PB_migrate_end);
+       set_pfnblock_flags_mask(page, (unsigned long)migratetype,
+                               page_to_pfn(page), MIGRATETYPE_MASK);
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -813,11 +806,10 @@ static inline struct capture_control *task_capc(struct zone *zone)
 {
        struct capture_control *capc = current->capture_control;
 
-       return capc &&
+       return unlikely(capc) &&
                !(current->flags & PF_KTHREAD) &&
                !capc->page &&
-               capc->cc->zone == zone &&
-               capc->cc->direct_compaction ? capc : NULL;
+               capc->cc->zone == zone ? capc : NULL;
 }
 
 static inline bool
@@ -1164,8 +1156,11 @@ static void kernel_init_free_pages(struct page *page, int numpages)
 {
        int i;
 
+       /* s390's use of memset() could override KASAN redzones. */
+       kasan_disable_current();
        for (i = 0; i < numpages; i++)
                clear_highpage(page + i);
+       kasan_enable_current();
 }
 
 static __always_inline bool free_pages_prepare(struct page *page,
@@ -2273,7 +2268,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
-static int fallbacks[MIGRATE_TYPES][4] = {
+static int fallbacks[MIGRATE_TYPES][3] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
@@ -2790,7 +2785,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
         * allocating from CMA when over half of the zone's free memory
         * is in the CMA area.
         */
-       if (migratetype == MIGRATE_MOVABLE &&
+       if (alloc_flags & ALLOC_CMA &&
            zone_page_state(zone, NR_FREE_CMA_PAGES) >
            zone_page_state(zone, NR_FREE_PAGES) / 2) {
                page = __rmqueue_cma_fallback(zone, order);
@@ -2801,7 +2796,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 retry:
        page = __rmqueue_smallest(zone, order, migratetype);
        if (unlikely(!page)) {
-               if (migratetype == MIGRATE_MOVABLE)
+               if (alloc_flags & ALLOC_CMA)
                        page = __rmqueue_cma_fallback(zone, order);
 
                if (!page && __rmqueue_fallback(zone, order, migratetype,
@@ -3487,6 +3482,29 @@ static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 }
 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
 
+static inline long __zone_watermark_unusable_free(struct zone *z,
+                               unsigned int order, unsigned int alloc_flags)
+{
+       const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+       long unusable_free = (1 << order) - 1;
+
+       /*
+        * If the caller does not have rights to ALLOC_HARDER then subtract
+        * the high-atomic reserves. This will over-estimate the size of the
+        * atomic reserve but it avoids a search.
+        */
+       if (likely(!alloc_harder))
+               unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+       /* If allocation can't use CMA areas don't use free CMA pages */
+       if (!(alloc_flags & ALLOC_CMA))
+               unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+       return unusable_free;
+}
+
 /*
  * Return true if free base pages are above 'mark'. For high-order checks it
  * will return true of the order-0 watermark is reached and there is at least
@@ -3502,19 +3520,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
        const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
 
        /* free_pages may go negative - that's OK */
-       free_pages -= (1 << order) - 1;
+       free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
 
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
 
-       /*
-        * If the caller does not have rights to ALLOC_HARDER then subtract
-        * the high-atomic reserves. This will over-estimate the size of the
-        * atomic reserve but it avoids a search.
-        */
-       if (likely(!alloc_harder)) {
-               free_pages -= z->nr_reserved_highatomic;
-       } else {
+       if (unlikely(alloc_harder)) {
                /*
                 * OOM victims can try even harder than normal ALLOC_HARDER
                 * users on the grounds that it's definitely going to be in
@@ -3527,13 +3538,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                        min -= min / 4;
        }
 
-
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
        /*
         * Check watermarks for an order-0 allocation request. If these
         * are not met, then a high-order request also cannot go ahead
@@ -3580,30 +3584,42 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 
 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
                                unsigned long mark, int highest_zoneidx,
-                               unsigned int alloc_flags)
+                               unsigned int alloc_flags, gfp_t gfp_mask)
 {
-       long free_pages = zone_page_state(z, NR_FREE_PAGES);
-       long cma_pages = 0;
+       long free_pages;
 
-#ifdef CONFIG_CMA
-       /* If allocation can't use CMA areas don't use free CMA pages */
-       if (!(alloc_flags & ALLOC_CMA))
-               cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+       free_pages = zone_page_state(z, NR_FREE_PAGES);
 
        /*
         * Fast check for order-0 only. If this fails then the reserves
-        * need to be calculated. There is a corner case where the check
-        * passes but only the high-order atomic reserve are free. If
-        * the caller is !atomic then it'll uselessly search the free
-        * list. That corner case is then slower but it is harmless.
+        * need to be calculated.
         */
-       if (!order && (free_pages - cma_pages) >
-                               mark + z->lowmem_reserve[highest_zoneidx])
+       if (!order) {
+               long fast_free;
+
+               fast_free = free_pages;
+               fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
+               if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+                       return true;
+       }
+
+       if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+                                       free_pages))
                return true;
+       /*
+        * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+        * when checking the min watermark. The min watermark is the
+        * point where boosting is ignored so that kswapd is woken up
+        * when below the low watermark.
+        */
+       if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+               && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+               mark = z->_watermark[WMARK_MIN];
+               return __zone_watermark_ok(z, order, mark, highest_zoneidx,
+                                       alloc_flags, free_pages);
+       }
 
-       return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
-                                       free_pages);
+       return false;
 }
 
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
@@ -3671,6 +3687,20 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
        return alloc_flags;
 }
 
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
+                                       unsigned int alloc_flags)
+{
+#ifdef CONFIG_CMA
+       unsigned int pflags = current->flags;
+
+       if (!(pflags & PF_MEMALLOC_NOCMA) &&
+                       gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+               alloc_flags |= ALLOC_CMA;
+
+#endif
+       return alloc_flags;
+}
+
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -3747,7 +3777,8 @@ retry:
 
                mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
                if (!zone_watermark_fast(zone, order, mark,
-                                      ac->highest_zoneidx, alloc_flags)) {
+                                      ac->highest_zoneidx, alloc_flags,
+                                      gfp_mask)) {
                        int ret;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -4316,10 +4347,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
-#ifdef CONFIG_CMA
-       if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-               alloc_flags |= ALLOC_CMA;
-#endif
+       alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+
        return alloc_flags;
 }
 
@@ -4620,7 +4649,7 @@ retry:
 
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
        if (reserve_flags)
-               alloc_flags = reserve_flags;
+               alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
 
        /*
         * Reset the nodemask and zonelist iterators if memory policies can be
@@ -4697,7 +4726,7 @@ retry:
 
        /* Avoid allocations with no watermarks from looping endlessly */
        if (tsk_is_oom_victim(current) &&
-           (alloc_flags == ALLOC_OOM ||
+           (alloc_flags & ALLOC_OOM ||
             (gfp_mask & __GFP_NOMEMALLOC)))
                goto nopage;
 
@@ -4771,7 +4800,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 
        if (cpusets_enabled()) {
                *alloc_mask |= __GFP_HARDWALL;
-               if (!ac->nodemask)
+               /*
+                * When we are in the interrupt context, it is irrelevant
+                * to the current task context. It means that any node ok.
+                */
+               if (!in_interrupt() && !ac->nodemask)
                        ac->nodemask = &cpuset_current_mems_allowed;
                else
                        *alloc_flags |= ALLOC_CPUSET;
@@ -4785,8 +4818,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
        if (should_fail_alloc_page(gfp_mask, order))
                return false;
 
-       if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
-               *alloc_flags |= ALLOC_CMA;
+       *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
 
        return true;
 }
@@ -5165,19 +5197,6 @@ unsigned long nr_free_buffer_pages(void)
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 
-/**
- * nr_free_pagecache_pages - count number of pages beyond high watermark
- *
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
- * high watermark within all zones.
- *
- * Return: number of pages beyond high watermark within all zones.
- */
-unsigned long nr_free_pagecache_pages(void)
-{
-       return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
-}
-
 static inline void show_node(struct zone *zone)
 {
        if (IS_ENABLED(CONFIG_NUMA))
@@ -5220,8 +5239,8 @@ long si_mem_available(void)
         * items that are in use, and cannot be freed. Cap this estimate at the
         * low watermark.
         */
-       reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
-                       global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+       reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+               global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
        available += reclaimable - min(reclaimable / 2, wmark_low);
 
        if (available < 0)
@@ -5364,8 +5383,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                global_node_page_state(NR_UNEVICTABLE),
                global_node_page_state(NR_FILE_DIRTY),
                global_node_page_state(NR_WRITEBACK),
-               global_node_page_state(NR_SLAB_RECLAIMABLE),
-               global_node_page_state(NR_SLAB_UNRECLAIMABLE),
+               global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+               global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
                global_node_page_state(NR_FILE_MAPPED),
                global_node_page_state(NR_SHMEM),
                global_zone_page_state(NR_PAGETABLE),
@@ -5396,6 +5415,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        " anon_thp: %lukB"
 #endif
                        " writeback_tmp:%lukB"
+                       " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+                       " shadow_call_stack:%lukB"
+#endif
                        " all_unreclaimable? %s"
                        "\n",
                        pgdat->node_id,
@@ -5417,6 +5440,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+                       node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+                       node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
                        pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
                                "yes" : "no");
        }
@@ -5448,10 +5475,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        " present:%lukB"
                        " managed:%lukB"
                        " mlocked:%lukB"
-                       " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
-                       " shadow_call_stack:%lukB"
-#endif
                        " pagetables:%lukB"
                        " bounce:%lukB"
                        " free_pcp:%lukB"
@@ -5473,10 +5496,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        K(zone->present_pages),
                        K(zone_managed_pages(zone)),
                        K(zone_page_state(zone, NR_MLOCK)),
-                       zone_page_state(zone, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
-                       zone_page_state(zone, NR_KERNEL_SCS_KB),
-#endif
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(free_pcp),
@@ -5891,13 +5910,16 @@ build_all_zonelists_init(void)
  */
 void __ref build_all_zonelists(pg_data_t *pgdat)
 {
+       unsigned long vm_total_pages;
+
        if (system_state == SYSTEM_BOOTING) {
                build_all_zonelists_init();
        } else {
                __build_all_zonelists(pgdat);
                /* cpuset refresh routine should be here */
        }
-       vm_total_pages = nr_free_pagecache_pages();
+       /* Get the number of free pages beyond high watermark in all zones. */
+       vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
        /*
         * Disable grouping by mobility if the number of pages in the
         * system is too low to allow the mechanism to work. It would be
@@ -6325,22 +6347,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
 }
 
 /**
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
- *
- * If an architecture guarantees that all ranges registered contain no holes and may
- * be freed, this function may be used instead of calling memory_present() manually.
- */
-void __init sparse_memory_present_with_active_regions(int nid)
-{
-       unsigned long start_pfn, end_pfn;
-       int i, this_nid;
-
-       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
-               memory_present(this_nid, start_pfn, end_pfn);
-}
-
-/**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
index c56db2d..b466384 100644 (file)
@@ -72,7 +72,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
                long new;
 
                new = atomic_long_add_return(nr_pages, &c->usage);
-               propagate_protected_usage(counter, new);
+               propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
@@ -116,7 +116,7 @@ bool page_counter_try_charge(struct page_counter *counter,
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
-                       propagate_protected_usage(counter, new);
+                       propagate_protected_usage(c, new);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt.
@@ -125,7 +125,7 @@ bool page_counter_try_charge(struct page_counter *counter,
                        *fail = c;
                        goto failed;
                }
-               propagate_protected_usage(counter, new);
+               propagate_protected_usage(c, new);
                /*
                 * Just like with failcnt, we can live with some
                 * inaccuracy in the watermark.
index ccda767..9e36256 100644 (file)
@@ -441,7 +441,7 @@ int swap_readpage(struct page *page, bool synchronous)
                        break;
 
                if (!blk_poll(disk->queue, qc, true))
-                       io_schedule();
+                       blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);
        bio_put(bio);
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
new file mode 100644 (file)
index 0000000..1dcc865
--- /dev/null
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLLC_TRACK_H
+#define _LINUX_PGALLLC_TRACK_H
+
+#if defined(CONFIG_MMU)
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+                                    unsigned long address,
+                                    pgtbl_mod_mask *mod_mask)
+{
+       if (unlikely(pgd_none(*pgd))) {
+               if (__p4d_alloc(mm, pgd, address))
+                       return NULL;
+               *mod_mask |= PGTBL_PGD_MODIFIED;
+       }
+
+       return p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+                                    unsigned long address,
+                                    pgtbl_mod_mask *mod_mask)
+{
+       if (unlikely(p4d_none(*p4d))) {
+               if (__pud_alloc(mm, p4d, address))
+                       return NULL;
+               *mod_mask |= PGTBL_P4D_MODIFIED;
+       }
+
+       return pud_offset(p4d, address);
+}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+                                    unsigned long address,
+                                    pgtbl_mod_mask *mod_mask)
+{
+       if (unlikely(pud_none(*pud))) {
+               if (__pmd_alloc(mm, pud, address))
+                       return NULL;
+               *mod_mask |= PGTBL_PUD_MODIFIED;
+       }
+
+       return pmd_offset(pud, address);
+}
+#endif /* CONFIG_MMU */
+
+#define pte_alloc_kernel_track(pmd, address, mask)                     \
+       ((unlikely(pmd_none(*(pmd))) &&                                 \
+         (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+               NULL: pte_offset_kernel(pmd, address))
+
+#endif /* _LINUX_PGALLLC_TRACK_H */
index b2abca3..eb6b36d 100644 (file)
@@ -114,11 +114,13 @@ struct shmem_options {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
+       bool full_inums;
        int huge;
        int seen;
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
 };
 
 #ifdef CONFIG_TMPFS
@@ -260,18 +262,76 @@ bool vma_is_shmem(struct vm_area_struct *vma)
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-       if (sbinfo->max_inodes) {
+       ino_t ino;
+
+       if (!(sb->s_flags & SB_KERNMOUNT)) {
                spin_lock(&sbinfo->stat_lock);
                if (!sbinfo->free_inodes) {
                        spin_unlock(&sbinfo->stat_lock);
                        return -ENOSPC;
                }
                sbinfo->free_inodes--;
+               if (inop) {
+                       ino = sbinfo->next_ino++;
+                       if (unlikely(is_zero_ino(ino)))
+                               ino = sbinfo->next_ino++;
+                       if (unlikely(!sbinfo->full_inums &&
+                                    ino > UINT_MAX)) {
+                               /*
+                                * Emulate get_next_ino uint wraparound for
+                                * compatibility
+                                */
+                               if (IS_ENABLED(CONFIG_64BIT))
+                                       pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+                                               __func__, MINOR(sb->s_dev));
+                               sbinfo->next_ino = 1;
+                               ino = sbinfo->next_ino++;
+                       }
+                       *inop = ino;
+               }
                spin_unlock(&sbinfo->stat_lock);
+       } else if (inop) {
+               /*
+                * __shmem_file_setup, one of our callers, is lock-free: it
+                * doesn't hold stat_lock in shmem_reserve_inode since
+                * max_inodes is always 0, and is called from potentially
+                * unknown contexts. As such, use a per-cpu batched allocator
+                * which doesn't require the per-sb stat_lock unless we are at
+                * the batch boundary.
+                *
+                * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+                * shmem mounts are not exposed to userspace, so we don't need
+                * to worry about things like glibc compatibility.
+                */
+               ino_t *next_ino;
+               next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+               ino = *next_ino;
+               if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+                       spin_lock(&sbinfo->stat_lock);
+                       ino = sbinfo->next_ino;
+                       sbinfo->next_ino += SHMEM_INO_BATCH;
+                       spin_unlock(&sbinfo->stat_lock);
+                       if (unlikely(is_zero_ino(ino)))
+                               ino++;
+               }
+               *inop = ino;
+               *next_ino = ++ino;
+               put_cpu();
        }
+
        return 0;
 }
 
@@ -2222,13 +2282,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
        struct inode *inode;
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+       ino_t ino;
 
-       if (shmem_reserve_inode(sb))
+       if (shmem_reserve_inode(sb, &ino))
                return NULL;
 
        inode = new_inode(sb);
        if (inode) {
-               inode->i_ino = get_next_ino();
+               inode->i_ino = ino;
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2932,7 +2993,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
         * first link must skip that, to get the accounting right.
         */
        if (inode->i_nlink) {
-               ret = shmem_reserve_inode(inode->i_sb);
+               ret = shmem_reserve_inode(inode->i_sb, NULL);
                if (ret)
                        goto out;
        }
@@ -3347,6 +3408,8 @@ enum shmem_param {
        Opt_nr_inodes,
        Opt_size,
        Opt_uid,
+       Opt_inode32,
+       Opt_inode64,
 };
 
 static const struct constant_table shmem_param_enums_huge[] = {
@@ -3366,6 +3429,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_string("nr_inodes",     Opt_nr_inodes),
        fsparam_string("size",          Opt_size),
        fsparam_u32   ("uid",           Opt_uid),
+       fsparam_flag  ("inode32",       Opt_inode32),
+       fsparam_flag  ("inode64",       Opt_inode64),
        {}
 };
 
@@ -3437,6 +3502,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
                        break;
                }
                goto unsupported_parameter;
+       case Opt_inode32:
+               ctx->full_inums = false;
+               ctx->seen |= SHMEM_SEEN_INUMS;
+               break;
+       case Opt_inode64:
+               if (sizeof(ino_t) < 8) {
+                       return invalfc(fc,
+                                      "Cannot use inode64 with <64bit inums in kernel\n");
+               }
+               ctx->full_inums = true;
+               ctx->seen |= SHMEM_SEEN_INUMS;
+               break;
        }
        return 0;
 
@@ -3528,8 +3605,16 @@ static int shmem_reconfigure(struct fs_context *fc)
                }
        }
 
+       if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+           sbinfo->next_ino > UINT_MAX) {
+               err = "Current inum too high to switch to 32-bit inums";
+               goto out;
+       }
+
        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
+       if (ctx->seen & SHMEM_SEEN_INUMS)
+               sbinfo->full_inums = ctx->full_inums;
        if (ctx->seen & SHMEM_SEEN_BLOCKS)
                sbinfo->max_blocks  = ctx->blocks;
        if (ctx->seen & SHMEM_SEEN_INODES) {
@@ -3569,6 +3654,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));
+
+       /*
+        * Showing inode{64,32} might be useful even if it's the system default,
+        * since then people don't have to resort to checking both here and
+        * /proc/config.gz to confirm 64-bit inums were successfully applied
+        * (which may not even exist if IKCONFIG_PROC isn't enabled).
+        *
+        * We hide it when inode64 isn't the default and we are using 32-bit
+        * inodes, since that probably just means the feature isn't even under
+        * consideration.
+        *
+        * As such:
+        *
+        *                     +-----------------+-----------------+
+        *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
+        *  +------------------+-----------------+-----------------+
+        *  | full_inums=true  | show            | show            |
+        *  | full_inums=false | show            | hide            |
+        *  +------------------+-----------------+-----------------+
+        *
+        */
+       if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+               seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
@@ -3584,6 +3692,7 @@ static void shmem_put_super(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
+       free_percpu(sbinfo->ino_batch);
        percpu_counter_destroy(&sbinfo->used_blocks);
        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
@@ -3616,6 +3725,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
                        ctx->blocks = shmem_default_max_blocks();
                if (!(ctx->seen & SHMEM_SEEN_INODES))
                        ctx->inodes = shmem_default_max_inodes();
+               if (!(ctx->seen & SHMEM_SEEN_INUMS))
+                       ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
        } else {
                sb->s_flags |= SB_NOUSER;
        }
@@ -3626,8 +3737,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 #endif
        sbinfo->max_blocks = ctx->blocks;
        sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+       if (sb->s_flags & SB_KERNMOUNT) {
+               sbinfo->ino_batch = alloc_percpu(ino_t);
+               if (!sbinfo->ino_batch)
+                       goto failed;
+       }
        sbinfo->uid = ctx->uid;
        sbinfo->gid = ctx->gid;
+       sbinfo->full_inums = ctx->full_inums;
        sbinfo->mode = ctx->mode;
        sbinfo->huge = ctx->huge;
        sbinfo->mpol = ctx->mpol;
@@ -4128,7 +4245,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
 
 /**
  * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
  */
 int shmem_zero_setup(struct vm_area_struct *vma)
 {
index 44406d9..9b5cd4b 100644 (file)
 #include "shuffle.h"
 
 DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-static unsigned long shuffle_state __ro_after_init;
-
-/*
- * Depending on the architecture, module parameter parsing may run
- * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
- * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
- * attempts to turn on the implementation, but aborts if it finds
- * SHUFFLE_FORCE_DISABLE already set.
- */
-__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
-       if (ctl == SHUFFLE_FORCE_DISABLE)
-               set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
-
-       if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
-               if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
-                       static_branch_disable(&page_alloc_shuffle_key);
-       } else if (ctl == SHUFFLE_ENABLE
-                       && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
-               static_branch_enable(&page_alloc_shuffle_key);
-}
 
 static bool shuffle_param;
 static int shuffle_show(char *buffer, const struct kernel_param *kp)
 {
-       return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
-                       ? 'Y' : 'N');
+       return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
 }
 
 static __meminit int shuffle_store(const char *val,
@@ -47,9 +25,7 @@ static __meminit int shuffle_store(const char *val,
        if (rc < 0)
                return rc;
        if (shuffle_param)
-               page_alloc_shuffle(SHUFFLE_ENABLE);
-       else
-               page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+               static_branch_enable(&page_alloc_shuffle_key);
        return 0;
 }
 module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
@@ -58,25 +34,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
  * For two pages to be swapped in the shuffle, they must be free (on a
  * 'free_area' lru), have the same order, and have the same migratetype.
  */
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+                                                 unsigned long pfn, int order)
 {
-       struct page *page;
+       struct page *page = pfn_to_online_page(pfn);
 
        /*
         * Given we're dealing with randomly selected pfns in a zone we
         * need to ask questions like...
         */
 
-       /* ...is the pfn even in the memmap? */
-       if (!pfn_valid_within(pfn))
+       /* ... is the page managed by the buddy? */
+       if (!page)
                return NULL;
 
-       /* ...is the pfn in a present section or a hole? */
-       if (!pfn_in_present_section(pfn))
+       /* ... is the page assigned to the same zone? */
+       if (page_zone(page) != zone)
                return NULL;
 
        /* ...is the page free and currently on a free_area list? */
-       page = pfn_to_page(pfn);
        if (!PageBuddy(page))
                return NULL;
 
@@ -123,7 +99,7 @@ void __meminit __shuffle_zone(struct zone *z)
                 * page_j randomly selected in the span @zone_start_pfn to
                 * @spanned_pages.
                 */
-               page_i = shuffle_valid_page(i, order);
+               page_i = shuffle_valid_page(z, i, order);
                if (!page_i)
                        continue;
 
@@ -137,7 +113,7 @@ void __meminit __shuffle_zone(struct zone *z)
                        j = z->zone_start_pfn +
                                ALIGN_DOWN(get_random_long() % z->spanned_pages,
                                                order_pages);
-                       page_j = shuffle_valid_page(j, order);
+                       page_j = shuffle_valid_page(z, j, order);
                        if (page_j && page_j != page_i)
                                break;
                }
index 4d79f03..71b784f 100644 (file)
@@ -4,23 +4,10 @@
 #define _MM_SHUFFLE_H
 #include <linux/jump_label.h>
 
-/*
- * SHUFFLE_ENABLE is called from the command line enabling path, or by
- * platform-firmware enabling that indicates the presence of a
- * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
- * the command line path and overrides any previous or future
- * SHUFFLE_ENABLE.
- */
-enum mm_shuffle_ctl {
-       SHUFFLE_ENABLE,
-       SHUFFLE_FORCE_DISABLE,
-};
-
 #define SHUFFLE_ORDER (MAX_ORDER-1)
 
 #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
 DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
 extern void __shuffle_free_memory(pg_data_t *pgdat);
 extern bool shuffle_pick_tail(void);
 static inline void shuffle_free_memory(pg_data_t *pgdat)
@@ -58,10 +45,6 @@ static inline void shuffle_zone(struct zone *z)
 {
 }
 
-static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
-}
-
 static inline bool is_shuffle_order(int order)
 {
        return false;
index 9350062..3160dff 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -588,6 +588,16 @@ static int transfer_objects(struct array_cache *to,
        return nr;
 }
 
+/* &alien->lock must be held by alien callers. */
+static __always_inline void __free_one(struct array_cache *ac, void *objp)
+{
+       /* Avoid trivial double-free. */
+       if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+           WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
+               return;
+       ac->entry[ac->avail++] = objp;
+}
+
 #ifndef CONFIG_NUMA
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
@@ -767,7 +777,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, ac, page_node, &list);
                }
-               ac->entry[ac->avail++] = objp;
+               __free_one(ac, objp);
                spin_unlock(&alien->lock);
                slabs_destroy(cachep, &list);
        } else {
@@ -1050,7 +1060,7 @@ int slab_prepare_cpu(unsigned int cpu)
  * offline.
  *
  * Even if all the cpus of a node are down, we don't free the
- * kmem_list3 of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
  * a kmalloc allocation from another cpu for memory from the node of
  * the cpu going down.  The list3 structure is usually allocated from
  * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
@@ -1239,7 +1249,6 @@ void __init kmem_cache_init(void)
                                  nr_node_ids * sizeof(struct kmem_cache_node *),
                                  SLAB_HWCACHE_ALIGN, 0, 0);
        list_add(&kmem_cache->list, &slab_caches);
-       memcg_link_cache(kmem_cache, NULL);
        slab_state = PARTIAL;
 
        /*
@@ -1370,11 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
                return NULL;
        }
 
-       if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
-               __free_pages(page, cachep->gfporder);
-               return NULL;
-       }
-
+       account_slab_page(page, cachep->gfporder, cachep);
        __SetPageSlab(page);
        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
        if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1398,7 +1403,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += 1 << order;
-       uncharge_slab_page(page, order, cachep);
+       unaccount_slab_page(page, order, cachep);
        __free_pages(page, order);
 }
 
@@ -2243,17 +2248,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
        return (ret ? 1 : 0);
 }
 
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
-{
-       __kmem_cache_shrink(cachep);
-}
-
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-}
-#endif
-
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
        return __kmem_cache_shrink(cachep);
@@ -2579,13 +2573,9 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
         * Be lazy and only check for valid flags here,  keeping it out of the
         * critical path in kmem_cache_alloc().
         */
-       if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-               gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-               flags &= ~GFP_SLAB_BUG_MASK;
-               pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
-                               invalid_mask, &invalid_mask, flags, &flags);
-               dump_stack();
-       }
+       if (unlikely(flags & GFP_SLAB_BUG_MASK))
+               flags = kmalloc_fix_flags(flags);
+
        WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
@@ -3222,9 +3212,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        unsigned long save_flags;
        void *ptr;
        int slab_node = numa_mem_id();
+       struct obj_cgroup *objcg = NULL;
 
        flags &= gfp_allowed_mask;
-       cachep = slab_pre_alloc_hook(cachep, flags);
+       cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
        if (unlikely(!cachep))
                return NULL;
 
@@ -3260,7 +3251,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
                memset(ptr, 0, cachep->object_size);
 
-       slab_post_alloc_hook(cachep, flags, 1, &ptr);
+       slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
        return ptr;
 }
 
@@ -3301,9 +3292,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
        unsigned long save_flags;
        void *objp;
+       struct obj_cgroup *objcg = NULL;
 
        flags &= gfp_allowed_mask;
-       cachep = slab_pre_alloc_hook(cachep, flags);
+       cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
        if (unlikely(!cachep))
                return NULL;
 
@@ -3317,7 +3309,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
        if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
                memset(objp, 0, cachep->object_size);
 
-       slab_post_alloc_hook(cachep, flags, 1, &objp);
+       slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
        return objp;
 }
 
@@ -3426,6 +3418,11 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
        if (kasan_slab_free(cachep, objp, _RET_IP_))
                return;
 
+       /* Use KCSAN to help debug racy use-after-free. */
+       if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+               __kcsan_check_access(objp, cachep->object_size,
+                                    KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
        ___cache_free(cachep, objp, caller);
 }
 
@@ -3439,6 +3436,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
                memset(objp, 0, cachep->object_size);
        kmemleak_free_recursive(objp, cachep->flags);
        objp = cache_free_debugcheck(cachep, objp, caller);
+       memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
 
        /*
         * Skip calling cache_free_alien() when the platform is not numa.
@@ -3466,7 +3464,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
                }
        }
 
-       ac->entry[ac->avail++] = objp;
+       __free_one(ac, objp);
 }
 
 /**
@@ -3504,8 +3502,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                          void **p)
 {
        size_t i;
+       struct obj_cgroup *objcg = NULL;
 
-       s = slab_pre_alloc_hook(s, flags);
+       s = slab_pre_alloc_hook(s, &objcg, size, flags);
        if (!s)
                return 0;
 
@@ -3528,13 +3527,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                for (i = 0; i < size; i++)
                        memset(p[i], 0, s->object_size);
 
-       slab_post_alloc_hook(s, flags, size, p);
+       slab_post_alloc_hook(s, objcg, flags, size, p);
        /* FIXME: Trace call missing. Christoph would like a bulk variant */
        return size;
 error:
        local_irq_enable();
        cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
-       slab_post_alloc_hook(s, flags, i, p);
+       slab_post_alloc_hook(s, objcg, flags, i, p);
        __kmem_cache_free_bulk(s, i, p);
        return 0;
 }
@@ -3796,8 +3795,8 @@ fail:
 }
 
 /* Always called with the slab_mutex held */
-static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
-                               int batchcount, int shared, gfp_t gfp)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+                           int batchcount, int shared, gfp_t gfp)
 {
        struct array_cache __percpu *cpu_cache, *prev;
        int cpu;
@@ -3842,29 +3841,6 @@ setup_node:
        return setup_kmem_cache_nodes(cachep, gfp);
 }
 
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-                               int batchcount, int shared, gfp_t gfp)
-{
-       int ret;
-       struct kmem_cache *c;
-
-       ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-
-       if (slab_state < FULL)
-               return ret;
-
-       if ((ret < 0) || !is_root_cache(cachep))
-               return ret;
-
-       lockdep_assert_held(&slab_mutex);
-       for_each_memcg_cache(c, cachep) {
-               /* return value determined by the root cache only */
-               __do_tune_cpucache(c, limit, batchcount, shared, gfp);
-       }
-
-       return ret;
-}
-
 /* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
@@ -3877,13 +3853,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
        if (err)
                goto end;
 
-       if (!is_root_cache(cachep)) {
-               struct kmem_cache *root = memcg_root_cache(cachep);
-               limit = root->limit;
-               shared = root->shared;
-               batchcount = root->batchcount;
-       }
-
        if (limit && shared && batchcount)
                goto skip_setup;
        /*
index 74f7e09..6cc323f 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,69 +30,6 @@ struct kmem_cache {
        struct list_head list;  /* List of all slab caches on the system */
 };
 
-#else /* !CONFIG_SLOB */
-
-struct memcg_cache_array {
-       struct rcu_head rcu;
-       struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache:        Common to root and child caches.  NULL for root, pointer to
- *             the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
- *             used to index child cachces during allocation and cleared
- *             early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children:  List of all child caches.  While the child caches are also
- *             reachable through @memcg_caches, a child cache remains on
- *             this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg:     Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
-       struct kmem_cache *root_cache;
-       union {
-               struct {
-                       struct memcg_cache_array __rcu *memcg_caches;
-                       struct list_head __root_caches_node;
-                       struct list_head children;
-                       bool dying;
-               };
-               struct {
-                       struct mem_cgroup *memcg;
-                       struct list_head children_node;
-                       struct list_head kmem_caches_node;
-                       struct percpu_ref refcnt;
-
-                       void (*work_fn)(struct kmem_cache *);
-                       union {
-                               struct rcu_head rcu_head;
-                               struct work_struct work;
-                       };
-               };
-       };
-};
 #endif /* CONFIG_SLOB */
 
 #ifdef CONFIG_SLAB
@@ -109,6 +46,7 @@ struct memcg_cache_params {
 #include <linux/kmemleak.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
+#include <linux/kmemleak.h>
 
 /*
  * State of the slab allocator.
@@ -152,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
 struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 #endif
 
+gfp_t kmalloc_fix_flags(gfp_t flags);
 
 /* Functions provided by the slab allocators */
 int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
@@ -234,10 +173,7 @@ bool __kmem_cache_empty(struct kmem_cache *);
 int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *);
-void __kmemcg_cache_deactivate(struct kmem_cache *s);
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
 void slab_kmem_cache_release(struct kmem_cache *);
-void kmem_cache_shrink_all(struct kmem_cache *s);
 
 struct seq_file;
 struct file;
@@ -272,199 +208,208 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 static inline int cache_vmstat_idx(struct kmem_cache *s)
 {
        return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-               NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+               NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
-
-/* List of all root caches. */
-extern struct list_head                slab_root_caches;
-#define root_caches_node       memcg_params.__root_caches_node
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+extern void print_tracking(struct kmem_cache *s, void *object);
+#else
+static inline void print_tracking(struct kmem_cache *s, void *object)
+{
+}
+#endif
 
 /*
- * Iterate over all memcg caches of the given root cache. The caller must hold
- * slab_mutex.
+ * Returns true if any of the specified slub_debug flags is enabled for the
+ * cache. Use only for flags parsed by setup_slub_debug() as it also enables
+ * the static key.
  */
-#define for_each_memcg_cache(iter, root) \
-       list_for_each_entry(iter, &(root)->memcg_params.children, \
-                           memcg_params.children_node)
-
-static inline bool is_root_cache(struct kmem_cache *s)
+static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
 {
-       return !s->memcg_params.root_cache;
+#ifdef CONFIG_SLUB_DEBUG
+       VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+       if (static_branch_unlikely(&slub_debug_enabled))
+               return s->flags & flags;
+#endif
+       return false;
 }
 
-static inline bool slab_equal_or_root(struct kmem_cache *s,
-                                     struct kmem_cache *p)
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
 {
-       return p == s || p == s->memcg_params.root_cache;
+       /*
+        * page->mem_cgroup and page->obj_cgroups are sharing the same
+        * space. To distinguish between them in case we don't know for sure
+        * that the page is a slab page (e.g. page_cgroup_ino()), let's
+        * always set the lowest bit of obj_cgroups.
+        */
+       return (struct obj_cgroup **)
+               ((unsigned long)page->obj_cgroups & ~0x1UL);
 }
 
-/*
- * We use suffixes to the name in memcg because we can't have caches
- * created in the system with the same name. But when we print them
- * locally, better refer to them with the base name
- */
-static inline const char *cache_name(struct kmem_cache *s)
+static inline bool page_has_obj_cgroups(struct page *page)
 {
-       if (!is_root_cache(s))
-               s = s->memcg_params.root_cache;
-       return s->name;
+       return ((unsigned long)page->obj_cgroups & 0x1UL);
 }
 
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+                                gfp_t gfp);
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
-       if (is_root_cache(s))
-               return s;
-       return s->memcg_params.root_cache;
+       kfree(page_obj_cgroups(page));
+       page->obj_cgroups = NULL;
 }
 
-/*
- * Expects a pointer to a slab page. Please note, that PageSlab() check
- * isn't sufficient, as it returns true also for tail compound slab pages,
- * which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageSlab() && !PageTail()
- * check.
- *
- * The kmem_cache can be reparented asynchronously. The caller must ensure
- * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
- */
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline size_t obj_full_size(struct kmem_cache *s)
 {
-       struct kmem_cache *s;
-
-       s = READ_ONCE(page->slab_cache);
-       if (s && !is_root_cache(s))
-               return READ_ONCE(s->memcg_params.memcg);
-
-       return NULL;
+       /*
+        * For each accounted object there is an extra space which is used
+        * to store obj_cgroup membership. Charge it too.
+        */
+       return s->size + sizeof(struct obj_cgroup *);
 }
 
-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
-                                            gfp_t gfp, int order,
-                                            struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+                                                          size_t objects,
+                                                          gfp_t flags)
 {
-       int nr_pages = 1 << order;
-       struct mem_cgroup *memcg;
-       struct lruvec *lruvec;
-       int ret;
-
-       rcu_read_lock();
-       memcg = READ_ONCE(s->memcg_params.memcg);
-       while (memcg && !css_tryget_online(&memcg->css))
-               memcg = parent_mem_cgroup(memcg);
-       rcu_read_unlock();
+       struct obj_cgroup *objcg;
 
-       if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
-               mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-                                   nr_pages);
-               percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
-               return 0;
-       }
+       if (memcg_kmem_bypass())
+               return NULL;
 
-       ret = memcg_kmem_charge(memcg, gfp, nr_pages);
-       if (ret)
-               goto out;
+       objcg = get_obj_cgroup_from_current();
+       if (!objcg)
+               return NULL;
 
-       lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-       mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
+       if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
+               obj_cgroup_put(objcg);
+               return NULL;
+       }
 
-       /* transer try_charge() page references to kmem_cache */
-       percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
-       css_put_many(&memcg->css, nr_pages);
-out:
-       css_put(&memcg->css);
-       return ret;
+       return objcg;
 }
 
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
-                                               struct kmem_cache *s)
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+                                  struct pglist_data *pgdat,
+                                  int idx, int nr)
 {
-       int nr_pages = 1 << order;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
        rcu_read_lock();
-       memcg = READ_ONCE(s->memcg_params.memcg);
-       if (likely(!mem_cgroup_is_root(memcg))) {
-               lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-               mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
-               memcg_kmem_uncharge(memcg, nr_pages);
-       } else {
-               mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-                                   -nr_pages);
-       }
+       memcg = obj_cgroup_memcg(objcg);
+       lruvec = mem_cgroup_lruvec(memcg, pgdat);
+       mod_memcg_lruvec_state(lruvec, idx, nr);
        rcu_read_unlock();
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+                                             struct obj_cgroup *objcg,
+                                             gfp_t flags, size_t size,
+                                             void **p)
+{
+       struct page *page;
+       unsigned long off;
+       size_t i;
+
+       if (!objcg)
+               return;
 
-       percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
+       flags &= ~__GFP_ACCOUNT;
+       for (i = 0; i < size; i++) {
+               if (likely(p[i])) {
+                       page = virt_to_head_page(p[i]);
+
+                       if (!page_has_obj_cgroups(page) &&
+                           memcg_alloc_page_obj_cgroups(page, s, flags)) {
+                               obj_cgroup_uncharge(objcg, obj_full_size(s));
+                               continue;
+                       }
+
+                       off = obj_to_index(s, page, p[i]);
+                       obj_cgroup_get(objcg);
+                       page_obj_cgroups(page)[off] = objcg;
+                       mod_objcg_state(objcg, page_pgdat(page),
+                                       cache_vmstat_idx(s), obj_full_size(s));
+               } else {
+                       obj_cgroup_uncharge(objcg, obj_full_size(s));
+               }
+       }
+       obj_cgroup_put(objcg);
 }
 
-extern void slab_init_memcg_params(struct kmem_cache *);
-extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+                                       void *p)
+{
+       struct obj_cgroup *objcg;
+       unsigned int off;
 
-#else /* CONFIG_MEMCG_KMEM */
+       if (!memcg_kmem_enabled())
+               return;
 
-/* If !memcg, all caches are root. */
-#define slab_root_caches       slab_caches
-#define root_caches_node       list
+       if (!page_has_obj_cgroups(page))
+               return;
 
-#define for_each_memcg_cache(iter, root) \
-       for ((void)(iter), (void)(root); 0; )
+       off = obj_to_index(s, page, p);
+       objcg = page_obj_cgroups(page)[off];
+       page_obj_cgroups(page)[off] = NULL;
 
-static inline bool is_root_cache(struct kmem_cache *s)
-{
-       return true;
-}
+       if (!objcg)
+               return;
 
-static inline bool slab_equal_or_root(struct kmem_cache *s,
-                                     struct kmem_cache *p)
-{
-       return s == p;
-}
+       obj_cgroup_uncharge(objcg, obj_full_size(s));
+       mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+                       -obj_full_size(s));
 
-static inline const char *cache_name(struct kmem_cache *s)
-{
-       return s->name;
+       obj_cgroup_put(objcg);
 }
 
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+#else /* CONFIG_MEMCG_KMEM */
+static inline bool page_has_obj_cgroups(struct page *page)
 {
-       return s;
+       return false;
 }
 
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
 {
        return NULL;
 }
 
-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
-                                   struct kmem_cache *s)
+static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+                                              struct kmem_cache *s, gfp_t gfp)
 {
        return 0;
 }
 
-static inline void memcg_uncharge_slab(struct page *page, int order,
-                                      struct kmem_cache *s)
+static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
 }
 
-static inline void slab_init_memcg_params(struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+                                                          size_t objects,
+                                                          gfp_t flags)
 {
+       return NULL;
 }
 
-static inline void memcg_link_cache(struct kmem_cache *s,
-                                   struct mem_cgroup *memcg)
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+                                             struct obj_cgroup *objcg,
+                                             gfp_t flags, size_t size,
+                                             void **p)
 {
 }
 
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+                                       void *p)
+{
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 static inline struct kmem_cache *virt_to_cache(const void *obj)
@@ -478,51 +423,36 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
        return page->slab_cache;
 }
 
-static __always_inline int charge_slab_page(struct page *page,
-                                           gfp_t gfp, int order,
-                                           struct kmem_cache *s)
+static __always_inline void account_slab_page(struct page *page, int order,
+                                             struct kmem_cache *s)
 {
-       if (is_root_cache(s)) {
-               mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-                                   1 << order);
-               return 0;
-       }
-
-       return memcg_charge_slab(page, gfp, order, s);
+       mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+                           PAGE_SIZE << order);
 }
 
-static __always_inline void uncharge_slab_page(struct page *page, int order,
-                                              struct kmem_cache *s)
+static __always_inline void unaccount_slab_page(struct page *page, int order,
+                                               struct kmem_cache *s)
 {
-       if (is_root_cache(s)) {
-               mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-                                   -(1 << order));
-               return;
-       }
+       if (memcg_kmem_enabled())
+               memcg_free_page_obj_cgroups(page);
 
-       memcg_uncharge_slab(page, order, s);
+       mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+                           -(PAGE_SIZE << order));
 }
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
        struct kmem_cache *cachep;
 
-       /*
-        * When kmemcg is not being used, both assignments should return the
-        * same value. but we don't want to pay the assignment price in that
-        * case. If it is not compiled in, the compiler should be smart enough
-        * to not do even the assignment. In that case, slab_equal_or_root
-        * will also be a constant.
-        */
-       if (!memcg_kmem_enabled() &&
-           !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
-           !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
+       if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+           !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
                return s;
 
        cachep = virt_to_cache(x);
-       WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
+       if (WARN(cachep && cachep != s,
                  "%s: Wrong slab cache. %s but object is from %s\n",
-                 __func__, s->name, cachep->name);
+                 __func__, s->name, cachep->name))
+               print_tracking(cachep, x);
        return cachep;
 }
 
@@ -557,7 +487,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 }
 
 static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
-                                                    gfp_t flags)
+                                                    struct obj_cgroup **objcgp,
+                                                    size_t size, gfp_t flags)
 {
        flags &= gfp_allowed_mask;
 
@@ -571,13 +502,14 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 
        if (memcg_kmem_enabled() &&
            ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
-               return memcg_kmem_get_cache(s);
+               *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
 
        return s;
 }
 
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-                                       size_t size, void **p)
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+                                       struct obj_cgroup *objcg,
+                                       gfp_t flags, size_t size, void **p)
 {
        size_t i;
 
@@ -590,7 +522,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
        }
 
        if (memcg_kmem_enabled())
-               memcg_kmem_put_cache(s);
+               memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
 }
 
 #ifndef CONFIG_SLOB
@@ -645,9 +577,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 void *slab_start(struct seq_file *m, loff_t *pos);
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
-void *memcg_slab_start(struct seq_file *m, loff_t *pos);
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
-void memcg_slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
index fe8b684..a513f32 100644 (file)
@@ -26,6 +26,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kmem.h>
 
+#include "internal.h"
+
 #include "slab.h"
 
 enum slab_state slab_state;
@@ -128,152 +130,6 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
        return i;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
-
-LIST_HEAD(slab_root_caches);
-static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
-
-void slab_init_memcg_params(struct kmem_cache *s)
-{
-       s->memcg_params.root_cache = NULL;
-       RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
-       INIT_LIST_HEAD(&s->memcg_params.children);
-       s->memcg_params.dying = false;
-}
-
-static int init_memcg_params(struct kmem_cache *s,
-                            struct kmem_cache *root_cache)
-{
-       struct memcg_cache_array *arr;
-
-       if (root_cache) {
-               int ret = percpu_ref_init(&s->memcg_params.refcnt,
-                                         kmemcg_cache_shutdown,
-                                         0, GFP_KERNEL);
-               if (ret)
-                       return ret;
-
-               s->memcg_params.root_cache = root_cache;
-               INIT_LIST_HEAD(&s->memcg_params.children_node);
-               INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
-               return 0;
-       }
-
-       slab_init_memcg_params(s);
-
-       if (!memcg_nr_cache_ids)
-               return 0;
-
-       arr = kvzalloc(sizeof(struct memcg_cache_array) +
-                      memcg_nr_cache_ids * sizeof(void *),
-                      GFP_KERNEL);
-       if (!arr)
-               return -ENOMEM;
-
-       RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
-       return 0;
-}
-
-static void destroy_memcg_params(struct kmem_cache *s)
-{
-       if (is_root_cache(s)) {
-               kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
-       } else {
-               mem_cgroup_put(s->memcg_params.memcg);
-               WRITE_ONCE(s->memcg_params.memcg, NULL);
-               percpu_ref_exit(&s->memcg_params.refcnt);
-       }
-}
-
-static void free_memcg_params(struct rcu_head *rcu)
-{
-       struct memcg_cache_array *old;
-
-       old = container_of(rcu, struct memcg_cache_array, rcu);
-       kvfree(old);
-}
-
-static int update_memcg_params(struct kmem_cache *s, int new_array_size)
-{
-       struct memcg_cache_array *old, *new;
-
-       new = kvzalloc(sizeof(struct memcg_cache_array) +
-                      new_array_size * sizeof(void *), GFP_KERNEL);
-       if (!new)
-               return -ENOMEM;
-
-       old = rcu_dereference_protected(s->memcg_params.memcg_caches,
-                                       lockdep_is_held(&slab_mutex));
-       if (old)
-               memcpy(new->entries, old->entries,
-                      memcg_nr_cache_ids * sizeof(void *));
-
-       rcu_assign_pointer(s->memcg_params.memcg_caches, new);
-       if (old)
-               call_rcu(&old->rcu, free_memcg_params);
-       return 0;
-}
-
-int memcg_update_all_caches(int num_memcgs)
-{
-       struct kmem_cache *s;
-       int ret = 0;
-
-       mutex_lock(&slab_mutex);
-       list_for_each_entry(s, &slab_root_caches, root_caches_node) {
-               ret = update_memcg_params(s, num_memcgs);
-               /*
-                * Instead of freeing the memory, we'll just leave the caches
-                * up to this point in an updated state.
-                */
-               if (ret)
-                       break;
-       }
-       mutex_unlock(&slab_mutex);
-       return ret;
-}
-
-void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
-{
-       if (is_root_cache(s)) {
-               list_add(&s->root_caches_node, &slab_root_caches);
-       } else {
-               css_get(&memcg->css);
-               s->memcg_params.memcg = memcg;
-               list_add(&s->memcg_params.children_node,
-                        &s->memcg_params.root_cache->memcg_params.children);
-               list_add(&s->memcg_params.kmem_caches_node,
-                        &s->memcg_params.memcg->kmem_caches);
-       }
-}
-
-static void memcg_unlink_cache(struct kmem_cache *s)
-{
-       if (is_root_cache(s)) {
-               list_del(&s->root_caches_node);
-       } else {
-               list_del(&s->memcg_params.children_node);
-               list_del(&s->memcg_params.kmem_caches_node);
-       }
-}
-#else
-static inline int init_memcg_params(struct kmem_cache *s,
-                                   struct kmem_cache *root_cache)
-{
-       return 0;
-}
-
-static inline void destroy_memcg_params(struct kmem_cache *s)
-{
-}
-
-static inline void memcg_unlink_cache(struct kmem_cache *s)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
 /*
  * Figure out what the alignment of the objects will be given a set of
  * flags, a user specified alignment and the size of the objects.
@@ -311,9 +167,6 @@ int slab_unmergeable(struct kmem_cache *s)
        if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
                return 1;
 
-       if (!is_root_cache(s))
-               return 1;
-
        if (s->ctor)
                return 1;
 
@@ -326,14 +179,6 @@ int slab_unmergeable(struct kmem_cache *s)
        if (s->refcount < 0)
                return 1;
 
-#ifdef CONFIG_MEMCG_KMEM
-       /*
-        * Skip the dying kmem_cache.
-        */
-       if (s->memcg_params.dying)
-               return 1;
-#endif
-
        return 0;
 }
 
@@ -356,7 +201,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
        if (flags & SLAB_NEVER_MERGE)
                return NULL;
 
-       list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
+       list_for_each_entry_reverse(s, &slab_caches, list) {
                if (slab_unmergeable(s))
                        continue;
 
@@ -388,7 +233,7 @@ static struct kmem_cache *create_cache(const char *name,
                unsigned int object_size, unsigned int align,
                slab_flags_t flags, unsigned int useroffset,
                unsigned int usersize, void (*ctor)(void *),
-               struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+               struct kmem_cache *root_cache)
 {
        struct kmem_cache *s;
        int err;
@@ -408,24 +253,18 @@ static struct kmem_cache *create_cache(const char *name,
        s->useroffset = useroffset;
        s->usersize = usersize;
 
-       err = init_memcg_params(s, root_cache);
-       if (err)
-               goto out_free_cache;
-
        err = __kmem_cache_create(s, flags);
        if (err)
                goto out_free_cache;
 
        s->refcount = 1;
        list_add(&s->list, &slab_caches);
-       memcg_link_cache(s, memcg);
 out:
        if (err)
                return ERR_PTR(err);
        return s;
 
 out_free_cache:
-       destroy_memcg_params(s);
        kmem_cache_free(kmem_cache, s);
        goto out;
 }
@@ -471,7 +310,6 @@ kmem_cache_create_usercopy(const char *name,
 
        get_online_cpus();
        get_online_mems();
-       memcg_get_cache_ids();
 
        mutex_lock(&slab_mutex);
 
@@ -512,7 +350,7 @@ kmem_cache_create_usercopy(const char *name,
 
        s = create_cache(cache_name, size,
                         calculate_alignment(flags, align, size),
-                        flags, useroffset, usersize, ctor, NULL, NULL);
+                        flags, useroffset, usersize, ctor, NULL);
        if (IS_ERR(s)) {
                err = PTR_ERR(s);
                kfree_const(cache_name);
@@ -521,7 +359,6 @@ kmem_cache_create_usercopy(const char *name,
 out_unlock:
        mutex_unlock(&slab_mutex);
 
-       memcg_put_cache_ids();
        put_online_mems();
        put_online_cpus();
 
@@ -614,7 +451,6 @@ static int shutdown_cache(struct kmem_cache *s)
        if (__kmem_cache_shutdown(s) != 0)
                return -EBUSY;
 
-       memcg_unlink_cache(s);
        list_del(&s->list);
 
        if (s->flags & SLAB_TYPESAFE_BY_RCU) {
@@ -635,311 +471,9 @@ static int shutdown_cache(struct kmem_cache *s)
        return 0;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
-/*
- * memcg_create_kmem_cache - Create a cache for a memory cgroup.
- * @memcg: The memory cgroup the new cache is for.
- * @root_cache: The parent of the new cache.
- *
- * This function attempts to create a kmem cache that will serve allocation
- * requests going from @memcg to @root_cache. The new cache inherits properties
- * from its parent.
- */
-void memcg_create_kmem_cache(struct mem_cgroup *memcg,
-                            struct kmem_cache *root_cache)
-{
-       static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-       struct cgroup_subsys_state *css = &memcg->css;
-       struct memcg_cache_array *arr;
-       struct kmem_cache *s = NULL;
-       char *cache_name;
-       int idx;
-
-       get_online_cpus();
-       get_online_mems();
-
-       mutex_lock(&slab_mutex);
-
-       /*
-        * The memory cgroup could have been offlined while the cache
-        * creation work was pending.
-        */
-       if (memcg->kmem_state != KMEM_ONLINE)
-               goto out_unlock;
-
-       idx = memcg_cache_id(memcg);
-       arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
-                                       lockdep_is_held(&slab_mutex));
-
-       /*
-        * Since per-memcg caches are created asynchronously on first
-        * allocation (see memcg_kmem_get_cache()), several threads can try to
-        * create the same cache, but only one of them may succeed.
-        */
-       if (arr->entries[idx])
-               goto out_unlock;
-
-       cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
-       cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
-                              css->serial_nr, memcg_name_buf);
-       if (!cache_name)
-               goto out_unlock;
-
-       s = create_cache(cache_name, root_cache->object_size,
-                        root_cache->align,
-                        root_cache->flags & CACHE_CREATE_MASK,
-                        root_cache->useroffset, root_cache->usersize,
-                        root_cache->ctor, memcg, root_cache);
-       /*
-        * If we could not create a memcg cache, do not complain, because
-        * that's not critical at all as we can always proceed with the root
-        * cache.
-        */
-       if (IS_ERR(s)) {
-               kfree(cache_name);
-               goto out_unlock;
-       }
-
-       /*
-        * Since readers won't lock (see memcg_kmem_get_cache()), we need a
-        * barrier here to ensure nobody will see the kmem_cache partially
-        * initialized.
-        */
-       smp_wmb();
-       arr->entries[idx] = s;
-
-out_unlock:
-       mutex_unlock(&slab_mutex);
-
-       put_online_mems();
-       put_online_cpus();
-}
-
-static void kmemcg_workfn(struct work_struct *work)
-{
-       struct kmem_cache *s = container_of(work, struct kmem_cache,
-                                           memcg_params.work);
-
-       get_online_cpus();
-       get_online_mems();
-
-       mutex_lock(&slab_mutex);
-       s->memcg_params.work_fn(s);
-       mutex_unlock(&slab_mutex);
-
-       put_online_mems();
-       put_online_cpus();
-}
-
-static void kmemcg_rcufn(struct rcu_head *head)
-{
-       struct kmem_cache *s = container_of(head, struct kmem_cache,
-                                           memcg_params.rcu_head);
-
-       /*
-        * We need to grab blocking locks.  Bounce to ->work.  The
-        * work item shares the space with the RCU head and can't be
-        * initialized earlier.
-        */
-       INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
-       queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-}
-
-static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
-{
-       WARN_ON(shutdown_cache(s));
-}
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
-{
-       struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
-                                           memcg_params.refcnt);
-       unsigned long flags;
-
-       spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
-       if (s->memcg_params.root_cache->memcg_params.dying)
-               goto unlock;
-
-       s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
-       INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
-       queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-
-unlock:
-       spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
-}
-
-static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-       __kmemcg_cache_deactivate_after_rcu(s);
-       percpu_ref_kill(&s->memcg_params.refcnt);
-}
-
-static void kmemcg_cache_deactivate(struct kmem_cache *s)
-{
-       if (WARN_ON_ONCE(is_root_cache(s)))
-               return;
-
-       __kmemcg_cache_deactivate(s);
-       s->flags |= SLAB_DEACTIVATED;
-
-       /*
-        * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
-        * flag and make sure that no new kmem_cache deactivation tasks
-        * are queued (see flush_memcg_workqueue() ).
-        */
-       spin_lock_irq(&memcg_kmem_wq_lock);
-       if (s->memcg_params.root_cache->memcg_params.dying)
-               goto unlock;
-
-       s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
-       call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
-unlock:
-       spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
-                                 struct mem_cgroup *parent)
-{
-       int idx;
-       struct memcg_cache_array *arr;
-       struct kmem_cache *s, *c;
-       unsigned int nr_reparented;
-
-       idx = memcg_cache_id(memcg);
-
-       get_online_cpus();
-       get_online_mems();
-
-       mutex_lock(&slab_mutex);
-       list_for_each_entry(s, &slab_root_caches, root_caches_node) {
-               arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
-                                               lockdep_is_held(&slab_mutex));
-               c = arr->entries[idx];
-               if (!c)
-                       continue;
-
-               kmemcg_cache_deactivate(c);
-               arr->entries[idx] = NULL;
-       }
-       nr_reparented = 0;
-       list_for_each_entry(s, &memcg->kmem_caches,
-                           memcg_params.kmem_caches_node) {
-               WRITE_ONCE(s->memcg_params.memcg, parent);
-               css_put(&memcg->css);
-               nr_reparented++;
-       }
-       if (nr_reparented) {
-               list_splice_init(&memcg->kmem_caches,
-                                &parent->kmem_caches);
-               css_get_many(&parent->css, nr_reparented);
-       }
-       mutex_unlock(&slab_mutex);
-
-       put_online_mems();
-       put_online_cpus();
-}
-
-static int shutdown_memcg_caches(struct kmem_cache *s)
-{
-       struct memcg_cache_array *arr;
-       struct kmem_cache *c, *c2;
-       LIST_HEAD(busy);
-       int i;
-
-       BUG_ON(!is_root_cache(s));
-
-       /*
-        * First, shutdown active caches, i.e. caches that belong to online
-        * memory cgroups.
-        */
-       arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
-                                       lockdep_is_held(&slab_mutex));
-       for_each_memcg_cache_index(i) {
-               c = arr->entries[i];
-               if (!c)
-                       continue;
-               if (shutdown_cache(c))
-                       /*
-                        * The cache still has objects. Move it to a temporary
-                        * list so as not to try to destroy it for a second
-                        * time while iterating over inactive caches below.
-                        */
-                       list_move(&c->memcg_params.children_node, &busy);
-               else
-                       /*
-                        * The cache is empty and will be destroyed soon. Clear
-                        * the pointer to it in the memcg_caches array so that
-                        * it will never be accessed even if the root cache
-                        * stays alive.
-                        */
-                       arr->entries[i] = NULL;
-       }
-
-       /*
-        * Second, shutdown all caches left from memory cgroups that are now
-        * offline.
-        */
-       list_for_each_entry_safe(c, c2, &s->memcg_params.children,
-                                memcg_params.children_node)
-               shutdown_cache(c);
-
-       list_splice(&busy, &s->memcg_params.children);
-
-       /*
-        * A cache being destroyed must be empty. In particular, this means
-        * that all per memcg caches attached to it must be empty too.
-        */
-       if (!list_empty(&s->memcg_params.children))
-               return -EBUSY;
-       return 0;
-}
-
-static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
-{
-       spin_lock_irq(&memcg_kmem_wq_lock);
-       s->memcg_params.dying = true;
-       spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-static void flush_memcg_workqueue(struct kmem_cache *s)
-{
-       /*
-        * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
-        * sure all registered rcu callbacks have been invoked.
-        */
-       rcu_barrier();
-
-       /*
-        * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
-        * deactivates the memcg kmem_caches through workqueue. Make sure all
-        * previous workitems on workqueue are processed.
-        */
-       if (likely(memcg_kmem_cache_wq))
-               flush_workqueue(memcg_kmem_cache_wq);
-
-       /*
-        * If we're racing with children kmem_cache deactivation, it might
-        * take another rcu grace period to complete their destruction.
-        * At this moment the corresponding percpu_ref_kill() call should be
-        * done, but it might take another rcu grace period to complete
-        * switching to the atomic mode.
-        * Please, note that we check without grabbing the slab_mutex. It's safe
-        * because at this moment the children list can't grow.
-        */
-       if (!list_empty(&s->memcg_params.children))
-               rcu_barrier();
-}
-#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s)
-{
-       return 0;
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
        __kmem_cache_release(s);
-       destroy_memcg_params(s);
        kfree_const(s->name);
        kmem_cache_free(kmem_cache, s);
 }
@@ -960,26 +494,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
        if (s->refcount)
                goto out_unlock;
 
-#ifdef CONFIG_MEMCG_KMEM
-       memcg_set_kmem_cache_dying(s);
-
-       mutex_unlock(&slab_mutex);
-
-       put_online_mems();
-       put_online_cpus();
-
-       flush_memcg_workqueue(s);
-
-       get_online_cpus();
-       get_online_mems();
-
-       mutex_lock(&slab_mutex);
-#endif
-
-       err = shutdown_memcg_caches(s);
-       if (!err)
-               err = shutdown_cache(s);
-
+       err = shutdown_cache(s);
        if (err) {
                pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
                       s->name);
@@ -1016,43 +531,6 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
-/**
- * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
- * @s: The cache pointer
- */
-void kmem_cache_shrink_all(struct kmem_cache *s)
-{
-       struct kmem_cache *c;
-
-       if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
-               kmem_cache_shrink(s);
-               return;
-       }
-
-       get_online_cpus();
-       get_online_mems();
-       kasan_cache_shrink(s);
-       __kmem_cache_shrink(s);
-
-       /*
-        * We have to take the slab_mutex to protect from the memcg list
-        * modification.
-        */
-       mutex_lock(&slab_mutex);
-       for_each_memcg_cache(c, s) {
-               /*
-                * Don't need to shrink deactivated memcg caches.
-                */
-               if (s->flags & SLAB_DEACTIVATED)
-                       continue;
-               kasan_cache_shrink(c);
-               __kmem_cache_shrink(c);
-       }
-       mutex_unlock(&slab_mutex);
-       put_online_mems();
-       put_online_cpus();
-}
-
 bool slab_is_available(void)
 {
        return slab_state >= UP;
@@ -1081,8 +559,6 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
        s->useroffset = useroffset;
        s->usersize = usersize;
 
-       slab_init_memcg_params(s);
-
        err = __kmem_cache_create(s, flags);
 
        if (err)
@@ -1103,7 +579,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
 
        create_boot_cache(s, name, size, flags, useroffset, usersize);
        list_add(&s->list, &slab_caches);
-       memcg_link_cache(s, NULL);
        s->refcount = 1;
        return s;
 }
@@ -1332,6 +807,18 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 }
 #endif /* !CONFIG_SLOB */
 
+gfp_t kmalloc_fix_flags(gfp_t flags)
+{
+       gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+       flags &= ~GFP_SLAB_BUG_MASK;
+       pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+                       invalid_mask, &invalid_mask, flags, &flags);
+       dump_stack();
+
+       return flags;
+}
+
 /*
  * To avoid unnecessary overhead, we pass through large allocation requests
  * directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -1342,12 +829,15 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
        void *ret = NULL;
        struct page *page;
 
+       if (unlikely(flags & GFP_SLAB_BUG_MASK))
+               flags = kmalloc_fix_flags(flags);
+
        flags |= __GFP_COMP;
        page = alloc_pages(flags, order);
        if (likely(page)) {
                ret = page_address(page);
-               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-                                   1 << order);
+               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+                                   PAGE_SIZE << order);
        }
        ret = kasan_kmalloc_large(ret, size, flags);
        /* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1444,12 +934,12 @@ static void print_slabinfo_header(struct seq_file *m)
 void *slab_start(struct seq_file *m, loff_t *pos)
 {
        mutex_lock(&slab_mutex);
-       return seq_list_start(&slab_root_caches, *pos);
+       return seq_list_start(&slab_caches, *pos);
 }
 
 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
-       return seq_list_next(p, &slab_root_caches, pos);
+       return seq_list_next(p, &slab_caches, pos);
 }
 
 void slab_stop(struct seq_file *m, void *p)
@@ -1457,27 +947,6 @@ void slab_stop(struct seq_file *m, void *p)
        mutex_unlock(&slab_mutex);
 }
 
-static void
-memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
-{
-       struct kmem_cache *c;
-       struct slabinfo sinfo;
-
-       if (!is_root_cache(s))
-               return;
-
-       for_each_memcg_cache(c, s) {
-               memset(&sinfo, 0, sizeof(sinfo));
-               get_slabinfo(c, &sinfo);
-
-               info->active_slabs += sinfo.active_slabs;
-               info->num_slabs += sinfo.num_slabs;
-               info->shared_avail += sinfo.shared_avail;
-               info->active_objs += sinfo.active_objs;
-               info->num_objs += sinfo.num_objs;
-       }
-}
-
 static void cache_show(struct kmem_cache *s, struct seq_file *m)
 {
        struct slabinfo sinfo;
@@ -1485,10 +954,8 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
        memset(&sinfo, 0, sizeof(sinfo));
        get_slabinfo(s, &sinfo);
 
-       memcg_accumulate_slabinfo(s, &sinfo);
-
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                  cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+                  s->name, sinfo.active_objs, sinfo.num_objs, s->size,
                   sinfo.objects_per_slab, (1 << sinfo.cache_order));
 
        seq_printf(m, " : tunables %4u %4u %4u",
@@ -1501,9 +968,9 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
 
 static int slab_show(struct seq_file *m, void *p)
 {
-       struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
+       struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
 
-       if (p == slab_root_caches.next)
+       if (p == slab_caches.next)
                print_slabinfo_header(m);
        cache_show(s, m);
        return 0;
@@ -1530,13 +997,13 @@ void dump_unreclaimable_slab(void)
        pr_info("Name                      Used          Total\n");
 
        list_for_each_entry_safe(s, s2, &slab_caches, list) {
-               if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
+               if (s->flags & SLAB_RECLAIM_ACCOUNT)
                        continue;
 
                get_slabinfo(s, &sinfo);
 
                if (sinfo.num_objs > 0)
-                       pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
+                       pr_info("%-17s %10luKB %10luKB\n", s->name,
                                (sinfo.active_objs * s->size) / 1024,
                                (sinfo.num_objs * s->size) / 1024);
        }
@@ -1544,35 +1011,12 @@ void dump_unreclaimable_slab(void)
 }
 
 #if defined(CONFIG_MEMCG_KMEM)
-void *memcg_slab_start(struct seq_file *m, loff_t *pos)
-{
-       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
-       mutex_lock(&slab_mutex);
-       return seq_list_start(&memcg->kmem_caches, *pos);
-}
-
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
-{
-       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
-       return seq_list_next(p, &memcg->kmem_caches, pos);
-}
-
-void memcg_slab_stop(struct seq_file *m, void *p)
-{
-       mutex_unlock(&slab_mutex);
-}
-
 int memcg_slab_show(struct seq_file *m, void *p)
 {
-       struct kmem_cache *s = list_entry(p, struct kmem_cache,
-                                         memcg_params.kmem_caches_node);
-       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
-       if (p == memcg->kmem_caches.next)
-               print_slabinfo_header(m);
-       cache_show(s, m);
+       /*
+        * Deprecated.
+        * Please, take a look at tools/cgroup/slabinfo.py .
+        */
        return 0;
 }
 #endif
@@ -1618,73 +1062,15 @@ static int __init slab_proc_init(void)
 }
 module_init(slab_proc_init);
 
-#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
-/*
- * Display information about kmem caches that have child memcg caches.
- */
-static int memcg_slabinfo_show(struct seq_file *m, void *unused)
-{
-       struct kmem_cache *s, *c;
-       struct slabinfo sinfo;
-
-       mutex_lock(&slab_mutex);
-       seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
-       seq_puts(m, " <active_slabs> <num_slabs>\n");
-       list_for_each_entry(s, &slab_root_caches, root_caches_node) {
-               /*
-                * Skip kmem caches that don't have any memcg children.
-                */
-               if (list_empty(&s->memcg_params.children))
-                       continue;
-
-               memset(&sinfo, 0, sizeof(sinfo));
-               get_slabinfo(s, &sinfo);
-               seq_printf(m, "%-17s root       %6lu %6lu %6lu %6lu\n",
-                          cache_name(s), sinfo.active_objs, sinfo.num_objs,
-                          sinfo.active_slabs, sinfo.num_slabs);
-
-               for_each_memcg_cache(c, s) {
-                       struct cgroup_subsys_state *css;
-                       char *status = "";
-
-                       css = &c->memcg_params.memcg->css;
-                       if (!(css->flags & CSS_ONLINE))
-                               status = ":dead";
-                       else if (c->flags & SLAB_DEACTIVATED)
-                               status = ":deact";
-
-                       memset(&sinfo, 0, sizeof(sinfo));
-                       get_slabinfo(c, &sinfo);
-                       seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
-                                  cache_name(c), css->id, status,
-                                  sinfo.active_objs, sinfo.num_objs,
-                                  sinfo.active_slabs, sinfo.num_slabs);
-               }
-       }
-       mutex_unlock(&slab_mutex);
-       return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
-
-static int __init memcg_slabinfo_init(void)
-{
-       debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
-                           NULL, NULL, &memcg_slabinfo_fops);
-       return 0;
-}
-
-late_initcall(memcg_slabinfo_init);
-#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
 #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
 
 static __always_inline void *__do_krealloc(const void *p, size_t new_size,
                                           gfp_t flags)
 {
        void *ret;
-       size_t ks = 0;
+       size_t ks;
 
-       if (p)
-               ks = ksize(p);
+       ks = ksize(p);
 
        if (ks >= new_size) {
                p = kasan_krealloc((void *)p, new_size, flags);
@@ -1729,28 +1115,27 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 EXPORT_SYMBOL(krealloc);
 
 /**
- * kzfree - like kfree but zero memory
+ * kfree_sensitive - Clear sensitive information in memory before freeing
  * @p: object to free memory of
  *
  * The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
+ * If @p is %NULL, kfree_sensitive() does nothing.
  *
  * Note: this function zeroes the whole allocated buffer which can be a good
  * deal bigger than the requested buffer size passed to kmalloc(). So be
  * careful when using this function in performance sensitive code.
  */
-void kzfree(const void *p)
+void kfree_sensitive(const void *p)
 {
        size_t ks;
        void *mem = (void *)p;
 
-       if (unlikely(ZERO_OR_NULL_PTR(mem)))
-               return;
        ks = ksize(mem);
-       memzero_explicit(mem, ks);
+       if (ks)
+               memzero_explicit(mem, ks);
        kfree(mem);
 }
-EXPORT_SYMBOL(kzfree);
+EXPORT_SYMBOL(kfree_sensitive);
 
 /**
  * ksize - get the actual amount of memory allocated for a given object
@@ -1770,8 +1155,6 @@ size_t ksize(const void *objp)
 {
        size_t size;
 
-       if (WARN_ON_ONCE(!objp))
-               return 0;
        /*
         * We need to check that the pointed to object is valid, and only then
         * unpoison the shadow memory below. We use __kasan_check_read(), to
@@ -1785,7 +1168,7 @@ size_t ksize(const void *objp)
         * We want to perform the check before __ksize(), to avoid potentially
         * crashing in __ksize() due to accessing invalid metadata.
         */
-       if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+       if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
                return 0;
 
        size = __ksize(objp);
index ac2aecf..7cc9805 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
        if (!page)
                return NULL;
 
-       mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-                           1 << order);
+       mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+                           PAGE_SIZE << order);
        return page_address(page);
 }
 
@@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order)
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += 1 << order;
 
-       mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
-                           -(1 << order));
+       mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+                           -(PAGE_SIZE << order));
        __free_pages(sp, order);
 }
 
@@ -552,8 +552,8 @@ void kfree(const void *block)
                slob_free(m, *m + align);
        } else {
                unsigned int order = compound_order(sp);
-               mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
-                                   -(1 << order));
+               mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+                                   -(PAGE_SIZE << order));
                __free_pages(sp, order);
 
        }
index f226d66..68c02b2 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
  *                     the fast path and disables lockless freelists.
  */
 
-static inline int kmem_cache_debug(struct kmem_cache *s)
-{
 #ifdef CONFIG_SLUB_DEBUG
-       return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
 #else
-       return 0;
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 #endif
+#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+       return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
 }
 
 void *fixup_red_left(struct kmem_cache *s, void *p)
 {
-       if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+       if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
                p += s->red_left_pad;
 
        return p;
@@ -214,14 +218,10 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 #ifdef CONFIG_SYSFS
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
-static void sysfs_slab_remove(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 #endif
 
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
                __p < (__addr) + (__objects) * (__s)->size; \
                __p += (__s)->size)
 
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
-       return (kasan_reset_tag(p) - addr) / s->size;
-}
-
 static inline unsigned int order_objects(unsigned int order, unsigned int size)
 {
        return ((unsigned int)PAGE_SIZE << order) / size;
@@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
        bitmap_zero(object_map, page->objects);
 
        for (p = page->freelist; p; p = get_freepointer(s, p))
-               set_bit(slab_index(p, s, addr), object_map);
+               set_bit(__obj_to_index(s, addr, p), object_map);
 
        return object_map;
 }
@@ -469,8 +463,6 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
 static void put_map(unsigned long *map) __releases(&object_map_lock)
 {
        VM_BUG_ON(map != object_map);
-       lockdep_assert_held(&object_map_lock);
-
        spin_unlock(&object_map_lock);
 }
 
@@ -499,7 +491,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
 static slab_flags_t slub_debug;
 #endif
 
-static char *slub_debug_slabs;
+static char *slub_debug_string;
 static int disable_higher_order_debug;
 
 /*
@@ -634,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
 #endif
 }
 
-static void print_tracking(struct kmem_cache *s, void *object)
+void print_tracking(struct kmem_cache *s, void *object)
 {
        unsigned long pr_time = jiffies;
        if (!(s->flags & SLAB_STORE_USER))
@@ -1112,7 +1104,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 static void setup_object_debug(struct kmem_cache *s, struct page *page,
                                                                void *object)
 {
-       if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+       if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
                return;
 
        init_object(s, object, SLUB_RED_INACTIVE);
@@ -1122,7 +1114,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
 static
 void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
 {
-       if (!(s->flags & SLAB_POISON))
+       if (!kmem_cache_debug_flags(s, SLAB_POISON))
                return;
 
        metadata_access_enable();
@@ -1262,69 +1254,135 @@ out:
        return ret;
 }
 
-static int __init setup_slub_debug(char *str)
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str:    start of block
+ * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs:  return start of list of slabs, or NULL when there's no list
+ * @init:   assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
 {
-       slub_debug = DEBUG_DEFAULT_FLAGS;
-       if (*str++ != '=' || !*str)
-               /*
-                * No options specified. Switch on full debugging.
-                */
-               goto out;
+       bool higher_order_disable = false;
 
-       if (*str == ',')
+       /* Skip any completely empty blocks */
+       while (*str && *str == ';')
+               str++;
+
+       if (*str == ',') {
                /*
                 * No options but restriction on slabs. This means full
                 * debugging for slabs matching a pattern.
                 */
+               *flags = DEBUG_DEFAULT_FLAGS;
                goto check_slabs;
+       }
+       *flags = 0;
 
-       slub_debug = 0;
-       if (*str == '-')
-               /*
-                * Switch off all debugging measures.
-                */
-               goto out;
-
-       /*
-        * Determine which debug features should be switched on
-        */
-       for (; *str && *str != ','; str++) {
+       /* Determine which debug features should be switched on */
+       for (; *str && *str != ',' && *str != ';'; str++) {
                switch (tolower(*str)) {
+               case '-':
+                       *flags = 0;
+                       break;
                case 'f':
-                       slub_debug |= SLAB_CONSISTENCY_CHECKS;
+                       *flags |= SLAB_CONSISTENCY_CHECKS;
                        break;
                case 'z':
-                       slub_debug |= SLAB_RED_ZONE;
+                       *flags |= SLAB_RED_ZONE;
                        break;
                case 'p':
-                       slub_debug |= SLAB_POISON;
+                       *flags |= SLAB_POISON;
                        break;
                case 'u':
-                       slub_debug |= SLAB_STORE_USER;
+                       *flags |= SLAB_STORE_USER;
                        break;
                case 't':
-                       slub_debug |= SLAB_TRACE;
+                       *flags |= SLAB_TRACE;
                        break;
                case 'a':
-                       slub_debug |= SLAB_FAILSLAB;
+                       *flags |= SLAB_FAILSLAB;
                        break;
                case 'o':
                        /*
                         * Avoid enabling debugging on caches if its minimum
                         * order would increase as a result.
                         */
-                       disable_higher_order_debug = 1;
+                       higher_order_disable = true;
                        break;
                default:
-                       pr_err("slub_debug option '%c' unknown. skipped\n",
-                              *str);
+                       if (init)
+                               pr_err("slub_debug option '%c' unknown. skipped\n", *str);
                }
        }
-
 check_slabs:
        if (*str == ',')
-               slub_debug_slabs = str + 1;
+               *slabs = ++str;
+       else
+               *slabs = NULL;
+
+       /* Skip over the slab list */
+       while (*str && *str != ';')
+               str++;
+
+       /* Skip any completely empty blocks */
+       while (*str && *str == ';')
+               str++;
+
+       if (init && higher_order_disable)
+               disable_higher_order_debug = 1;
+
+       if (*str)
+               return str;
+       else
+               return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+       slab_flags_t flags;
+       char *saved_str;
+       char *slab_list;
+       bool global_slub_debug_changed = false;
+       bool slab_list_specified = false;
+
+       slub_debug = DEBUG_DEFAULT_FLAGS;
+       if (*str++ != '=' || !*str)
+               /*
+                * No options specified. Switch on full debugging.
+                */
+               goto out;
+
+       saved_str = str;
+       while (str) {
+               str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+               if (!slab_list) {
+                       slub_debug = flags;
+                       global_slub_debug_changed = true;
+               } else {
+                       slab_list_specified = true;
+               }
+       }
+
+       /*
+        * For backwards compatibility, a single list of flags with list of
+        * slabs means debugging is only enabled for those slabs, so the global
+        * slub_debug should be 0. We can extended that to multiple lists as
+        * long as there is no option specifying flags without a slab list.
+        */
+       if (slab_list_specified) {
+               if (!global_slub_debug_changed)
+                       slub_debug = 0;
+               slub_debug_string = saved_str;
+       }
 out:
+       if (slub_debug != 0 || slub_debug_string)
+               static_branch_enable(&slub_debug_enabled);
        if ((static_branch_unlikely(&init_on_alloc) ||
             static_branch_unlikely(&init_on_free)) &&
            (slub_debug & SLAB_POISON))
@@ -1352,36 +1410,47 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
 {
        char *iter;
        size_t len;
+       char *next_block;
+       slab_flags_t block_flags;
 
        /* If slub_debug = 0, it folds into the if conditional. */
-       if (!slub_debug_slabs)
+       if (!slub_debug_string)
                return flags | slub_debug;
 
        len = strlen(name);
-       iter = slub_debug_slabs;
-       while (*iter) {
-               char *end, *glob;
-               size_t cmplen;
-
-               end = strchrnul(iter, ',');
+       next_block = slub_debug_string;
+       /* Go through all blocks of debug options, see if any matches our slab's name */
+       while (next_block) {
+               next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+               if (!iter)
+                       continue;
+               /* Found a block that has a slab list, search it */
+               while (*iter) {
+                       char *end, *glob;
+                       size_t cmplen;
+
+                       end = strchrnul(iter, ',');
+                       if (next_block && next_block < end)
+                               end = next_block - 1;
+
+                       glob = strnchr(iter, end - iter, '*');
+                       if (glob)
+                               cmplen = glob - iter;
+                       else
+                               cmplen = max_t(size_t, len, (end - iter));
 
-               glob = strnchr(iter, end - iter, '*');
-               if (glob)
-                       cmplen = glob - iter;
-               else
-                       cmplen = max_t(size_t, len, (end - iter));
+                       if (!strncmp(name, iter, cmplen)) {
+                               flags |= block_flags;
+                               return flags;
+                       }
 
-               if (!strncmp(name, iter, cmplen)) {
-                       flags |= slub_debug;
-                       break;
+                       if (!*end || *end == ';')
+                               break;
+                       iter = end + 1;
                }
-
-               if (!*end)
-                       break;
-               iter = end + 1;
        }
 
-       return flags;
+       return slub_debug;
 }
 #else /* !CONFIG_SLUB_DEBUG */
 static inline void setup_object_debug(struct kmem_cache *s,
@@ -1470,6 +1539,11 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(x, s->object_size);
 
+       /* Use KCSAN to help debug racy use-after-free. */
+       if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+               __kcsan_check_access(x, s->object_size,
+                                    KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
        /* KASAN might put x into memory quarantine, delaying its reuse */
        return kasan_slab_free(s, x, _RET_IP_);
 }
@@ -1546,10 +1620,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
        else
                page = __alloc_pages_node(node, flags, order);
 
-       if (page && charge_slab_page(page, flags, order, s)) {
-               __free_pages(page, order);
-               page = NULL;
-       }
+       if (page)
+               account_slab_page(page, order, s);
 
        return page;
 }
@@ -1745,13 +1817,8 @@ out:
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
-       if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-               gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-               flags &= ~GFP_SLAB_BUG_MASK;
-               pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
-                               invalid_mask, &invalid_mask, flags, &flags);
-               dump_stack();
-       }
+       if (unlikely(flags & GFP_SLAB_BUG_MASK))
+               flags = kmalloc_fix_flags(flags);
 
        return allocate_slab(s,
                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1762,7 +1829,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        int order = compound_order(page);
        int pages = 1 << order;
 
-       if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+       if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
                void *p;
 
                slab_pad_check(s, page);
@@ -1777,7 +1844,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        page->mapping = NULL;
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
-       uncharge_slab_page(page, order, s);
+       unaccount_slab_page(page, order, s);
        __free_pages(page, order);
 }
 
@@ -2744,8 +2811,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        struct kmem_cache_cpu *c;
        struct page *page;
        unsigned long tid;
+       struct obj_cgroup *objcg = NULL;
 
-       s = slab_pre_alloc_hook(s, gfpflags);
+       s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
        if (!s)
                return NULL;
 redo:
@@ -2821,7 +2889,7 @@ redo:
        if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
                memset(object, 0, s->object_size);
 
-       slab_post_alloc_hook(s, gfpflags, 1, &object);
+       slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
 
        return object;
 }
@@ -3026,6 +3094,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
        void *tail_obj = tail ? : head;
        struct kmem_cache_cpu *c;
        unsigned long tid;
+
+       memcg_slab_free_hook(s, page, head);
 redo:
        /*
         * Determine the currently cpus per cpu slab.
@@ -3205,9 +3275,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 {
        struct kmem_cache_cpu *c;
        int i;
+       struct obj_cgroup *objcg = NULL;
 
        /* memcg and kmem_cache debug support */
-       s = slab_pre_alloc_hook(s, flags);
+       s = slab_pre_alloc_hook(s, &objcg, size, flags);
        if (unlikely(!s))
                return false;
        /*
@@ -3261,11 +3332,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
        }
 
        /* memcg and kmem_cache debug support */
-       slab_post_alloc_hook(s, flags, size, p);
+       slab_post_alloc_hook(s, objcg, flags, size, p);
        return i;
 error:
        local_irq_enable();
-       slab_post_alloc_hook(s, flags, i, p);
+       slab_post_alloc_hook(s, objcg, flags, i, p);
        __kmem_cache_free_bulk(s, i, p);
        return 0;
 }
@@ -3675,6 +3746,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         */
        size = ALIGN(size, s->align);
        s->size = size;
+       s->reciprocal_size = reciprocal_value(size);
        if (forced_order >= 0)
                order = forced_order;
        else
@@ -3779,7 +3851,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
        map = get_map(s, page);
        for_each_object(p, s, addr, page->objects) {
 
-               if (!test_bit(slab_index(p, s, addr), map)) {
+               if (!test_bit(__obj_to_index(s, addr, p), map)) {
                        pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
                        print_tracking(s, p);
                }
@@ -3842,7 +3914,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
                if (n->nr_partial || slabs_node(s, node))
                        return 1;
        }
-       sysfs_slab_remove(s);
        return 0;
 }
 
@@ -3912,8 +3983,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        page = alloc_pages_node(node, flags, order);
        if (page) {
                ptr = page_address(page);
-               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-                                   1 << order);
+               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+                                   PAGE_SIZE << order);
        }
 
        return kmalloc_large_node_hook(ptr, size, flags);
@@ -3980,7 +4051,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
        offset = (ptr - page_address(page)) % s->size;
 
        /* Adjust for redzone and reject if within the redzone. */
-       if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+       if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
                if (offset < s->red_left_pad)
                        usercopy_abort("SLUB object in left red zone",
                                       s->name, to_user, offset, n);
@@ -4044,8 +4115,8 @@ void kfree(const void *x)
 
                BUG_ON(!PageCompound(page));
                kfree_hook(object);
-               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-                                   -(1 << order));
+               mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+                                   -(PAGE_SIZE << order));
                __free_pages(page, order);
                return;
        }
@@ -4126,36 +4197,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
        return ret;
 }
 
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-       /*
-        * Called with all the locks held after a sched RCU grace period.
-        * Even if @s becomes empty after shrinking, we can't know that @s
-        * doesn't have allocations already in-flight and thus can't
-        * destroy @s until the associated memcg is released.
-        *
-        * However, let's remove the sysfs files for empty caches here.
-        * Each cache has a lot of interface files which aren't
-        * particularly useful for empty draining caches; otherwise, we can
-        * easily end up with millions of unnecessary sysfs files on
-        * systems which have a lot of memory and transient cgroups.
-        */
-       if (!__kmem_cache_shrink(s))
-               sysfs_slab_remove(s);
-}
-
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
-{
-       /*
-        * Disable empty slabs caching. Used to avoid pinning offline
-        * memory cgroups by kmem pages that can be freed.
-        */
-       slub_set_cpu_partial(s, 0);
-       s->min_partial = 0;
-}
-#endif /* CONFIG_MEMCG */
-
 static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
@@ -4310,9 +4351,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
                        p->slab_cache = s;
 #endif
        }
-       slab_init_memcg_params(s);
        list_add(&s->list, &slab_caches);
-       memcg_link_cache(s, NULL);
        return s;
 }
 
@@ -4367,7 +4406,7 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
                   slab_flags_t flags, void (*ctor)(void *))
 {
-       struct kmem_cache *s, *c;
+       struct kmem_cache *s;
 
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
@@ -4380,11 +4419,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
                s->object_size = max(s->object_size, size);
                s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
 
-               for_each_memcg_cache(c, s) {
-                       c->object_size = s->object_size;
-                       c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
-               }
-
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
                        s = NULL;
@@ -4406,7 +4440,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
        if (slab_state <= UP)
                return 0;
 
-       memcg_propagate_slab_attrs(s);
        err = sysfs_slab_add(s);
        if (err)
                __kmem_cache_release(s);
@@ -4495,7 +4528,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
        /* Now we know that a valid freelist exists */
        map = get_map(s, page);
        for_each_object(p, s, addr, page->objects) {
-               u8 val = test_bit(slab_index(p, s, addr), map) ?
+               u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
                         SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
 
                if (!check_object(s, page, p, val))
@@ -4686,7 +4719,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
 
        map = get_map(s, page);
        for_each_object(p, s, addr, page->objects)
-               if (!test_bit(slab_index(p, s, addr), map))
+               if (!test_bit(__obj_to_index(s, addr, p), map))
                        add_location(t, s, get_track(s, p, alloc));
        put_map(map);
 }
@@ -4970,20 +5003,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
        return x + sprintf(buf + x, "\n");
 }
 
-#ifdef CONFIG_SLUB_DEBUG
-static int any_slab_objects(struct kmem_cache *s)
-{
-       int node;
-       struct kmem_cache_node *n;
-
-       for_each_kmem_cache_node(s, node, n)
-               if (atomic_long_read(&n->total_objects))
-                       return 1;
-
-       return 0;
-}
-#endif
-
 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
 
@@ -5025,28 +5044,11 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(objs_per_slab);
 
-static ssize_t order_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       unsigned int order;
-       int err;
-
-       err = kstrtouint(buf, 10, &order);
-       if (err)
-               return err;
-
-       if (order > slub_max_order || order < slub_min_order)
-               return -EINVAL;
-
-       calculate_sizes(s, order);
-       return length;
-}
-
 static ssize_t order_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%u\n", oo_order(s->oo));
 }
-SLAB_ATTR(order);
+SLAB_ATTR_RO(order);
 
 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
 {
@@ -5168,16 +5170,7 @@ static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
 }
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       s->flags &= ~SLAB_RECLAIM_ACCOUNT;
-       if (buf[0] == '1')
-               s->flags |= SLAB_RECLAIM_ACCOUNT;
-       return length;
-}
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(reclaim_account);
 
 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
 {
@@ -5222,104 +5215,34 @@ static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
 }
-
-static ssize_t sanity_checks_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       s->flags &= ~SLAB_CONSISTENCY_CHECKS;
-       if (buf[0] == '1') {
-               s->flags &= ~__CMPXCHG_DOUBLE;
-               s->flags |= SLAB_CONSISTENCY_CHECKS;
-       }
-       return length;
-}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR_RO(sanity_checks);
 
 static ssize_t trace_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
 }
-
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
-                                                       size_t length)
-{
-       /*
-        * Tracing a merged cache is going to give confusing results
-        * as well as cause other issues like converting a mergeable
-        * cache into an umergeable one.
-        */
-       if (s->refcount > 1)
-               return -EINVAL;
-
-       s->flags &= ~SLAB_TRACE;
-       if (buf[0] == '1') {
-               s->flags &= ~__CMPXCHG_DOUBLE;
-               s->flags |= SLAB_TRACE;
-       }
-       return length;
-}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(trace);
 
 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
 }
 
-static ssize_t red_zone_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       if (any_slab_objects(s))
-               return -EBUSY;
-
-       s->flags &= ~SLAB_RED_ZONE;
-       if (buf[0] == '1') {
-               s->flags |= SLAB_RED_ZONE;
-       }
-       calculate_sizes(s, -1);
-       return length;
-}
-SLAB_ATTR(red_zone);
+SLAB_ATTR_RO(red_zone);
 
 static ssize_t poison_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
 }
 
-static ssize_t poison_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       if (any_slab_objects(s))
-               return -EBUSY;
-
-       s->flags &= ~SLAB_POISON;
-       if (buf[0] == '1') {
-               s->flags |= SLAB_POISON;
-       }
-       calculate_sizes(s, -1);
-       return length;
-}
-SLAB_ATTR(poison);
+SLAB_ATTR_RO(poison);
 
 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
 }
 
-static ssize_t store_user_store(struct kmem_cache *s,
-                               const char *buf, size_t length)
-{
-       if (any_slab_objects(s))
-               return -EBUSY;
-
-       s->flags &= ~SLAB_STORE_USER;
-       if (buf[0] == '1') {
-               s->flags &= ~__CMPXCHG_DOUBLE;
-               s->flags |= SLAB_STORE_USER;
-       }
-       calculate_sizes(s, -1);
-       return length;
-}
-SLAB_ATTR(store_user);
+SLAB_ATTR_RO(store_user);
 
 static ssize_t validate_show(struct kmem_cache *s, char *buf)
 {
@@ -5362,19 +5285,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
 }
-
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
-                                                       size_t length)
-{
-       if (s->refcount > 1)
-               return -EINVAL;
-
-       s->flags &= ~SLAB_FAILSLAB;
-       if (buf[0] == '1')
-               s->flags |= SLAB_FAILSLAB;
-       return length;
-}
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(failslab);
 #endif
 
 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5386,7 +5297,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
                        const char *buf, size_t length)
 {
        if (buf[0] == '1')
-               kmem_cache_shrink_all(s);
+               kmem_cache_shrink(s);
        else
                return -EINVAL;
        return length;
@@ -5610,98 +5521,9 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                return -EIO;
 
        err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG
-       if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-               struct kmem_cache *c;
-
-               mutex_lock(&slab_mutex);
-               if (s->max_attr_size < len)
-                       s->max_attr_size = len;
-
-               /*
-                * This is a best effort propagation, so this function's return
-                * value will be determined by the parent cache only. This is
-                * basically because not all attributes will have a well
-                * defined semantics for rollbacks - most of the actions will
-                * have permanent effects.
-                *
-                * Returning the error value of any of the children that fail
-                * is not 100 % defined, in the sense that users seeing the
-                * error code won't be able to know anything about the state of
-                * the cache.
-                *
-                * Only returning the error code for the parent cache at least
-                * has well defined semantics. The cache being written to
-                * directly either failed or succeeded, in which case we loop
-                * through the descendants with best-effort propagation.
-                */
-               for_each_memcg_cache(c, s)
-                       attribute->store(c, buf, len);
-               mutex_unlock(&slab_mutex);
-       }
-#endif
        return err;
 }
 
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG
-       int i;
-       char *buffer = NULL;
-       struct kmem_cache *root_cache;
-
-       if (is_root_cache(s))
-               return;
-
-       root_cache = s->memcg_params.root_cache;
-
-       /*
-        * This mean this cache had no attribute written. Therefore, no point
-        * in copying default values around
-        */
-       if (!root_cache->max_attr_size)
-               return;
-
-       for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
-               char mbuf[64];
-               char *buf;
-               struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
-               ssize_t len;
-
-               if (!attr || !attr->store || !attr->show)
-                       continue;
-
-               /*
-                * It is really bad that we have to allocate here, so we will
-                * do it only as a fallback. If we actually allocate, though,
-                * we can just use the allocated buffer until the end.
-                *
-                * Most of the slub attributes will tend to be very small in
-                * size, but sysfs allows buffers up to a page, so they can
-                * theoretically happen.
-                */
-               if (buffer)
-                       buf = buffer;
-               else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
-                        !IS_ENABLED(CONFIG_SLUB_STATS))
-                       buf = mbuf;
-               else {
-                       buffer = (char *) get_zeroed_page(GFP_KERNEL);
-                       if (WARN_ON(!buffer))
-                               continue;
-                       buf = buffer;
-               }
-
-               len = attr->show(root_cache, buf);
-               if (len > 0)
-                       attr->store(s, buf, len);
-       }
-
-       if (buffer)
-               free_page((unsigned long)buffer);
-#endif /* CONFIG_MEMCG */
-}
-
 static void kmem_cache_release(struct kobject *k)
 {
        slab_kmem_cache_release(to_slab(k));
@@ -5721,10 +5543,6 @@ static struct kset *slab_kset;
 
 static inline struct kset *cache_kset(struct kmem_cache *s)
 {
-#ifdef CONFIG_MEMCG
-       if (!is_root_cache(s))
-               return s->memcg_params.root_cache->memcg_kset;
-#endif
        return slab_kset;
 }
 
@@ -5767,27 +5585,6 @@ static char *create_unique_id(struct kmem_cache *s)
        return name;
 }
 
-static void sysfs_slab_remove_workfn(struct work_struct *work)
-{
-       struct kmem_cache *s =
-               container_of(work, struct kmem_cache, kobj_remove_work);
-
-       if (!s->kobj.state_in_sysfs)
-               /*
-                * For a memcg cache, this may be called during
-                * deactivation and again on shutdown.  Remove only once.
-                * A cache is never shut down before deactivation is
-                * complete, so no need to worry about synchronization.
-                */
-               goto out;
-
-#ifdef CONFIG_MEMCG
-       kset_unregister(s->memcg_kset);
-#endif
-out:
-       kobject_put(&s->kobj);
-}
-
 static int sysfs_slab_add(struct kmem_cache *s)
 {
        int err;
@@ -5795,8 +5592,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
        struct kset *kset = cache_kset(s);
        int unmergeable = slab_unmergeable(s);
 
-       INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
-
        if (!kset) {
                kobject_init(&s->kobj, &slab_ktype);
                return 0;
@@ -5833,16 +5628,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
        if (err)
                goto out_del_kobj;
 
-#ifdef CONFIG_MEMCG
-       if (is_root_cache(s) && memcg_sysfs_enabled) {
-               s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
-               if (!s->memcg_kset) {
-                       err = -ENOMEM;
-                       goto out_del_kobj;
-               }
-       }
-#endif
-
        if (!unmergeable) {
                /* Setup first alias */
                sysfs_slab_alias(s, s->name);
@@ -5856,19 +5641,6 @@ out_del_kobj:
        goto out;
 }
 
-static void sysfs_slab_remove(struct kmem_cache *s)
-{
-       if (slab_state < FULL)
-               /*
-                * Sysfs has not been setup yet so no need to remove the
-                * cache from sysfs.
-                */
-               return;
-
-       kobject_get(&s->kobj);
-       schedule_work(&s->kobj_remove_work);
-}
-
 void sysfs_slab_unlink(struct kmem_cache *s)
 {
        if (slab_state >= FULL)
index 0db7738..16183d8 100644 (file)
@@ -69,11 +69,19 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
                                __pa(MAX_DMA_ADDRESS));
 }
 
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+                                              struct vmem_altmap *altmap);
+
 /* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
+                                        struct vmem_altmap *altmap)
 {
-       void *ptr = sparse_buffer_alloc(size);
+       void *ptr;
+
+       if (altmap)
+               return altmap_alloc_block_buf(size, altmap);
 
+       ptr = sparse_buffer_alloc(size);
        if (!ptr)
                ptr = vmemmap_alloc_block(size, node);
        return ptr;
@@ -94,15 +102,8 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
        return 0;
 }
 
-/**
- * altmap_alloc_block_buf - allocate pages from the device page map
- * @altmap:    device page map
- * @size:      size (in bytes) of the allocation
- *
- * Allocations are aligned to the size of the request.
- */
-void * __meminit altmap_alloc_block_buf(unsigned long size,
-               struct vmem_altmap *altmap)
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+                                              struct vmem_altmap *altmap)
 {
        unsigned long pfn, nr_pfns, nr_align;
 
@@ -139,12 +140,15 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
                        start, end - 1);
 }
 
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+                                      struct vmem_altmap *altmap)
 {
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
-               void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+               void *p;
+
+               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
                if (!p)
                        return NULL;
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -212,8 +216,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
        return pgd;
 }
 
-int __meminit vmemmap_populate_basepages(unsigned long start,
-                                        unsigned long end, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+                                        int node, struct vmem_altmap *altmap)
 {
        unsigned long addr = start;
        pgd_t *pgd;
@@ -235,7 +239,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
                pmd = vmemmap_pmd_populate(pud, addr, node);
                if (!pmd)
                        return -ENOMEM;
-               pte = vmemmap_pte_populate(pmd, addr, node);
+               pte = vmemmap_pte_populate(pmd, addr, node, altmap);
                if (!pte)
                        return -ENOMEM;
                vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -247,20 +251,12 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
-       unsigned long start;
-       unsigned long end;
-
-       /*
-        * The minimum granularity of memmap extensions is
-        * PAGES_PER_SUBSECTION as allocations are tracked in the
-        * 'subsection_map' bitmap of the section.
-        */
-       end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
-       pfn &= PAGE_SUBSECTION_MASK;
-       nr_pages = end - pfn;
-
-       start = (unsigned long) pfn_to_page(pfn);
-       end = start + nr_pages * sizeof(struct page);
+       unsigned long start = (unsigned long) pfn_to_page(pfn);
+       unsigned long end = start + nr_pages * sizeof(struct page);
+
+       if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
+               !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
+               return NULL;
 
        if (vmemmap_populate(start, end, nid, altmap))
                return NULL;
index b2b9a3e..fcc3d17 100644 (file)
@@ -16,7 +16,6 @@
 
 #include "internal.h"
 #include <asm/dma.h>
-#include <asm/pgalloc.h>
 
 /*
  * Permanent SPARSEMEM data:
@@ -250,7 +249,7 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 #endif
 
 /* Record a memory area against a node. */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
        unsigned long pfn;
 
@@ -286,11 +285,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 }
 
 /*
- * Mark all memblocks as present using memory_present(). This is a
- * convenience function that is useful for a number of arches
- * to mark all of the systems memory as present during initialization.
+ * Mark all memblocks as present using memory_present().
+ * This is a convenience function that is useful to mark all of the systems
+ * memory as present during initialization.
  */
-void __init memblocks_present(void)
+static void __init memblocks_present(void)
 {
        struct memblock_region *reg;
 
@@ -575,9 +574,13 @@ failed:
  */
 void __init sparse_init(void)
 {
-       unsigned long pnum_begin = first_present_section_nr();
-       int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
-       unsigned long pnum_end, map_count = 1;
+       unsigned long pnum_end, pnum_begin, map_count = 1;
+       int nid_begin;
+
+       memblocks_present();
+
+       pnum_begin = first_present_section_nr();
+       nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
 
        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
        set_pageblock_order();
@@ -825,10 +828,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
                ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
        }
 
-       if (section_is_early && memmap)
-               free_map_bootmem(memmap);
-       else
+       /*
+        * The memmap of early sections is always fully populated. See
+        * section_activate() and pfn_valid() .
+        */
+       if (!section_is_early)
                depopulate_section_memmap(pfn, nr_pages, altmap);
+       else if (memmap)
+               free_map_bootmem(memmap);
 
        if (empty)
                ms->section_mem_map = (unsigned long)NULL;
index 0975adc..3e64535 100644 (file)
@@ -46,8 +46,7 @@ static void __drain_swap_slots_cache(unsigned int type);
 static void deactivate_swap_slots_cache(void);
 static void reactivate_swap_slots_cache(void);
 
-#define use_swap_slot_cache (swap_slot_cache_active && \
-               swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
 #define SLOTS_CACHE 0x1
 #define SLOTS_CACHE_RET 0x2
 
@@ -94,7 +93,7 @@ static bool check_cache_active(void)
 {
        long pages;
 
-       if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+       if (!swap_slot_cache_enabled)
                return false;
 
        pages = get_nr_swap_pages();
@@ -136,9 +135,16 @@ static int alloc_swap_slot_cache(unsigned int cpu)
 
        mutex_lock(&swap_slots_cache_mutex);
        cache = &per_cpu(swp_slots, cpu);
-       if (cache->slots || cache->slots_ret)
+       if (cache->slots || cache->slots_ret) {
                /* cache already allocated */
-               goto out;
+               mutex_unlock(&swap_slots_cache_mutex);
+
+               kvfree(slots);
+               kvfree(slots_ret);
+
+               return 0;
+       }
+
        if (!cache->lock_initialized) {
                mutex_init(&cache->alloc_lock);
                spin_lock_init(&cache->free_lock);
@@ -155,15 +161,8 @@ static int alloc_swap_slot_cache(unsigned int cpu)
         */
        mb();
        cache->slots = slots;
-       slots = NULL;
        cache->slots_ret = slots_ret;
-       slots_ret = NULL;
-out:
        mutex_unlock(&swap_slots_cache_mutex);
-       if (slots)
-               kvfree(slots);
-       if (slots_ret)
-               kvfree(slots_ret);
        return 0;
 }
 
@@ -240,21 +239,19 @@ static int free_slot_cache(unsigned int cpu)
 
 int enable_swap_slots_cache(void)
 {
-       int ret = 0;
-
        mutex_lock(&swap_slots_cache_enable_mutex);
-       if (swap_slot_cache_initialized) {
-               __reenable_swap_slots_cache();
-               goto out_unlock;
-       }
+       if (!swap_slot_cache_initialized) {
+               int ret;
 
-       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
-                               alloc_swap_slot_cache, free_slot_cache);
-       if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
-                              "without swap slots cache.\n", __func__))
-               goto out_unlock;
+               ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+                                       alloc_swap_slot_cache, free_slot_cache);
+               if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+                                      "without swap slots cache.\n", __func__))
+                       goto out_unlock;
+
+               swap_slot_cache_initialized = true;
+       }
 
-       swap_slot_cache_initialized = true;
        __reenable_swap_slots_cache();
 out_unlock:
        mutex_unlock(&swap_slots_cache_enable_mutex);
index 05889e8..e82f4f8 100644 (file)
@@ -725,7 +725,7 @@ static void swap_ra_info(struct vm_fault *vmf,
 
 /**
  * swap_vma_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
+ * @fentry: swap entry of this memory
  * @gfp_mask: memory allocation flags
  * @vmf: fault information
  *
index c63c8e4..5ef378a 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -503,8 +503,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
-               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
-                                   &populate, &uf);
+               ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+                             &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
@@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
        return ret;
 }
 
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+       percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+               size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int new_policy;
+       int ret;
+
+       /*
+        * The deviation of sync_overcommit_as could be big with loose policy
+        * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+        * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+        * with the strict "NEVER", and to avoid possible race condtion (even
+        * though user usually won't too frequently do the switching to policy
+        * OVERCOMMIT_NEVER), the switch is done in the following order:
+        *      1. changing the batch
+        *      2. sync percpu count on each CPU
+        *      3. switch the policy
+        */
+       if (write) {
+               t = *table;
+               t.data = &new_policy;
+               ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+               if (ret)
+                       return ret;
+
+               mm_compute_batch(new_policy);
+               if (new_policy == OVERCOMMIT_NEVER)
+                       schedule_on_each_cpu(sync_overcommit_as);
+               sysctl_overcommit_memory = new_policy;
+       } else {
+               ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       }
+
+       return ret;
+}
+
 int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
 {
@@ -787,10 +828,15 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
  * balancing memory across competing virtual machines that are hosted.
  * Several metrics drive this policy engine including the guest reported
  * memory commitment.
+ *
+ * The time cost of this is very low for small platforms, and for big
+ * platform like a 2S/36C/72T Skylake server, in worst case where
+ * vm_committed_as's spinlock is under severe contention, the time cost
+ * could be about 30~40 microseconds.
  */
 unsigned long vm_memory_committed(void)
 {
-       return percpu_counter_read_positive(&vm_committed_as);
+       return percpu_counter_sum_positive(&vm_committed_as);
 }
 EXPORT_SYMBOL_GPL(vm_memory_committed);
 
index 5a2b55c..b482d24 100644 (file)
@@ -7,6 +7,7 @@
  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
  *  Numa awareness, Christoph Lameter, SGI, June 2005
+ *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
  */
 
 #include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
 #include <linux/rcupdate.h>
 #include <linux/pfn.h>
 #include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
 #include <asm/shmparam.h>
 
 #include "internal.h"
+#include "pgalloc-track.h"
 
 bool is_vmalloc_addr(const void *x)
 {
@@ -173,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
        pgtbl_mod_mask mask = 0;
 
        BUG_ON(addr >= end);
-       start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -511,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
 /*
  * This function returns back addresses of parent node
  * and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
  */
 static __always_inline struct rb_node **
 find_va_links(struct vmap_area *va,
@@ -549,8 +554,12 @@ find_va_links(struct vmap_area *va,
                else if (va->va_end > tmp_va->va_start &&
                                va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
-               else
-                       BUG();
+               else {
+                       WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+                               va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+                       return NULL;
+               }
        } while (*link);
 
        *parent = &tmp_va->rb_node;
@@ -632,43 +641,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
 
 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
 {
        struct vmap_area *va;
-       struct rb_node *node;
-       unsigned long size;
-       bool found = false;
-
-       if (n == NULL)
-               return;
+       unsigned long computed_size;
 
-       va = rb_entry(n, struct vmap_area, rb_node);
-       size = va->subtree_max_size;
-       node = n;
-
-       while (node) {
-               va = rb_entry(node, struct vmap_area, rb_node);
-
-               if (get_subtree_max_size(node->rb_left) == size) {
-                       node = node->rb_left;
-               } else {
-                       if (va_size(va) == size) {
-                               found = true;
-                               break;
-                       }
-
-                       node = node->rb_right;
-               }
-       }
-
-       if (!found) {
-               va = rb_entry(n, struct vmap_area, rb_node);
-               pr_emerg("tree is corrupted: %lu, %lu\n",
-                       va_size(va), va->subtree_max_size);
+       list_for_each_entry(va, &free_vmap_area_list, list) {
+               computed_size = compute_subtree_max_size(va);
+               if (computed_size != va->subtree_max_size)
+                       pr_emerg("tree is corrupted: %lu, %lu\n",
+                               va_size(va), va->subtree_max_size);
        }
-
-       augment_tree_propagate_check(n->rb_left);
-       augment_tree_propagate_check(n->rb_right);
 }
 #endif
 
@@ -702,28 +685,15 @@ augment_tree_propagate_check(struct rb_node *n)
 static __always_inline void
 augment_tree_propagate_from(struct vmap_area *va)
 {
-       struct rb_node *node = &va->rb_node;
-       unsigned long new_va_sub_max_size;
-
-       while (node) {
-               va = rb_entry(node, struct vmap_area, rb_node);
-               new_va_sub_max_size = compute_subtree_max_size(va);
-
-               /*
-                * If the newly calculated maximum available size of the
-                * subtree is equal to the current one, then it means that
-                * the tree is propagated correctly. So we have to stop at
-                * this point to save cycles.
-                */
-               if (va->subtree_max_size == new_va_sub_max_size)
-                       break;
-
-               va->subtree_max_size = new_va_sub_max_size;
-               node = rb_parent(&va->rb_node);
-       }
+       /*
+        * Populate the tree from bottom towards the root until
+        * the calculated maximum available size of checked node
+        * is equal to its current one.
+        */
+       free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
 
 #if DEBUG_AUGMENT_PROPAGATE_CHECK
-       augment_tree_propagate_check(free_vmap_area_root.rb_node);
+       augment_tree_propagate_check();
 #endif
 }
 
@@ -735,7 +705,8 @@ insert_vmap_area(struct vmap_area *va,
        struct rb_node *parent;
 
        link = find_va_links(va, root, NULL, &parent);
-       link_va(va, root, parent, link, head);
+       if (link)
+               link_va(va, root, parent, link, head);
 }
 
 static void
@@ -751,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va,
        else
                link = find_va_links(va, root, NULL, &parent);
 
-       link_va(va, root, parent, link, head);
-       augment_tree_propagate_from(va);
+       if (link) {
+               link_va(va, root, parent, link, head);
+               augment_tree_propagate_from(va);
+       }
 }
 
 /*
@@ -760,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va,
  * and next free blocks. If coalesce is not done a new
  * free area is inserted. If VA has been merged, it is
  * freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
  */
 static __always_inline struct vmap_area *
 merge_or_add_vmap_area(struct vmap_area *va,
@@ -776,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
+       if (!link)
+               return NULL;
 
        /*
         * Get next node of VA to check if merging can be done.
@@ -796,9 +776,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;
 
-                       /* Check and update the tree if needed. */
-                       augment_tree_propagate_from(sibling);
-
                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);
 
@@ -818,14 +795,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
-                       sibling->va_end = va->va_end;
-
-                       /* Check and update the tree if needed. */
-                       augment_tree_propagate_from(sibling);
-
+                       /*
+                        * If both neighbors are coalesced, it is important
+                        * to unlink the "next" node first, followed by merging
+                        * with "previous" one. Otherwise the tree might not be
+                        * fully populated if a sibling's augmented value is
+                        * "normalized" because of rotation operations.
+                        */
                        if (merged)
                                unlink_va(va, root);
 
+                       sibling->va_end = va->va_end;
+
                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);
 
@@ -836,11 +817,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
        }
 
 insert:
-       if (!merged) {
+       if (!merged)
                link_va(va, root, parent, link, head);
-               augment_tree_propagate_from(va);
-       }
 
+       /*
+        * Last step is to check and update the tree.
+        */
+       augment_tree_propagate_from(va);
        return va;
 }
 
@@ -1381,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
                va = merge_or_add_vmap_area(va, &free_vmap_area_root,
                                            &free_vmap_area_list);
 
+               if (!va)
+                       continue;
+
                if (is_vmalloc_or_module_addr((void *)orig_start))
                        kasan_release_vmalloc(orig_start, orig_end,
                                              va->va_start, va->va_end);
@@ -1513,12 +1499,11 @@ struct vmap_block {
 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
 
 /*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
  * in the free path. Could get rid of this if we change the API to return a
  * "cookie" from alloc, to be passed to free. But no big deal yet.
  */
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
 
 /*
  * We should probably have a fallback mechanism to allocate virtual memory
@@ -1575,13 +1560,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
                return ERR_CAST(va);
        }
 
-       err = radix_tree_preload(gfp_mask);
-       if (unlikely(err)) {
-               kfree(vb);
-               free_vmap_area(va);
-               return ERR_PTR(err);
-       }
-
        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
@@ -1594,11 +1572,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
        INIT_LIST_HEAD(&vb->free_list);
 
        vb_idx = addr_to_vb_idx(va->va_start);
-       spin_lock(&vmap_block_tree_lock);
-       err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
-       spin_unlock(&vmap_block_tree_lock);
-       BUG_ON(err);
-       radix_tree_preload_end();
+       err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+       if (err) {
+               kfree(vb);
+               free_vmap_area(va);
+               return ERR_PTR(err);
+       }
 
        vbq = &get_cpu_var(vmap_block_queue);
        spin_lock(&vbq->lock);
@@ -1612,12 +1591,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 static void free_vmap_block(struct vmap_block *vb)
 {
        struct vmap_block *tmp;
-       unsigned long vb_idx;
 
-       vb_idx = addr_to_vb_idx(vb->va->va_start);
-       spin_lock(&vmap_block_tree_lock);
-       tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
-       spin_unlock(&vmap_block_tree_lock);
+       tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);
 
        free_vmap_area_noflush(vb->va);
@@ -1723,7 +1698,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 static void vb_free(unsigned long addr, unsigned long size)
 {
        unsigned long offset;
-       unsigned long vb_idx;
        unsigned int order;
        struct vmap_block *vb;
 
@@ -1733,14 +1707,8 @@ static void vb_free(unsigned long addr, unsigned long size)
        flush_cache_vunmap(addr, addr + size);
 
        order = get_order(size);
-
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
-
-       vb_idx = addr_to_vb_idx(addr);
-       rcu_read_lock();
-       vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
-       rcu_read_unlock();
-       BUG_ON(!vb);
+       vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
 
        unmap_kernel_range_noflush(addr, size);
 
@@ -3383,8 +3351,9 @@ recovery:
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
-               kasan_release_vmalloc(orig_start, orig_end,
-                                     va->va_start, va->va_end);
+               if (va)
+                       kasan_release_vmalloc(orig_start, orig_end,
+                               va->va_start, va->va_end);
                vas[area] = NULL;
        }
 
@@ -3432,8 +3401,9 @@ err_free_shadow:
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
-               kasan_release_vmalloc(orig_start, orig_end,
-                                     va->va_start, va->va_end);
+               if (va)
+                       kasan_release_vmalloc(orig_start, orig_end,
+                               va->va_start, va->va_end);
                vas[area] = NULL;
                kfree(vms[area]);
        }
index 749d239..72da290 100644 (file)
@@ -170,11 +170,6 @@ struct scan_control {
  * From 0 .. 200.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-/*
- * The total number of pages which are beyond the high watermark within all
- * zones.
- */
-unsigned long vm_total_pages;
 
 static void set_task_reclaim_state(struct task_struct *task,
                                   struct reclaim_state *rs)
@@ -915,7 +910,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 * order to detect refaults, thus thrashing, later on.
                 *
                 * But don't store shadows in an address space that is
-                * already exiting.  This is not just an optizimation,
+                * already exiting.  This is not just an optimization,
                 * inode reclaim needs to empty out the radix tree or
                 * the nodes are lost.  Don't plant shadows behind its
                 * back.
@@ -2035,7 +2030,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 
-       __count_vm_events(PGREFILL, nr_scanned);
+       if (!cgroup_reclaim(sc))
+               __count_vm_events(PGREFILL, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
        spin_unlock_irq(&pgdat->lru_lock);
@@ -2331,7 +2327,8 @@ out:
                unsigned long protection;
 
                lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-               protection = mem_cgroup_protection(memcg,
+               protection = mem_cgroup_protection(sc->target_mem_cgroup,
+                                                  memcg,
                                                   sc->memcg_low_reclaim);
 
                if (protection) {
@@ -2619,14 +2616,15 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
                unsigned long reclaimed;
                unsigned long scanned;
 
-               switch (mem_cgroup_protected(target_memcg, memcg)) {
-               case MEMCG_PROT_MIN:
+               mem_cgroup_calculate_protection(target_memcg, memcg);
+
+               if (mem_cgroup_below_min(memcg)) {
                        /*
                         * Hard protection.
                         * If there is no reclaimable memory, OOM.
                         */
                        continue;
-               case MEMCG_PROT_LOW:
+               } else if (mem_cgroup_below_low(memcg)) {
                        /*
                         * Soft protection.
                         * Respect the protection only as long as
@@ -2638,16 +2636,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
                                continue;
                        }
                        memcg_memory_event(memcg, MEMCG_LOW);
-                       break;
-               case MEMCG_PROT_NONE:
-                       /*
-                        * All protection thresholds breached. We may
-                        * still choose to vary the scan pressure
-                        * applied based on by how much the cgroup in
-                        * question has exceeded its protection
-                        * thresholds (see get_scan_count).
-                        */
-                       break;
                }
 
                reclaimed = sc->nr_reclaimed;
@@ -3318,7 +3306,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           bool may_swap)
 {
        unsigned long nr_reclaimed;
-       unsigned long pflags;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3339,17 +3326,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 
        set_task_reclaim_state(current, &sc.reclaim_state);
-
        trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
-
-       psi_memstall_enter(&pflags);
        noreclaim_flag = memalloc_noreclaim_save();
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
        memalloc_noreclaim_restore(noreclaim_flag);
-       psi_memstall_leave(&pflags);
-
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
        set_task_reclaim_state(current, NULL);
 
@@ -4222,7 +4204,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
         * unmapped file backed pages.
         */
        if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
-           node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+           node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+           pgdat->min_slab_pages)
                return NODE_RECLAIM_FULL;
 
        /*
index 3fb23a2..2b866cb 100644 (file)
@@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
        long x;
        long t;
 
+       if (vmstat_item_in_bytes(item)) {
+               VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+               delta >>= PAGE_SHIFT;
+       }
+
        x = delta + __this_cpu_read(*p);
 
        t = __this_cpu_read(pcp->stat_threshold);
@@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
        s8 __percpu *p = pcp->vm_node_stat_diff + item;
        s8 v, t;
 
+       VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
        v = __this_cpu_inc_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
        if (unlikely(v > t)) {
@@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
        s8 __percpu *p = pcp->vm_node_stat_diff + item;
        s8 v, t;
 
+       VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
        v = __this_cpu_dec_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
        if (unlikely(v < - t)) {
@@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat,
        s8 __percpu *p = pcp->vm_node_stat_diff + item;
        long o, n, t, z;
 
+       if (vmstat_item_in_bytes(item)) {
+               VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+               delta >>= PAGE_SHIFT;
+       }
+
        do {
                z = 0;  /* overflow to node counters */
 
@@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node,
 /*
  * Determine the per node value of a stat item.
  */
-unsigned long node_page_state(struct pglist_data *pgdat,
-                               enum node_stat_item item)
+unsigned long node_page_state_pages(struct pglist_data *pgdat,
+                                   enum node_stat_item item)
 {
        long x = atomic_long_read(&pgdat->vm_stat[item]);
 #ifdef CONFIG_SMP
@@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat,
 #endif
        return x;
 }
+
+unsigned long node_page_state(struct pglist_data *pgdat,
+                             enum node_stat_item item)
+{
+       VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+       return node_page_state_pages(pgdat, item);
+}
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -1118,10 +1140,6 @@ const char * const vmstat_text[] = {
        "nr_zone_write_pending",
        "nr_mlock",
        "nr_page_table_pages",
-       "nr_kernel_stack",
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
-       "nr_shadow_call_stack",
-#endif
        "nr_bounce",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        "nr_zspages",
@@ -1172,6 +1190,10 @@ const char * const vmstat_text[] = {
        "nr_kernel_misc_reclaimable",
        "nr_foll_pin_acquired",
        "nr_foll_pin_released",
+       "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+       "nr_shadow_call_stack",
+#endif
 
        /* enum writeback_stat_item counters */
        "nr_dirty_threshold",
@@ -1577,7 +1599,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                seq_printf(m, "\n  per-node stats");
                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
                        seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
-                                  node_page_state(pgdat, i));
+                                  node_page_state_pages(pgdat, i));
                }
        }
        seq_printf(m,
@@ -1698,7 +1720,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 #endif
 
        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-               v[i] = global_node_page_state(i);
+               v[i] = global_node_page_state_pages(i);
        v += NR_VM_NODE_STAT_ITEMS;
 
        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
index 50b7937..b199726 100644 (file)
@@ -486,8 +486,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
                        pages += lruvec_page_state_local(lruvec,
                                                         NR_LRU_BASE + i);
-               pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
-               pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+               pages += lruvec_page_state_local(
+                       lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+               pages += lruvec_page_state_local(
+                       lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
        } else
 #endif
                pages = node_present_pages(sc->nid);
index 3286f9d..f7a2f0e 100644 (file)
@@ -180,7 +180,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
 static void in_cache_put(in_cache_entry *entry)
 {
        if (refcount_dec_and_test(&entry->use)) {
-               kzfree(entry);
+               kfree_sensitive(entry);
        }
 }
 
@@ -415,7 +415,7 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
 static void eg_cache_put(eg_cache_entry *entry)
 {
        if (refcount_dec_and_test(&entry->use)) {
-               kzfree(entry);
+               kfree_sensitive(entry);
        }
 }
 
index 2155ce8..3226fe0 100644 (file)
@@ -104,7 +104,7 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
 free_all:
        kpp_request_free(req);
 free_tmp:
-       kzfree(tmp);
+       kfree_sensitive(tmp);
        return err;
 }
 
@@ -151,9 +151,9 @@ int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32])
        err = crypto_kpp_set_secret(tfm, buf, buf_len);
        /* fall through */
 free_all:
-       kzfree(buf);
+       kfree_sensitive(buf);
 free_tmp:
-       kzfree(tmp);
+       kfree_sensitive(tmp);
        return err;
 }
 
index 433227f..bf4bef1 100644 (file)
@@ -753,9 +753,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
        complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags);
        mgmt_smp_complete(hcon, complete);
 
-       kzfree(smp->csrk);
-       kzfree(smp->slave_csrk);
-       kzfree(smp->link_key);
+       kfree_sensitive(smp->csrk);
+       kfree_sensitive(smp->slave_csrk);
+       kfree_sensitive(smp->link_key);
 
        crypto_free_shash(smp->tfm_cmac);
        crypto_free_kpp(smp->tfm_ecdh);
@@ -789,7 +789,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
        }
 
        chan->data = NULL;
-       kzfree(smp);
+       kfree_sensitive(smp);
        hci_conn_drop(hcon);
 }
 
@@ -1156,7 +1156,7 @@ static void sc_generate_link_key(struct smp_chan *smp)
                const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
 
                if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
-                       kzfree(smp->link_key);
+                       kfree_sensitive(smp->link_key);
                        smp->link_key = NULL;
                        return;
                }
@@ -1165,14 +1165,14 @@ static void sc_generate_link_key(struct smp_chan *smp)
                const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
 
                if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
-                       kzfree(smp->link_key);
+                       kfree_sensitive(smp->link_key);
                        smp->link_key = NULL;
                        return;
                }
        }
 
        if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
-               kzfree(smp->link_key);
+               kfree_sensitive(smp->link_key);
                smp->link_key = NULL;
                return;
        }
@@ -1407,7 +1407,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
 free_shash:
        crypto_free_shash(smp->tfm_cmac);
 zfree_smp:
-       kzfree(smp);
+       kfree_sensitive(smp);
        return NULL;
 }
 
@@ -3278,7 +3278,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
        tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
        if (IS_ERR(tfm_cmac)) {
                BT_ERR("Unable to create CMAC crypto context");
-               kzfree(smp);
+               kfree_sensitive(smp);
                return ERR_CAST(tfm_cmac);
        }
 
@@ -3286,7 +3286,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
        if (IS_ERR(tfm_ecdh)) {
                BT_ERR("Unable to create ECDH crypto context");
                crypto_free_shash(tfm_cmac);
-               kzfree(smp);
+               kfree_sensitive(smp);
                return ERR_CAST(tfm_ecdh);
        }
 
@@ -3300,7 +3300,7 @@ create_chan:
                if (smp) {
                        crypto_free_shash(smp->tfm_cmac);
                        crypto_free_kpp(smp->tfm_ecdh);
-                       kzfree(smp);
+                       kfree_sensitive(smp);
                }
                return ERR_PTR(-ENOMEM);
        }
@@ -3347,7 +3347,7 @@ static void smp_del_chan(struct l2cap_chan *chan)
                chan->data = NULL;
                crypto_free_shash(smp->tfm_cmac);
                crypto_free_kpp(smp->tfm_ecdh);
-               kzfree(smp);
+               kfree_sensitive(smp);
        }
 
        l2cap_chan_put(chan);
index d29709e..a2044b4 100644 (file)
@@ -2265,7 +2265,7 @@ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
-               kzfree(mem);
+               kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
index 19ad958..c1a54f3 100644 (file)
@@ -38,7 +38,7 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
        struct tcp_fastopen_context *ctx =
            container_of(head, struct tcp_fastopen_context, rcu);
 
-       kzfree(ctx);
+       kfree_sensitive(ctx);
 }
 
 void tcp_fastopen_destroy_cipher(struct sock *sk)
index c5fe95e..d7b3d90 100644 (file)
@@ -41,7 +41,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
        aead_request_set_ad(aead_req, sg[0].length);
 
        crypto_aead_encrypt(aead_req);
-       kzfree(aead_req);
+       kfree_sensitive(aead_req);
 
        return 0;
 }
@@ -76,7 +76,7 @@ int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
        aead_request_set_ad(aead_req, sg[0].length);
 
        err = crypto_aead_decrypt(aead_req);
-       kzfree(aead_req);
+       kfree_sensitive(aead_req);
 
        return err;
 }
index 16ba09c..6f3b3a0 100644 (file)
@@ -60,7 +60,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
        aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len);
 
        crypto_aead_encrypt(aead_req);
-       kzfree(aead_req);
+       kfree_sensitive(aead_req);
 
        return 0;
 }
index 9c28880..2df636c 100644 (file)
@@ -732,7 +732,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
                ieee80211_aes_gcm_key_free(key->u.gcmp.tfm);
                break;
        }
-       kzfree(key);
+       kfree_sensitive(key);
 }
 
 static void __ieee80211_key_destroy(struct ieee80211_key *key,
index c079ee6..585d331 100644 (file)
@@ -49,7 +49,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
 
                msl = container_of(sl, struct mac802154_llsec_seclevel, level);
                list_del(&sl->list);
-               kzfree(msl);
+               kfree_sensitive(msl);
        }
 
        list_for_each_entry_safe(dev, dn, &sec->table.devices, list) {
@@ -66,7 +66,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
                mkey = container_of(key->key, struct mac802154_llsec_key, key);
                list_del(&key->list);
                llsec_key_put(mkey);
-               kzfree(key);
+               kfree_sensitive(key);
        }
 }
 
@@ -155,7 +155,7 @@ err_tfm:
                if (key->tfm[i])
                        crypto_free_aead(key->tfm[i]);
 
-       kzfree(key);
+       kfree_sensitive(key);
        return NULL;
 }
 
@@ -170,7 +170,7 @@ static void llsec_key_release(struct kref *ref)
                crypto_free_aead(key->tfm[i]);
 
        crypto_free_sync_skcipher(key->tfm0);
-       kzfree(key);
+       kfree_sensitive(key);
 }
 
 static struct mac802154_llsec_key*
@@ -261,7 +261,7 @@ int mac802154_llsec_key_add(struct mac802154_llsec *sec,
        return 0;
 
 fail:
-       kzfree(new);
+       kfree_sensitive(new);
        return -ENOMEM;
 }
 
@@ -341,10 +341,10 @@ static void llsec_dev_free(struct mac802154_llsec_device *dev)
                                      devkey);
 
                list_del(&pos->list);
-               kzfree(devkey);
+               kfree_sensitive(devkey);
        }
 
-       kzfree(dev);
+       kfree_sensitive(dev);
 }
 
 int mac802154_llsec_dev_add(struct mac802154_llsec *sec,
@@ -682,7 +682,7 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 
        rc = crypto_aead_encrypt(req);
 
-       kzfree(req);
+       kfree_sensitive(req);
 
        return rc;
 }
@@ -886,7 +886,7 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 
        rc = crypto_aead_decrypt(req);
 
-       kzfree(req);
+       kfree_sensitive(req);
        skb_trim(skb, skb->len - authlen);
 
        return rc;
@@ -926,7 +926,7 @@ llsec_update_devkey_record(struct mac802154_llsec_device *dev,
                if (!devkey)
                        list_add_rcu(&next->devkey.list, &dev->dev.keys);
                else
-                       kzfree(next);
+                       kfree_sensitive(next);
 
                spin_unlock_bh(&dev->lock);
        }
index 83e97e8..9e289c7 100644 (file)
@@ -49,7 +49,7 @@ void sctp_auth_key_put(struct sctp_auth_bytes *key)
                return;
 
        if (refcount_dec_and_test(&key->refcnt)) {
-               kzfree(key);
+               kfree_sensitive(key);
                SCTP_DBG_OBJCNT_DEC(keys);
        }
 }
index e7180da..794fb30 100644 (file)
@@ -1003,7 +1003,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
        err = 0;
 
 out_err:
-       kzfree(desc);
+       kfree_sensitive(desc);
        crypto_free_shash(hmac);
        dprintk("%s: returning %d\n", __func__, err);
        return err;
@@ -1079,7 +1079,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
        err = 0;
 
 out_err:
-       kzfree(desc);
+       kfree_sensitive(desc);
        crypto_free_shash(hmac);
        dprintk("%s: returning %d\n", __func__, err);
        return err;
index 3b7f721..726c076 100644 (file)
@@ -228,11 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
        ret = 0;
 
 err_free_raw:
-       kzfree(rawkey);
+       kfree_sensitive(rawkey);
 err_free_out:
-       kzfree(outblockdata);
+       kfree_sensitive(outblockdata);
 err_free_in:
-       kzfree(inblockdata);
+       kfree_sensitive(inblockdata);
 err_free_cipher:
        crypto_free_sync_skcipher(cipher);
 err_return:
index 75b3c2e..a84a5b2 100644 (file)
@@ -443,7 +443,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
        desc->tfm = hmac;
 
        err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
-       kzfree(desc);
+       kfree_sensitive(desc);
        if (err)
                goto out_err_free_hmac;
        /*
index c8c47fc..001bcb0 100644 (file)
@@ -441,7 +441,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
        /* Allocate per-cpu TFM entry pointer */
        tmp->tfm_entry = alloc_percpu(struct tipc_tfm *);
        if (!tmp->tfm_entry) {
-               kzfree(tmp);
+               kfree_sensitive(tmp);
                return -ENOMEM;
        }
 
@@ -491,7 +491,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
        /* Not any TFM is allocated? */
        if (!tfm_cnt) {
                free_percpu(tmp->tfm_entry);
-               kzfree(tmp);
+               kfree_sensitive(tmp);
                return err;
        }
 
@@ -545,7 +545,7 @@ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src)
 
        aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC);
        if (unlikely(!aead->tfm_entry)) {
-               kzfree(aead);
+               kfree_sensitive(aead);
                return -ENOMEM;
        }
 
@@ -1352,7 +1352,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
        /* Allocate statistic structure */
        c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
        if (!c->stats) {
-               kzfree(c);
+               kfree_sensitive(c);
                return -ENOMEM;
        }
 
@@ -1408,7 +1408,7 @@ void tipc_crypto_stop(struct tipc_crypto **crypto)
        free_percpu(c->stats);
 
        *crypto = NULL;
-       kzfree(c);
+       kfree_sensitive(c);
 }
 
 void tipc_crypto_timeout(struct tipc_crypto *rx)
index 1971d7e..354b0cc 100644 (file)
@@ -1125,7 +1125,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
        }
 
 #ifdef CONFIG_CFG80211_WEXT
-       kzfree(wdev->wext.keys);
+       kfree_sensitive(wdev->wext.keys);
        wdev->wext.keys = NULL;
 #endif
        /* only initialized if we have a netdev */
index ae8fe66..a0621bb 100644 (file)
@@ -127,7 +127,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
                return -EINVAL;
 
        if (WARN_ON(wdev->connect_keys))
-               kzfree(wdev->connect_keys);
+               kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = connkeys;
 
        wdev->ibss_fixed = params->channel_fixed;
@@ -161,7 +161,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 
        ASSERT_WDEV_LOCK(wdev);
 
-       kzfree(wdev->connect_keys);
+       kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = NULL;
 
        rdev_set_qos_map(rdev, dev, NULL);
index f5e842b..1b4d6c8 100644 (file)
@@ -131,7 +131,7 @@ static void lib80211_tkip_deinit(void *priv)
                crypto_free_shash(_priv->tx_tfm_michael);
                crypto_free_shash(_priv->rx_tfm_michael);
        }
-       kzfree(priv);
+       kfree_sensitive(priv);
 }
 
 static inline u16 RotR1(u16 val)
index dafc6f3..6ab9957 100644 (file)
@@ -56,7 +56,7 @@ static void *lib80211_wep_init(int keyidx)
 
 static void lib80211_wep_deinit(void *priv)
 {
-       kzfree(priv);
+       kfree_sensitive(priv);
 }
 
 /* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */
index 814e23d..c04fc6c 100644 (file)
@@ -9836,7 +9836,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 
                if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
                    no_ht) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return -EINVAL;
                }
        }
@@ -9848,7 +9848,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
                int r = validate_pae_over_nl80211(rdev, info);
 
                if (r < 0) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return r;
                }
 
@@ -9861,7 +9861,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
        wdev_lock(dev->ieee80211_ptr);
        err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
        if (err)
-               kzfree(connkeys);
+               kfree_sensitive(connkeys);
        else if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
                dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
        wdev_unlock(dev->ieee80211_ptr);
@@ -10289,7 +10289,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
        if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                memcpy(&connect.ht_capa,
@@ -10307,7 +10307,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                memcpy(&connect.vht_capa,
@@ -10321,7 +10321,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
                       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_RRM)) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                connect.flags |= ASSOC_REQ_USE_RRM;
@@ -10329,21 +10329,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
        connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
        if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
-               kzfree(connkeys);
+               kfree_sensitive(connkeys);
                return -EOPNOTSUPP;
        }
 
        if (info->attrs[NL80211_ATTR_BSS_SELECT]) {
                /* bss selection makes no sense if bssid is set */
                if (connect.bssid) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return -EINVAL;
                }
 
                err = parse_bss_select(info->attrs[NL80211_ATTR_BSS_SELECT],
                                       wiphy, &connect.bss_select);
                if (err) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        return err;
                }
        }
@@ -10373,13 +10373,13 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
                   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
-               kzfree(connkeys);
+               kfree_sensitive(connkeys);
                return -EINVAL;
        }
 
        if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) {
                if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
-                       kzfree(connkeys);
+                       kfree_sensitive(connkeys);
                        GENL_SET_ERR_MSG(info,
                                         "external auth requires connection ownership");
                        return -EINVAL;
@@ -10392,7 +10392,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
        err = cfg80211_connect(rdev, dev, &connect, connkeys,
                               connect.prev_bssid);
        if (err)
-               kzfree(connkeys);
+               kfree_sensitive(connkeys);
 
        if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
index 15595cf..985f3c2 100644 (file)
@@ -742,7 +742,7 @@ void __cfg80211_connect_result(struct net_device *dev,
        }
 
        if (cr->status != WLAN_STATUS_SUCCESS) {
-               kzfree(wdev->connect_keys);
+               kfree_sensitive(wdev->connect_keys);
                wdev->connect_keys = NULL;
                wdev->ssid_len = 0;
                wdev->conn_owner_nlportid = 0;
@@ -1098,7 +1098,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
        wdev->current_bss = NULL;
        wdev->ssid_len = 0;
        wdev->conn_owner_nlportid = 0;
-       kzfree(wdev->connect_keys);
+       kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = NULL;
 
        nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
@@ -1281,7 +1281,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 
        ASSERT_WDEV_LOCK(wdev);
 
-       kzfree(wdev->connect_keys);
+       kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = NULL;
 
        wdev->conn_owner_nlportid = 0;
index 26a9773..dfad1c0 100644 (file)
@@ -871,7 +871,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
                }
        }
 
-       kzfree(wdev->connect_keys);
+       kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = NULL;
 }
 
index 73fd0ea..73df235 100644 (file)
@@ -57,7 +57,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
        err = cfg80211_connect(rdev, wdev->netdev,
                               &wdev->wext.connect, ck, prev_bssid);
        if (err)
-               kzfree(ck);
+               kfree_sensitive(ck);
 
        return err;
 }
index 03757cc..f4beee1 100644 (file)
@@ -44,7 +44,8 @@ else
 endif
 
 CFLAGS_KASAN := -fsanitize=kernel-hwaddress \
-               -mllvm -hwasan-instrument-stack=0 \
+               -mllvm -hwasan-instrument-stack=$(CONFIG_KASAN_STACK) \
+               -mllvm -hwasan-use-short-granules=0 \
                $(instrumentation_flags)
 
 endif # CONFIG_KASAN_SW_TAGS
index 8c965f6..d7ca46c 100755 (executable)
@@ -26,6 +26,8 @@ def getsizes(file, format):
     sym = {}
     with os.popen("nm --size-sort " + file) as f:
         for line in f:
+            if line.startswith("\n") or ":" in line:
+                continue
             size, type, name = line.split()
             if type in format:
                 # strip generated symbols
index 3357bf4..da80050 100644 (file)
@@ -89,7 +89,7 @@ position p;
 (
  kfree@p(x)
 |
- kzfree@p(x)
+ kfree_sensitive@p(x)
 |
  krealloc@p(x, ...)
 |
@@ -112,7 +112,7 @@ position p != safe.p;
 (
 * kfree@p(x)
 |
-* kzfree@p(x)
+* kfree_sensitive@p(x)
 |
 * krealloc@p(x, ...)
 |
index b3290c4..2045391 100644 (file)
@@ -21,7 +21,7 @@ expression E;
 (
   kfree(E);
 |
-  kzfree(E);
+  kfree_sensitive(E);
 |
   debugfs_remove(E);
 |
@@ -42,7 +42,7 @@ position p;
 @@
 
 * if (E != NULL)
-*      \(kfree@p\|kzfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
+*      \(kfree@p\|kfree_sensitive@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
 *         usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\|
 *         dma_pool_destroy@p\)(E);
 
index e9d50e7..1685683 100644 (file)
@@ -24,7 +24,7 @@ position p1;
 (
 * kfree@p1(E)
 |
-* kzfree@p1(E)
+* kfree_sensitive@p1(E)
 )
 
 @print expression@
@@ -68,7 +68,7 @@ while (1) { ...
 (
 * kfree@ok(E)
 |
-* kzfree@ok(E)
+* kfree_sensitive@ok(E)
 )
   ... when != break;
       when != goto l;
@@ -86,7 +86,7 @@ position free.p1!=loop.ok,p2!={print.p,sz.p};
 (
 * kfree@p1(E,...)
 |
-* kzfree@p1(E,...)
+* kfree_sensitive@p1(E,...)
 )
 ...
 (
index cfaf308..142af63 100644 (file)
@@ -20,7 +20,7 @@ position p;
 (
 * kfree@p(&e->f)
 |
-* kzfree@p(&e->f)
+* kfree_sensitive@p(&e->f)
 )
 
 @script:python depends on org@
index ac5f126..e9df9cc 100644 (file)
@@ -44,6 +44,7 @@ platform_hibernation_ops
 platform_suspend_ops
 proto_ops
 regmap_access_table
+regulator_ops
 rpc_pipe_ops
 rtc_class_ops
 sd_desc
index 0869def..9039834 100755 (executable)
@@ -3,18 +3,68 @@
 # (c) 2014, Sasha Levin <sasha.levin@oracle.com>
 #set -x
 
-if [[ $# < 2 ]]; then
+if [[ $# < 1 ]]; then
        echo "Usage:"
-       echo "  $0 [vmlinux] [base path] [modules path]"
+       echo "  $0 -r <release> | <vmlinux> [base path] [modules path]"
        exit 1
 fi
 
-vmlinux=$1
-basepath=$2
-modpath=$3
+if [[ $1 == "-r" ]] ; then
+       vmlinux=""
+       basepath="auto"
+       modpath=""
+       release=$2
+
+       for fn in {,/usr/lib/debug}/boot/vmlinux-$release{,.debug} /lib/modules/$release{,/build}/vmlinux ; do
+               if [ -e "$fn" ] ; then
+                       vmlinux=$fn
+                       break
+               fi
+       done
+
+       if [[ $vmlinux == "" ]] ; then
+               echo "ERROR! vmlinux image for release $release is not found" >&2
+               exit 2
+       fi
+else
+       vmlinux=$1
+       basepath=${2-auto}
+       modpath=$3
+       release=""
+fi
+
 declare -A cache
 declare -A modcache
 
+find_module() {
+       if [[ "$modpath" != "" ]] ; then
+               for fn in $(find "$modpath" -name "${module//_/[-_]}.ko*") ; do
+                       if readelf -WS "$fn" | grep -qwF .debug_line ; then
+                               echo $fn
+                               return
+                       fi
+               done
+               return 1
+       fi
+
+       modpath=$(dirname "$vmlinux")
+       find_module && return
+
+       if [[ $release == "" ]] ; then
+               release=$(gdb -ex 'print init_uts_ns.name.release' -ex 'quit' -quiet -batch "$vmlinux" | sed -n 's/\$1 = "\(.*\)".*/\1/p')
+       fi
+
+       for dn in {/usr/lib/debug,}/lib/modules/$release ; do
+               if [ -e "$dn" ] ; then
+                       modpath="$dn"
+                       find_module && return
+               fi
+       done
+
+       modpath=""
+       return 1
+}
+
 parse_symbol() {
        # The structure of symbol at this point is:
        #   ([name]+[offset]/[total length])
@@ -27,12 +77,11 @@ parse_symbol() {
        elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
                local objfile=${modcache[$module]}
        else
-               if [[ $modpath == "" ]]; then
+               local objfile=$(find_module)
+               if [[ $objfile == "" ]] ; then
                        echo "WARNING! Modules path isn't set, but is needed to parse this symbol" >&2
                        return
                fi
-               local objfile=$(find "$modpath" -name "${module//_/[-_]}.ko*" -print -quit)
-               [[ $objfile == "" ]] && return
                modcache[$module]=$objfile
        fi
 
@@ -56,7 +105,11 @@ parse_symbol() {
        if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
                local base_addr=${cache[$module,$name]}
        else
-               local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
+               local base_addr=$(nm "$objfile" | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}')
+               if [[ $base_addr == "" ]] ; then
+                       # address not found
+                       return
+               fi
                cache[$module,$name]="$base_addr"
        fi
        # Let's start doing the math to get the exact address into the
@@ -148,6 +201,14 @@ handle_line() {
        echo "${words[@]}" "$symbol $module"
 }
 
+if [[ $basepath == "auto" ]] ; then
+       module=""
+       symbol="kernel_init+0x0/0x0"
+       parse_symbol
+       basepath=${symbol#kernel_init (}
+       basepath=${basepath%/init/main.c:*)}
+fi
+
 while read line; do
        # Let's see if we have an address in the line
        if [[ $line =~ \[\<([^]]+)\>\] ]] ||
index c45e9af..f253681 100644 (file)
@@ -149,6 +149,7 @@ arbitary||arbitrary
 architechture||architecture
 arguement||argument
 arguements||arguments
+arithmatic||arithmetic
 aritmetic||arithmetic
 arne't||aren't
 arraival||arrival
@@ -454,6 +455,7 @@ destorys||destroys
 destroied||destroyed
 detabase||database
 deteced||detected
+detectt||detect
 develope||develop
 developement||development
 developped||developed
@@ -545,6 +547,7 @@ entires||entries
 entites||entities
 entrys||entries
 enocded||encoded
+enought||enough
 enterily||entirely
 enviroiment||environment
 enviroment||environment
@@ -556,11 +559,14 @@ equivelant||equivalent
 equivilant||equivalent
 eror||error
 errorr||error
+errror||error
 estbalishment||establishment
 etsablishment||establishment
 etsbalishment||establishment
+evalution||evaluation
 excecutable||executable
 exceded||exceeded
+exceds||exceeds
 exceeed||exceed
 excellant||excellent
 execeeded||exceeded
@@ -583,6 +589,7 @@ explictly||explicitly
 expresion||expression
 exprimental||experimental
 extened||extended
+exteneded||extended||extended
 extensability||extensibility
 extention||extension
 extenstion||extension
@@ -610,10 +617,12 @@ feautures||features
 fetaure||feature
 fetaures||features
 fileystem||filesystem
+fimrware||firmware
 fimware||firmware
 firmare||firmware
 firmaware||firmware
 firware||firmware
+firwmare||firmware
 finanize||finalize
 findn||find
 finilizes||finalizes
@@ -661,6 +670,7 @@ globel||global
 grabing||grabbing
 grahical||graphical
 grahpical||graphical
+granularty||granularity
 grapic||graphic
 grranted||granted
 guage||gauge
@@ -906,6 +916,7 @@ miximum||maximum
 mmnemonic||mnemonic
 mnay||many
 modfiy||modify
+modifer||modifier
 modulues||modules
 momery||memory
 memomry||memory
@@ -915,6 +926,7 @@ monochromo||monochrome
 monocrome||monochrome
 mopdule||module
 mroe||more
+multipler||multiplier
 mulitplied||multiplied
 multidimensionnal||multidimensional
 multipe||multiple
@@ -952,6 +964,7 @@ occassionally||occasionally
 occationally||occasionally
 occurance||occurrence
 occurances||occurrences
+occurd||occurred
 occured||occurred
 occurence||occurrence
 occure||occurred
@@ -1058,6 +1071,7 @@ precission||precision
 preemptable||preemptible
 prefered||preferred
 prefferably||preferably
+prefitler||prefilter
 premption||preemption
 prepaired||prepared
 preperation||preparation
@@ -1101,6 +1115,7 @@ pronunce||pronounce
 propery||property
 propigate||propagate
 propigation||propagation
+propogation||propagation
 propogate||propagate
 prosess||process
 protable||portable
@@ -1316,6 +1331,7 @@ sturcture||structure
 subdirectoires||subdirectories
 suble||subtle
 substract||subtract
+submited||submitted
 submition||submission
 suceed||succeed
 succesfully||successfully
@@ -1324,6 +1340,7 @@ successed||succeeded
 successfull||successful
 successfuly||successfully
 sucessfully||successfully
+sucessful||successful
 sucess||success
 superflous||superfluous
 superseeded||superseded
@@ -1409,6 +1426,7 @@ transormed||transformed
 trasfer||transfer
 trasmission||transmission
 treshold||threshold
+triggerd||triggered
 trigerred||triggered
 trigerring||triggering
 trun||turn
@@ -1421,6 +1439,7 @@ uknown||unknown
 usccess||success
 usupported||unsupported
 uncommited||uncommitted
+uncompatible||incompatible
 unconditionaly||unconditionally
 undeflow||underflow
 underun||underrun
index 4e18ae5..32d3f53 100755 (executable)
@@ -91,20 +91,10 @@ all_sources()
 
 all_compiled_sources()
 {
-       for i in $(all_sources); do
-               case "$i" in
-                       *.[cS])
-                               j=${i/\.[cS]/\.o}
-                               j="${j#$tree}"
-                               if [ -e $j ]; then
-                                       echo $i
-                               fi
-                               ;;
-                       *)
-                               echo $i
-                               ;;
-               esac
-       done
+       realpath -es $([ -z "$KBUILD_ABS_SRCTREE" ] && echo --relative-to=.) \
+               include/generated/autoconf.h $(find -name "*.cmd" -exec \
+               grep -Poh '(?(?=^source_.* \K).*|(?=^  \K\S).*(?= \\))' {} \+ |
+               awk '!a[$0]++') | sort -u
 }
 
 all_target_sources()
index 1c89805..7b0e13c 100644 (file)
@@ -40,8 +40,8 @@ void aa_free_domain_entries(struct aa_domain *domain)
                        return;
 
                for (i = 0; i < domain->size; i++)
-                       kzfree(domain->table[i]);
-               kzfree(domain->table);
+                       kfree_sensitive(domain->table[i]);
+               kfree_sensitive(domain->table);
                domain->table = NULL;
        }
 }
index aff26fc..d4f8948 100644 (file)
@@ -72,7 +72,7 @@ static inline void aa_free_file_ctx(struct aa_file_ctx *ctx)
 {
        if (ctx) {
                aa_put_label(rcu_access_pointer(ctx->label));
-               kzfree(ctx);
+               kfree_sensitive(ctx);
        }
 }
 
index af4f50f..4c010c9 100644 (file)
@@ -187,9 +187,9 @@ static void aa_free_data(void *ptr, void *arg)
 {
        struct aa_data *data = ptr;
 
-       kzfree(data->data);
-       kzfree(data->key);
-       kzfree(data);
+       kfree_sensitive(data->data);
+       kfree_sensitive(data->key);
+       kfree_sensitive(data);
 }
 
 /**
@@ -217,19 +217,19 @@ void aa_free_profile(struct aa_profile *profile)
        aa_put_profile(rcu_access_pointer(profile->parent));
 
        aa_put_ns(profile->ns);
-       kzfree(profile->rename);
+       kfree_sensitive(profile->rename);
 
        aa_free_file_rules(&profile->file);
        aa_free_cap_rules(&profile->caps);
        aa_free_rlimit_rules(&profile->rlimits);
 
        for (i = 0; i < profile->xattr_count; i++)
-               kzfree(profile->xattrs[i]);
-       kzfree(profile->xattrs);
+               kfree_sensitive(profile->xattrs[i]);
+       kfree_sensitive(profile->xattrs);
        for (i = 0; i < profile->secmark_count; i++)
-               kzfree(profile->secmark[i].label);
-       kzfree(profile->secmark);
-       kzfree(profile->dirname);
+               kfree_sensitive(profile->secmark[i].label);
+       kfree_sensitive(profile->secmark);
+       kfree_sensitive(profile->dirname);
        aa_put_dfa(profile->xmatch);
        aa_put_dfa(profile->policy.dfa);
 
@@ -237,14 +237,14 @@ void aa_free_profile(struct aa_profile *profile)
                rht = profile->data;
                profile->data = NULL;
                rhashtable_free_and_destroy(rht, aa_free_data, NULL);
-               kzfree(rht);
+               kfree_sensitive(rht);
        }
 
-       kzfree(profile->hash);
+       kfree_sensitive(profile->hash);
        aa_put_loaddata(profile->rawdata);
        aa_label_destroy(&profile->label);
 
-       kzfree(profile);
+       kfree_sensitive(profile);
 }
 
 /**
index d7ef540..70921d9 100644 (file)
@@ -121,9 +121,9 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name)
        return ns;
 
 fail_unconfined:
-       kzfree(ns->base.hname);
+       kfree_sensitive(ns->base.hname);
 fail_ns:
-       kzfree(ns);
+       kfree_sensitive(ns);
        return NULL;
 }
 
@@ -145,7 +145,7 @@ void aa_free_ns(struct aa_ns *ns)
 
        ns->unconfined->ns = NULL;
        aa_free_profile(ns->unconfined);
-       kzfree(ns);
+       kfree_sensitive(ns);
 }
 
 /**
index b67322a..dc345ac 100644 (file)
@@ -163,10 +163,10 @@ static void do_loaddata_free(struct work_struct *work)
                aa_put_ns(ns);
        }
 
-       kzfree(d->hash);
-       kzfree(d->name);
+       kfree_sensitive(d->hash);
+       kfree_sensitive(d->name);
        kvfree(d->data);
-       kzfree(d);
+       kfree_sensitive(d);
 }
 
 void aa_loaddata_kref(struct kref *kref)
@@ -894,7 +894,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
                while (unpack_strdup(e, &key, NULL)) {
                        data = kzalloc(sizeof(*data), GFP_KERNEL);
                        if (!data) {
-                               kzfree(key);
+                               kfree_sensitive(key);
                                goto fail;
                        }
 
@@ -902,8 +902,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
                        data->size = unpack_blob(e, &data->data, NULL);
                        data->data = kvmemdup(data->data, data->size);
                        if (data->size && !data->data) {
-                               kzfree(data->key);
-                               kzfree(data);
+                               kfree_sensitive(data->key);
+                               kfree_sensitive(data);
                                goto fail;
                        }
 
@@ -1037,7 +1037,7 @@ void aa_load_ent_free(struct aa_load_ent *ent)
                aa_put_profile(ent->old);
                aa_put_profile(ent->new);
                kfree(ent->ns_name);
-               kzfree(ent);
+               kfree_sensitive(ent);
        }
 }
 
index dd708e8..691347d 100644 (file)
@@ -138,7 +138,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
 err_fput:
        fput(file);
 err_enckey:
-       kzfree(enckey);
+       kfree_sensitive(enckey);
 error:
        memzero_explicit(buf, enclen);
        kvfree(buf);
@@ -155,7 +155,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep)
 
                path_put(path);
        }
-       kzfree(prep->payload.data[big_key_data]);
+       kfree_sensitive(prep->payload.data[big_key_data]);
 }
 
 /*
@@ -187,7 +187,7 @@ void big_key_destroy(struct key *key)
                path->mnt = NULL;
                path->dentry = NULL;
        }
-       kzfree(key->payload.data[big_key_data]);
+       kfree_sensitive(key->payload.data[big_key_data]);
        key->payload.data[big_key_data] = NULL;
 }
 
index c4c629b..1abfa70 100644 (file)
@@ -58,9 +58,9 @@ error:
 
 static void dh_free_data(struct dh *dh)
 {
-       kzfree(dh->key);
-       kzfree(dh->p);
-       kzfree(dh->g);
+       kfree_sensitive(dh->key);
+       kfree_sensitive(dh->p);
+       kfree_sensitive(dh->g);
 }
 
 struct dh_completion {
@@ -126,7 +126,7 @@ static void kdf_dealloc(struct kdf_sdesc *sdesc)
        if (sdesc->shash.tfm)
                crypto_free_shash(sdesc->shash.tfm);
 
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
 }
 
 /*
@@ -220,7 +220,7 @@ static int keyctl_dh_compute_kdf(struct kdf_sdesc *sdesc,
                ret = -EFAULT;
 
 err:
-       kzfree(outbuf);
+       kfree_sensitive(outbuf);
        return ret;
 }
 
@@ -395,11 +395,11 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params,
 out6:
        kpp_request_free(req);
 out5:
-       kzfree(outbuf);
+       kfree_sensitive(outbuf);
 out4:
        crypto_free_kpp(tfm);
 out3:
-       kzfree(secret);
+       kfree_sensitive(secret);
 out2:
        dh_free_data(&dh_inputs);
 out1:
index 14cf81d..deebbf1 100644 (file)
@@ -370,7 +370,7 @@ static int get_derived_key(u8 *derived_key, enum derived_key_type key_type,
               master_keylen);
        ret = crypto_shash_tfm_digest(hash_tfm, derived_buf, derived_buf_len,
                                      derived_key);
-       kzfree(derived_buf);
+       kfree_sensitive(derived_buf);
        return ret;
 }
 
@@ -812,13 +812,13 @@ static int encrypted_instantiate(struct key *key,
        ret = encrypted_init(epayload, key->description, format, master_desc,
                             decrypted_datalen, hex_encoded_iv);
        if (ret < 0) {
-               kzfree(epayload);
+               kfree_sensitive(epayload);
                goto out;
        }
 
        rcu_assign_keypointer(key, epayload);
 out:
-       kzfree(datablob);
+       kfree_sensitive(datablob);
        return ret;
 }
 
@@ -827,7 +827,7 @@ static void encrypted_rcu_free(struct rcu_head *rcu)
        struct encrypted_key_payload *epayload;
 
        epayload = container_of(rcu, struct encrypted_key_payload, rcu);
-       kzfree(epayload);
+       kfree_sensitive(epayload);
 }
 
 /*
@@ -885,7 +885,7 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep)
        rcu_assign_keypointer(key, new_epayload);
        call_rcu(&epayload->rcu, encrypted_rcu_free);
 out:
-       kzfree(buf);
+       kfree_sensitive(buf);
        return ret;
 }
 
@@ -946,7 +946,7 @@ static long encrypted_read(const struct key *key, char *buffer,
        memzero_explicit(derived_key, sizeof(derived_key));
 
        memcpy(buffer, ascii_buf, asciiblob_len);
-       kzfree(ascii_buf);
+       kfree_sensitive(ascii_buf);
 
        return asciiblob_len;
 out:
@@ -961,7 +961,7 @@ out:
  */
 static void encrypted_destroy(struct key *key)
 {
-       kzfree(key->payload.data[0]);
+       kfree_sensitive(key->payload.data[0]);
 }
 
 struct key_type key_type_encrypted = {
index 8001ab0..b9fe02e 100644 (file)
@@ -68,7 +68,7 @@ static int TSS_sha1(const unsigned char *data, unsigned int datalen,
        }
 
        ret = crypto_shash_digest(&sdesc->shash, data, datalen, digest);
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
        return ret;
 }
 
@@ -112,7 +112,7 @@ static int TSS_rawhmac(unsigned char *digest, const unsigned char *key,
        if (!ret)
                ret = crypto_shash_final(&sdesc->shash, digest);
 out:
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
        return ret;
 }
 
@@ -166,7 +166,7 @@ int TSS_authhmac(unsigned char *digest, const unsigned char *key,
                                  paramdigest, TPM_NONCE_SIZE, h1,
                                  TPM_NONCE_SIZE, h2, 1, &c, 0, 0);
 out:
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(TSS_authhmac);
@@ -251,7 +251,7 @@ int TSS_checkhmac1(unsigned char *buffer,
        if (memcmp(testhmac, authdata, SHA1_DIGEST_SIZE))
                ret = -EINVAL;
 out:
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(TSS_checkhmac1);
@@ -353,7 +353,7 @@ static int TSS_checkhmac2(unsigned char *buffer,
        if (memcmp(testhmac2, authdata2, SHA1_DIGEST_SIZE))
                ret = -EINVAL;
 out:
-       kzfree(sdesc);
+       kfree_sensitive(sdesc);
        return ret;
 }
 
@@ -563,7 +563,7 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype,
                *bloblen = storedsize;
        }
 out:
-       kzfree(td);
+       kfree_sensitive(td);
        return ret;
 }
 
@@ -1031,12 +1031,12 @@ static int trusted_instantiate(struct key *key,
        if (!ret && options->pcrlock)
                ret = pcrlock(options->pcrlock);
 out:
-       kzfree(datablob);
-       kzfree(options);
+       kfree_sensitive(datablob);
+       kfree_sensitive(options);
        if (!ret)
                rcu_assign_keypointer(key, payload);
        else
-               kzfree(payload);
+               kfree_sensitive(payload);
        return ret;
 }
 
@@ -1045,7 +1045,7 @@ static void trusted_rcu_free(struct rcu_head *rcu)
        struct trusted_key_payload *p;
 
        p = container_of(rcu, struct trusted_key_payload, rcu);
-       kzfree(p);
+       kfree_sensitive(p);
 }
 
 /*
@@ -1087,13 +1087,13 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
        ret = datablob_parse(datablob, new_p, new_o);
        if (ret != Opt_update) {
                ret = -EINVAL;
-               kzfree(new_p);
+               kfree_sensitive(new_p);
                goto out;
        }
 
        if (!new_o->keyhandle) {
                ret = -EINVAL;
-               kzfree(new_p);
+               kfree_sensitive(new_p);
                goto out;
        }
 
@@ -1107,22 +1107,22 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
        ret = key_seal(new_p, new_o);
        if (ret < 0) {
                pr_info("trusted_key: key_seal failed (%d)\n", ret);
-               kzfree(new_p);
+               kfree_sensitive(new_p);
                goto out;
        }
        if (new_o->pcrlock) {
                ret = pcrlock(new_o->pcrlock);
                if (ret < 0) {
                        pr_info("trusted_key: pcrlock failed (%d)\n", ret);
-                       kzfree(new_p);
+                       kfree_sensitive(new_p);
                        goto out;
                }
        }
        rcu_assign_keypointer(key, new_p);
        call_rcu(&p->rcu, trusted_rcu_free);
 out:
-       kzfree(datablob);
-       kzfree(new_o);
+       kfree_sensitive(datablob);
+       kfree_sensitive(new_o);
        return ret;
 }
 
@@ -1154,7 +1154,7 @@ static long trusted_read(const struct key *key, char *buffer,
  */
 static void trusted_destroy(struct key *key)
 {
-       kzfree(key->payload.data[0]);
+       kfree_sensitive(key->payload.data[0]);
 }
 
 struct key_type key_type_trusted = {
index 07d4287..749e2a4 100644 (file)
@@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(user_preparse);
  */
 void user_free_preparse(struct key_preparsed_payload *prep)
 {
-       kzfree(prep->payload.data[0]);
+       kfree_sensitive(prep->payload.data[0]);
 }
 EXPORT_SYMBOL_GPL(user_free_preparse);
 
@@ -91,7 +91,7 @@ static void user_free_payload_rcu(struct rcu_head *head)
        struct user_key_payload *payload;
 
        payload = container_of(head, struct user_key_payload, rcu);
-       kzfree(payload);
+       kfree_sensitive(payload);
 }
 
 /*
@@ -147,7 +147,7 @@ void user_destroy(struct key *key)
 {
        struct user_key_payload *upayload = key->payload.data[0];
 
-       kzfree(upayload);
+       kfree_sensitive(upayload);
 }
 
 EXPORT_SYMBOL_GPL(user_destroy);
diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
new file mode 100644 (file)
index 0000000..c4225ed
--- /dev/null
@@ -0,0 +1,226 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2020 Roman Gushchin <guro@fb.com>
+# Copyright (C) 2020 Facebook
+
+from os import stat
+import argparse
+import sys
+
+from drgn.helpers.linux import list_for_each_entry, list_empty
+from drgn.helpers.linux import for_each_page
+from drgn.helpers.linux.cpumask import for_each_online_cpu
+from drgn.helpers.linux.percpu import per_cpu_ptr
+from drgn import container_of, FaultError, Object
+
+
+DESC = """
+This is a drgn script to provide slab statistics for memory cgroups.
+It supports cgroup v2 and v1 and can emulate memory.kmem.slabinfo
+interface of cgroup v1.
+For drgn, visit https://github.com/osandov/drgn.
+"""
+
+
+MEMCGS = {}
+
+OO_SHIFT = 16
+OO_MASK = ((1 << OO_SHIFT) - 1)
+
+
+def err(s):
+    print('slabinfo.py: error: %s' % s, file=sys.stderr, flush=True)
+    sys.exit(1)
+
+
+def find_memcg_ids(css=prog['root_mem_cgroup'].css, prefix=''):
+    if not list_empty(css.children.address_of_()):
+        for css in list_for_each_entry('struct cgroup_subsys_state',
+                                       css.children.address_of_(),
+                                       'sibling'):
+            name = prefix + '/' + css.cgroup.kn.name.string_().decode('utf-8')
+            memcg = container_of(css, 'struct mem_cgroup', 'css')
+            MEMCGS[css.cgroup.kn.id.value_()] = memcg
+            find_memcg_ids(css, name)
+
+
+def is_root_cache(s):
+    try:
+        return False if s.memcg_params.root_cache else True
+    except AttributeError:
+        return True
+
+
+def cache_name(s):
+    if is_root_cache(s):
+        return s.name.string_().decode('utf-8')
+    else:
+        return s.memcg_params.root_cache.name.string_().decode('utf-8')
+
+
+# SLUB
+
+def oo_order(s):
+    return s.oo.x >> OO_SHIFT
+
+
+def oo_objects(s):
+    return s.oo.x & OO_MASK
+
+
+def count_partial(n, fn):
+    nr_pages = 0
+    for page in list_for_each_entry('struct page', n.partial.address_of_(),
+                                    'lru'):
+         nr_pages += fn(page)
+    return nr_pages
+
+
+def count_free(page):
+    return page.objects - page.inuse
+
+
+def slub_get_slabinfo(s, cfg):
+    nr_slabs = 0
+    nr_objs = 0
+    nr_free = 0
+
+    for node in range(cfg['nr_nodes']):
+        n = s.node[node]
+        nr_slabs += n.nr_slabs.counter.value_()
+        nr_objs += n.total_objects.counter.value_()
+        nr_free += count_partial(n, count_free)
+
+    return {'active_objs': nr_objs - nr_free,
+            'num_objs': nr_objs,
+            'active_slabs': nr_slabs,
+            'num_slabs': nr_slabs,
+            'objects_per_slab': oo_objects(s),
+            'cache_order': oo_order(s),
+            'limit': 0,
+            'batchcount': 0,
+            'shared': 0,
+            'shared_avail': 0}
+
+
+def cache_show(s, cfg, objs):
+    if cfg['allocator'] == 'SLUB':
+        sinfo = slub_get_slabinfo(s, cfg)
+    else:
+        err('SLAB isn\'t supported yet')
+
+    if cfg['shared_slab_pages']:
+        sinfo['active_objs'] = objs
+        sinfo['num_objs'] = objs
+
+    print('%-17s %6lu %6lu %6u %4u %4d'
+          ' : tunables %4u %4u %4u'
+          ' : slabdata %6lu %6lu %6lu' % (
+              cache_name(s), sinfo['active_objs'], sinfo['num_objs'],
+              s.size, sinfo['objects_per_slab'], 1 << sinfo['cache_order'],
+              sinfo['limit'], sinfo['batchcount'], sinfo['shared'],
+              sinfo['active_slabs'], sinfo['num_slabs'],
+              sinfo['shared_avail']))
+
+
+def detect_kernel_config():
+    cfg = {}
+
+    cfg['nr_nodes'] = prog['nr_online_nodes'].value_()
+
+    if prog.type('struct kmem_cache').members[1][1] == 'flags':
+        cfg['allocator'] = 'SLUB'
+    elif prog.type('struct kmem_cache').members[1][1] == 'batchcount':
+        cfg['allocator'] = 'SLAB'
+    else:
+        err('Can\'t determine the slab allocator')
+
+    cfg['shared_slab_pages'] = False
+    try:
+        if prog.type('struct obj_cgroup'):
+            cfg['shared_slab_pages'] = True
+    except:
+        pass
+
+    return cfg
+
+
+def for_each_slab_page(prog):
+    PGSlab = 1 << prog.constant('PG_slab')
+    PGHead = 1 << prog.constant('PG_head')
+
+    for page in for_each_page(prog):
+        try:
+            if page.flags.value_() & PGSlab:
+                yield page
+        except FaultError:
+            pass
+
+
+def main():
+    parser = argparse.ArgumentParser(description=DESC,
+                                     formatter_class=
+                                     argparse.RawTextHelpFormatter)
+    parser.add_argument('cgroup', metavar='CGROUP',
+                        help='Target memory cgroup')
+    args = parser.parse_args()
+
+    try:
+        cgroup_id = stat(args.cgroup).st_ino
+        find_memcg_ids()
+        memcg = MEMCGS[cgroup_id]
+    except KeyError:
+        err('Can\'t find the memory cgroup')
+
+    cfg = detect_kernel_config()
+
+    print('# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>'
+          ' : tunables <limit> <batchcount> <sharedfactor>'
+          ' : slabdata <active_slabs> <num_slabs> <sharedavail>')
+
+    if cfg['shared_slab_pages']:
+        obj_cgroups = set()
+        stats = {}
+        caches = {}
+
+        # find memcg pointers belonging to the specified cgroup
+        obj_cgroups.add(memcg.objcg.value_())
+        for ptr in list_for_each_entry('struct obj_cgroup',
+                                       memcg.objcg_list.address_of_(),
+                                       'list'):
+            obj_cgroups.add(ptr.value_())
+
+        # look over all slab pages, belonging to non-root memcgs
+        # and look for objects belonging to the given memory cgroup
+        for page in for_each_slab_page(prog):
+            objcg_vec_raw = page.obj_cgroups.value_()
+            if objcg_vec_raw == 0:
+                continue
+            cache = page.slab_cache
+            if not cache:
+                continue
+            addr = cache.value_()
+            caches[addr] = cache
+            # clear the lowest bit to get the true obj_cgroups
+            objcg_vec = Object(prog, page.obj_cgroups.type_,
+                               value=objcg_vec_raw & ~1)
+
+            if addr not in stats:
+                stats[addr] = 0
+
+            for i in range(oo_objects(cache)):
+                if objcg_vec[i].value_() in obj_cgroups:
+                    stats[addr] += 1
+
+        for addr in caches:
+            if stats[addr] > 0:
+                cache_show(caches[addr], cfg, stats[addr])
+
+    else:
+        for s in list_for_each_entry('struct kmem_cache',
+                                     memcg.kmem_caches.address_of_(),
+                                     'memcg_params.kmem_caches_node'):
+            cache_show(s, cfg, None)
+
+
+main()
index 348c6f4..af8d0fe 100644 (file)
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
  *
- * http://burtleburtle.net/bob/hash/
+ * https://burtleburtle.net/bob/hash/
  *
  * These are the credits from Bob's sources:
  *
index 06ac7bd..727396d 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/export.h>
 
 /*
- * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
  *
  *  1) A node is either red or black
  *  2) The root is black
index b77837f..ad7799c 100644 (file)
@@ -379,7 +379,7 @@ enum tep_errno {
         * errno since SUS requires the errno has distinct positive values.
         * See 'Issue 6' in the link below.
         *
-        * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
+        * https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
         */
        __TEP_ERRNO__START                      = -100000,
 
index a12d295..4f04878 100644 (file)
@@ -11,7 +11,7 @@ crosstests.conf - this config shows an example of testing a git repo against
     lots of different architectures. It only does build tests, but makes
     it easy to compile test different archs. You can download the arch
     cross compilers from:
-  http://kernel.org/pub/tools/crosstool/files/bin/x86_64/
+  https://kernel.org/pub/tools/crosstool/files/bin/x86_64/
 
 test.conf - A generic example of a config. This is based on an actual config
      used to perform real testing.
index 6907f32..3b15e85 100644 (file)
@@ -3,7 +3,7 @@
 #
 # In this config, it is expected that the tool chains from:
 #
-#   http://kernel.org/pub/tools/crosstool/files/bin/x86_64/
+#   https://kernel.org/pub/tools/crosstool/files/bin/x86_64/
 #
 # running on a x86_64 system have been downloaded and installed into:
 #
index e03bc15..9018f45 100644 (file)
@@ -32,6 +32,7 @@ TARGETS += lkdtm
 TARGETS += membarrier
 TARGETS += memfd
 TARGETS += memory-hotplug
+TARGETS += mincore
 TARGETS += mount
 TARGETS += mqueue
 TARGETS += net
index aa6de65..84cfcab 100644 (file)
@@ -2,3 +2,4 @@
 test_memcontrol
 test_core
 test_freezer
+test_kmem
\ No newline at end of file
index 967f268..f027d93 100644 (file)
@@ -6,11 +6,13 @@ all:
 TEST_FILES     := with_stress.sh
 TEST_PROGS     := test_stress.sh
 TEST_GEN_PROGS = test_memcontrol
+TEST_GEN_PROGS += test_kmem
 TEST_GEN_PROGS += test_core
 TEST_GEN_PROGS += test_freezer
 
 include ../lib.mk
 
 $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
index 8a637ca..05853b0 100644 (file)
@@ -106,7 +106,7 @@ int cg_read_strcmp(const char *cgroup, const char *control,
 
        /* Handle the case of comparing against empty string */
        if (!expected)
-               size = 32;
+               return -1;
        else
                size = strlen(expected) + 1;
 
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
new file mode 100644 (file)
index 0000000..5224dae
--- /dev/null
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+
+static int alloc_dcache(const char *cgroup, void *arg)
+{
+       unsigned long i;
+       struct stat st;
+       char buf[128];
+
+       for (i = 0; i < (unsigned long)arg; i++) {
+               snprintf(buf, sizeof(buf),
+                       "/something-non-existent-with-a-long-name-%64lu-%d",
+                        i, getpid());
+               stat(buf, &st);
+       }
+
+       return 0;
+}
+
+/*
+ * This test allocates 100000 of negative dentries with long names.
+ * Then it checks that "slab" in memory.stat is larger than 1M.
+ * Then it sets memory.high to 1M and checks that at least 1/2
+ * of slab memory has been reclaimed.
+ */
+static int test_kmem_basic(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *cg = NULL;
+       long slab0, slab1, current;
+
+       cg = cg_name(root, "kmem_basic_test");
+       if (!cg)
+               goto cleanup;
+
+       if (cg_create(cg))
+               goto cleanup;
+
+       if (cg_run(cg, alloc_dcache, (void *)100000))
+               goto cleanup;
+
+       slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
+       if (slab0 < (1 << 20))
+               goto cleanup;
+
+       cg_write(cg, "memory.high", "1M");
+       slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
+       if (slab1 <= 0)
+               goto cleanup;
+
+       current = cg_read_long(cg, "memory.current");
+       if (current <= 0)
+               goto cleanup;
+
+       if (slab1 < slab0 / 2 && current < slab0 / 2)
+               ret = KSFT_PASS;
+cleanup:
+       cg_destroy(cg);
+       free(cg);
+
+       return ret;
+}
+
+static void *alloc_kmem_fn(void *arg)
+{
+       alloc_dcache(NULL, (void *)100);
+       return NULL;
+}
+
+static int alloc_kmem_smp(const char *cgroup, void *arg)
+{
+       int nr_threads = 2 * get_nprocs();
+       pthread_t *tinfo;
+       unsigned long i;
+       int ret = -1;
+
+       tinfo = calloc(nr_threads, sizeof(pthread_t));
+       if (tinfo == NULL)
+               return -1;
+
+       for (i = 0; i < nr_threads; i++) {
+               if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
+                                  (void *)i)) {
+                       free(tinfo);
+                       return -1;
+               }
+       }
+
+       for (i = 0; i < nr_threads; i++) {
+               ret = pthread_join(tinfo[i], NULL);
+               if (ret)
+                       break;
+       }
+
+       free(tinfo);
+       return ret;
+}
+
+static int cg_run_in_subcgroups(const char *parent,
+                               int (*fn)(const char *cgroup, void *arg),
+                               void *arg, int times)
+{
+       char *child;
+       int i;
+
+       for (i = 0; i < times; i++) {
+               child = cg_name_indexed(parent, "child", i);
+               if (!child)
+                       return -1;
+
+               if (cg_create(child)) {
+                       cg_destroy(child);
+                       free(child);
+                       return -1;
+               }
+
+               if (cg_run(child, fn, NULL)) {
+                       cg_destroy(child);
+                       free(child);
+                       return -1;
+               }
+
+               cg_destroy(child);
+               free(child);
+       }
+
+       return 0;
+}
+
+/*
+ * The test creates and destroys a large number of cgroups. In each cgroup it
+ * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
+ * threads. Then it checks the sanity of numbers on the parent level:
+ * the total size of the cgroups should be roughly equal to
+ * anon + file + slab + kernel_stack.
+ */
+static int test_kmem_memcg_deletion(const char *root)
+{
+       long current, slab, anon, file, kernel_stack, sum;
+       int ret = KSFT_FAIL;
+       char *parent;
+
+       parent = cg_name(root, "kmem_memcg_deletion_test");
+       if (!parent)
+               goto cleanup;
+
+       if (cg_create(parent))
+               goto cleanup;
+
+       if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+               goto cleanup;
+
+       if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
+               goto cleanup;
+
+       current = cg_read_long(parent, "memory.current");
+       slab = cg_read_key_long(parent, "memory.stat", "slab ");
+       anon = cg_read_key_long(parent, "memory.stat", "anon ");
+       file = cg_read_key_long(parent, "memory.stat", "file ");
+       kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
+       if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
+           kernel_stack < 0)
+               goto cleanup;
+
+       sum = slab + anon + file + kernel_stack;
+       if (abs(sum - current) < 4096 * 32 * 2 * get_nprocs()) {
+               ret = KSFT_PASS;
+       } else {
+               printf("memory.current = %ld\n", current);
+               printf("slab + anon + file + kernel_stack = %ld\n", sum);
+               printf("slab = %ld\n", slab);
+               printf("anon = %ld\n", anon);
+               printf("file = %ld\n", file);
+               printf("kernel_stack = %ld\n", kernel_stack);
+       }
+
+cleanup:
+       cg_destroy(parent);
+       free(parent);
+
+       return ret;
+}
+
+/*
+ * The test reads the entire /proc/kpagecgroup. If the operation went
+ * successfully (and the kernel didn't panic), the test is treated as passed.
+ */
+static int test_kmem_proc_kpagecgroup(const char *root)
+{
+       unsigned long buf[128];
+       int ret = KSFT_FAIL;
+       ssize_t len;
+       int fd;
+
+       fd = open("/proc/kpagecgroup", O_RDONLY);
+       if (fd < 0)
+               return ret;
+
+       do {
+               len = read(fd, buf, sizeof(buf));
+       } while (len > 0);
+
+       if (len == 0)
+               ret = KSFT_PASS;
+
+       close(fd);
+       return ret;
+}
+
+static void *pthread_wait_fn(void *arg)
+{
+       sleep(100);
+       return NULL;
+}
+
+static int spawn_1000_threads(const char *cgroup, void *arg)
+{
+       int nr_threads = 1000;
+       pthread_t *tinfo;
+       unsigned long i;
+       long stack;
+       int ret = -1;
+
+       tinfo = calloc(nr_threads, sizeof(pthread_t));
+       if (tinfo == NULL)
+               return -1;
+
+       for (i = 0; i < nr_threads; i++) {
+               if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
+                                  (void *)i)) {
+                       free(tinfo);
+                       return(-1);
+               }
+       }
+
+       stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
+       if (stack >= 4096 * 1000)
+               ret = 0;
+
+       free(tinfo);
+       return ret;
+}
+
+/*
+ * The test spawns a process, which spawns 1000 threads. Then it checks
+ * that memory.stat's kernel_stack is at least 1000 pages large.
+ */
+static int test_kmem_kernel_stacks(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *cg = NULL;
+
+       cg = cg_name(root, "kmem_kernel_stacks_test");
+       if (!cg)
+               goto cleanup;
+
+       if (cg_create(cg))
+               goto cleanup;
+
+       if (cg_run(cg, spawn_1000_threads, NULL))
+               goto cleanup;
+
+       ret = KSFT_PASS;
+cleanup:
+       cg_destroy(cg);
+       free(cg);
+
+       return ret;
+}
+
+/*
+ * This test sequentionally creates 30 child cgroups, allocates some
+ * kernel memory in each of them, and deletes them. Then it checks
+ * that the number of dying cgroups on the parent level is 0.
+ */
+static int test_kmem_dead_cgroups(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *parent;
+       long dead;
+       int i;
+
+       parent = cg_name(root, "kmem_dead_cgroups_test");
+       if (!parent)
+               goto cleanup;
+
+       if (cg_create(parent))
+               goto cleanup;
+
+       if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+               goto cleanup;
+
+       if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
+               goto cleanup;
+
+       for (i = 0; i < 5; i++) {
+               dead = cg_read_key_long(parent, "cgroup.stat",
+                                       "nr_dying_descendants ");
+               if (dead == 0) {
+                       ret = KSFT_PASS;
+                       break;
+               }
+               /*
+                * Reclaiming cgroups might take some time,
+                * let's wait a bit and repeat.
+                */
+               sleep(1);
+       }
+
+cleanup:
+       cg_destroy(parent);
+       free(parent);
+
+       return ret;
+}
+
+#define T(x) { x, #x }
+struct kmem_test {
+       int (*fn)(const char *root);
+       const char *name;
+} tests[] = {
+       T(test_kmem_basic),
+       T(test_kmem_memcg_deletion),
+       T(test_kmem_proc_kpagecgroup),
+       T(test_kmem_kernel_stacks),
+       T(test_kmem_dead_cgroups),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+       char root[PATH_MAX];
+       int i, ret = EXIT_SUCCESS;
+
+       if (cg_find_unified_root(root, sizeof(root)))
+               ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+       /*
+        * Check that memory controller is available:
+        * memory is listed in cgroup.controllers
+        */
+       if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+               ksft_exit_skip("memory controller isn't available\n");
+
+       if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+               if (cg_write(root, "cgroup.subtree_control", "+memory"))
+                       ksft_exit_skip("Failed to set memory controller\n");
+
+       for (i = 0; i < ARRAY_SIZE(tests); i++) {
+               switch (tests[i].fn(root)) {
+               case KSFT_PASS:
+                       ksft_test_result_pass("%s\n", tests[i].name);
+                       break;
+               case KSFT_SKIP:
+                       ksft_test_result_skip("%s\n", tests[i].name);
+                       break;
+               default:
+                       ret = EXIT_FAILURE;
+                       ksft_test_result_fail("%s\n", tests[i].name);
+                       break;
+               }
+       }
+
+       return ret;
+}
diff --git a/tools/testing/selftests/mincore/.gitignore b/tools/testing/selftests/mincore/.gitignore
new file mode 100644 (file)
index 0000000..15c4dfc
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0+
+mincore_selftest
diff --git a/tools/testing/selftests/mincore/Makefile b/tools/testing/selftests/mincore/Makefile
new file mode 100644 (file)
index 0000000..38c7db1
--- /dev/null
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+CFLAGS += -Wall
+
+TEST_GEN_PROGS := mincore_selftest
+include ../lib.mk
diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c
new file mode 100644 (file)
index 0000000..5a1e85f
--- /dev/null
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * kselftest suite for mincore().
+ *
+ * Copyright (C) 2020 Collabora, Ltd.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+/* Default test file size: 4MB */
+#define MB (1UL << 20)
+#define FILE_SIZE (4 * MB)
+
+
+/*
+ * Tests the user interface. This test triggers most of the documented
+ * error conditions in mincore().
+ */
+TEST(basic_interface)
+{
+       int retval;
+       int page_size;
+       unsigned char vec[1];
+       char *addr;
+
+       page_size = sysconf(_SC_PAGESIZE);
+
+       /* Query a 0 byte sized range */
+       retval = mincore(0, 0, vec);
+       EXPECT_EQ(0, retval);
+
+       /* Addresses in the specified range are invalid or unmapped */
+       errno = 0;
+       retval = mincore(NULL, page_size, vec);
+       EXPECT_EQ(-1, retval);
+       EXPECT_EQ(ENOMEM, errno);
+
+       errno = 0;
+       addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+               MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(MAP_FAILED, addr) {
+               TH_LOG("mmap error: %s", strerror(errno));
+       }
+
+       /* <addr> argument is not page-aligned */
+       errno = 0;
+       retval = mincore(addr + 1, page_size, vec);
+       EXPECT_EQ(-1, retval);
+       EXPECT_EQ(EINVAL, errno);
+
+       /* <length> argument is too large */
+       errno = 0;
+       retval = mincore(addr, -1, vec);
+       EXPECT_EQ(-1, retval);
+       EXPECT_EQ(ENOMEM, errno);
+
+       /* <vec> argument points to an illegal address */
+       errno = 0;
+       retval = mincore(addr, page_size, NULL);
+       EXPECT_EQ(-1, retval);
+       EXPECT_EQ(EFAULT, errno);
+       munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a private anonymous page mapping.
+ * Check that the page is not loaded into memory right after the mapping
+ * but after accessing it (on-demand allocation).
+ * Then free the page and check that it's not memory-resident.
+ */
+TEST(check_anonymous_locked_pages)
+{
+       unsigned char vec[1];
+       char *addr;
+       int retval;
+       int page_size;
+
+       page_size = sysconf(_SC_PAGESIZE);
+
+       /* Map one page and check it's not memory-resident */
+       errno = 0;
+       addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(MAP_FAILED, addr) {
+               TH_LOG("mmap error: %s", strerror(errno));
+       }
+       retval = mincore(addr, page_size, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(0, vec[0]) {
+               TH_LOG("Page found in memory before use");
+       }
+
+       /* Touch the page and check again. It should now be in memory */
+       addr[0] = 1;
+       mlock(addr, page_size);
+       retval = mincore(addr, page_size, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(1, vec[0]) {
+               TH_LOG("Page not found in memory after use");
+       }
+
+       /*
+        * It shouldn't be memory-resident after unlocking it and
+        * marking it as unneeded.
+        */
+       munlock(addr, page_size);
+       madvise(addr, page_size, MADV_DONTNEED);
+       retval = mincore(addr, page_size, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(0, vec[0]) {
+               TH_LOG("Page in memory after being zapped");
+       }
+       munmap(addr, page_size);
+}
+
+
+/*
+ * Check mincore() behavior on huge pages.
+ * This test will be skipped if the mapping fails (ie. if there are no
+ * huge pages available).
+ *
+ * Make sure the system has at least one free huge page, check
+ * "HugePages_Free" in /proc/meminfo.
+ * Increment /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages if
+ * needed.
+ */
+TEST(check_huge_pages)
+{
+       unsigned char vec[1];
+       char *addr;
+       int retval;
+       int page_size;
+
+       page_size = sysconf(_SC_PAGESIZE);
+
+       errno = 0;
+       addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+               -1, 0);
+       if (addr == MAP_FAILED) {
+               if (errno == ENOMEM)
+                       SKIP(return, "No huge pages available.");
+               else
+                       TH_LOG("mmap error: %s", strerror(errno));
+       }
+       retval = mincore(addr, page_size, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(0, vec[0]) {
+               TH_LOG("Page found in memory before use");
+       }
+
+       addr[0] = 1;
+       mlock(addr, page_size);
+       retval = mincore(addr, page_size, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(1, vec[0]) {
+               TH_LOG("Page not found in memory after use");
+       }
+
+       munlock(addr, page_size);
+       munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a file-backed page.
+ * No pages should be loaded into memory right after the mapping. Then,
+ * accessing any address in the mapping range should load the page
+ * containing the address and a number of subsequent pages (readahead).
+ *
+ * The actual readahead settings depend on the test environment, so we
+ * can't make a lot of assumptions about that. This test covers the most
+ * general cases.
+ */
+TEST(check_file_mmap)
+{
+       unsigned char *vec;
+       int vec_size;
+       char *addr;
+       int retval;
+       int page_size;
+       int fd;
+       int i;
+       int ra_pages = 0;
+
+       page_size = sysconf(_SC_PAGESIZE);
+       vec_size = FILE_SIZE / page_size;
+       if (FILE_SIZE % page_size)
+               vec_size++;
+
+       vec = calloc(vec_size, sizeof(unsigned char));
+       ASSERT_NE(NULL, vec) {
+               TH_LOG("Can't allocate array");
+       }
+
+       errno = 0;
+       fd = open(".", O_TMPFILE | O_RDWR, 0600);
+       ASSERT_NE(-1, fd) {
+               TH_LOG("Can't create temporary file: %s",
+                       strerror(errno));
+       }
+       errno = 0;
+       retval = fallocate(fd, 0, 0, FILE_SIZE);
+       ASSERT_EQ(0, retval) {
+               TH_LOG("Error allocating space for the temporary file: %s",
+                       strerror(errno));
+       }
+
+       /*
+        * Map the whole file, the pages shouldn't be fetched yet.
+        */
+       errno = 0;
+       addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       ASSERT_NE(MAP_FAILED, addr) {
+               TH_LOG("mmap error: %s", strerror(errno));
+       }
+       retval = mincore(addr, FILE_SIZE, vec);
+       ASSERT_EQ(0, retval);
+       for (i = 0; i < vec_size; i++) {
+               ASSERT_EQ(0, vec[i]) {
+                       TH_LOG("Unexpected page in memory");
+               }
+       }
+
+       /*
+        * Touch a page in the middle of the mapping. We expect the next
+        * few pages (the readahead window) to be populated too.
+        */
+       addr[FILE_SIZE / 2] = 1;
+       retval = mincore(addr, FILE_SIZE, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+               TH_LOG("Page not found in memory after use");
+       }
+
+       i = FILE_SIZE / 2 / page_size + 1;
+       while (i < vec_size && vec[i]) {
+               ra_pages++;
+               i++;
+       }
+       EXPECT_GT(ra_pages, 0) {
+               TH_LOG("No read-ahead pages found in memory");
+       }
+
+       EXPECT_LT(i, vec_size) {
+               TH_LOG("Read-ahead pages reached the end of the file");
+       }
+       /*
+        * End of the readahead window. The rest of the pages shouldn't
+        * be in memory.
+        */
+       if (i < vec_size) {
+               while (i < vec_size && !vec[i])
+                       i++;
+               EXPECT_EQ(vec_size, i) {
+                       TH_LOG("Unexpected page in memory beyond readahead window");
+               }
+       }
+
+       munmap(addr, FILE_SIZE);
+       close(fd);
+       free(vec);
+}
+
+
+/*
+ * Test mincore() behavior on a page backed by a tmpfs file.  This test
+ * performs the same steps as the previous one. However, we don't expect
+ * any readahead in this case.
+ */
+TEST(check_tmpfs_mmap)
+{
+       unsigned char *vec;
+       int vec_size;
+       char *addr;
+       int retval;
+       int page_size;
+       int fd;
+       int i;
+       int ra_pages = 0;
+
+       page_size = sysconf(_SC_PAGESIZE);
+       vec_size = FILE_SIZE / page_size;
+       if (FILE_SIZE % page_size)
+               vec_size++;
+
+       vec = calloc(vec_size, sizeof(unsigned char));
+       ASSERT_NE(NULL, vec) {
+               TH_LOG("Can't allocate array");
+       }
+
+       errno = 0;
+       fd = open("/dev/shm", O_TMPFILE | O_RDWR, 0600);
+       ASSERT_NE(-1, fd) {
+               TH_LOG("Can't create temporary file: %s",
+                       strerror(errno));
+       }
+       errno = 0;
+       retval = fallocate(fd, 0, 0, FILE_SIZE);
+       ASSERT_EQ(0, retval) {
+               TH_LOG("Error allocating space for the temporary file: %s",
+                       strerror(errno));
+       }
+
+       /*
+        * Map the whole file, the pages shouldn't be fetched yet.
+        */
+       errno = 0;
+       addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       ASSERT_NE(MAP_FAILED, addr) {
+               TH_LOG("mmap error: %s", strerror(errno));
+       }
+       retval = mincore(addr, FILE_SIZE, vec);
+       ASSERT_EQ(0, retval);
+       for (i = 0; i < vec_size; i++) {
+               ASSERT_EQ(0, vec[i]) {
+                       TH_LOG("Unexpected page in memory");
+               }
+       }
+
+       /*
+        * Touch a page in the middle of the mapping. We expect only
+        * that page to be fetched into memory.
+        */
+       addr[FILE_SIZE / 2] = 1;
+       retval = mincore(addr, FILE_SIZE, vec);
+       ASSERT_EQ(0, retval);
+       ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+               TH_LOG("Page not found in memory after use");
+       }
+
+       i = FILE_SIZE / 2 / page_size + 1;
+       while (i < vec_size && vec[i]) {
+               ra_pages++;
+               i++;
+       }
+       ASSERT_EQ(ra_pages, 0) {
+               TH_LOG("Read-ahead pages found in memory");
+       }
+
+       munmap(addr, FILE_SIZE);
+       close(fd);
+       free(vec);
+}
+
+TEST_HARNESS_MAIN