Merge branch 'xarray' of git://git.infradead.org/users/willy/linux-dax
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Oct 2018 18:35:40 +0000 (11:35 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Oct 2018 18:35:40 +0000 (11:35 -0700)
Pull XArray conversion from Matthew Wilcox:
 "The XArray provides an improved interface to the radix tree data
  structure, providing locking as part of the API, specifying GFP flags
  at allocation time, eliminating preloading, less re-walking the tree,
  more efficient iterations and not exposing RCU-protected pointers to
  its users.

  This patch set

   1. Introduces the XArray implementation

   2. Converts the pagecache to use it

   3. Converts memremap to use it

  The page cache is the most complex and important user of the radix
  tree, so converting it was most important. Converting the memremap
  code removes the only other user of the multiorder code, which allows
  us to remove the radix tree code that supported it.

  I have 40+ followup patches to convert many other users of the radix
  tree over to the XArray, but I'd like to get this part in first. The
  other conversions haven't been in linux-next and aren't suitable for
  applying yet, but you can see them in the xarray-conv branch if you're
  interested"

* 'xarray' of git://git.infradead.org/users/willy/linux-dax: (90 commits)
  radix tree: Remove multiorder support
  radix tree test: Convert multiorder tests to XArray
  radix tree tests: Convert item_delete_rcu to XArray
  radix tree tests: Convert item_kill_tree to XArray
  radix tree tests: Move item_insert_order
  radix tree test suite: Remove multiorder benchmarking
  radix tree test suite: Remove __item_insert
  memremap: Convert to XArray
  xarray: Add range store functionality
  xarray: Move multiorder_check to in-kernel tests
  xarray: Move multiorder_shrink to kernel tests
  xarray: Move multiorder account test in-kernel
  radix tree test suite: Convert iteration test to XArray
  radix tree test suite: Convert tag_tagged_items to XArray
  radix tree: Remove radix_tree_clear_tags
  radix tree: Remove radix_tree_maybe_preload_order
  radix tree: Remove split/join code
  radix tree: Remove radix_tree_update_node_t
  page cache: Finish XArray conversion
  dax: Convert page fault handlers to XArray
  ...

33 files changed:
1  2 
Documentation/core-api/index.rst
MAINTAINERS
arch/parisc/kernel/syscall.S
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/nohash/64/pgtable.h
drivers/pci/hotplug/acpiphp.h
drivers/pci/hotplug/acpiphp_core.c
fs/btrfs/compression.c
fs/btrfs/extent_io.c
fs/buffer.c
fs/ext4/inode.c
fs/f2fs/data.c
fs/f2fs/dir.c
fs/f2fs/f2fs.h
fs/f2fs/inline.c
fs/f2fs/node.c
fs/proc/task_mmu.c
include/linux/fs.h
include/linux/swap.h
kernel/memremap.c
lib/Kconfig
lib/Kconfig.debug
lib/Makefile
mm/filemap.c
mm/huge_memory.c
mm/madvise.c
mm/memcontrol.c
mm/migrate.c
mm/page-writeback.c
mm/swap.c
mm/swap_state.c
mm/vmscan.c
mm/workingset.c

Simple merge
diff --cc MAINTAINERS
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/buffer.c
Simple merge
diff --cc fs/ext4/inode.c
Simple merge
diff --cc fs/f2fs/data.c
@@@ -2069,10 -2000,10 +2069,10 @@@ static int f2fs_write_cache_pages(struc
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        pgoff_t done_index;
 -      pgoff_t last_idx = ULONG_MAX;
        int cycled;
        int range_whole = 0;
-       int tag;
+       xa_mark_t tag;
 +      int nwritten = 0;
  
        pagevec_init(&pvec);
  
diff --cc fs/f2fs/dir.c
Simple merge
diff --cc fs/f2fs/f2fs.h
Simple merge
Simple merge
diff --cc fs/f2fs/node.c
Simple merge
Simple merge
Simple merge
@@@ -297,20 -296,15 +297,15 @@@ struct vma_swap_readahead 
  
  /* linux/mm/workingset.c */
  void *workingset_eviction(struct address_space *mapping, struct page *page);
 -bool workingset_refault(void *shadow);
 +void workingset_refault(struct page *page, void *shadow);
  void workingset_activation(struct page *page);
  
- /* Do not use directly, use workingset_lookup_update */
- void workingset_update_node(struct radix_tree_node *node);
- /* Returns workingset_update_node() if the mapping has shadow entries. */
- #define workingset_lookup_update(mapping)                             \
- ({                                                                    \
-       radix_tree_update_node_t __helper = workingset_update_node;     \
-       if (dax_mapping(mapping) || shmem_mapping(mapping))             \
-               __helper = NULL;                                        \
-       __helper;                                                       \
- })
+ /* Only track the nodes of mappings with shadow entries */
+ void workingset_update_node(struct xa_node *node);
+ #define mapping_set_update(xas, mapping) do {                         \
+       if (!dax_mapping(mapping) && !shmem_mapping(mapping))           \
+               xas_set_update(xas, workingset_update_node);            \
+ } while (0)
  
  /* linux/mm/page_alloc.c */
  extern unsigned long totalram_pages;
@@@ -175,10 -141,10 +141,9 @@@ void *devm_memremap_pages(struct devic
        struct vmem_altmap *altmap = pgmap->altmap_valid ?
                        &pgmap->altmap : NULL;
        struct resource *res = &pgmap->res;
 -      unsigned long pfn;
 +      struct dev_pagemap *conflict_pgmap;
        pgprot_t pgprot = PAGE_KERNEL;
-       unsigned long pgoff, order;
        int error, nid, is_ram;
 -      struct dev_pagemap *conflict_pgmap;
  
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
diff --cc lib/Kconfig
Simple merge
Simple merge
diff --cc lib/Makefile
Simple merge
diff --cc mm/filemap.c
Simple merge
Simple merge
diff --cc mm/madvise.c
Simple merge
diff --cc mm/memcontrol.c
Simple merge
diff --cc mm/migrate.c
Simple merge
@@@ -2169,8 -2153,9 +2160,8 @@@ int write_cache_pages(struct address_sp
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        pgoff_t done_index;
 -      int cycled;
        int range_whole = 0;
-       int tag;
+       xa_mark_t tag;
  
        pagevec_init(&pvec);
        if (wbc->range_cyclic) {
diff --cc mm/swap.c
Simple merge
diff --cc mm/swap_state.c
@@@ -433,22 -401,15 +401,16 @@@ struct page *__read_swap_cache_async(sw
                         */
                        cond_resched();
                        continue;
-               }
-               if (err) {              /* swp entry is obsolete ? */
-                       radix_tree_preload_end();
+               } else if (err)         /* swp entry is obsolete ? */
                        break;
-               }
  
-               /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+               /* May fail (-ENOMEM) if XArray node allocation failed. */
                __SetPageLocked(new_page);
                __SetPageSwapBacked(new_page);
-               err = __add_to_swap_cache(new_page, entry);
+               err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
                if (likely(!err)) {
-                       radix_tree_preload_end();
-                       /*
-                        * Initiate read into locked page and return.
-                        */
+                       /* Initiate read into locked page */
 +                      SetPageWorkingset(new_page);
                        lru_cache_add_anon(new_page);
                        *new_page_allocated = true;
                        return new_page;
diff --cc mm/vmscan.c
Simple merge
diff --cc mm/workingset.c
   * refault distance will immediately activate the refaulting page.
   */
  
- #define EVICTION_SHIFT        (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ #define EVICTION_SHIFT        ((BITS_PER_LONG - BITS_PER_XA_VALUE) +  \
 -                       NODES_SHIFT +                          \
 -                       MEM_CGROUP_ID_SHIFT)
 +                       1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
  #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
  
  /*
   */
  static unsigned int bucket_order __read_mostly;
  
 -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
 +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
 +                       bool workingset)
  {
        eviction >>= bucket_order;
+       eviction &= EVICTION_MASK;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
-       eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 +      eviction = (eviction << 1) | workingset;
  
-       return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+       return xa_mk_value(eviction);
  }
  
  static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
 -                        unsigned long *evictionp)
 +                        unsigned long *evictionp, bool *workingsetp)
  {
-       unsigned long entry = (unsigned long)shadow;
+       unsigned long entry = xa_to_value(shadow);
        int memcgid, nid;
 +      bool workingset;
  
-       entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
 +      workingset = entry & 1;
 +      entry >>= 1;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@@ -377,20 -349,12 +376,20 @@@ void workingset_update_node(struct xa_n
         * already where they should be. The list_empty() test is safe
         * as node->private_list is protected by the i_pages lock.
         */
-       if (node->count && node->count == node->exceptional) {
 +      VM_WARN_ON_ONCE(!irqs_disabled());  /* For __inc_lruvec_page_state */
 +
 -              if (list_empty(&node->private_list))
+       if (node->count && node->count == node->nr_values) {
 +              if (list_empty(&node->private_list)) {
                        list_lru_add(&shadow_nodes, &node->private_list);
 +                      __inc_lruvec_page_state(virt_to_page(node),
 +                                              WORKINGSET_NODES);
 +              }
        } else {
 -              if (!list_empty(&node->private_list))
 +              if (!list_empty(&node->private_list)) {
                        list_lru_del(&shadow_nodes, &node->private_list);
 +                      __dec_lruvec_page_state(virt_to_page(node),
 +                                              WORKINGSET_NODES);
 +              }
        }
  }
  
@@@ -423,22 -387,16 +422,22 @@@ static unsigned long count_shadow_nodes
         * each, this will reclaim shadow entries when they consume
         * ~1.8% of available memory:
         *
-        * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
+        * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
         */
 +#ifdef CONFIG_MEMCG
        if (sc->memcg) {
 -              cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
 -                                                   LRU_ALL_FILE);
 -      } else {
 -              cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
 -                      node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
 -      }
 -      max_nodes = cache >> (XA_CHUNK_SHIFT - 3);
 +              struct lruvec *lruvec;
 +
 +              pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
 +                                                   LRU_ALL);
 +              lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
 +              pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
 +              pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
 +      } else
 +#endif
 +              pages = node_present_pages(sc->nid);
 +
-       max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
++      max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
  
        if (!nodes)
                return SHRINK_EMPTY;
@@@ -490,29 -445,21 +488,21 @@@ static enum lru_status shadow_lru_isola
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
-       if (WARN_ON_ONCE(!node->exceptional))
+       if (WARN_ON_ONCE(!node->nr_values))
                goto out_invalid;
-       if (WARN_ON_ONCE(node->count != node->exceptional))
-               goto out_invalid;
-       for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-               if (node->slots[i]) {
-                       if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
-                               goto out_invalid;
-                       if (WARN_ON_ONCE(!node->exceptional))
-                               goto out_invalid;
-                       if (WARN_ON_ONCE(!mapping->nrexceptional))
-                               goto out_invalid;
-                       node->slots[i] = NULL;
-                       node->exceptional--;
-                       node->count--;
-                       mapping->nrexceptional--;
-               }
-       }
-       if (WARN_ON_ONCE(node->exceptional))
+       if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
 -      inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+       mapping->nrexceptional -= node->nr_values;
+       xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
+       xas.xa_offset = node->offset;
+       xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
+       xas_set_update(&xas, workingset_update_node);
+       /*
+        * We could store a shadow entry here which was the minimum of the
+        * shadow entries we were tracking ...
+        */
+       xas_store(&xas, NULL);
-       __radix_tree_delete_node(&mapping->i_pages, node,
-                                workingset_lookup_update(mapping));
 +      __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
  
  out_invalid:
        xa_unlock_irq(&mapping->i_pages);