sched/numa: Adjust scan rate in task_numa_placement

author Rik van Riel <riel@redhat.com>

Mon, 7 Oct 2013 10:29:36 +0000 (11:29 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 9 Oct 2013 12:48:16 +0000 (14:48 +0200)
author Rik van Riel <riel@redhat.com>
Mon, 7 Oct 2013 10:29:36 +0000 (11:29 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 9 Oct 2013 12:48:16 +0000 (14:48 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 59f953b..2292f6c 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1365,6 +1365,14 @@ struct task_struct {
          */
         unsigned long *numa_faults_buffer;
  
+       /*
+        * numa_faults_locality tracks if faults recorded during the last
+        * scan window were remote/local. The task scan period is adapted
+        * based on the locality of the faults with different weights
+        * depending on whether they were shared or private faults
+        */
+       unsigned long numa_faults_locality[2];
+
         int numa_preferred_nid;
         unsigned long numa_pages_migrated;
  #endif /* CONFIG_NUMA_BALANCING */
@@ -1455,6 +1463,7 @@ struct task_struct {
  #define TNF_MIGRATED   0x01
  #define TNF_NO_GROUP   0x02
  #define TNF_SHARED     0x04
+#define TNF_FAULT_LOCAL        0x08
  
  #ifdef CONFIG_NUMA_BALANCING
  extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d26a16e..66237ff 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
  
         sched_setnuma(p, env.dst_nid);
  
+       /*
+        * Reset the scan period if the task is being rescheduled on an
+        * alternative node to recheck if the tasks is now properly placed.
+        */
+       p->numa_scan_period = task_scan_min(p);
+
         if (env.best_task == NULL) {
                 int ret = migrate_task_to(p, env.best_cpu);
                 return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
                 p->numa_migrate_retry = jiffies + HZ*5;
  }
  
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                       unsigned long shared, unsigned long private)
+{
+       unsigned int period_slot;
+       int ratio;
+       int diff;
+
+       unsigned long remote = p->numa_faults_locality[0];
+       unsigned long local = p->numa_faults_locality[1];
+
+       /*
+        * If there were no record hinting faults then either the task is
+        * completely idle or all activity is areas that are not of interest
+        * to automatic numa balancing. Scan slower
+        */
+       if (local + shared == 0) {
+               p->numa_scan_period = min(p->numa_scan_period_max,
+                       p->numa_scan_period << 1);
+
+               p->mm->numa_next_scan = jiffies +
+                       msecs_to_jiffies(p->numa_scan_period);
+
+               return;
+       }
+
+       /*
+        * Prepare to scale scan period relative to the current period.
+        *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+        *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+        *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+        */
+       period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+       ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+       if (ratio >= NUMA_PERIOD_THRESHOLD) {
+               int slot = ratio - NUMA_PERIOD_THRESHOLD;
+               if (!slot)
+                       slot = 1;
+               diff = slot * period_slot;
+       } else {
+               diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+               /*
+                * Scale scan rate increases based on sharing. There is an
+                * inverse relationship between the degree of sharing and
+                * the adjustment made to the scanning period. Broadly
+                * speaking the intent is that there is little point
+                * scanning faster if shared accesses dominate as it may
+                * simply bounce migrations uselessly
+                */
+               period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+               ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+               diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+       }
+
+       p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                       task_scan_min(p), task_scan_max(p));
+       memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
  static void task_numa_placement(struct task_struct *p)
  {
         int seq, nid, max_nid = -1, max_group_nid = -1;
         unsigned long max_faults = 0, max_group_faults = 0;
+       unsigned long fault_types[2] = { 0, 0 };
         spinlock_t *group_lock = NULL;
  
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
                         /* Decay existing window, copy faults since last scan */
                         p->numa_faults[i] >>= 1;
                         p->numa_faults[i] += p->numa_faults_buffer[i];
+                       fault_types[priv] += p->numa_faults_buffer[i];
                         p->numa_faults_buffer[i] = 0;
  
                         faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
                 }
         }
  
+       update_task_scan_period(p, fault_types[0], fault_types[1]);
+
         if (p->numa_group) {
                 /*
                  * If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                 BUG_ON(p->numa_faults_buffer);
                 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
                 p->total_numa_faults = 0;
+               memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
         }
  
         /*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                         task_numa_group(p, last_cpupid, flags, &priv);
         }
  
-       /*
-        * If pages are properly placed (did not migrate) then scan slower.
-        * This is reset periodically in case of phase changes
-        */
-       if (!migrated) {
-               /* Initialise if necessary */
-               if (!p->numa_scan_period_max)
-                       p->numa_scan_period_max = task_scan_max(p);
-
-               p->numa_scan_period = min(p->numa_scan_period_max,
-                       p->numa_scan_period + 10);
-       }
-
         task_numa_placement(p);
  
         /* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                 p->numa_pages_migrated += pages;
  
         p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
  }
  
  static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
  
  out:
         /*
-        * If the whole process was scanned without updates then no NUMA
-        * hinting faults are being recorded and scan rate should be lower.
-        */
-       if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
-               p->numa_scan_period = min(p->numa_scan_period_max,
-                       p->numa_scan_period << 1);
-
-               next_scan = now + msecs_to_jiffies(p->numa_scan_period);
-               mm->numa_next_scan = next_scan;
-       }
-
-       /*
          * It is possible to reach the end of the VMA list but the last few
          * VMAs are not guaranteed to the vma_migratable. If they are not, we
          * would find the !migratable VMA on the next scan but not reset the
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 7ab4e32..1be2a1f 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         page_nid = page_to_nid(page);
         last_cpupid = page_cpupid_last(page);
         count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (page_nid == this_nid)
+       if (page_nid == this_nid) {
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+               flags |= TNF_FAULT_LOCAL;
+       }
  
         /*
          * Avoid grouping on DSO/COW pages in specific and RO pages
diff --git a/mm/memory.c b/mm/memory.c

index 823720c..1c7501f 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int page_nid)
+                               unsigned long addr, int page_nid,
+                               int *flags)
  {
         get_page(page);
  
         count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (page_nid == numa_node_id())
+       if (page_nid == numa_node_id()) {
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+               *flags |= TNF_FAULT_LOCAL;
+       }
  
         return mpol_misplaced(page, vma, addr);
  }
@@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         last_cpupid = page_cpupid_last(page);
         page_nid = page_to_nid(page);
-       target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+       target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
         pte_unmap_unlock(ptep, ptl);
         if (target_nid == -1) {
                 put_page(page);
author	Rik van Riel <riel@redhat.com>
	Mon, 7 Oct 2013 10:29:36 +0000 (11:29 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 9 Oct 2013 12:48:16 +0000 (14:48 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history