place the hot pages in the fast memory. This is implemented based on
unmapping and page fault too.
+numa_balancing_promote_rate_limit_MBps
+======================================
+
+Too high promotion/demotion throughput between different memory types
+may hurt application latency. This can be used to rate limit the
+promotion throughput. The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
+
oops_all_cpu_backtrace
======================
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
+ PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
NR_VM_NODE_STAT_ITEMS
};
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /* start time in ms of current promote rate limit period */
+ unsigned int nbp_rl_start;
+ /* number of promote candidate pages at start time of current rate limit period */
+ unsigned long nbp_rl_nr_cand;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!node_is_toptier(src_nid)) {
struct pglist_data *pgdat;
- unsigned long latency, th;
+ unsigned long rate_limit, latency, th;
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat))
if (latency >= th)
return false;
- return true;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);