1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
12 /* hierarchy of memory tiers */
13 struct list_head list;
14 /* list of all memory types part of this tier */
15 struct list_head memory_types;
17 * start value of abstract distance. memory tier maps
18 * an abstract distance range,
19 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
23 /* All the nodes that are part of all the lower memory tiers. */
24 nodemask_t lower_tier_mask;
27 struct demotion_nodes {
31 struct node_memory_type_map {
32 struct memory_dev_type *memtype;
36 static DEFINE_MUTEX(memory_tier_lock);
37 static LIST_HEAD(memory_tiers);
38 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
39 static struct memory_dev_type *default_dram_type;
41 static struct bus_type memory_tier_subsys = {
42 .name = "memory_tiering",
43 .dev_name = "memory_tier",
46 #ifdef CONFIG_MIGRATION
47 static int top_tier_adistance;
49 * node_demotion[] examples:
53 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
65 * node_demotion[0].preferred = 2
66 * node_demotion[1].preferred = 3
67 * node_demotion[2].preferred = <empty>
68 * node_demotion[3].preferred = <empty>
72 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
82 * node_demotion[0].preferred = <empty>
83 * node_demotion[1].preferred = <empty>
84 * node_demotion[2].preferred = <empty>
88 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
100 * node_demotion[0].preferred = 2
101 * node_demotion[1].preferred = 0
102 * node_demotion[2].preferred = <empty>
105 static struct demotion_nodes *node_demotion __read_mostly;
106 #endif /* CONFIG_MIGRATION */
108 static inline struct memory_tier *to_memory_tier(struct device *device)
110 return container_of(device, struct memory_tier, dev);
113 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
115 nodemask_t nodes = NODE_MASK_NONE;
116 struct memory_dev_type *memtype;
118 list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
119 nodes_or(nodes, nodes, memtype->nodes);
124 static void memory_tier_device_release(struct device *dev)
126 struct memory_tier *tier = to_memory_tier(dev);
128 * synchronize_rcu in clear_node_memory_tier makes sure
129 * we don't have rcu access to this memory tier.
134 static ssize_t nodelist_show(struct device *dev,
135 struct device_attribute *attr, char *buf)
140 mutex_lock(&memory_tier_lock);
141 nmask = get_memtier_nodemask(to_memory_tier(dev));
142 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
143 mutex_unlock(&memory_tier_lock);
146 static DEVICE_ATTR_RO(nodelist);
148 static struct attribute *memtier_dev_attrs[] = {
149 &dev_attr_nodelist.attr,
153 static const struct attribute_group memtier_dev_group = {
154 .attrs = memtier_dev_attrs,
157 static const struct attribute_group *memtier_dev_groups[] = {
162 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
165 bool found_slot = false;
166 struct memory_tier *memtier, *new_memtier;
167 int adistance = memtype->adistance;
168 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
170 lockdep_assert_held_once(&memory_tier_lock);
172 adistance = round_down(adistance, memtier_adistance_chunk_size);
174 * If the memtype is already part of a memory tier,
177 if (!list_empty(&memtype->tier_sibiling)) {
178 list_for_each_entry(memtier, &memory_tiers, list) {
179 if (adistance == memtier->adistance_start)
183 return ERR_PTR(-EINVAL);
186 list_for_each_entry(memtier, &memory_tiers, list) {
187 if (adistance == memtier->adistance_start) {
189 } else if (adistance < memtier->adistance_start) {
195 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
197 return ERR_PTR(-ENOMEM);
199 new_memtier->adistance_start = adistance;
200 INIT_LIST_HEAD(&new_memtier->list);
201 INIT_LIST_HEAD(&new_memtier->memory_types);
203 list_add_tail(&new_memtier->list, &memtier->list);
205 list_add_tail(&new_memtier->list, &memory_tiers);
207 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
208 new_memtier->dev.bus = &memory_tier_subsys;
209 new_memtier->dev.release = memory_tier_device_release;
210 new_memtier->dev.groups = memtier_dev_groups;
212 ret = device_register(&new_memtier->dev);
214 list_del(&new_memtier->list);
215 put_device(&new_memtier->dev);
218 memtier = new_memtier;
221 list_add(&memtype->tier_sibiling, &memtier->memory_types);
225 static struct memory_tier *__node_get_memory_tier(int node)
229 pgdat = NODE_DATA(node);
233 * Since we hold memory_tier_lock, we can avoid
234 * RCU read locks when accessing the details. No
235 * parallel updates are possible here.
237 return rcu_dereference_check(pgdat->memtier,
238 lockdep_is_held(&memory_tier_lock));
241 #ifdef CONFIG_MIGRATION
242 bool node_is_toptier(int node)
246 struct memory_tier *memtier;
248 pgdat = NODE_DATA(node);
253 memtier = rcu_dereference(pgdat->memtier);
258 if (memtier->adistance_start <= top_tier_adistance)
267 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
269 struct memory_tier *memtier;
272 * pg_data_t.memtier updates includes a synchronize_rcu()
273 * which ensures that we either find NULL or a valid memtier
274 * in NODE_DATA. protect the access via rcu_read_lock();
277 memtier = rcu_dereference(pgdat->memtier);
279 *targets = memtier->lower_tier_mask;
281 *targets = NODE_MASK_NONE;
286 * next_demotion_node() - Get the next node in the demotion path
287 * @node: The starting node to lookup the next node
289 * Return: node id for next memory node in the demotion path hierarchy
290 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
291 * @node online or guarantee that it *continues* to be the next demotion
294 int next_demotion_node(int node)
296 struct demotion_nodes *nd;
302 nd = &node_demotion[node];
305 * node_demotion[] is updated without excluding this
306 * function from running.
308 * Make sure to use RCU over entire code blocks if
309 * node_demotion[] reads need to be consistent.
313 * If there are multiple target nodes, just select one
314 * target node randomly.
316 * In addition, we can also use round-robin to select
317 * target node, but we should introduce another variable
318 * for node_demotion[] to record last selected target node,
319 * that may cause cache ping-pong due to the changing of
320 * last target node. Or introducing per-cpu data to avoid
321 * caching issue, which seems more complicated. So selecting
322 * target node randomly seems better until now.
324 target = node_random(&nd->preferred);
330 static void disable_all_demotion_targets(void)
332 struct memory_tier *memtier;
335 for_each_node_state(node, N_MEMORY) {
336 node_demotion[node].preferred = NODE_MASK_NONE;
338 * We are holding memory_tier_lock, it is safe
339 * to access pgda->memtier.
341 memtier = __node_get_memory_tier(node);
343 memtier->lower_tier_mask = NODE_MASK_NONE;
346 * Ensure that the "disable" is visible across the system.
347 * Readers will see either a combination of before+disable
348 * state or disable+after. They will never see before and
349 * after state together.
355 * Find an automatic demotion target for all memory
356 * nodes. Failing here is OK. It might just indicate
357 * being at the end of a chain.
359 static void establish_demotion_targets(void)
361 struct memory_tier *memtier;
362 struct demotion_nodes *nd;
363 int target = NUMA_NO_NODE, node;
364 int distance, best_distance;
365 nodemask_t tier_nodes, lower_tier;
367 lockdep_assert_held_once(&memory_tier_lock);
372 disable_all_demotion_targets();
374 for_each_node_state(node, N_MEMORY) {
376 nd = &node_demotion[node];
378 memtier = __node_get_memory_tier(node);
379 if (!memtier || list_is_last(&memtier->list, &memory_tiers))
382 * Get the lower memtier to find the demotion node list.
384 memtier = list_next_entry(memtier, list);
385 tier_nodes = get_memtier_nodemask(memtier);
387 * find_next_best_node, use 'used' nodemask as a skip list.
388 * Add all memory nodes except the selected memory tier
389 * nodelist to skip list so that we find the best node from the
392 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
395 * Find all the nodes in the memory tier node list of same best distance.
396 * add them to the preferred mask. We randomly select between nodes
397 * in the preferred mask when allocating pages during demotion.
400 target = find_next_best_node(node, &tier_nodes);
401 if (target == NUMA_NO_NODE)
404 distance = node_distance(node, target);
405 if (distance == best_distance || best_distance == -1) {
406 best_distance = distance;
407 node_set(target, nd->preferred);
414 * Promotion is allowed from a memory tier to higher
415 * memory tier only if the memory tier doesn't include
416 * compute. We want to skip promotion from a memory tier,
417 * if any node that is part of the memory tier have CPUs.
418 * Once we detect such a memory tier, we consider that tier
419 * as top tiper from which promotion is not allowed.
421 list_for_each_entry_reverse(memtier, &memory_tiers, list) {
422 tier_nodes = get_memtier_nodemask(memtier);
423 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
424 if (!nodes_empty(tier_nodes)) {
426 * abstract distance below the max value of this memtier
427 * is considered toptier.
429 top_tier_adistance = memtier->adistance_start +
430 MEMTIER_CHUNK_SIZE - 1;
435 * Now build the lower_tier mask for each node collecting node mask from
436 * all memory tier below it. This allows us to fallback demotion page
437 * allocation to a set of nodes that is closer the above selected
440 lower_tier = node_states[N_MEMORY];
441 list_for_each_entry(memtier, &memory_tiers, list) {
443 * Keep removing current tier from lower_tier nodes,
444 * This will remove all nodes in current and above
445 * memory tier from the lower_tier mask.
447 tier_nodes = get_memtier_nodemask(memtier);
448 nodes_andnot(lower_tier, lower_tier, tier_nodes);
449 memtier->lower_tier_mask = lower_tier;
454 static inline void establish_demotion_targets(void) {}
455 #endif /* CONFIG_MIGRATION */
457 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
459 if (!node_memory_types[node].memtype)
460 node_memory_types[node].memtype = memtype;
462 * for each device getting added in the same NUMA node
463 * with this specific memtype, bump the map count. We
464 * Only take memtype device reference once, so that
465 * changing a node memtype can be done by droping the
466 * only reference count taken here.
469 if (node_memory_types[node].memtype == memtype) {
470 if (!node_memory_types[node].map_count++)
471 kref_get(&memtype->kref);
475 static struct memory_tier *set_node_memory_tier(int node)
477 struct memory_tier *memtier;
478 struct memory_dev_type *memtype;
479 pg_data_t *pgdat = NODE_DATA(node);
482 lockdep_assert_held_once(&memory_tier_lock);
484 if (!node_state(node, N_MEMORY))
485 return ERR_PTR(-EINVAL);
487 __init_node_memory_type(node, default_dram_type);
489 memtype = node_memory_types[node].memtype;
490 node_set(node, memtype->nodes);
491 memtier = find_create_memory_tier(memtype);
492 if (!IS_ERR(memtier))
493 rcu_assign_pointer(pgdat->memtier, memtier);
497 static void destroy_memory_tier(struct memory_tier *memtier)
499 list_del(&memtier->list);
500 device_unregister(&memtier->dev);
503 static bool clear_node_memory_tier(int node)
505 bool cleared = false;
507 struct memory_tier *memtier;
509 pgdat = NODE_DATA(node);
514 * Make sure that anybody looking at NODE_DATA who finds
515 * a valid memtier finds memory_dev_types with nodes still
516 * linked to the memtier. We achieve this by waiting for
517 * rcu read section to finish using synchronize_rcu.
518 * This also enables us to free the destroyed memory tier
519 * with kfree instead of kfree_rcu
521 memtier = __node_get_memory_tier(node);
523 struct memory_dev_type *memtype;
525 rcu_assign_pointer(pgdat->memtier, NULL);
527 memtype = node_memory_types[node].memtype;
528 node_clear(node, memtype->nodes);
529 if (nodes_empty(memtype->nodes)) {
530 list_del_init(&memtype->tier_sibiling);
531 if (list_empty(&memtier->memory_types))
532 destroy_memory_tier(memtier);
539 static void release_memtype(struct kref *kref)
541 struct memory_dev_type *memtype;
543 memtype = container_of(kref, struct memory_dev_type, kref);
547 struct memory_dev_type *alloc_memory_type(int adistance)
549 struct memory_dev_type *memtype;
551 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
553 return ERR_PTR(-ENOMEM);
555 memtype->adistance = adistance;
556 INIT_LIST_HEAD(&memtype->tier_sibiling);
557 memtype->nodes = NODE_MASK_NONE;
558 kref_init(&memtype->kref);
561 EXPORT_SYMBOL_GPL(alloc_memory_type);
563 void put_memory_type(struct memory_dev_type *memtype)
565 kref_put(&memtype->kref, release_memtype);
567 EXPORT_SYMBOL_GPL(put_memory_type);
569 void init_node_memory_type(int node, struct memory_dev_type *memtype)
572 mutex_lock(&memory_tier_lock);
573 __init_node_memory_type(node, memtype);
574 mutex_unlock(&memory_tier_lock);
576 EXPORT_SYMBOL_GPL(init_node_memory_type);
578 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
580 mutex_lock(&memory_tier_lock);
581 if (node_memory_types[node].memtype == memtype)
582 node_memory_types[node].map_count--;
584 * If we umapped all the attached devices to this node,
585 * clear the node memory type.
587 if (!node_memory_types[node].map_count) {
588 node_memory_types[node].memtype = NULL;
589 put_memory_type(memtype);
591 mutex_unlock(&memory_tier_lock);
593 EXPORT_SYMBOL_GPL(clear_node_memory_type);
595 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
596 unsigned long action, void *_arg)
598 struct memory_tier *memtier;
599 struct memory_notify *arg = _arg;
602 * Only update the node migration order when a node is
603 * changing status, like online->offline.
605 if (arg->status_change_nid < 0)
606 return notifier_from_errno(0);
610 mutex_lock(&memory_tier_lock);
611 if (clear_node_memory_tier(arg->status_change_nid))
612 establish_demotion_targets();
613 mutex_unlock(&memory_tier_lock);
616 mutex_lock(&memory_tier_lock);
617 memtier = set_node_memory_tier(arg->status_change_nid);
618 if (!IS_ERR(memtier))
619 establish_demotion_targets();
620 mutex_unlock(&memory_tier_lock);
624 return notifier_from_errno(0);
627 static int __init memory_tier_init(void)
630 struct memory_tier *memtier;
632 ret = subsys_virtual_register(&memory_tier_subsys, NULL);
634 panic("%s() failed to register memory tier subsystem\n", __func__);
636 #ifdef CONFIG_MIGRATION
637 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
639 WARN_ON(!node_demotion);
641 mutex_lock(&memory_tier_lock);
643 * For now we can have 4 faster memory tiers with smaller adistance
644 * than default DRAM tier.
646 default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
647 if (IS_ERR(default_dram_type))
648 panic("%s() failed to allocate default DRAM tier\n", __func__);
651 * Look at all the existing N_MEMORY nodes and add them to
652 * default memory tier or to a tier if we already have memory
655 for_each_node_state(node, N_MEMORY) {
656 memtier = set_node_memory_tier(node);
659 * Continue with memtiers we are able to setup
663 establish_demotion_targets();
664 mutex_unlock(&memory_tier_lock);
666 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
669 subsys_initcall(memory_tier_init);
671 bool numa_demotion_enabled = false;
673 #ifdef CONFIG_MIGRATION
675 static ssize_t demotion_enabled_show(struct kobject *kobj,
676 struct kobj_attribute *attr, char *buf)
678 return sysfs_emit(buf, "%s\n",
679 numa_demotion_enabled ? "true" : "false");
682 static ssize_t demotion_enabled_store(struct kobject *kobj,
683 struct kobj_attribute *attr,
684 const char *buf, size_t count)
688 ret = kstrtobool(buf, &numa_demotion_enabled);
695 static struct kobj_attribute numa_demotion_enabled_attr =
696 __ATTR_RW(demotion_enabled);
698 static struct attribute *numa_attrs[] = {
699 &numa_demotion_enabled_attr.attr,
703 static const struct attribute_group numa_attr_group = {
707 static int __init numa_init_sysfs(void)
710 struct kobject *numa_kobj;
712 numa_kobj = kobject_create_and_add("numa", mm_kobj);
714 pr_err("failed to create numa kobject\n");
717 err = sysfs_create_group(numa_kobj, &numa_attr_group);
719 pr_err("failed to register numa group\n");
725 kobject_put(numa_kobj);
728 subsys_initcall(numa_init_sysfs);
729 #endif /* CONFIG_SYSFS */