mm/page_ext: move functions around for minor cleanups to page_ext
[platform/kernel/linux-starfive.git] / mm / memcontrol.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* memcontrol.c - Memory Controller
3  *
4  * Copyright IBM Corporation, 2007
5  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6  *
7  * Copyright 2007 OpenVZ SWsoft Inc
8  * Author: Pavel Emelianov <xemul@openvz.org>
9  *
10  * Memory thresholds
11  * Copyright (C) 2009 Nokia Corporation
12  * Author: Kirill A. Shutemov
13  *
14  * Kernel Memory Controller
15  * Copyright (C) 2012 Parallels Inc. and Google Inc.
16  * Authors: Glauber Costa and Suleiman Souhlal
17  *
18  * Native page reclaim
19  * Charge lifetime sanitation
20  * Lockless page tracking & accounting
21  * Unified hierarchy configuration model
22  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
23  *
24  * Per memcg lru locking
25  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
26  */
27
28 #include <linux/page_counter.h>
29 #include <linux/memcontrol.h>
30 #include <linux/cgroup.h>
31 #include <linux/pagewalk.h>
32 #include <linux/sched/mm.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/hugetlb.h>
35 #include <linux/pagemap.h>
36 #include <linux/vm_event_item.h>
37 #include <linux/smp.h>
38 #include <linux/page-flags.h>
39 #include <linux/backing-dev.h>
40 #include <linux/bit_spinlock.h>
41 #include <linux/rcupdate.h>
42 #include <linux/limits.h>
43 #include <linux/export.h>
44 #include <linux/mutex.h>
45 #include <linux/rbtree.h>
46 #include <linux/slab.h>
47 #include <linux/swap.h>
48 #include <linux/swapops.h>
49 #include <linux/spinlock.h>
50 #include <linux/eventfd.h>
51 #include <linux/poll.h>
52 #include <linux/sort.h>
53 #include <linux/fs.h>
54 #include <linux/seq_file.h>
55 #include <linux/vmpressure.h>
56 #include <linux/memremap.h>
57 #include <linux/mm_inline.h>
58 #include <linux/swap_cgroup.h>
59 #include <linux/cpu.h>
60 #include <linux/oom.h>
61 #include <linux/lockdep.h>
62 #include <linux/file.h>
63 #include <linux/resume_user_mode.h>
64 #include <linux/psi.h>
65 #include <linux/seq_buf.h>
66 #include <linux/sched/isolation.h>
67 #include "internal.h"
68 #include <net/sock.h>
69 #include <net/ip.h>
70 #include "slab.h"
71 #include "swap.h"
72
73 #include <linux/uaccess.h>
74
75 #include <trace/events/vmscan.h>
76
77 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
78 EXPORT_SYMBOL(memory_cgrp_subsys);
79
80 struct mem_cgroup *root_mem_cgroup __read_mostly;
81
82 /* Active memory cgroup to use from an interrupt context */
83 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
84 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
85
86 /* Socket memory accounting disabled? */
87 static bool cgroup_memory_nosocket __ro_after_init;
88
89 /* Kernel memory accounting disabled? */
90 static bool cgroup_memory_nokmem __ro_after_init;
91
92 /* BPF memory accounting disabled? */
93 static bool cgroup_memory_nobpf __ro_after_init;
94
95 #ifdef CONFIG_CGROUP_WRITEBACK
96 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
97 #endif
98
99 /* Whether legacy memory+swap accounting is active */
100 static bool do_memsw_account(void)
101 {
102         return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
103 }
104
105 #define THRESHOLDS_EVENTS_TARGET 128
106 #define SOFTLIMIT_EVENTS_TARGET 1024
107
108 /*
109  * Cgroups above their limits are maintained in a RB-Tree, independent of
110  * their hierarchy representation
111  */
112
113 struct mem_cgroup_tree_per_node {
114         struct rb_root rb_root;
115         struct rb_node *rb_rightmost;
116         spinlock_t lock;
117 };
118
119 struct mem_cgroup_tree {
120         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
121 };
122
123 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
124
125 /* for OOM */
126 struct mem_cgroup_eventfd_list {
127         struct list_head list;
128         struct eventfd_ctx *eventfd;
129 };
130
131 /*
132  * cgroup_event represents events which userspace want to receive.
133  */
134 struct mem_cgroup_event {
135         /*
136          * memcg which the event belongs to.
137          */
138         struct mem_cgroup *memcg;
139         /*
140          * eventfd to signal userspace about the event.
141          */
142         struct eventfd_ctx *eventfd;
143         /*
144          * Each of these stored in a list by the cgroup.
145          */
146         struct list_head list;
147         /*
148          * register_event() callback will be used to add new userspace
149          * waiter for changes related to this event.  Use eventfd_signal()
150          * on eventfd to send notification to userspace.
151          */
152         int (*register_event)(struct mem_cgroup *memcg,
153                               struct eventfd_ctx *eventfd, const char *args);
154         /*
155          * unregister_event() callback will be called when userspace closes
156          * the eventfd or on cgroup removing.  This callback must be set,
157          * if you want provide notification functionality.
158          */
159         void (*unregister_event)(struct mem_cgroup *memcg,
160                                  struct eventfd_ctx *eventfd);
161         /*
162          * All fields below needed to unregister event when
163          * userspace closes eventfd.
164          */
165         poll_table pt;
166         wait_queue_head_t *wqh;
167         wait_queue_entry_t wait;
168         struct work_struct remove;
169 };
170
171 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
172 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
173
174 /* Stuffs for move charges at task migration. */
175 /*
176  * Types of charges to be moved.
177  */
178 #define MOVE_ANON       0x1U
179 #define MOVE_FILE       0x2U
180 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
181
182 /* "mc" and its members are protected by cgroup_mutex */
183 static struct move_charge_struct {
184         spinlock_t        lock; /* for from, to */
185         struct mm_struct  *mm;
186         struct mem_cgroup *from;
187         struct mem_cgroup *to;
188         unsigned long flags;
189         unsigned long precharge;
190         unsigned long moved_charge;
191         unsigned long moved_swap;
192         struct task_struct *moving_task;        /* a task moving charges */
193         wait_queue_head_t waitq;                /* a waitq for other context */
194 } mc = {
195         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
196         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
197 };
198
199 /*
200  * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
201  * limit reclaim to prevent infinite loops, if they ever occur.
202  */
203 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
204 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
205
206 /* for encoding cft->private value on file */
207 enum res_type {
208         _MEM,
209         _MEMSWAP,
210         _KMEM,
211         _TCP,
212 };
213
214 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
215 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
216 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
217
218 /*
219  * Iteration constructs for visiting all cgroups (under a tree).  If
220  * loops are exited prematurely (break), mem_cgroup_iter_break() must
221  * be used for reference counting.
222  */
223 #define for_each_mem_cgroup_tree(iter, root)            \
224         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
225              iter != NULL;                              \
226              iter = mem_cgroup_iter(root, iter, NULL))
227
228 #define for_each_mem_cgroup(iter)                       \
229         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
230              iter != NULL;                              \
231              iter = mem_cgroup_iter(NULL, iter, NULL))
232
233 static inline bool task_is_dying(void)
234 {
235         return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
236                 (current->flags & PF_EXITING);
237 }
238
239 /* Some nice accessors for the vmpressure. */
240 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
241 {
242         if (!memcg)
243                 memcg = root_mem_cgroup;
244         return &memcg->vmpressure;
245 }
246
247 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
248 {
249         return container_of(vmpr, struct mem_cgroup, vmpressure);
250 }
251
252 #ifdef CONFIG_MEMCG_KMEM
253 static DEFINE_SPINLOCK(objcg_lock);
254
255 bool mem_cgroup_kmem_disabled(void)
256 {
257         return cgroup_memory_nokmem;
258 }
259
260 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
261                                       unsigned int nr_pages);
262
263 static void obj_cgroup_release(struct percpu_ref *ref)
264 {
265         struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
266         unsigned int nr_bytes;
267         unsigned int nr_pages;
268         unsigned long flags;
269
270         /*
271          * At this point all allocated objects are freed, and
272          * objcg->nr_charged_bytes can't have an arbitrary byte value.
273          * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
274          *
275          * The following sequence can lead to it:
276          * 1) CPU0: objcg == stock->cached_objcg
277          * 2) CPU1: we do a small allocation (e.g. 92 bytes),
278          *          PAGE_SIZE bytes are charged
279          * 3) CPU1: a process from another memcg is allocating something,
280          *          the stock if flushed,
281          *          objcg->nr_charged_bytes = PAGE_SIZE - 92
282          * 5) CPU0: we do release this object,
283          *          92 bytes are added to stock->nr_bytes
284          * 6) CPU0: stock is flushed,
285          *          92 bytes are added to objcg->nr_charged_bytes
286          *
287          * In the result, nr_charged_bytes == PAGE_SIZE.
288          * This page will be uncharged in obj_cgroup_release().
289          */
290         nr_bytes = atomic_read(&objcg->nr_charged_bytes);
291         WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
292         nr_pages = nr_bytes >> PAGE_SHIFT;
293
294         if (nr_pages)
295                 obj_cgroup_uncharge_pages(objcg, nr_pages);
296
297         spin_lock_irqsave(&objcg_lock, flags);
298         list_del(&objcg->list);
299         spin_unlock_irqrestore(&objcg_lock, flags);
300
301         percpu_ref_exit(ref);
302         kfree_rcu(objcg, rcu);
303 }
304
305 static struct obj_cgroup *obj_cgroup_alloc(void)
306 {
307         struct obj_cgroup *objcg;
308         int ret;
309
310         objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
311         if (!objcg)
312                 return NULL;
313
314         ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
315                               GFP_KERNEL);
316         if (ret) {
317                 kfree(objcg);
318                 return NULL;
319         }
320         INIT_LIST_HEAD(&objcg->list);
321         return objcg;
322 }
323
324 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
325                                   struct mem_cgroup *parent)
326 {
327         struct obj_cgroup *objcg, *iter;
328
329         objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
330
331         spin_lock_irq(&objcg_lock);
332
333         /* 1) Ready to reparent active objcg. */
334         list_add(&objcg->list, &memcg->objcg_list);
335         /* 2) Reparent active objcg and already reparented objcgs to parent. */
336         list_for_each_entry(iter, &memcg->objcg_list, list)
337                 WRITE_ONCE(iter->memcg, parent);
338         /* 3) Move already reparented objcgs to the parent's list */
339         list_splice(&memcg->objcg_list, &parent->objcg_list);
340
341         spin_unlock_irq(&objcg_lock);
342
343         percpu_ref_kill(&objcg->refcnt);
344 }
345
346 /*
347  * A lot of the calls to the cache allocation functions are expected to be
348  * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
349  * conditional to this static branch, we'll have to allow modules that does
350  * kmem_cache_alloc and the such to see this symbol as well
351  */
352 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
353 EXPORT_SYMBOL(memcg_kmem_online_key);
354
355 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
356 EXPORT_SYMBOL(memcg_bpf_enabled_key);
357 #endif
358
359 /**
360  * mem_cgroup_css_from_folio - css of the memcg associated with a folio
361  * @folio: folio of interest
362  *
363  * If memcg is bound to the default hierarchy, css of the memcg associated
364  * with @folio is returned.  The returned css remains associated with @folio
365  * until it is released.
366  *
367  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
368  * is returned.
369  */
370 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
371 {
372         struct mem_cgroup *memcg = folio_memcg(folio);
373
374         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
375                 memcg = root_mem_cgroup;
376
377         return &memcg->css;
378 }
379
380 /**
381  * page_cgroup_ino - return inode number of the memcg a page is charged to
382  * @page: the page
383  *
384  * Look up the closest online ancestor of the memory cgroup @page is charged to
385  * and return its inode number or 0 if @page is not charged to any cgroup. It
386  * is safe to call this function without holding a reference to @page.
387  *
388  * Note, this function is inherently racy, because there is nothing to prevent
389  * the cgroup inode from getting torn down and potentially reallocated a moment
390  * after page_cgroup_ino() returns, so it only should be used by callers that
391  * do not care (such as procfs interfaces).
392  */
393 ino_t page_cgroup_ino(struct page *page)
394 {
395         struct mem_cgroup *memcg;
396         unsigned long ino = 0;
397
398         rcu_read_lock();
399         /* page_folio() is racy here, but the entire function is racy anyway */
400         memcg = folio_memcg_check(page_folio(page));
401
402         while (memcg && !(memcg->css.flags & CSS_ONLINE))
403                 memcg = parent_mem_cgroup(memcg);
404         if (memcg)
405                 ino = cgroup_ino(memcg->css.cgroup);
406         rcu_read_unlock();
407         return ino;
408 }
409
410 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
411                                          struct mem_cgroup_tree_per_node *mctz,
412                                          unsigned long new_usage_in_excess)
413 {
414         struct rb_node **p = &mctz->rb_root.rb_node;
415         struct rb_node *parent = NULL;
416         struct mem_cgroup_per_node *mz_node;
417         bool rightmost = true;
418
419         if (mz->on_tree)
420                 return;
421
422         mz->usage_in_excess = new_usage_in_excess;
423         if (!mz->usage_in_excess)
424                 return;
425         while (*p) {
426                 parent = *p;
427                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
428                                         tree_node);
429                 if (mz->usage_in_excess < mz_node->usage_in_excess) {
430                         p = &(*p)->rb_left;
431                         rightmost = false;
432                 } else {
433                         p = &(*p)->rb_right;
434                 }
435         }
436
437         if (rightmost)
438                 mctz->rb_rightmost = &mz->tree_node;
439
440         rb_link_node(&mz->tree_node, parent, p);
441         rb_insert_color(&mz->tree_node, &mctz->rb_root);
442         mz->on_tree = true;
443 }
444
445 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
446                                          struct mem_cgroup_tree_per_node *mctz)
447 {
448         if (!mz->on_tree)
449                 return;
450
451         if (&mz->tree_node == mctz->rb_rightmost)
452                 mctz->rb_rightmost = rb_prev(&mz->tree_node);
453
454         rb_erase(&mz->tree_node, &mctz->rb_root);
455         mz->on_tree = false;
456 }
457
458 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
459                                        struct mem_cgroup_tree_per_node *mctz)
460 {
461         unsigned long flags;
462
463         spin_lock_irqsave(&mctz->lock, flags);
464         __mem_cgroup_remove_exceeded(mz, mctz);
465         spin_unlock_irqrestore(&mctz->lock, flags);
466 }
467
468 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
469 {
470         unsigned long nr_pages = page_counter_read(&memcg->memory);
471         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
472         unsigned long excess = 0;
473
474         if (nr_pages > soft_limit)
475                 excess = nr_pages - soft_limit;
476
477         return excess;
478 }
479
480 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
481 {
482         unsigned long excess;
483         struct mem_cgroup_per_node *mz;
484         struct mem_cgroup_tree_per_node *mctz;
485
486         if (lru_gen_enabled()) {
487                 if (soft_limit_excess(memcg))
488                         lru_gen_soft_reclaim(memcg, nid);
489                 return;
490         }
491
492         mctz = soft_limit_tree.rb_tree_per_node[nid];
493         if (!mctz)
494                 return;
495         /*
496          * Necessary to update all ancestors when hierarchy is used.
497          * because their event counter is not touched.
498          */
499         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
500                 mz = memcg->nodeinfo[nid];
501                 excess = soft_limit_excess(memcg);
502                 /*
503                  * We have to update the tree if mz is on RB-tree or
504                  * mem is over its softlimit.
505                  */
506                 if (excess || mz->on_tree) {
507                         unsigned long flags;
508
509                         spin_lock_irqsave(&mctz->lock, flags);
510                         /* if on-tree, remove it */
511                         if (mz->on_tree)
512                                 __mem_cgroup_remove_exceeded(mz, mctz);
513                         /*
514                          * Insert again. mz->usage_in_excess will be updated.
515                          * If excess is 0, no tree ops.
516                          */
517                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
518                         spin_unlock_irqrestore(&mctz->lock, flags);
519                 }
520         }
521 }
522
523 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
524 {
525         struct mem_cgroup_tree_per_node *mctz;
526         struct mem_cgroup_per_node *mz;
527         int nid;
528
529         for_each_node(nid) {
530                 mz = memcg->nodeinfo[nid];
531                 mctz = soft_limit_tree.rb_tree_per_node[nid];
532                 if (mctz)
533                         mem_cgroup_remove_exceeded(mz, mctz);
534         }
535 }
536
537 static struct mem_cgroup_per_node *
538 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
539 {
540         struct mem_cgroup_per_node *mz;
541
542 retry:
543         mz = NULL;
544         if (!mctz->rb_rightmost)
545                 goto done;              /* Nothing to reclaim from */
546
547         mz = rb_entry(mctz->rb_rightmost,
548                       struct mem_cgroup_per_node, tree_node);
549         /*
550          * Remove the node now but someone else can add it back,
551          * we will to add it back at the end of reclaim to its correct
552          * position in the tree.
553          */
554         __mem_cgroup_remove_exceeded(mz, mctz);
555         if (!soft_limit_excess(mz->memcg) ||
556             !css_tryget(&mz->memcg->css))
557                 goto retry;
558 done:
559         return mz;
560 }
561
562 static struct mem_cgroup_per_node *
563 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
564 {
565         struct mem_cgroup_per_node *mz;
566
567         spin_lock_irq(&mctz->lock);
568         mz = __mem_cgroup_largest_soft_limit_node(mctz);
569         spin_unlock_irq(&mctz->lock);
570         return mz;
571 }
572
573 /*
574  * memcg and lruvec stats flushing
575  *
576  * Many codepaths leading to stats update or read are performance sensitive and
577  * adding stats flushing in such codepaths is not desirable. So, to optimize the
578  * flushing the kernel does:
579  *
580  * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
581  *    rstat update tree grow unbounded.
582  *
583  * 2) Flush the stats synchronously on reader side only when there are more than
584  *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
585  *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
586  *    only for 2 seconds due to (1).
587  */
588 static void flush_memcg_stats_dwork(struct work_struct *w);
589 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
590 static DEFINE_PER_CPU(unsigned int, stats_updates);
591 static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
592 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
593 static u64 flush_next_time;
594
595 #define FLUSH_TIME (2UL*HZ)
596
597 /*
598  * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
599  * not rely on this as part of an acquired spinlock_t lock. These functions are
600  * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
601  * is sufficient.
602  */
603 static void memcg_stats_lock(void)
604 {
605         preempt_disable_nested();
606         VM_WARN_ON_IRQS_ENABLED();
607 }
608
609 static void __memcg_stats_lock(void)
610 {
611         preempt_disable_nested();
612 }
613
614 static void memcg_stats_unlock(void)
615 {
616         preempt_enable_nested();
617 }
618
619 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
620 {
621         unsigned int x;
622
623         if (!val)
624                 return;
625
626         cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
627
628         x = __this_cpu_add_return(stats_updates, abs(val));
629         if (x > MEMCG_CHARGE_BATCH) {
630                 /*
631                  * If stats_flush_threshold exceeds the threshold
632                  * (>num_online_cpus()), cgroup stats update will be triggered
633                  * in __mem_cgroup_flush_stats(). Increasing this var further
634                  * is redundant and simply adds overhead in atomic update.
635                  */
636                 if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
637                         atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
638                 __this_cpu_write(stats_updates, 0);
639         }
640 }
641
642 static void do_flush_stats(void)
643 {
644         /*
645          * We always flush the entire tree, so concurrent flushers can just
646          * skip. This avoids a thundering herd problem on the rstat global lock
647          * from memcg flushers (e.g. reclaim, refault, etc).
648          */
649         if (atomic_read(&stats_flush_ongoing) ||
650             atomic_xchg(&stats_flush_ongoing, 1))
651                 return;
652
653         WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
654
655         cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
656
657         atomic_set(&stats_flush_threshold, 0);
658         atomic_set(&stats_flush_ongoing, 0);
659 }
660
661 void mem_cgroup_flush_stats(void)
662 {
663         if (atomic_read(&stats_flush_threshold) > num_online_cpus())
664                 do_flush_stats();
665 }
666
667 void mem_cgroup_flush_stats_ratelimited(void)
668 {
669         if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
670                 mem_cgroup_flush_stats();
671 }
672
673 static void flush_memcg_stats_dwork(struct work_struct *w)
674 {
675         /*
676          * Always flush here so that flushing in latency-sensitive paths is
677          * as cheap as possible.
678          */
679         do_flush_stats();
680         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
681 }
682
683 /* Subset of vm_event_item to report for memcg event stats */
684 static const unsigned int memcg_vm_event_stat[] = {
685         PGPGIN,
686         PGPGOUT,
687         PGSCAN_KSWAPD,
688         PGSCAN_DIRECT,
689         PGSCAN_KHUGEPAGED,
690         PGSTEAL_KSWAPD,
691         PGSTEAL_DIRECT,
692         PGSTEAL_KHUGEPAGED,
693         PGFAULT,
694         PGMAJFAULT,
695         PGREFILL,
696         PGACTIVATE,
697         PGDEACTIVATE,
698         PGLAZYFREE,
699         PGLAZYFREED,
700 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
701         ZSWPIN,
702         ZSWPOUT,
703 #endif
704 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
705         THP_FAULT_ALLOC,
706         THP_COLLAPSE_ALLOC,
707 #endif
708 };
709
710 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
711 static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
712
713 static void init_memcg_events(void)
714 {
715         int i;
716
717         for (i = 0; i < NR_MEMCG_EVENTS; ++i)
718                 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
719 }
720
721 static inline int memcg_events_index(enum vm_event_item idx)
722 {
723         return mem_cgroup_events_index[idx] - 1;
724 }
725
726 struct memcg_vmstats_percpu {
727         /* Local (CPU and cgroup) page state & events */
728         long                    state[MEMCG_NR_STAT];
729         unsigned long           events[NR_MEMCG_EVENTS];
730
731         /* Delta calculation for lockless upward propagation */
732         long                    state_prev[MEMCG_NR_STAT];
733         unsigned long           events_prev[NR_MEMCG_EVENTS];
734
735         /* Cgroup1: threshold notifications & softlimit tree updates */
736         unsigned long           nr_page_events;
737         unsigned long           targets[MEM_CGROUP_NTARGETS];
738 };
739
740 struct memcg_vmstats {
741         /* Aggregated (CPU and subtree) page state & events */
742         long                    state[MEMCG_NR_STAT];
743         unsigned long           events[NR_MEMCG_EVENTS];
744
745         /* Pending child counts during tree propagation */
746         long                    state_pending[MEMCG_NR_STAT];
747         unsigned long           events_pending[NR_MEMCG_EVENTS];
748 };
749
750 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
751 {
752         long x = READ_ONCE(memcg->vmstats->state[idx]);
753 #ifdef CONFIG_SMP
754         if (x < 0)
755                 x = 0;
756 #endif
757         return x;
758 }
759
760 /**
761  * __mod_memcg_state - update cgroup memory statistics
762  * @memcg: the memory cgroup
763  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
764  * @val: delta to add to the counter, can be negative
765  */
766 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
767 {
768         if (mem_cgroup_disabled())
769                 return;
770
771         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
772         memcg_rstat_updated(memcg, val);
773 }
774
775 /* idx can be of type enum memcg_stat_item or node_stat_item. */
776 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
777 {
778         long x = 0;
779         int cpu;
780
781         for_each_possible_cpu(cpu)
782                 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
783 #ifdef CONFIG_SMP
784         if (x < 0)
785                 x = 0;
786 #endif
787         return x;
788 }
789
790 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
791                               int val)
792 {
793         struct mem_cgroup_per_node *pn;
794         struct mem_cgroup *memcg;
795
796         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
797         memcg = pn->memcg;
798
799         /*
800          * The caller from rmap relay on disabled preemption becase they never
801          * update their counter from in-interrupt context. For these two
802          * counters we check that the update is never performed from an
803          * interrupt context while other caller need to have disabled interrupt.
804          */
805         __memcg_stats_lock();
806         if (IS_ENABLED(CONFIG_DEBUG_VM)) {
807                 switch (idx) {
808                 case NR_ANON_MAPPED:
809                 case NR_FILE_MAPPED:
810                 case NR_ANON_THPS:
811                 case NR_SHMEM_PMDMAPPED:
812                 case NR_FILE_PMDMAPPED:
813                         WARN_ON_ONCE(!in_task());
814                         break;
815                 default:
816                         VM_WARN_ON_IRQS_ENABLED();
817                 }
818         }
819
820         /* Update memcg */
821         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
822
823         /* Update lruvec */
824         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
825
826         memcg_rstat_updated(memcg, val);
827         memcg_stats_unlock();
828 }
829
830 /**
831  * __mod_lruvec_state - update lruvec memory statistics
832  * @lruvec: the lruvec
833  * @idx: the stat item
834  * @val: delta to add to the counter, can be negative
835  *
836  * The lruvec is the intersection of the NUMA node and a cgroup. This
837  * function updates the all three counters that are affected by a
838  * change of state at this level: per-node, per-cgroup, per-lruvec.
839  */
840 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
841                         int val)
842 {
843         /* Update node */
844         __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
845
846         /* Update memcg and lruvec */
847         if (!mem_cgroup_disabled())
848                 __mod_memcg_lruvec_state(lruvec, idx, val);
849 }
850
851 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
852                              int val)
853 {
854         struct page *head = compound_head(page); /* rmap on tail pages */
855         struct mem_cgroup *memcg;
856         pg_data_t *pgdat = page_pgdat(page);
857         struct lruvec *lruvec;
858
859         rcu_read_lock();
860         memcg = page_memcg(head);
861         /* Untracked pages have no memcg, no lruvec. Update only the node */
862         if (!memcg) {
863                 rcu_read_unlock();
864                 __mod_node_page_state(pgdat, idx, val);
865                 return;
866         }
867
868         lruvec = mem_cgroup_lruvec(memcg, pgdat);
869         __mod_lruvec_state(lruvec, idx, val);
870         rcu_read_unlock();
871 }
872 EXPORT_SYMBOL(__mod_lruvec_page_state);
873
874 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
875 {
876         pg_data_t *pgdat = page_pgdat(virt_to_page(p));
877         struct mem_cgroup *memcg;
878         struct lruvec *lruvec;
879
880         rcu_read_lock();
881         memcg = mem_cgroup_from_slab_obj(p);
882
883         /*
884          * Untracked pages have no memcg, no lruvec. Update only the
885          * node. If we reparent the slab objects to the root memcg,
886          * when we free the slab object, we need to update the per-memcg
887          * vmstats to keep it correct for the root memcg.
888          */
889         if (!memcg) {
890                 __mod_node_page_state(pgdat, idx, val);
891         } else {
892                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
893                 __mod_lruvec_state(lruvec, idx, val);
894         }
895         rcu_read_unlock();
896 }
897
898 /**
899  * __count_memcg_events - account VM events in a cgroup
900  * @memcg: the memory cgroup
901  * @idx: the event item
902  * @count: the number of events that occurred
903  */
904 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
905                           unsigned long count)
906 {
907         int index = memcg_events_index(idx);
908
909         if (mem_cgroup_disabled() || index < 0)
910                 return;
911
912         memcg_stats_lock();
913         __this_cpu_add(memcg->vmstats_percpu->events[index], count);
914         memcg_rstat_updated(memcg, count);
915         memcg_stats_unlock();
916 }
917
918 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
919 {
920         int index = memcg_events_index(event);
921
922         if (index < 0)
923                 return 0;
924         return READ_ONCE(memcg->vmstats->events[index]);
925 }
926
927 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
928 {
929         long x = 0;
930         int cpu;
931         int index = memcg_events_index(event);
932
933         if (index < 0)
934                 return 0;
935
936         for_each_possible_cpu(cpu)
937                 x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
938         return x;
939 }
940
941 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
942                                          int nr_pages)
943 {
944         /* pagein of a big page is an event. So, ignore page size */
945         if (nr_pages > 0)
946                 __count_memcg_events(memcg, PGPGIN, 1);
947         else {
948                 __count_memcg_events(memcg, PGPGOUT, 1);
949                 nr_pages = -nr_pages; /* for event */
950         }
951
952         __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
953 }
954
955 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
956                                        enum mem_cgroup_events_target target)
957 {
958         unsigned long val, next;
959
960         val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
961         next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
962         /* from time_after() in jiffies.h */
963         if ((long)(next - val) < 0) {
964                 switch (target) {
965                 case MEM_CGROUP_TARGET_THRESH:
966                         next = val + THRESHOLDS_EVENTS_TARGET;
967                         break;
968                 case MEM_CGROUP_TARGET_SOFTLIMIT:
969                         next = val + SOFTLIMIT_EVENTS_TARGET;
970                         break;
971                 default:
972                         break;
973                 }
974                 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
975                 return true;
976         }
977         return false;
978 }
979
980 /*
981  * Check events in order.
982  *
983  */
984 static void memcg_check_events(struct mem_cgroup *memcg, int nid)
985 {
986         if (IS_ENABLED(CONFIG_PREEMPT_RT))
987                 return;
988
989         /* threshold event is triggered in finer grain than soft limit */
990         if (unlikely(mem_cgroup_event_ratelimit(memcg,
991                                                 MEM_CGROUP_TARGET_THRESH))) {
992                 bool do_softlimit;
993
994                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
995                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
996                 mem_cgroup_threshold(memcg);
997                 if (unlikely(do_softlimit))
998                         mem_cgroup_update_tree(memcg, nid);
999         }
1000 }
1001
1002 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1003 {
1004         /*
1005          * mm_update_next_owner() may clear mm->owner to NULL
1006          * if it races with swapoff, page migration, etc.
1007          * So this can be called with p == NULL.
1008          */
1009         if (unlikely(!p))
1010                 return NULL;
1011
1012         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1013 }
1014 EXPORT_SYMBOL(mem_cgroup_from_task);
1015
1016 static __always_inline struct mem_cgroup *active_memcg(void)
1017 {
1018         if (!in_task())
1019                 return this_cpu_read(int_active_memcg);
1020         else
1021                 return current->active_memcg;
1022 }
1023
1024 /**
1025  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
1026  * @mm: mm from which memcg should be extracted. It can be NULL.
1027  *
1028  * Obtain a reference on mm->memcg and returns it if successful. If mm
1029  * is NULL, then the memcg is chosen as follows:
1030  * 1) The active memcg, if set.
1031  * 2) current->mm->memcg, if available
1032  * 3) root memcg
1033  * If mem_cgroup is disabled, NULL is returned.
1034  */
1035 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1036 {
1037         struct mem_cgroup *memcg;
1038
1039         if (mem_cgroup_disabled())
1040                 return NULL;
1041
1042         /*
1043          * Page cache insertions can happen without an
1044          * actual mm context, e.g. during disk probing
1045          * on boot, loopback IO, acct() writes etc.
1046          *
1047          * No need to css_get on root memcg as the reference
1048          * counting is disabled on the root level in the
1049          * cgroup core. See CSS_NO_REF.
1050          */
1051         if (unlikely(!mm)) {
1052                 memcg = active_memcg();
1053                 if (unlikely(memcg)) {
1054                         /* remote memcg must hold a ref */
1055                         css_get(&memcg->css);
1056                         return memcg;
1057                 }
1058                 mm = current->mm;
1059                 if (unlikely(!mm))
1060                         return root_mem_cgroup;
1061         }
1062
1063         rcu_read_lock();
1064         do {
1065                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1066                 if (unlikely(!memcg))
1067                         memcg = root_mem_cgroup;
1068         } while (!css_tryget(&memcg->css));
1069         rcu_read_unlock();
1070         return memcg;
1071 }
1072 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1073
1074 static __always_inline bool memcg_kmem_bypass(void)
1075 {
1076         /* Allow remote memcg charging from any context. */
1077         if (unlikely(active_memcg()))
1078                 return false;
1079
1080         /* Memcg to charge can't be determined. */
1081         if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
1082                 return true;
1083
1084         return false;
1085 }
1086
1087 /**
1088  * mem_cgroup_iter - iterate over memory cgroup hierarchy
1089  * @root: hierarchy root
1090  * @prev: previously returned memcg, NULL on first invocation
1091  * @reclaim: cookie for shared reclaim walks, NULL for full walks
1092  *
1093  * Returns references to children of the hierarchy below @root, or
1094  * @root itself, or %NULL after a full round-trip.
1095  *
1096  * Caller must pass the return value in @prev on subsequent
1097  * invocations for reference counting, or use mem_cgroup_iter_break()
1098  * to cancel a hierarchy walk before the round-trip is complete.
1099  *
1100  * Reclaimers can specify a node in @reclaim to divide up the memcgs
1101  * in the hierarchy among all concurrent reclaimers operating on the
1102  * same node.
1103  */
1104 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1105                                    struct mem_cgroup *prev,
1106                                    struct mem_cgroup_reclaim_cookie *reclaim)
1107 {
1108         struct mem_cgroup_reclaim_iter *iter;
1109         struct cgroup_subsys_state *css = NULL;
1110         struct mem_cgroup *memcg = NULL;
1111         struct mem_cgroup *pos = NULL;
1112
1113         if (mem_cgroup_disabled())
1114                 return NULL;
1115
1116         if (!root)
1117                 root = root_mem_cgroup;
1118
1119         rcu_read_lock();
1120
1121         if (reclaim) {
1122                 struct mem_cgroup_per_node *mz;
1123
1124                 mz = root->nodeinfo[reclaim->pgdat->node_id];
1125                 iter = &mz->iter;
1126
1127                 /*
1128                  * On start, join the current reclaim iteration cycle.
1129                  * Exit when a concurrent walker completes it.
1130                  */
1131                 if (!prev)
1132                         reclaim->generation = iter->generation;
1133                 else if (reclaim->generation != iter->generation)
1134                         goto out_unlock;
1135
1136                 while (1) {
1137                         pos = READ_ONCE(iter->position);
1138                         if (!pos || css_tryget(&pos->css))
1139                                 break;
1140                         /*
1141                          * css reference reached zero, so iter->position will
1142                          * be cleared by ->css_released. However, we should not
1143                          * rely on this happening soon, because ->css_released
1144                          * is called from a work queue, and by busy-waiting we
1145                          * might block it. So we clear iter->position right
1146                          * away.
1147                          */
1148                         (void)cmpxchg(&iter->position, pos, NULL);
1149                 }
1150         } else if (prev) {
1151                 pos = prev;
1152         }
1153
1154         if (pos)
1155                 css = &pos->css;
1156
1157         for (;;) {
1158                 css = css_next_descendant_pre(css, &root->css);
1159                 if (!css) {
1160                         /*
1161                          * Reclaimers share the hierarchy walk, and a
1162                          * new one might jump in right at the end of
1163                          * the hierarchy - make sure they see at least
1164                          * one group and restart from the beginning.
1165                          */
1166                         if (!prev)
1167                                 continue;
1168                         break;
1169                 }
1170
1171                 /*
1172                  * Verify the css and acquire a reference.  The root
1173                  * is provided by the caller, so we know it's alive
1174                  * and kicking, and don't take an extra reference.
1175                  */
1176                 if (css == &root->css || css_tryget(css)) {
1177                         memcg = mem_cgroup_from_css(css);
1178                         break;
1179                 }
1180         }
1181
1182         if (reclaim) {
1183                 /*
1184                  * The position could have already been updated by a competing
1185                  * thread, so check that the value hasn't changed since we read
1186                  * it to avoid reclaiming from the same cgroup twice.
1187                  */
1188                 (void)cmpxchg(&iter->position, pos, memcg);
1189
1190                 if (pos)
1191                         css_put(&pos->css);
1192
1193                 if (!memcg)
1194                         iter->generation++;
1195         }
1196
1197 out_unlock:
1198         rcu_read_unlock();
1199         if (prev && prev != root)
1200                 css_put(&prev->css);
1201
1202         return memcg;
1203 }
1204
1205 /**
1206  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1207  * @root: hierarchy root
1208  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1209  */
1210 void mem_cgroup_iter_break(struct mem_cgroup *root,
1211                            struct mem_cgroup *prev)
1212 {
1213         if (!root)
1214                 root = root_mem_cgroup;
1215         if (prev && prev != root)
1216                 css_put(&prev->css);
1217 }
1218
1219 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1220                                         struct mem_cgroup *dead_memcg)
1221 {
1222         struct mem_cgroup_reclaim_iter *iter;
1223         struct mem_cgroup_per_node *mz;
1224         int nid;
1225
1226         for_each_node(nid) {
1227                 mz = from->nodeinfo[nid];
1228                 iter = &mz->iter;
1229                 cmpxchg(&iter->position, dead_memcg, NULL);
1230         }
1231 }
1232
1233 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1234 {
1235         struct mem_cgroup *memcg = dead_memcg;
1236         struct mem_cgroup *last;
1237
1238         do {
1239                 __invalidate_reclaim_iterators(memcg, dead_memcg);
1240                 last = memcg;
1241         } while ((memcg = parent_mem_cgroup(memcg)));
1242
1243         /*
1244          * When cgroup1 non-hierarchy mode is used,
1245          * parent_mem_cgroup() does not walk all the way up to the
1246          * cgroup root (root_mem_cgroup). So we have to handle
1247          * dead_memcg from cgroup root separately.
1248          */
1249         if (!mem_cgroup_is_root(last))
1250                 __invalidate_reclaim_iterators(root_mem_cgroup,
1251                                                 dead_memcg);
1252 }
1253
1254 /**
1255  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1256  * @memcg: hierarchy root
1257  * @fn: function to call for each task
1258  * @arg: argument passed to @fn
1259  *
1260  * This function iterates over tasks attached to @memcg or to any of its
1261  * descendants and calls @fn for each task. If @fn returns a non-zero
1262  * value, the function breaks the iteration loop. Otherwise, it will iterate
1263  * over all tasks and return 0.
1264  *
1265  * This function must not be called for the root memory cgroup.
1266  */
1267 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1268                            int (*fn)(struct task_struct *, void *), void *arg)
1269 {
1270         struct mem_cgroup *iter;
1271         int ret = 0;
1272
1273         BUG_ON(mem_cgroup_is_root(memcg));
1274
1275         for_each_mem_cgroup_tree(iter, memcg) {
1276                 struct css_task_iter it;
1277                 struct task_struct *task;
1278
1279                 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1280                 while (!ret && (task = css_task_iter_next(&it)))
1281                         ret = fn(task, arg);
1282                 css_task_iter_end(&it);
1283                 if (ret) {
1284                         mem_cgroup_iter_break(memcg, iter);
1285                         break;
1286                 }
1287         }
1288 }
1289
1290 #ifdef CONFIG_DEBUG_VM
1291 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1292 {
1293         struct mem_cgroup *memcg;
1294
1295         if (mem_cgroup_disabled())
1296                 return;
1297
1298         memcg = folio_memcg(folio);
1299
1300         if (!memcg)
1301                 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1302         else
1303                 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1304 }
1305 #endif
1306
1307 /**
1308  * folio_lruvec_lock - Lock the lruvec for a folio.
1309  * @folio: Pointer to the folio.
1310  *
1311  * These functions are safe to use under any of the following conditions:
1312  * - folio locked
1313  * - folio_test_lru false
1314  * - folio_memcg_lock()
1315  * - folio frozen (refcount of 0)
1316  *
1317  * Return: The lruvec this folio is on with its lock held.
1318  */
1319 struct lruvec *folio_lruvec_lock(struct folio *folio)
1320 {
1321         struct lruvec *lruvec = folio_lruvec(folio);
1322
1323         spin_lock(&lruvec->lru_lock);
1324         lruvec_memcg_debug(lruvec, folio);
1325
1326         return lruvec;
1327 }
1328
1329 /**
1330  * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1331  * @folio: Pointer to the folio.
1332  *
1333  * These functions are safe to use under any of the following conditions:
1334  * - folio locked
1335  * - folio_test_lru false
1336  * - folio_memcg_lock()
1337  * - folio frozen (refcount of 0)
1338  *
1339  * Return: The lruvec this folio is on with its lock held and interrupts
1340  * disabled.
1341  */
1342 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1343 {
1344         struct lruvec *lruvec = folio_lruvec(folio);
1345
1346         spin_lock_irq(&lruvec->lru_lock);
1347         lruvec_memcg_debug(lruvec, folio);
1348
1349         return lruvec;
1350 }
1351
1352 /**
1353  * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1354  * @folio: Pointer to the folio.
1355  * @flags: Pointer to irqsave flags.
1356  *
1357  * These functions are safe to use under any of the following conditions:
1358  * - folio locked
1359  * - folio_test_lru false
1360  * - folio_memcg_lock()
1361  * - folio frozen (refcount of 0)
1362  *
1363  * Return: The lruvec this folio is on with its lock held and interrupts
1364  * disabled.
1365  */
1366 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1367                 unsigned long *flags)
1368 {
1369         struct lruvec *lruvec = folio_lruvec(folio);
1370
1371         spin_lock_irqsave(&lruvec->lru_lock, *flags);
1372         lruvec_memcg_debug(lruvec, folio);
1373
1374         return lruvec;
1375 }
1376
1377 /**
1378  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1379  * @lruvec: mem_cgroup per zone lru vector
1380  * @lru: index of lru list the page is sitting on
1381  * @zid: zone id of the accounted pages
1382  * @nr_pages: positive when adding or negative when removing
1383  *
1384  * This function must be called under lru_lock, just before a page is added
1385  * to or just after a page is removed from an lru list.
1386  */
1387 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1388                                 int zid, int nr_pages)
1389 {
1390         struct mem_cgroup_per_node *mz;
1391         unsigned long *lru_size;
1392         long size;
1393
1394         if (mem_cgroup_disabled())
1395                 return;
1396
1397         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1398         lru_size = &mz->lru_zone_size[zid][lru];
1399
1400         if (nr_pages < 0)
1401                 *lru_size += nr_pages;
1402
1403         size = *lru_size;
1404         if (WARN_ONCE(size < 0,
1405                 "%s(%p, %d, %d): lru_size %ld\n",
1406                 __func__, lruvec, lru, nr_pages, size)) {
1407                 VM_BUG_ON(1);
1408                 *lru_size = 0;
1409         }
1410
1411         if (nr_pages > 0)
1412                 *lru_size += nr_pages;
1413 }
1414
1415 /**
1416  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1417  * @memcg: the memory cgroup
1418  *
1419  * Returns the maximum amount of memory @mem can be charged with, in
1420  * pages.
1421  */
1422 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1423 {
1424         unsigned long margin = 0;
1425         unsigned long count;
1426         unsigned long limit;
1427
1428         count = page_counter_read(&memcg->memory);
1429         limit = READ_ONCE(memcg->memory.max);
1430         if (count < limit)
1431                 margin = limit - count;
1432
1433         if (do_memsw_account()) {
1434                 count = page_counter_read(&memcg->memsw);
1435                 limit = READ_ONCE(memcg->memsw.max);
1436                 if (count < limit)
1437                         margin = min(margin, limit - count);
1438                 else
1439                         margin = 0;
1440         }
1441
1442         return margin;
1443 }
1444
1445 /*
1446  * A routine for checking "mem" is under move_account() or not.
1447  *
1448  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1449  * moving cgroups. This is for waiting at high-memory pressure
1450  * caused by "move".
1451  */
1452 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1453 {
1454         struct mem_cgroup *from;
1455         struct mem_cgroup *to;
1456         bool ret = false;
1457         /*
1458          * Unlike task_move routines, we access mc.to, mc.from not under
1459          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1460          */
1461         spin_lock(&mc.lock);
1462         from = mc.from;
1463         to = mc.to;
1464         if (!from)
1465                 goto unlock;
1466
1467         ret = mem_cgroup_is_descendant(from, memcg) ||
1468                 mem_cgroup_is_descendant(to, memcg);
1469 unlock:
1470         spin_unlock(&mc.lock);
1471         return ret;
1472 }
1473
1474 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1475 {
1476         if (mc.moving_task && current != mc.moving_task) {
1477                 if (mem_cgroup_under_move(memcg)) {
1478                         DEFINE_WAIT(wait);
1479                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1480                         /* moving charge context might have finished. */
1481                         if (mc.moving_task)
1482                                 schedule();
1483                         finish_wait(&mc.waitq, &wait);
1484                         return true;
1485                 }
1486         }
1487         return false;
1488 }
1489
1490 struct memory_stat {
1491         const char *name;
1492         unsigned int idx;
1493 };
1494
1495 static const struct memory_stat memory_stats[] = {
1496         { "anon",                       NR_ANON_MAPPED                  },
1497         { "file",                       NR_FILE_PAGES                   },
1498         { "kernel",                     MEMCG_KMEM                      },
1499         { "kernel_stack",               NR_KERNEL_STACK_KB              },
1500         { "pagetables",                 NR_PAGETABLE                    },
1501         { "sec_pagetables",             NR_SECONDARY_PAGETABLE          },
1502         { "percpu",                     MEMCG_PERCPU_B                  },
1503         { "sock",                       MEMCG_SOCK                      },
1504         { "vmalloc",                    MEMCG_VMALLOC                   },
1505         { "shmem",                      NR_SHMEM                        },
1506 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1507         { "zswap",                      MEMCG_ZSWAP_B                   },
1508         { "zswapped",                   MEMCG_ZSWAPPED                  },
1509 #endif
1510         { "file_mapped",                NR_FILE_MAPPED                  },
1511         { "file_dirty",                 NR_FILE_DIRTY                   },
1512         { "file_writeback",             NR_WRITEBACK                    },
1513 #ifdef CONFIG_SWAP
1514         { "swapcached",                 NR_SWAPCACHE                    },
1515 #endif
1516 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1517         { "anon_thp",                   NR_ANON_THPS                    },
1518         { "file_thp",                   NR_FILE_THPS                    },
1519         { "shmem_thp",                  NR_SHMEM_THPS                   },
1520 #endif
1521         { "inactive_anon",              NR_INACTIVE_ANON                },
1522         { "active_anon",                NR_ACTIVE_ANON                  },
1523         { "inactive_file",              NR_INACTIVE_FILE                },
1524         { "active_file",                NR_ACTIVE_FILE                  },
1525         { "unevictable",                NR_UNEVICTABLE                  },
1526         { "slab_reclaimable",           NR_SLAB_RECLAIMABLE_B           },
1527         { "slab_unreclaimable",         NR_SLAB_UNRECLAIMABLE_B         },
1528
1529         /* The memory events */
1530         { "workingset_refault_anon",    WORKINGSET_REFAULT_ANON         },
1531         { "workingset_refault_file",    WORKINGSET_REFAULT_FILE         },
1532         { "workingset_activate_anon",   WORKINGSET_ACTIVATE_ANON        },
1533         { "workingset_activate_file",   WORKINGSET_ACTIVATE_FILE        },
1534         { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
1535         { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
1536         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
1537 };
1538
1539 /* Translate stat items to the correct unit for memory.stat output */
1540 static int memcg_page_state_unit(int item)
1541 {
1542         switch (item) {
1543         case MEMCG_PERCPU_B:
1544         case MEMCG_ZSWAP_B:
1545         case NR_SLAB_RECLAIMABLE_B:
1546         case NR_SLAB_UNRECLAIMABLE_B:
1547         case WORKINGSET_REFAULT_ANON:
1548         case WORKINGSET_REFAULT_FILE:
1549         case WORKINGSET_ACTIVATE_ANON:
1550         case WORKINGSET_ACTIVATE_FILE:
1551         case WORKINGSET_RESTORE_ANON:
1552         case WORKINGSET_RESTORE_FILE:
1553         case WORKINGSET_NODERECLAIM:
1554                 return 1;
1555         case NR_KERNEL_STACK_KB:
1556                 return SZ_1K;
1557         default:
1558                 return PAGE_SIZE;
1559         }
1560 }
1561
1562 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1563                                                     int item)
1564 {
1565         return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1566 }
1567
1568 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1569 {
1570         int i;
1571
1572         /*
1573          * Provide statistics on the state of the memory subsystem as
1574          * well as cumulative event counters that show past behavior.
1575          *
1576          * This list is ordered following a combination of these gradients:
1577          * 1) generic big picture -> specifics and details
1578          * 2) reflecting userspace activity -> reflecting kernel heuristics
1579          *
1580          * Current memory state:
1581          */
1582         mem_cgroup_flush_stats();
1583
1584         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1585                 u64 size;
1586
1587                 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1588                 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1589
1590                 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1591                         size += memcg_page_state_output(memcg,
1592                                                         NR_SLAB_RECLAIMABLE_B);
1593                         seq_buf_printf(s, "slab %llu\n", size);
1594                 }
1595         }
1596
1597         /* Accumulated memory events */
1598         seq_buf_printf(s, "pgscan %lu\n",
1599                        memcg_events(memcg, PGSCAN_KSWAPD) +
1600                        memcg_events(memcg, PGSCAN_DIRECT) +
1601                        memcg_events(memcg, PGSCAN_KHUGEPAGED));
1602         seq_buf_printf(s, "pgsteal %lu\n",
1603                        memcg_events(memcg, PGSTEAL_KSWAPD) +
1604                        memcg_events(memcg, PGSTEAL_DIRECT) +
1605                        memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1606
1607         for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1608                 if (memcg_vm_event_stat[i] == PGPGIN ||
1609                     memcg_vm_event_stat[i] == PGPGOUT)
1610                         continue;
1611
1612                 seq_buf_printf(s, "%s %lu\n",
1613                                vm_event_name(memcg_vm_event_stat[i]),
1614                                memcg_events(memcg, memcg_vm_event_stat[i]));
1615         }
1616
1617         /* The above should easily fit into one page */
1618         WARN_ON_ONCE(seq_buf_has_overflowed(s));
1619 }
1620
1621 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
1622
1623 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1624 {
1625         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1626                 memcg_stat_format(memcg, s);
1627         else
1628                 memcg1_stat_format(memcg, s);
1629         WARN_ON_ONCE(seq_buf_has_overflowed(s));
1630 }
1631
1632 #define K(x) ((x) << (PAGE_SHIFT-10))
1633 /**
1634  * mem_cgroup_print_oom_context: Print OOM information relevant to
1635  * memory controller.
1636  * @memcg: The memory cgroup that went over limit
1637  * @p: Task that is going to be killed
1638  *
1639  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1640  * enabled
1641  */
1642 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1643 {
1644         rcu_read_lock();
1645
1646         if (memcg) {
1647                 pr_cont(",oom_memcg=");
1648                 pr_cont_cgroup_path(memcg->css.cgroup);
1649         } else
1650                 pr_cont(",global_oom");
1651         if (p) {
1652                 pr_cont(",task_memcg=");
1653                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1654         }
1655         rcu_read_unlock();
1656 }
1657
1658 /**
1659  * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1660  * memory controller.
1661  * @memcg: The memory cgroup that went over limit
1662  */
1663 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1664 {
1665         /* Use static buffer, for the caller is holding oom_lock. */
1666         static char buf[PAGE_SIZE];
1667         struct seq_buf s;
1668
1669         lockdep_assert_held(&oom_lock);
1670
1671         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1672                 K((u64)page_counter_read(&memcg->memory)),
1673                 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1674         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1675                 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1676                         K((u64)page_counter_read(&memcg->swap)),
1677                         K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1678         else {
1679                 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1680                         K((u64)page_counter_read(&memcg->memsw)),
1681                         K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1682                 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1683                         K((u64)page_counter_read(&memcg->kmem)),
1684                         K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1685         }
1686
1687         pr_info("Memory cgroup stats for ");
1688         pr_cont_cgroup_path(memcg->css.cgroup);
1689         pr_cont(":");
1690         seq_buf_init(&s, buf, sizeof(buf));
1691         memory_stat_format(memcg, &s);
1692         seq_buf_do_printk(&s, KERN_INFO);
1693 }
1694
1695 /*
1696  * Return the memory (and swap, if configured) limit for a memcg.
1697  */
1698 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1699 {
1700         unsigned long max = READ_ONCE(memcg->memory.max);
1701
1702         if (do_memsw_account()) {
1703                 if (mem_cgroup_swappiness(memcg)) {
1704                         /* Calculate swap excess capacity from memsw limit */
1705                         unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1706
1707                         max += min(swap, (unsigned long)total_swap_pages);
1708                 }
1709         } else {
1710                 if (mem_cgroup_swappiness(memcg))
1711                         max += min(READ_ONCE(memcg->swap.max),
1712                                    (unsigned long)total_swap_pages);
1713         }
1714         return max;
1715 }
1716
1717 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1718 {
1719         return page_counter_read(&memcg->memory);
1720 }
1721
1722 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1723                                      int order)
1724 {
1725         struct oom_control oc = {
1726                 .zonelist = NULL,
1727                 .nodemask = NULL,
1728                 .memcg = memcg,
1729                 .gfp_mask = gfp_mask,
1730                 .order = order,
1731         };
1732         bool ret = true;
1733
1734         if (mutex_lock_killable(&oom_lock))
1735                 return true;
1736
1737         if (mem_cgroup_margin(memcg) >= (1 << order))
1738                 goto unlock;
1739
1740         /*
1741          * A few threads which were not waiting at mutex_lock_killable() can
1742          * fail to bail out. Therefore, check again after holding oom_lock.
1743          */
1744         ret = task_is_dying() || out_of_memory(&oc);
1745
1746 unlock:
1747         mutex_unlock(&oom_lock);
1748         return ret;
1749 }
1750
1751 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1752                                    pg_data_t *pgdat,
1753                                    gfp_t gfp_mask,
1754                                    unsigned long *total_scanned)
1755 {
1756         struct mem_cgroup *victim = NULL;
1757         int total = 0;
1758         int loop = 0;
1759         unsigned long excess;
1760         unsigned long nr_scanned;
1761         struct mem_cgroup_reclaim_cookie reclaim = {
1762                 .pgdat = pgdat,
1763         };
1764
1765         excess = soft_limit_excess(root_memcg);
1766
1767         while (1) {
1768                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1769                 if (!victim) {
1770                         loop++;
1771                         if (loop >= 2) {
1772                                 /*
1773                                  * If we have not been able to reclaim
1774                                  * anything, it might because there are
1775                                  * no reclaimable pages under this hierarchy
1776                                  */
1777                                 if (!total)
1778                                         break;
1779                                 /*
1780                                  * We want to do more targeted reclaim.
1781                                  * excess >> 2 is not to excessive so as to
1782                                  * reclaim too much, nor too less that we keep
1783                                  * coming back to reclaim from this cgroup
1784                                  */
1785                                 if (total >= (excess >> 2) ||
1786                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1787                                         break;
1788                         }
1789                         continue;
1790                 }
1791                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1792                                         pgdat, &nr_scanned);
1793                 *total_scanned += nr_scanned;
1794                 if (!soft_limit_excess(root_memcg))
1795                         break;
1796         }
1797         mem_cgroup_iter_break(root_memcg, victim);
1798         return total;
1799 }
1800
1801 #ifdef CONFIG_LOCKDEP
1802 static struct lockdep_map memcg_oom_lock_dep_map = {
1803         .name = "memcg_oom_lock",
1804 };
1805 #endif
1806
1807 static DEFINE_SPINLOCK(memcg_oom_lock);
1808
1809 /*
1810  * Check OOM-Killer is already running under our hierarchy.
1811  * If someone is running, return false.
1812  */
1813 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1814 {
1815         struct mem_cgroup *iter, *failed = NULL;
1816
1817         spin_lock(&memcg_oom_lock);
1818
1819         for_each_mem_cgroup_tree(iter, memcg) {
1820                 if (iter->oom_lock) {
1821                         /*
1822                          * this subtree of our hierarchy is already locked
1823                          * so we cannot give a lock.
1824                          */
1825                         failed = iter;
1826                         mem_cgroup_iter_break(memcg, iter);
1827                         break;
1828                 } else
1829                         iter->oom_lock = true;
1830         }
1831
1832         if (failed) {
1833                 /*
1834                  * OK, we failed to lock the whole subtree so we have
1835                  * to clean up what we set up to the failing subtree
1836                  */
1837                 for_each_mem_cgroup_tree(iter, memcg) {
1838                         if (iter == failed) {
1839                                 mem_cgroup_iter_break(memcg, iter);
1840                                 break;
1841                         }
1842                         iter->oom_lock = false;
1843                 }
1844         } else
1845                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1846
1847         spin_unlock(&memcg_oom_lock);
1848
1849         return !failed;
1850 }
1851
1852 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1853 {
1854         struct mem_cgroup *iter;
1855
1856         spin_lock(&memcg_oom_lock);
1857         mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1858         for_each_mem_cgroup_tree(iter, memcg)
1859                 iter->oom_lock = false;
1860         spin_unlock(&memcg_oom_lock);
1861 }
1862
1863 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1864 {
1865         struct mem_cgroup *iter;
1866
1867         spin_lock(&memcg_oom_lock);
1868         for_each_mem_cgroup_tree(iter, memcg)
1869                 iter->under_oom++;
1870         spin_unlock(&memcg_oom_lock);
1871 }
1872
1873 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1874 {
1875         struct mem_cgroup *iter;
1876
1877         /*
1878          * Be careful about under_oom underflows because a child memcg
1879          * could have been added after mem_cgroup_mark_under_oom.
1880          */
1881         spin_lock(&memcg_oom_lock);
1882         for_each_mem_cgroup_tree(iter, memcg)
1883                 if (iter->under_oom > 0)
1884                         iter->under_oom--;
1885         spin_unlock(&memcg_oom_lock);
1886 }
1887
1888 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1889
1890 struct oom_wait_info {
1891         struct mem_cgroup *memcg;
1892         wait_queue_entry_t      wait;
1893 };
1894
1895 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1896         unsigned mode, int sync, void *arg)
1897 {
1898         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1899         struct mem_cgroup *oom_wait_memcg;
1900         struct oom_wait_info *oom_wait_info;
1901
1902         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1903         oom_wait_memcg = oom_wait_info->memcg;
1904
1905         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1906             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1907                 return 0;
1908         return autoremove_wake_function(wait, mode, sync, arg);
1909 }
1910
1911 static void memcg_oom_recover(struct mem_cgroup *memcg)
1912 {
1913         /*
1914          * For the following lockless ->under_oom test, the only required
1915          * guarantee is that it must see the state asserted by an OOM when
1916          * this function is called as a result of userland actions
1917          * triggered by the notification of the OOM.  This is trivially
1918          * achieved by invoking mem_cgroup_mark_under_oom() before
1919          * triggering notification.
1920          */
1921         if (memcg && memcg->under_oom)
1922                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1923 }
1924
1925 /*
1926  * Returns true if successfully killed one or more processes. Though in some
1927  * corner cases it can return true even without killing any process.
1928  */
1929 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1930 {
1931         bool locked, ret;
1932
1933         if (order > PAGE_ALLOC_COSTLY_ORDER)
1934                 return false;
1935
1936         memcg_memory_event(memcg, MEMCG_OOM);
1937
1938         /*
1939          * We are in the middle of the charge context here, so we
1940          * don't want to block when potentially sitting on a callstack
1941          * that holds all kinds of filesystem and mm locks.
1942          *
1943          * cgroup1 allows disabling the OOM killer and waiting for outside
1944          * handling until the charge can succeed; remember the context and put
1945          * the task to sleep at the end of the page fault when all locks are
1946          * released.
1947          *
1948          * On the other hand, in-kernel OOM killer allows for an async victim
1949          * memory reclaim (oom_reaper) and that means that we are not solely
1950          * relying on the oom victim to make a forward progress and we can
1951          * invoke the oom killer here.
1952          *
1953          * Please note that mem_cgroup_out_of_memory might fail to find a
1954          * victim and then we have to bail out from the charge path.
1955          */
1956         if (READ_ONCE(memcg->oom_kill_disable)) {
1957                 if (current->in_user_fault) {
1958                         css_get(&memcg->css);
1959                         current->memcg_in_oom = memcg;
1960                         current->memcg_oom_gfp_mask = mask;
1961                         current->memcg_oom_order = order;
1962                 }
1963                 return false;
1964         }
1965
1966         mem_cgroup_mark_under_oom(memcg);
1967
1968         locked = mem_cgroup_oom_trylock(memcg);
1969
1970         if (locked)
1971                 mem_cgroup_oom_notify(memcg);
1972
1973         mem_cgroup_unmark_under_oom(memcg);
1974         ret = mem_cgroup_out_of_memory(memcg, mask, order);
1975
1976         if (locked)
1977                 mem_cgroup_oom_unlock(memcg);
1978
1979         return ret;
1980 }
1981
1982 /**
1983  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1984  * @handle: actually kill/wait or just clean up the OOM state
1985  *
1986  * This has to be called at the end of a page fault if the memcg OOM
1987  * handler was enabled.
1988  *
1989  * Memcg supports userspace OOM handling where failed allocations must
1990  * sleep on a waitqueue until the userspace task resolves the
1991  * situation.  Sleeping directly in the charge context with all kinds
1992  * of locks held is not a good idea, instead we remember an OOM state
1993  * in the task and mem_cgroup_oom_synchronize() has to be called at
1994  * the end of the page fault to complete the OOM handling.
1995  *
1996  * Returns %true if an ongoing memcg OOM situation was detected and
1997  * completed, %false otherwise.
1998  */
1999 bool mem_cgroup_oom_synchronize(bool handle)
2000 {
2001         struct mem_cgroup *memcg = current->memcg_in_oom;
2002         struct oom_wait_info owait;
2003         bool locked;
2004
2005         /* OOM is global, do not handle */
2006         if (!memcg)
2007                 return false;
2008
2009         if (!handle)
2010                 goto cleanup;
2011
2012         owait.memcg = memcg;
2013         owait.wait.flags = 0;
2014         owait.wait.func = memcg_oom_wake_function;
2015         owait.wait.private = current;
2016         INIT_LIST_HEAD(&owait.wait.entry);
2017
2018         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2019         mem_cgroup_mark_under_oom(memcg);
2020
2021         locked = mem_cgroup_oom_trylock(memcg);
2022
2023         if (locked)
2024                 mem_cgroup_oom_notify(memcg);
2025
2026         schedule();
2027         mem_cgroup_unmark_under_oom(memcg);
2028         finish_wait(&memcg_oom_waitq, &owait.wait);
2029
2030         if (locked)
2031                 mem_cgroup_oom_unlock(memcg);
2032 cleanup:
2033         current->memcg_in_oom = NULL;
2034         css_put(&memcg->css);
2035         return true;
2036 }
2037
2038 /**
2039  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
2040  * @victim: task to be killed by the OOM killer
2041  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
2042  *
2043  * Returns a pointer to a memory cgroup, which has to be cleaned up
2044  * by killing all belonging OOM-killable tasks.
2045  *
2046  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
2047  */
2048 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2049                                             struct mem_cgroup *oom_domain)
2050 {
2051         struct mem_cgroup *oom_group = NULL;
2052         struct mem_cgroup *memcg;
2053
2054         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2055                 return NULL;
2056
2057         if (!oom_domain)
2058                 oom_domain = root_mem_cgroup;
2059
2060         rcu_read_lock();
2061
2062         memcg = mem_cgroup_from_task(victim);
2063         if (mem_cgroup_is_root(memcg))
2064                 goto out;
2065
2066         /*
2067          * If the victim task has been asynchronously moved to a different
2068          * memory cgroup, we might end up killing tasks outside oom_domain.
2069          * In this case it's better to ignore memory.group.oom.
2070          */
2071         if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2072                 goto out;
2073
2074         /*
2075          * Traverse the memory cgroup hierarchy from the victim task's
2076          * cgroup up to the OOMing cgroup (or root) to find the
2077          * highest-level memory cgroup with oom.group set.
2078          */
2079         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2080                 if (READ_ONCE(memcg->oom_group))
2081                         oom_group = memcg;
2082
2083                 if (memcg == oom_domain)
2084                         break;
2085         }
2086
2087         if (oom_group)
2088                 css_get(&oom_group->css);
2089 out:
2090         rcu_read_unlock();
2091
2092         return oom_group;
2093 }
2094
2095 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2096 {
2097         pr_info("Tasks in ");
2098         pr_cont_cgroup_path(memcg->css.cgroup);
2099         pr_cont(" are going to be killed due to memory.oom.group set\n");
2100 }
2101
2102 /**
2103  * folio_memcg_lock - Bind a folio to its memcg.
2104  * @folio: The folio.
2105  *
2106  * This function prevents unlocked LRU folios from being moved to
2107  * another cgroup.
2108  *
2109  * It ensures lifetime of the bound memcg.  The caller is responsible
2110  * for the lifetime of the folio.
2111  */
2112 void folio_memcg_lock(struct folio *folio)
2113 {
2114         struct mem_cgroup *memcg;
2115         unsigned long flags;
2116
2117         /*
2118          * The RCU lock is held throughout the transaction.  The fast
2119          * path can get away without acquiring the memcg->move_lock
2120          * because page moving starts with an RCU grace period.
2121          */
2122         rcu_read_lock();
2123
2124         if (mem_cgroup_disabled())
2125                 return;
2126 again:
2127         memcg = folio_memcg(folio);
2128         if (unlikely(!memcg))
2129                 return;
2130
2131 #ifdef CONFIG_PROVE_LOCKING
2132         local_irq_save(flags);
2133         might_lock(&memcg->move_lock);
2134         local_irq_restore(flags);
2135 #endif
2136
2137         if (atomic_read(&memcg->moving_account) <= 0)
2138                 return;
2139
2140         spin_lock_irqsave(&memcg->move_lock, flags);
2141         if (memcg != folio_memcg(folio)) {
2142                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2143                 goto again;
2144         }
2145
2146         /*
2147          * When charge migration first begins, we can have multiple
2148          * critical sections holding the fast-path RCU lock and one
2149          * holding the slowpath move_lock. Track the task who has the
2150          * move_lock for folio_memcg_unlock().
2151          */
2152         memcg->move_lock_task = current;
2153         memcg->move_lock_flags = flags;
2154 }
2155
2156 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
2157 {
2158         if (memcg && memcg->move_lock_task == current) {
2159                 unsigned long flags = memcg->move_lock_flags;
2160
2161                 memcg->move_lock_task = NULL;
2162                 memcg->move_lock_flags = 0;
2163
2164                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2165         }
2166
2167         rcu_read_unlock();
2168 }
2169
2170 /**
2171  * folio_memcg_unlock - Release the binding between a folio and its memcg.
2172  * @folio: The folio.
2173  *
2174  * This releases the binding created by folio_memcg_lock().  This does
2175  * not change the accounting of this folio to its memcg, but it does
2176  * permit others to change it.
2177  */
2178 void folio_memcg_unlock(struct folio *folio)
2179 {
2180         __folio_memcg_unlock(folio_memcg(folio));
2181 }
2182
2183 struct memcg_stock_pcp {
2184         local_lock_t stock_lock;
2185         struct mem_cgroup *cached; /* this never be root cgroup */
2186         unsigned int nr_pages;
2187
2188 #ifdef CONFIG_MEMCG_KMEM
2189         struct obj_cgroup *cached_objcg;
2190         struct pglist_data *cached_pgdat;
2191         unsigned int nr_bytes;
2192         int nr_slab_reclaimable_b;
2193         int nr_slab_unreclaimable_b;
2194 #endif
2195
2196         struct work_struct work;
2197         unsigned long flags;
2198 #define FLUSHING_CACHED_CHARGE  0
2199 };
2200 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
2201         .stock_lock = INIT_LOCAL_LOCK(stock_lock),
2202 };
2203 static DEFINE_MUTEX(percpu_charge_mutex);
2204
2205 #ifdef CONFIG_MEMCG_KMEM
2206 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
2207 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2208                                      struct mem_cgroup *root_memcg);
2209 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
2210
2211 #else
2212 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2213 {
2214         return NULL;
2215 }
2216 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2217                                      struct mem_cgroup *root_memcg)
2218 {
2219         return false;
2220 }
2221 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2222 {
2223 }
2224 #endif
2225
2226 /**
2227  * consume_stock: Try to consume stocked charge on this cpu.
2228  * @memcg: memcg to consume from.
2229  * @nr_pages: how many pages to charge.
2230  *
2231  * The charges will only happen if @memcg matches the current cpu's memcg
2232  * stock, and at least @nr_pages are available in that stock.  Failure to
2233  * service an allocation will refill the stock.
2234  *
2235  * returns true if successful, false otherwise.
2236  */
2237 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2238 {
2239         struct memcg_stock_pcp *stock;
2240         unsigned long flags;
2241         bool ret = false;
2242
2243         if (nr_pages > MEMCG_CHARGE_BATCH)
2244                 return ret;
2245
2246         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2247
2248         stock = this_cpu_ptr(&memcg_stock);
2249         if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
2250                 stock->nr_pages -= nr_pages;
2251                 ret = true;
2252         }
2253
2254         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2255
2256         return ret;
2257 }
2258
2259 /*
2260  * Returns stocks cached in percpu and reset cached information.
2261  */
2262 static void drain_stock(struct memcg_stock_pcp *stock)
2263 {
2264         struct mem_cgroup *old = READ_ONCE(stock->cached);
2265
2266         if (!old)
2267                 return;
2268
2269         if (stock->nr_pages) {
2270                 page_counter_uncharge(&old->memory, stock->nr_pages);
2271                 if (do_memsw_account())
2272                         page_counter_uncharge(&old->memsw, stock->nr_pages);
2273                 stock->nr_pages = 0;
2274         }
2275
2276         css_put(&old->css);
2277         WRITE_ONCE(stock->cached, NULL);
2278 }
2279
2280 static void drain_local_stock(struct work_struct *dummy)
2281 {
2282         struct memcg_stock_pcp *stock;
2283         struct obj_cgroup *old = NULL;
2284         unsigned long flags;
2285
2286         /*
2287          * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
2288          * drain_stock races is that we always operate on local CPU stock
2289          * here with IRQ disabled
2290          */
2291         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2292
2293         stock = this_cpu_ptr(&memcg_stock);
2294         old = drain_obj_stock(stock);
2295         drain_stock(stock);
2296         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2297
2298         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2299         if (old)
2300                 obj_cgroup_put(old);
2301 }
2302
2303 /*
2304  * Cache charges(val) to local per_cpu area.
2305  * This will be consumed by consume_stock() function, later.
2306  */
2307 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2308 {
2309         struct memcg_stock_pcp *stock;
2310
2311         stock = this_cpu_ptr(&memcg_stock);
2312         if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
2313                 drain_stock(stock);
2314                 css_get(&memcg->css);
2315                 WRITE_ONCE(stock->cached, memcg);
2316         }
2317         stock->nr_pages += nr_pages;
2318
2319         if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2320                 drain_stock(stock);
2321 }
2322
2323 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2324 {
2325         unsigned long flags;
2326
2327         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2328         __refill_stock(memcg, nr_pages);
2329         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2330 }
2331
2332 /*
2333  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2334  * of the hierarchy under it.
2335  */
2336 static void drain_all_stock(struct mem_cgroup *root_memcg)
2337 {
2338         int cpu, curcpu;
2339
2340         /* If someone's already draining, avoid adding running more workers. */
2341         if (!mutex_trylock(&percpu_charge_mutex))
2342                 return;
2343         /*
2344          * Notify other cpus that system-wide "drain" is running
2345          * We do not care about races with the cpu hotplug because cpu down
2346          * as well as workers from this path always operate on the local
2347          * per-cpu data. CPU up doesn't touch memcg_stock at all.
2348          */
2349         migrate_disable();
2350         curcpu = smp_processor_id();
2351         for_each_online_cpu(cpu) {
2352                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2353                 struct mem_cgroup *memcg;
2354                 bool flush = false;
2355
2356                 rcu_read_lock();
2357                 memcg = READ_ONCE(stock->cached);
2358                 if (memcg && stock->nr_pages &&
2359                     mem_cgroup_is_descendant(memcg, root_memcg))
2360                         flush = true;
2361                 else if (obj_stock_flush_required(stock, root_memcg))
2362                         flush = true;
2363                 rcu_read_unlock();
2364
2365                 if (flush &&
2366                     !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2367                         if (cpu == curcpu)
2368                                 drain_local_stock(&stock->work);
2369                         else if (!cpu_is_isolated(cpu))
2370                                 schedule_work_on(cpu, &stock->work);
2371                 }
2372         }
2373         migrate_enable();
2374         mutex_unlock(&percpu_charge_mutex);
2375 }
2376
2377 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2378 {
2379         struct memcg_stock_pcp *stock;
2380
2381         stock = &per_cpu(memcg_stock, cpu);
2382         drain_stock(stock);
2383
2384         return 0;
2385 }
2386
2387 static unsigned long reclaim_high(struct mem_cgroup *memcg,
2388                                   unsigned int nr_pages,
2389                                   gfp_t gfp_mask)
2390 {
2391         unsigned long nr_reclaimed = 0;
2392
2393         do {
2394                 unsigned long pflags;
2395
2396                 if (page_counter_read(&memcg->memory) <=
2397                     READ_ONCE(memcg->memory.high))
2398                         continue;
2399
2400                 memcg_memory_event(memcg, MEMCG_HIGH);
2401
2402                 psi_memstall_enter(&pflags);
2403                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2404                                                         gfp_mask,
2405                                                         MEMCG_RECLAIM_MAY_SWAP);
2406                 psi_memstall_leave(&pflags);
2407         } while ((memcg = parent_mem_cgroup(memcg)) &&
2408                  !mem_cgroup_is_root(memcg));
2409
2410         return nr_reclaimed;
2411 }
2412
2413 static void high_work_func(struct work_struct *work)
2414 {
2415         struct mem_cgroup *memcg;
2416
2417         memcg = container_of(work, struct mem_cgroup, high_work);
2418         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2419 }
2420
2421 /*
2422  * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2423  * enough to still cause a significant slowdown in most cases, while still
2424  * allowing diagnostics and tracing to proceed without becoming stuck.
2425  */
2426 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2427
2428 /*
2429  * When calculating the delay, we use these either side of the exponentiation to
2430  * maintain precision and scale to a reasonable number of jiffies (see the table
2431  * below.
2432  *
2433  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2434  *   overage ratio to a delay.
2435  * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2436  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2437  *   to produce a reasonable delay curve.
2438  *
2439  * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2440  * reasonable delay curve compared to precision-adjusted overage, not
2441  * penalising heavily at first, but still making sure that growth beyond the
2442  * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2443  * example, with a high of 100 megabytes:
2444  *
2445  *  +-------+------------------------+
2446  *  | usage | time to allocate in ms |
2447  *  +-------+------------------------+
2448  *  | 100M  |                      0 |
2449  *  | 101M  |                      6 |
2450  *  | 102M  |                     25 |
2451  *  | 103M  |                     57 |
2452  *  | 104M  |                    102 |
2453  *  | 105M  |                    159 |
2454  *  | 106M  |                    230 |
2455  *  | 107M  |                    313 |
2456  *  | 108M  |                    409 |
2457  *  | 109M  |                    518 |
2458  *  | 110M  |                    639 |
2459  *  | 111M  |                    774 |
2460  *  | 112M  |                    921 |
2461  *  | 113M  |                   1081 |
2462  *  | 114M  |                   1254 |
2463  *  | 115M  |                   1439 |
2464  *  | 116M  |                   1638 |
2465  *  | 117M  |                   1849 |
2466  *  | 118M  |                   2000 |
2467  *  | 119M  |                   2000 |
2468  *  | 120M  |                   2000 |
2469  *  +-------+------------------------+
2470  */
2471  #define MEMCG_DELAY_PRECISION_SHIFT 20
2472  #define MEMCG_DELAY_SCALING_SHIFT 14
2473
2474 static u64 calculate_overage(unsigned long usage, unsigned long high)
2475 {
2476         u64 overage;
2477
2478         if (usage <= high)
2479                 return 0;
2480
2481         /*
2482          * Prevent division by 0 in overage calculation by acting as if
2483          * it was a threshold of 1 page
2484          */
2485         high = max(high, 1UL);
2486
2487         overage = usage - high;
2488         overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2489         return div64_u64(overage, high);
2490 }
2491
2492 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2493 {
2494         u64 overage, max_overage = 0;
2495
2496         do {
2497                 overage = calculate_overage(page_counter_read(&memcg->memory),
2498                                             READ_ONCE(memcg->memory.high));
2499                 max_overage = max(overage, max_overage);
2500         } while ((memcg = parent_mem_cgroup(memcg)) &&
2501                  !mem_cgroup_is_root(memcg));
2502
2503         return max_overage;
2504 }
2505
2506 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2507 {
2508         u64 overage, max_overage = 0;
2509
2510         do {
2511                 overage = calculate_overage(page_counter_read(&memcg->swap),
2512                                             READ_ONCE(memcg->swap.high));
2513                 if (overage)
2514                         memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2515                 max_overage = max(overage, max_overage);
2516         } while ((memcg = parent_mem_cgroup(memcg)) &&
2517                  !mem_cgroup_is_root(memcg));
2518
2519         return max_overage;
2520 }
2521
2522 /*
2523  * Get the number of jiffies that we should penalise a mischievous cgroup which
2524  * is exceeding its memory.high by checking both it and its ancestors.
2525  */
2526 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2527                                           unsigned int nr_pages,
2528                                           u64 max_overage)
2529 {
2530         unsigned long penalty_jiffies;
2531
2532         if (!max_overage)
2533                 return 0;
2534
2535         /*
2536          * We use overage compared to memory.high to calculate the number of
2537          * jiffies to sleep (penalty_jiffies). Ideally this value should be
2538          * fairly lenient on small overages, and increasingly harsh when the
2539          * memcg in question makes it clear that it has no intention of stopping
2540          * its crazy behaviour, so we exponentially increase the delay based on
2541          * overage amount.
2542          */
2543         penalty_jiffies = max_overage * max_overage * HZ;
2544         penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2545         penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2546
2547         /*
2548          * Factor in the task's own contribution to the overage, such that four
2549          * N-sized allocations are throttled approximately the same as one
2550          * 4N-sized allocation.
2551          *
2552          * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2553          * larger the current charge patch is than that.
2554          */
2555         return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2556 }
2557
2558 /*
2559  * Scheduled by try_charge() to be executed from the userland return path
2560  * and reclaims memory over the high limit.
2561  */
2562 void mem_cgroup_handle_over_high(void)
2563 {
2564         unsigned long penalty_jiffies;
2565         unsigned long pflags;
2566         unsigned long nr_reclaimed;
2567         unsigned int nr_pages = current->memcg_nr_pages_over_high;
2568         int nr_retries = MAX_RECLAIM_RETRIES;
2569         struct mem_cgroup *memcg;
2570         bool in_retry = false;
2571
2572         if (likely(!nr_pages))
2573                 return;
2574
2575         memcg = get_mem_cgroup_from_mm(current->mm);
2576         current->memcg_nr_pages_over_high = 0;
2577
2578 retry_reclaim:
2579         /*
2580          * The allocating task should reclaim at least the batch size, but for
2581          * subsequent retries we only want to do what's necessary to prevent oom
2582          * or breaching resource isolation.
2583          *
2584          * This is distinct from memory.max or page allocator behaviour because
2585          * memory.high is currently batched, whereas memory.max and the page
2586          * allocator run every time an allocation is made.
2587          */
2588         nr_reclaimed = reclaim_high(memcg,
2589                                     in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2590                                     GFP_KERNEL);
2591
2592         /*
2593          * memory.high is breached and reclaim is unable to keep up. Throttle
2594          * allocators proactively to slow down excessive growth.
2595          */
2596         penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2597                                                mem_find_max_overage(memcg));
2598
2599         penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2600                                                 swap_find_max_overage(memcg));
2601
2602         /*
2603          * Clamp the max delay per usermode return so as to still keep the
2604          * application moving forwards and also permit diagnostics, albeit
2605          * extremely slowly.
2606          */
2607         penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2608
2609         /*
2610          * Don't sleep if the amount of jiffies this memcg owes us is so low
2611          * that it's not even worth doing, in an attempt to be nice to those who
2612          * go only a small amount over their memory.high value and maybe haven't
2613          * been aggressively reclaimed enough yet.
2614          */
2615         if (penalty_jiffies <= HZ / 100)
2616                 goto out;
2617
2618         /*
2619          * If reclaim is making forward progress but we're still over
2620          * memory.high, we want to encourage that rather than doing allocator
2621          * throttling.
2622          */
2623         if (nr_reclaimed || nr_retries--) {
2624                 in_retry = true;
2625                 goto retry_reclaim;
2626         }
2627
2628         /*
2629          * If we exit early, we're guaranteed to die (since
2630          * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2631          * need to account for any ill-begotten jiffies to pay them off later.
2632          */
2633         psi_memstall_enter(&pflags);
2634         schedule_timeout_killable(penalty_jiffies);
2635         psi_memstall_leave(&pflags);
2636
2637 out:
2638         css_put(&memcg->css);
2639 }
2640
2641 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2642                         unsigned int nr_pages)
2643 {
2644         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2645         int nr_retries = MAX_RECLAIM_RETRIES;
2646         struct mem_cgroup *mem_over_limit;
2647         struct page_counter *counter;
2648         unsigned long nr_reclaimed;
2649         bool passed_oom = false;
2650         unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2651         bool drained = false;
2652         bool raised_max_event = false;
2653         unsigned long pflags;
2654
2655 retry:
2656         if (consume_stock(memcg, nr_pages))
2657                 return 0;
2658
2659         if (!do_memsw_account() ||
2660             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2661                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2662                         goto done_restock;
2663                 if (do_memsw_account())
2664                         page_counter_uncharge(&memcg->memsw, batch);
2665                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2666         } else {
2667                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2668                 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2669         }
2670
2671         if (batch > nr_pages) {
2672                 batch = nr_pages;
2673                 goto retry;
2674         }
2675
2676         /*
2677          * Prevent unbounded recursion when reclaim operations need to
2678          * allocate memory. This might exceed the limits temporarily,
2679          * but we prefer facilitating memory reclaim and getting back
2680          * under the limit over triggering OOM kills in these cases.
2681          */
2682         if (unlikely(current->flags & PF_MEMALLOC))
2683                 goto force;
2684
2685         if (unlikely(task_in_memcg_oom(current)))
2686                 goto nomem;
2687
2688         if (!gfpflags_allow_blocking(gfp_mask))
2689                 goto nomem;
2690
2691         memcg_memory_event(mem_over_limit, MEMCG_MAX);
2692         raised_max_event = true;
2693
2694         psi_memstall_enter(&pflags);
2695         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2696                                                     gfp_mask, reclaim_options);
2697         psi_memstall_leave(&pflags);
2698
2699         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2700                 goto retry;
2701
2702         if (!drained) {
2703                 drain_all_stock(mem_over_limit);
2704                 drained = true;
2705                 goto retry;
2706         }
2707
2708         if (gfp_mask & __GFP_NORETRY)
2709                 goto nomem;
2710         /*
2711          * Even though the limit is exceeded at this point, reclaim
2712          * may have been able to free some pages.  Retry the charge
2713          * before killing the task.
2714          *
2715          * Only for regular pages, though: huge pages are rather
2716          * unlikely to succeed so close to the limit, and we fall back
2717          * to regular pages anyway in case of failure.
2718          */
2719         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2720                 goto retry;
2721         /*
2722          * At task move, charge accounts can be doubly counted. So, it's
2723          * better to wait until the end of task_move if something is going on.
2724          */
2725         if (mem_cgroup_wait_acct_move(mem_over_limit))
2726                 goto retry;
2727
2728         if (nr_retries--)
2729                 goto retry;
2730
2731         if (gfp_mask & __GFP_RETRY_MAYFAIL)
2732                 goto nomem;
2733
2734         /* Avoid endless loop for tasks bypassed by the oom killer */
2735         if (passed_oom && task_is_dying())
2736                 goto nomem;
2737
2738         /*
2739          * keep retrying as long as the memcg oom killer is able to make
2740          * a forward progress or bypass the charge if the oom killer
2741          * couldn't make any progress.
2742          */
2743         if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2744                            get_order(nr_pages * PAGE_SIZE))) {
2745                 passed_oom = true;
2746                 nr_retries = MAX_RECLAIM_RETRIES;
2747                 goto retry;
2748         }
2749 nomem:
2750         /*
2751          * Memcg doesn't have a dedicated reserve for atomic
2752          * allocations. But like the global atomic pool, we need to
2753          * put the burden of reclaim on regular allocation requests
2754          * and let these go through as privileged allocations.
2755          */
2756         if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2757                 return -ENOMEM;
2758 force:
2759         /*
2760          * If the allocation has to be enforced, don't forget to raise
2761          * a MEMCG_MAX event.
2762          */
2763         if (!raised_max_event)
2764                 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2765
2766         /*
2767          * The allocation either can't fail or will lead to more memory
2768          * being freed very soon.  Allow memory usage go over the limit
2769          * temporarily by force charging it.
2770          */
2771         page_counter_charge(&memcg->memory, nr_pages);
2772         if (do_memsw_account())
2773                 page_counter_charge(&memcg->memsw, nr_pages);
2774
2775         return 0;
2776
2777 done_restock:
2778         if (batch > nr_pages)
2779                 refill_stock(memcg, batch - nr_pages);
2780
2781         /*
2782          * If the hierarchy is above the normal consumption range, schedule
2783          * reclaim on returning to userland.  We can perform reclaim here
2784          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2785          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2786          * not recorded as it most likely matches current's and won't
2787          * change in the meantime.  As high limit is checked again before
2788          * reclaim, the cost of mismatch is negligible.
2789          */
2790         do {
2791                 bool mem_high, swap_high;
2792
2793                 mem_high = page_counter_read(&memcg->memory) >
2794                         READ_ONCE(memcg->memory.high);
2795                 swap_high = page_counter_read(&memcg->swap) >
2796                         READ_ONCE(memcg->swap.high);
2797
2798                 /* Don't bother a random interrupted task */
2799                 if (!in_task()) {
2800                         if (mem_high) {
2801                                 schedule_work(&memcg->high_work);
2802                                 break;
2803                         }
2804                         continue;
2805                 }
2806
2807                 if (mem_high || swap_high) {
2808                         /*
2809                          * The allocating tasks in this cgroup will need to do
2810                          * reclaim or be throttled to prevent further growth
2811                          * of the memory or swap footprints.
2812                          *
2813                          * Target some best-effort fairness between the tasks,
2814                          * and distribute reclaim work and delay penalties
2815                          * based on how much each task is actually allocating.
2816                          */
2817                         current->memcg_nr_pages_over_high += batch;
2818                         set_notify_resume(current);
2819                         break;
2820                 }
2821         } while ((memcg = parent_mem_cgroup(memcg)));
2822
2823         if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2824             !(current->flags & PF_MEMALLOC) &&
2825             gfpflags_allow_blocking(gfp_mask)) {
2826                 mem_cgroup_handle_over_high();
2827         }
2828         return 0;
2829 }
2830
2831 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2832                              unsigned int nr_pages)
2833 {
2834         if (mem_cgroup_is_root(memcg))
2835                 return 0;
2836
2837         return try_charge_memcg(memcg, gfp_mask, nr_pages);
2838 }
2839
2840 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2841 {
2842         if (mem_cgroup_is_root(memcg))
2843                 return;
2844
2845         page_counter_uncharge(&memcg->memory, nr_pages);
2846         if (do_memsw_account())
2847                 page_counter_uncharge(&memcg->memsw, nr_pages);
2848 }
2849
2850 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2851 {
2852         VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2853         /*
2854          * Any of the following ensures page's memcg stability:
2855          *
2856          * - the page lock
2857          * - LRU isolation
2858          * - folio_memcg_lock()
2859          * - exclusive reference
2860          * - mem_cgroup_trylock_pages()
2861          */
2862         folio->memcg_data = (unsigned long)memcg;
2863 }
2864
2865 #ifdef CONFIG_MEMCG_KMEM
2866 /*
2867  * The allocated objcg pointers array is not accounted directly.
2868  * Moreover, it should not come from DMA buffer and is not readily
2869  * reclaimable. So those GFP bits should be masked off.
2870  */
2871 #define OBJCGS_CLEAR_MASK       (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2872
2873 /*
2874  * mod_objcg_mlstate() may be called with irq enabled, so
2875  * mod_memcg_lruvec_state() should be used.
2876  */
2877 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2878                                      struct pglist_data *pgdat,
2879                                      enum node_stat_item idx, int nr)
2880 {
2881         struct mem_cgroup *memcg;
2882         struct lruvec *lruvec;
2883
2884         rcu_read_lock();
2885         memcg = obj_cgroup_memcg(objcg);
2886         lruvec = mem_cgroup_lruvec(memcg, pgdat);
2887         mod_memcg_lruvec_state(lruvec, idx, nr);
2888         rcu_read_unlock();
2889 }
2890
2891 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
2892                                  gfp_t gfp, bool new_slab)
2893 {
2894         unsigned int objects = objs_per_slab(s, slab);
2895         unsigned long memcg_data;
2896         void *vec;
2897
2898         gfp &= ~OBJCGS_CLEAR_MASK;
2899         vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2900                            slab_nid(slab));
2901         if (!vec)
2902                 return -ENOMEM;
2903
2904         memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2905         if (new_slab) {
2906                 /*
2907                  * If the slab is brand new and nobody can yet access its
2908                  * memcg_data, no synchronization is required and memcg_data can
2909                  * be simply assigned.
2910                  */
2911                 slab->memcg_data = memcg_data;
2912         } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
2913                 /*
2914                  * If the slab is already in use, somebody can allocate and
2915                  * assign obj_cgroups in parallel. In this case the existing
2916                  * objcg vector should be reused.
2917                  */
2918                 kfree(vec);
2919                 return 0;
2920         }
2921
2922         kmemleak_not_leak(vec);
2923         return 0;
2924 }
2925
2926 static __always_inline
2927 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2928 {
2929         /*
2930          * Slab objects are accounted individually, not per-page.
2931          * Memcg membership data for each individual object is saved in
2932          * slab->memcg_data.
2933          */
2934         if (folio_test_slab(folio)) {
2935                 struct obj_cgroup **objcgs;
2936                 struct slab *slab;
2937                 unsigned int off;
2938
2939                 slab = folio_slab(folio);
2940                 objcgs = slab_objcgs(slab);
2941                 if (!objcgs)
2942                         return NULL;
2943
2944                 off = obj_to_index(slab->slab_cache, slab, p);
2945                 if (objcgs[off])
2946                         return obj_cgroup_memcg(objcgs[off]);
2947
2948                 return NULL;
2949         }
2950
2951         /*
2952          * folio_memcg_check() is used here, because in theory we can encounter
2953          * a folio where the slab flag has been cleared already, but
2954          * slab->memcg_data has not been freed yet
2955          * folio_memcg_check() will guarantee that a proper memory
2956          * cgroup pointer or NULL will be returned.
2957          */
2958         return folio_memcg_check(folio);
2959 }
2960
2961 /*
2962  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2963  *
2964  * A passed kernel object can be a slab object, vmalloc object or a generic
2965  * kernel page, so different mechanisms for getting the memory cgroup pointer
2966  * should be used.
2967  *
2968  * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
2969  * can not know for sure how the kernel object is implemented.
2970  * mem_cgroup_from_obj() can be safely used in such cases.
2971  *
2972  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2973  * cgroup_mutex, etc.
2974  */
2975 struct mem_cgroup *mem_cgroup_from_obj(void *p)
2976 {
2977         struct folio *folio;
2978
2979         if (mem_cgroup_disabled())
2980                 return NULL;
2981
2982         if (unlikely(is_vmalloc_addr(p)))
2983                 folio = page_folio(vmalloc_to_page(p));
2984         else
2985                 folio = virt_to_folio(p);
2986
2987         return mem_cgroup_from_obj_folio(folio, p);
2988 }
2989
2990 /*
2991  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2992  * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
2993  * allocated using vmalloc().
2994  *
2995  * A passed kernel object must be a slab object or a generic kernel page.
2996  *
2997  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2998  * cgroup_mutex, etc.
2999  */
3000 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
3001 {
3002         if (mem_cgroup_disabled())
3003                 return NULL;
3004
3005         return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
3006 }
3007
3008 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
3009 {
3010         struct obj_cgroup *objcg = NULL;
3011
3012         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
3013                 objcg = rcu_dereference(memcg->objcg);
3014                 if (objcg && obj_cgroup_tryget(objcg))
3015                         break;
3016                 objcg = NULL;
3017         }
3018         return objcg;
3019 }
3020
3021 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3022 {
3023         struct obj_cgroup *objcg = NULL;
3024         struct mem_cgroup *memcg;
3025
3026         if (memcg_kmem_bypass())
3027                 return NULL;
3028
3029         rcu_read_lock();
3030         if (unlikely(active_memcg()))
3031                 memcg = active_memcg();
3032         else
3033                 memcg = mem_cgroup_from_task(current);
3034         objcg = __get_obj_cgroup_from_memcg(memcg);
3035         rcu_read_unlock();
3036         return objcg;
3037 }
3038
3039 struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
3040 {
3041         struct obj_cgroup *objcg;
3042
3043         if (!memcg_kmem_online())
3044                 return NULL;
3045
3046         if (PageMemcgKmem(page)) {
3047                 objcg = __folio_objcg(page_folio(page));
3048                 obj_cgroup_get(objcg);
3049         } else {
3050                 struct mem_cgroup *memcg;
3051
3052                 rcu_read_lock();
3053                 memcg = __folio_memcg(page_folio(page));
3054                 if (memcg)
3055                         objcg = __get_obj_cgroup_from_memcg(memcg);
3056                 else
3057                         objcg = NULL;
3058                 rcu_read_unlock();
3059         }
3060         return objcg;
3061 }
3062
3063 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3064 {
3065         mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
3066         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3067                 if (nr_pages > 0)
3068                         page_counter_charge(&memcg->kmem, nr_pages);
3069                 else
3070                         page_counter_uncharge(&memcg->kmem, -nr_pages);
3071         }
3072 }
3073
3074
3075 /*
3076  * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
3077  * @objcg: object cgroup to uncharge
3078  * @nr_pages: number of pages to uncharge
3079  */
3080 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
3081                                       unsigned int nr_pages)
3082 {
3083         struct mem_cgroup *memcg;
3084
3085         memcg = get_mem_cgroup_from_objcg(objcg);
3086
3087         memcg_account_kmem(memcg, -nr_pages);
3088         refill_stock(memcg, nr_pages);
3089
3090         css_put(&memcg->css);
3091 }
3092
3093 /*
3094  * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
3095  * @objcg: object cgroup to charge
3096  * @gfp: reclaim mode
3097  * @nr_pages: number of pages to charge
3098  *
3099  * Returns 0 on success, an error code on failure.
3100  */
3101 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
3102                                    unsigned int nr_pages)
3103 {
3104         struct mem_cgroup *memcg;
3105         int ret;
3106
3107         memcg = get_mem_cgroup_from_objcg(objcg);
3108
3109         ret = try_charge_memcg(memcg, gfp, nr_pages);
3110         if (ret)
3111                 goto out;
3112
3113         memcg_account_kmem(memcg, nr_pages);
3114 out:
3115         css_put(&memcg->css);
3116
3117         return ret;
3118 }
3119
3120 /**
3121  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
3122  * @page: page to charge
3123  * @gfp: reclaim mode
3124  * @order: allocation order
3125  *
3126  * Returns 0 on success, an error code on failure.
3127  */
3128 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3129 {
3130         struct obj_cgroup *objcg;
3131         int ret = 0;
3132
3133         objcg = get_obj_cgroup_from_current();
3134         if (objcg) {
3135                 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3136                 if (!ret) {
3137                         page->memcg_data = (unsigned long)objcg |
3138                                 MEMCG_DATA_KMEM;
3139                         return 0;
3140                 }
3141                 obj_cgroup_put(objcg);
3142         }
3143         return ret;
3144 }
3145
3146 /**
3147  * __memcg_kmem_uncharge_page: uncharge a kmem page
3148  * @page: page to uncharge
3149  * @order: allocation order
3150  */
3151 void __memcg_kmem_uncharge_page(struct page *page, int order)
3152 {
3153         struct folio *folio = page_folio(page);
3154         struct obj_cgroup *objcg;
3155         unsigned int nr_pages = 1 << order;
3156
3157         if (!folio_memcg_kmem(folio))
3158                 return;
3159
3160         objcg = __folio_objcg(folio);
3161         obj_cgroup_uncharge_pages(objcg, nr_pages);
3162         folio->memcg_data = 0;
3163         obj_cgroup_put(objcg);
3164 }
3165
3166 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3167                      enum node_stat_item idx, int nr)
3168 {
3169         struct memcg_stock_pcp *stock;
3170         struct obj_cgroup *old = NULL;
3171         unsigned long flags;
3172         int *bytes;
3173
3174         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3175         stock = this_cpu_ptr(&memcg_stock);
3176
3177         /*
3178          * Save vmstat data in stock and skip vmstat array update unless
3179          * accumulating over a page of vmstat data or when pgdat or idx
3180          * changes.
3181          */
3182         if (READ_ONCE(stock->cached_objcg) != objcg) {
3183                 old = drain_obj_stock(stock);
3184                 obj_cgroup_get(objcg);
3185                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3186                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3187                 WRITE_ONCE(stock->cached_objcg, objcg);
3188                 stock->cached_pgdat = pgdat;
3189         } else if (stock->cached_pgdat != pgdat) {
3190                 /* Flush the existing cached vmstat data */
3191                 struct pglist_data *oldpg = stock->cached_pgdat;
3192
3193                 if (stock->nr_slab_reclaimable_b) {
3194                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3195                                           stock->nr_slab_reclaimable_b);
3196                         stock->nr_slab_reclaimable_b = 0;
3197                 }
3198                 if (stock->nr_slab_unreclaimable_b) {
3199                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3200                                           stock->nr_slab_unreclaimable_b);
3201                         stock->nr_slab_unreclaimable_b = 0;
3202                 }
3203                 stock->cached_pgdat = pgdat;
3204         }
3205
3206         bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3207                                                : &stock->nr_slab_unreclaimable_b;
3208         /*
3209          * Even for large object >= PAGE_SIZE, the vmstat data will still be
3210          * cached locally at least once before pushing it out.
3211          */
3212         if (!*bytes) {
3213                 *bytes = nr;
3214                 nr = 0;
3215         } else {
3216                 *bytes += nr;
3217                 if (abs(*bytes) > PAGE_SIZE) {
3218                         nr = *bytes;
3219                         *bytes = 0;
3220                 } else {
3221                         nr = 0;
3222                 }
3223         }
3224         if (nr)
3225                 mod_objcg_mlstate(objcg, pgdat, idx, nr);
3226
3227         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3228         if (old)
3229                 obj_cgroup_put(old);
3230 }
3231
3232 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3233 {
3234         struct memcg_stock_pcp *stock;
3235         unsigned long flags;
3236         bool ret = false;
3237
3238         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3239
3240         stock = this_cpu_ptr(&memcg_stock);
3241         if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
3242                 stock->nr_bytes -= nr_bytes;
3243                 ret = true;
3244         }
3245
3246         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3247
3248         return ret;
3249 }
3250
3251 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
3252 {
3253         struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
3254
3255         if (!old)
3256                 return NULL;
3257
3258         if (stock->nr_bytes) {
3259                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3260                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3261
3262                 if (nr_pages) {
3263                         struct mem_cgroup *memcg;
3264
3265                         memcg = get_mem_cgroup_from_objcg(old);
3266
3267                         memcg_account_kmem(memcg, -nr_pages);
3268                         __refill_stock(memcg, nr_pages);
3269
3270                         css_put(&memcg->css);
3271                 }
3272
3273                 /*
3274                  * The leftover is flushed to the centralized per-memcg value.
3275                  * On the next attempt to refill obj stock it will be moved
3276                  * to a per-cpu stock (probably, on an other CPU), see
3277                  * refill_obj_stock().
3278                  *
3279                  * How often it's flushed is a trade-off between the memory
3280                  * limit enforcement accuracy and potential CPU contention,
3281                  * so it might be changed in the future.
3282                  */
3283                 atomic_add(nr_bytes, &old->nr_charged_bytes);
3284                 stock->nr_bytes = 0;
3285         }
3286
3287         /*
3288          * Flush the vmstat data in current stock
3289          */
3290         if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3291                 if (stock->nr_slab_reclaimable_b) {
3292                         mod_objcg_mlstate(old, stock->cached_pgdat,
3293                                           NR_SLAB_RECLAIMABLE_B,
3294                                           stock->nr_slab_reclaimable_b);
3295                         stock->nr_slab_reclaimable_b = 0;
3296                 }
3297                 if (stock->nr_slab_unreclaimable_b) {
3298                         mod_objcg_mlstate(old, stock->cached_pgdat,
3299                                           NR_SLAB_UNRECLAIMABLE_B,
3300                                           stock->nr_slab_unreclaimable_b);
3301                         stock->nr_slab_unreclaimable_b = 0;
3302                 }
3303                 stock->cached_pgdat = NULL;
3304         }
3305
3306         WRITE_ONCE(stock->cached_objcg, NULL);
3307         /*
3308          * The `old' objects needs to be released by the caller via
3309          * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
3310          */
3311         return old;
3312 }
3313
3314 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3315                                      struct mem_cgroup *root_memcg)
3316 {
3317         struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
3318         struct mem_cgroup *memcg;
3319
3320         if (objcg) {
3321                 memcg = obj_cgroup_memcg(objcg);
3322                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3323                         return true;
3324         }
3325
3326         return false;
3327 }
3328
3329 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3330                              bool allow_uncharge)
3331 {
3332         struct memcg_stock_pcp *stock;
3333         struct obj_cgroup *old = NULL;
3334         unsigned long flags;
3335         unsigned int nr_pages = 0;
3336
3337         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3338
3339         stock = this_cpu_ptr(&memcg_stock);
3340         if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
3341                 old = drain_obj_stock(stock);
3342                 obj_cgroup_get(objcg);
3343                 WRITE_ONCE(stock->cached_objcg, objcg);
3344                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3345                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3346                 allow_uncharge = true;  /* Allow uncharge when objcg changes */
3347         }
3348         stock->nr_bytes += nr_bytes;
3349
3350         if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3351                 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3352                 stock->nr_bytes &= (PAGE_SIZE - 1);
3353         }
3354
3355         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3356         if (old)
3357                 obj_cgroup_put(old);
3358
3359         if (nr_pages)
3360                 obj_cgroup_uncharge_pages(objcg, nr_pages);
3361 }
3362
3363 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3364 {
3365         unsigned int nr_pages, nr_bytes;
3366         int ret;
3367
3368         if (consume_obj_stock(objcg, size))
3369                 return 0;
3370
3371         /*
3372          * In theory, objcg->nr_charged_bytes can have enough
3373          * pre-charged bytes to satisfy the allocation. However,
3374          * flushing objcg->nr_charged_bytes requires two atomic
3375          * operations, and objcg->nr_charged_bytes can't be big.
3376          * The shared objcg->nr_charged_bytes can also become a
3377          * performance bottleneck if all tasks of the same memcg are
3378          * trying to update it. So it's better to ignore it and try
3379          * grab some new pages. The stock's nr_bytes will be flushed to
3380          * objcg->nr_charged_bytes later on when objcg changes.
3381          *
3382          * The stock's nr_bytes may contain enough pre-charged bytes
3383          * to allow one less page from being charged, but we can't rely
3384          * on the pre-charged bytes not being changed outside of
3385          * consume_obj_stock() or refill_obj_stock(). So ignore those
3386          * pre-charged bytes as well when charging pages. To avoid a
3387          * page uncharge right after a page charge, we set the
3388          * allow_uncharge flag to false when calling refill_obj_stock()
3389          * to temporarily allow the pre-charged bytes to exceed the page
3390          * size limit. The maximum reachable value of the pre-charged
3391          * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
3392          * race.
3393          */
3394         nr_pages = size >> PAGE_SHIFT;
3395         nr_bytes = size & (PAGE_SIZE - 1);
3396
3397         if (nr_bytes)
3398                 nr_pages += 1;
3399
3400         ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3401         if (!ret && nr_bytes)
3402                 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3403
3404         return ret;
3405 }
3406
3407 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3408 {
3409         refill_obj_stock(objcg, size, true);
3410 }
3411
3412 #endif /* CONFIG_MEMCG_KMEM */
3413
3414 /*
3415  * Because page_memcg(head) is not set on tails, set it now.
3416  */
3417 void split_page_memcg(struct page *head, unsigned int nr)
3418 {
3419         struct folio *folio = page_folio(head);
3420         struct mem_cgroup *memcg = folio_memcg(folio);
3421         int i;
3422
3423         if (mem_cgroup_disabled() || !memcg)
3424                 return;
3425
3426         for (i = 1; i < nr; i++)
3427                 folio_page(folio, i)->memcg_data = folio->memcg_data;
3428
3429         if (folio_memcg_kmem(folio))
3430                 obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
3431         else
3432                 css_get_many(&memcg->css, nr - 1);
3433 }
3434
3435 #ifdef CONFIG_SWAP
3436 /**
3437  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3438  * @entry: swap entry to be moved
3439  * @from:  mem_cgroup which the entry is moved from
3440  * @to:  mem_cgroup which the entry is moved to
3441  *
3442  * It succeeds only when the swap_cgroup's record for this entry is the same
3443  * as the mem_cgroup's id of @from.
3444  *
3445  * Returns 0 on success, -EINVAL on failure.
3446  *
3447  * The caller must have charged to @to, IOW, called page_counter_charge() about
3448  * both res and memsw, and called css_get().
3449  */
3450 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3451                                 struct mem_cgroup *from, struct mem_cgroup *to)
3452 {
3453         unsigned short old_id, new_id;
3454
3455         old_id = mem_cgroup_id(from);
3456         new_id = mem_cgroup_id(to);
3457
3458         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3459                 mod_memcg_state(from, MEMCG_SWAP, -1);
3460                 mod_memcg_state(to, MEMCG_SWAP, 1);
3461                 return 0;
3462         }
3463         return -EINVAL;
3464 }
3465 #else
3466 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3467                                 struct mem_cgroup *from, struct mem_cgroup *to)
3468 {
3469         return -EINVAL;
3470 }
3471 #endif
3472
3473 static DEFINE_MUTEX(memcg_max_mutex);
3474
3475 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3476                                  unsigned long max, bool memsw)
3477 {
3478         bool enlarge = false;
3479         bool drained = false;
3480         int ret;
3481         bool limits_invariant;
3482         struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3483
3484         do {
3485                 if (signal_pending(current)) {
3486                         ret = -EINTR;
3487                         break;
3488                 }
3489
3490                 mutex_lock(&memcg_max_mutex);
3491                 /*
3492                  * Make sure that the new limit (memsw or memory limit) doesn't
3493                  * break our basic invariant rule memory.max <= memsw.max.
3494                  */
3495                 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3496                                            max <= memcg->memsw.max;
3497                 if (!limits_invariant) {
3498                         mutex_unlock(&memcg_max_mutex);
3499                         ret = -EINVAL;
3500                         break;
3501                 }
3502                 if (max > counter->max)
3503                         enlarge = true;
3504                 ret = page_counter_set_max(counter, max);
3505                 mutex_unlock(&memcg_max_mutex);
3506
3507                 if (!ret)
3508                         break;
3509
3510                 if (!drained) {
3511                         drain_all_stock(memcg);
3512                         drained = true;
3513                         continue;
3514                 }
3515
3516                 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3517                                         memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
3518                         ret = -EBUSY;
3519                         break;
3520                 }
3521         } while (true);
3522
3523         if (!ret && enlarge)
3524                 memcg_oom_recover(memcg);
3525
3526         return ret;
3527 }
3528
3529 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3530                                             gfp_t gfp_mask,
3531                                             unsigned long *total_scanned)
3532 {
3533         unsigned long nr_reclaimed = 0;
3534         struct mem_cgroup_per_node *mz, *next_mz = NULL;
3535         unsigned long reclaimed;
3536         int loop = 0;
3537         struct mem_cgroup_tree_per_node *mctz;
3538         unsigned long excess;
3539
3540         if (lru_gen_enabled())
3541                 return 0;
3542
3543         if (order > 0)
3544                 return 0;
3545
3546         mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3547
3548         /*
3549          * Do not even bother to check the largest node if the root
3550          * is empty. Do it lockless to prevent lock bouncing. Races
3551          * are acceptable as soft limit is best effort anyway.
3552          */
3553         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3554                 return 0;
3555
3556         /*
3557          * This loop can run a while, specially if mem_cgroup's continuously
3558          * keep exceeding their soft limit and putting the system under
3559          * pressure
3560          */
3561         do {
3562                 if (next_mz)
3563                         mz = next_mz;
3564                 else
3565                         mz = mem_cgroup_largest_soft_limit_node(mctz);
3566                 if (!mz)
3567                         break;
3568
3569                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3570                                                     gfp_mask, total_scanned);
3571                 nr_reclaimed += reclaimed;
3572                 spin_lock_irq(&mctz->lock);
3573
3574                 /*
3575                  * If we failed to reclaim anything from this memory cgroup
3576                  * it is time to move on to the next cgroup
3577                  */
3578                 next_mz = NULL;
3579                 if (!reclaimed)
3580                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3581
3582                 excess = soft_limit_excess(mz->memcg);
3583                 /*
3584                  * One school of thought says that we should not add
3585                  * back the node to the tree if reclaim returns 0.
3586                  * But our reclaim could return 0, simply because due
3587                  * to priority we are exposing a smaller subset of
3588                  * memory to reclaim from. Consider this as a longer
3589                  * term TODO.
3590                  */
3591                 /* If excess == 0, no tree ops */
3592                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3593                 spin_unlock_irq(&mctz->lock);
3594                 css_put(&mz->memcg->css);
3595                 loop++;
3596                 /*
3597                  * Could not reclaim anything and there are no more
3598                  * mem cgroups to try or we seem to be looping without
3599                  * reclaiming anything.
3600                  */
3601                 if (!nr_reclaimed &&
3602                         (next_mz == NULL ||
3603                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3604                         break;
3605         } while (!nr_reclaimed);
3606         if (next_mz)
3607                 css_put(&next_mz->memcg->css);
3608         return nr_reclaimed;
3609 }
3610
3611 /*
3612  * Reclaims as many pages from the given memcg as possible.
3613  *
3614  * Caller is responsible for holding css reference for memcg.
3615  */
3616 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3617 {
3618         int nr_retries = MAX_RECLAIM_RETRIES;
3619
3620         /* we call try-to-free pages for make this cgroup empty */
3621         lru_add_drain_all();
3622
3623         drain_all_stock(memcg);
3624
3625         /* try to free all pages in this cgroup */
3626         while (nr_retries && page_counter_read(&memcg->memory)) {
3627                 if (signal_pending(current))
3628                         return -EINTR;
3629
3630                 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3631                                                   MEMCG_RECLAIM_MAY_SWAP))
3632                         nr_retries--;
3633         }
3634
3635         return 0;
3636 }
3637
3638 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3639                                             char *buf, size_t nbytes,
3640                                             loff_t off)
3641 {
3642         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3643
3644         if (mem_cgroup_is_root(memcg))
3645                 return -EINVAL;
3646         return mem_cgroup_force_empty(memcg) ?: nbytes;
3647 }
3648
3649 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3650                                      struct cftype *cft)
3651 {
3652         return 1;
3653 }
3654
3655 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3656                                       struct cftype *cft, u64 val)
3657 {
3658         if (val == 1)
3659                 return 0;
3660
3661         pr_warn_once("Non-hierarchical mode is deprecated. "
3662                      "Please report your usecase to linux-mm@kvack.org if you "
3663                      "depend on this functionality.\n");
3664
3665         return -EINVAL;
3666 }
3667
3668 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3669 {
3670         unsigned long val;
3671
3672         if (mem_cgroup_is_root(memcg)) {
3673                 /*
3674                  * Approximate root's usage from global state. This isn't
3675                  * perfect, but the root usage was always an approximation.
3676                  */
3677                 val = global_node_page_state(NR_FILE_PAGES) +
3678                         global_node_page_state(NR_ANON_MAPPED);
3679                 if (swap)
3680                         val += total_swap_pages - get_nr_swap_pages();
3681         } else {
3682                 if (!swap)
3683                         val = page_counter_read(&memcg->memory);
3684                 else
3685                         val = page_counter_read(&memcg->memsw);
3686         }
3687         return val;
3688 }
3689
3690 enum {
3691         RES_USAGE,
3692         RES_LIMIT,
3693         RES_MAX_USAGE,
3694         RES_FAILCNT,
3695         RES_SOFT_LIMIT,
3696 };
3697
3698 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3699                                struct cftype *cft)
3700 {
3701         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3702         struct page_counter *counter;
3703
3704         switch (MEMFILE_TYPE(cft->private)) {
3705         case _MEM:
3706                 counter = &memcg->memory;
3707                 break;
3708         case _MEMSWAP:
3709                 counter = &memcg->memsw;
3710                 break;
3711         case _KMEM:
3712                 counter = &memcg->kmem;
3713                 break;
3714         case _TCP:
3715                 counter = &memcg->tcpmem;
3716                 break;
3717         default:
3718                 BUG();
3719         }
3720
3721         switch (MEMFILE_ATTR(cft->private)) {
3722         case RES_USAGE:
3723                 if (counter == &memcg->memory)
3724                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3725                 if (counter == &memcg->memsw)
3726                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3727                 return (u64)page_counter_read(counter) * PAGE_SIZE;
3728         case RES_LIMIT:
3729                 return (u64)counter->max * PAGE_SIZE;
3730         case RES_MAX_USAGE:
3731                 return (u64)counter->watermark * PAGE_SIZE;
3732         case RES_FAILCNT:
3733                 return counter->failcnt;
3734         case RES_SOFT_LIMIT:
3735                 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
3736         default:
3737                 BUG();
3738         }
3739 }
3740
3741 /*
3742  * This function doesn't do anything useful. Its only job is to provide a read
3743  * handler for a file so that cgroup_file_mode() will add read permissions.
3744  */
3745 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
3746                                      __always_unused void *v)
3747 {
3748         return -EINVAL;
3749 }
3750
3751 #ifdef CONFIG_MEMCG_KMEM
3752 static int memcg_online_kmem(struct mem_cgroup *memcg)
3753 {
3754         struct obj_cgroup *objcg;
3755
3756         if (mem_cgroup_kmem_disabled())
3757                 return 0;
3758
3759         if (unlikely(mem_cgroup_is_root(memcg)))
3760                 return 0;
3761
3762         objcg = obj_cgroup_alloc();
3763         if (!objcg)
3764                 return -ENOMEM;
3765
3766         objcg->memcg = memcg;
3767         rcu_assign_pointer(memcg->objcg, objcg);
3768
3769         static_branch_enable(&memcg_kmem_online_key);
3770
3771         memcg->kmemcg_id = memcg->id.id;
3772
3773         return 0;
3774 }
3775
3776 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3777 {
3778         struct mem_cgroup *parent;
3779
3780         if (mem_cgroup_kmem_disabled())
3781                 return;
3782
3783         if (unlikely(mem_cgroup_is_root(memcg)))
3784                 return;
3785
3786         parent = parent_mem_cgroup(memcg);
3787         if (!parent)
3788                 parent = root_mem_cgroup;
3789
3790         memcg_reparent_objcgs(memcg, parent);
3791
3792         /*
3793          * After we have finished memcg_reparent_objcgs(), all list_lrus
3794          * corresponding to this cgroup are guaranteed to remain empty.
3795          * The ordering is imposed by list_lru_node->lock taken by
3796          * memcg_reparent_list_lrus().
3797          */
3798         memcg_reparent_list_lrus(memcg, parent);
3799 }
3800 #else
3801 static int memcg_online_kmem(struct mem_cgroup *memcg)
3802 {
3803         return 0;
3804 }
3805 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3806 {
3807 }
3808 #endif /* CONFIG_MEMCG_KMEM */
3809
3810 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3811 {
3812         int ret;
3813
3814         mutex_lock(&memcg_max_mutex);
3815
3816         ret = page_counter_set_max(&memcg->tcpmem, max);
3817         if (ret)
3818                 goto out;
3819
3820         if (!memcg->tcpmem_active) {
3821                 /*
3822                  * The active flag needs to be written after the static_key
3823                  * update. This is what guarantees that the socket activation
3824                  * function is the last one to run. See mem_cgroup_sk_alloc()
3825                  * for details, and note that we don't mark any socket as
3826                  * belonging to this memcg until that flag is up.
3827                  *
3828                  * We need to do this, because static_keys will span multiple
3829                  * sites, but we can't control their order. If we mark a socket
3830                  * as accounted, but the accounting functions are not patched in
3831                  * yet, we'll lose accounting.
3832                  *
3833                  * We never race with the readers in mem_cgroup_sk_alloc(),
3834                  * because when this value change, the code to process it is not
3835                  * patched in yet.
3836                  */
3837                 static_branch_inc(&memcg_sockets_enabled_key);
3838                 memcg->tcpmem_active = true;
3839         }
3840 out:
3841         mutex_unlock(&memcg_max_mutex);
3842         return ret;
3843 }
3844
3845 /*
3846  * The user of this function is...
3847  * RES_LIMIT.
3848  */
3849 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3850                                 char *buf, size_t nbytes, loff_t off)
3851 {
3852         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3853         unsigned long nr_pages;
3854         int ret;
3855
3856         buf = strstrip(buf);
3857         ret = page_counter_memparse(buf, "-1", &nr_pages);
3858         if (ret)
3859                 return ret;
3860
3861         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3862         case RES_LIMIT:
3863                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3864                         ret = -EINVAL;
3865                         break;
3866                 }
3867                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3868                 case _MEM:
3869                         ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3870                         break;
3871                 case _MEMSWAP:
3872                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3873                         break;
3874                 case _TCP:
3875                         ret = memcg_update_tcp_max(memcg, nr_pages);
3876                         break;
3877                 }
3878                 break;
3879         case RES_SOFT_LIMIT:
3880                 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
3881                         ret = -EOPNOTSUPP;
3882                 } else {
3883                         WRITE_ONCE(memcg->soft_limit, nr_pages);
3884                         ret = 0;
3885                 }
3886                 break;
3887         }
3888         return ret ?: nbytes;
3889 }
3890
3891 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3892                                 size_t nbytes, loff_t off)
3893 {
3894         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3895         struct page_counter *counter;
3896
3897         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3898         case _MEM:
3899                 counter = &memcg->memory;
3900                 break;
3901         case _MEMSWAP:
3902                 counter = &memcg->memsw;
3903                 break;
3904         case _KMEM:
3905                 counter = &memcg->kmem;
3906                 break;
3907         case _TCP:
3908                 counter = &memcg->tcpmem;
3909                 break;
3910         default:
3911                 BUG();
3912         }
3913
3914         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3915         case RES_MAX_USAGE:
3916                 page_counter_reset_watermark(counter);
3917                 break;
3918         case RES_FAILCNT:
3919                 counter->failcnt = 0;
3920                 break;
3921         default:
3922                 BUG();
3923         }
3924
3925         return nbytes;
3926 }
3927
3928 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3929                                         struct cftype *cft)
3930 {
3931         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3932 }
3933
3934 #ifdef CONFIG_MMU
3935 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3936                                         struct cftype *cft, u64 val)
3937 {
3938         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3939
3940         pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
3941                      "Please report your usecase to linux-mm@kvack.org if you "
3942                      "depend on this functionality.\n");
3943
3944         if (val & ~MOVE_MASK)
3945                 return -EINVAL;
3946
3947         /*
3948          * No kind of locking is needed in here, because ->can_attach() will
3949          * check this value once in the beginning of the process, and then carry
3950          * on with stale data. This means that changes to this value will only
3951          * affect task migrations starting after the change.
3952          */
3953         memcg->move_charge_at_immigrate = val;
3954         return 0;
3955 }
3956 #else
3957 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3958                                         struct cftype *cft, u64 val)
3959 {
3960         return -ENOSYS;
3961 }
3962 #endif
3963
3964 #ifdef CONFIG_NUMA
3965
3966 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3967 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3968 #define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
3969
3970 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3971                                 int nid, unsigned int lru_mask, bool tree)
3972 {
3973         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3974         unsigned long nr = 0;
3975         enum lru_list lru;
3976
3977         VM_BUG_ON((unsigned)nid >= nr_node_ids);
3978
3979         for_each_lru(lru) {
3980                 if (!(BIT(lru) & lru_mask))
3981                         continue;
3982                 if (tree)
3983                         nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3984                 else
3985                         nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3986         }
3987         return nr;
3988 }
3989
3990 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3991                                              unsigned int lru_mask,
3992                                              bool tree)
3993 {
3994         unsigned long nr = 0;
3995         enum lru_list lru;
3996
3997         for_each_lru(lru) {
3998                 if (!(BIT(lru) & lru_mask))
3999                         continue;
4000                 if (tree)
4001                         nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4002                 else
4003                         nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4004         }
4005         return nr;
4006 }
4007
4008 static int memcg_numa_stat_show(struct seq_file *m, void *v)
4009 {
4010         struct numa_stat {
4011                 const char *name;
4012                 unsigned int lru_mask;
4013         };
4014
4015         static const struct numa_stat stats[] = {
4016                 { "total", LRU_ALL },
4017                 { "file", LRU_ALL_FILE },
4018                 { "anon", LRU_ALL_ANON },
4019                 { "unevictable", BIT(LRU_UNEVICTABLE) },
4020         };
4021         const struct numa_stat *stat;
4022         int nid;
4023         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4024
4025         mem_cgroup_flush_stats();
4026
4027         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4028                 seq_printf(m, "%s=%lu", stat->name,
4029                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4030                                                    false));
4031                 for_each_node_state(nid, N_MEMORY)
4032                         seq_printf(m, " N%d=%lu", nid,
4033                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
4034                                                         stat->lru_mask, false));
4035                 seq_putc(m, '\n');
4036         }
4037
4038         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4039
4040                 seq_printf(m, "hierarchical_%s=%lu", stat->name,
4041                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4042                                                    true));
4043                 for_each_node_state(nid, N_MEMORY)
4044                         seq_printf(m, " N%d=%lu", nid,
4045                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
4046                                                         stat->lru_mask, true));
4047                 seq_putc(m, '\n');
4048         }
4049
4050         return 0;
4051 }
4052 #endif /* CONFIG_NUMA */
4053
4054 static const unsigned int memcg1_stats[] = {
4055         NR_FILE_PAGES,
4056         NR_ANON_MAPPED,
4057 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4058         NR_ANON_THPS,
4059 #endif
4060         NR_SHMEM,
4061         NR_FILE_MAPPED,
4062         NR_FILE_DIRTY,
4063         NR_WRITEBACK,
4064         WORKINGSET_REFAULT_ANON,
4065         WORKINGSET_REFAULT_FILE,
4066         MEMCG_SWAP,
4067 };
4068
4069 static const char *const memcg1_stat_names[] = {
4070         "cache",
4071         "rss",
4072 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4073         "rss_huge",
4074 #endif
4075         "shmem",
4076         "mapped_file",
4077         "dirty",
4078         "writeback",
4079         "workingset_refault_anon",
4080         "workingset_refault_file",
4081         "swap",
4082 };
4083
4084 /* Universal VM events cgroup1 shows, original sort order */
4085 static const unsigned int memcg1_events[] = {
4086         PGPGIN,
4087         PGPGOUT,
4088         PGFAULT,
4089         PGMAJFAULT,
4090 };
4091
4092 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
4093 {
4094         unsigned long memory, memsw;
4095         struct mem_cgroup *mi;
4096         unsigned int i;
4097
4098         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4099
4100         mem_cgroup_flush_stats();
4101
4102         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4103                 unsigned long nr;
4104
4105                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4106                         continue;
4107                 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4108                 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
4109                            nr * memcg_page_state_unit(memcg1_stats[i]));
4110         }
4111
4112         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4113                 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
4114                                memcg_events_local(memcg, memcg1_events[i]));
4115
4116         for (i = 0; i < NR_LRU_LISTS; i++)
4117                 seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
4118                                memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4119                                PAGE_SIZE);
4120
4121         /* Hierarchical information */
4122         memory = memsw = PAGE_COUNTER_MAX;
4123         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4124                 memory = min(memory, READ_ONCE(mi->memory.max));
4125                 memsw = min(memsw, READ_ONCE(mi->memsw.max));
4126         }
4127         seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
4128                        (u64)memory * PAGE_SIZE);
4129         if (do_memsw_account())
4130                 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
4131                                (u64)memsw * PAGE_SIZE);
4132
4133         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4134                 unsigned long nr;
4135
4136                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4137                         continue;
4138                 nr = memcg_page_state(memcg, memcg1_stats[i]);
4139                 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
4140                            (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
4141         }
4142
4143         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4144                 seq_buf_printf(s, "total_%s %llu\n",
4145                                vm_event_name(memcg1_events[i]),
4146                                (u64)memcg_events(memcg, memcg1_events[i]));
4147
4148         for (i = 0; i < NR_LRU_LISTS; i++)
4149                 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
4150                                (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4151                                PAGE_SIZE);
4152
4153 #ifdef CONFIG_DEBUG_VM
4154         {
4155                 pg_data_t *pgdat;
4156                 struct mem_cgroup_per_node *mz;
4157                 unsigned long anon_cost = 0;
4158                 unsigned long file_cost = 0;
4159
4160                 for_each_online_pgdat(pgdat) {
4161                         mz = memcg->nodeinfo[pgdat->node_id];
4162
4163                         anon_cost += mz->lruvec.anon_cost;
4164                         file_cost += mz->lruvec.file_cost;
4165                 }
4166                 seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
4167                 seq_buf_printf(s, "file_cost %lu\n", file_cost);
4168         }
4169 #endif
4170 }
4171
4172 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4173                                       struct cftype *cft)
4174 {
4175         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4176
4177         return mem_cgroup_swappiness(memcg);
4178 }
4179
4180 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4181                                        struct cftype *cft, u64 val)
4182 {
4183         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4184
4185         if (val > 200)
4186                 return -EINVAL;
4187
4188         if (!mem_cgroup_is_root(memcg))
4189                 WRITE_ONCE(memcg->swappiness, val);
4190         else
4191                 WRITE_ONCE(vm_swappiness, val);
4192
4193         return 0;
4194 }
4195
4196 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4197 {
4198         struct mem_cgroup_threshold_ary *t;
4199         unsigned long usage;
4200         int i;
4201
4202         rcu_read_lock();
4203         if (!swap)
4204                 t = rcu_dereference(memcg->thresholds.primary);
4205         else
4206                 t = rcu_dereference(memcg->memsw_thresholds.primary);
4207
4208         if (!t)
4209                 goto unlock;
4210
4211         usage = mem_cgroup_usage(memcg, swap);
4212
4213         /*
4214          * current_threshold points to threshold just below or equal to usage.
4215          * If it's not true, a threshold was crossed after last
4216          * call of __mem_cgroup_threshold().
4217          */
4218         i = t->current_threshold;
4219
4220         /*
4221          * Iterate backward over array of thresholds starting from
4222          * current_threshold and check if a threshold is crossed.
4223          * If none of thresholds below usage is crossed, we read
4224          * only one element of the array here.
4225          */
4226         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4227                 eventfd_signal(t->entries[i].eventfd, 1);
4228
4229         /* i = current_threshold + 1 */
4230         i++;
4231
4232         /*
4233          * Iterate forward over array of thresholds starting from
4234          * current_threshold+1 and check if a threshold is crossed.
4235          * If none of thresholds above usage is crossed, we read
4236          * only one element of the array here.
4237          */
4238         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4239                 eventfd_signal(t->entries[i].eventfd, 1);
4240
4241         /* Update current_threshold */
4242         t->current_threshold = i - 1;
4243 unlock:
4244         rcu_read_unlock();
4245 }
4246
4247 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4248 {
4249         while (memcg) {
4250                 __mem_cgroup_threshold(memcg, false);
4251                 if (do_memsw_account())
4252                         __mem_cgroup_threshold(memcg, true);
4253
4254                 memcg = parent_mem_cgroup(memcg);
4255         }
4256 }
4257
4258 static int compare_thresholds(const void *a, const void *b)
4259 {
4260         const struct mem_cgroup_threshold *_a = a;
4261         const struct mem_cgroup_threshold *_b = b;
4262
4263         if (_a->threshold > _b->threshold)
4264                 return 1;
4265
4266         if (_a->threshold < _b->threshold)
4267                 return -1;
4268
4269         return 0;
4270 }
4271
4272 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4273 {
4274         struct mem_cgroup_eventfd_list *ev;
4275
4276         spin_lock(&memcg_oom_lock);
4277
4278         list_for_each_entry(ev, &memcg->oom_notify, list)
4279                 eventfd_signal(ev->eventfd, 1);
4280
4281         spin_unlock(&memcg_oom_lock);
4282         return 0;
4283 }
4284
4285 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4286 {
4287         struct mem_cgroup *iter;
4288
4289         for_each_mem_cgroup_tree(iter, memcg)
4290                 mem_cgroup_oom_notify_cb(iter);
4291 }
4292
4293 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4294         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4295 {
4296         struct mem_cgroup_thresholds *thresholds;
4297         struct mem_cgroup_threshold_ary *new;
4298         unsigned long threshold;
4299         unsigned long usage;
4300         int i, size, ret;
4301
4302         ret = page_counter_memparse(args, "-1", &threshold);
4303         if (ret)
4304                 return ret;
4305
4306         mutex_lock(&memcg->thresholds_lock);
4307
4308         if (type == _MEM) {
4309                 thresholds = &memcg->thresholds;
4310                 usage = mem_cgroup_usage(memcg, false);
4311         } else if (type == _MEMSWAP) {
4312                 thresholds = &memcg->memsw_thresholds;
4313                 usage = mem_cgroup_usage(memcg, true);
4314         } else
4315                 BUG();
4316
4317         /* Check if a threshold crossed before adding a new one */
4318         if (thresholds->primary)
4319                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4320
4321         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4322
4323         /* Allocate memory for new array of thresholds */
4324         new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4325         if (!new) {
4326                 ret = -ENOMEM;
4327                 goto unlock;
4328         }
4329         new->size = size;
4330
4331         /* Copy thresholds (if any) to new array */
4332         if (thresholds->primary)
4333                 memcpy(new->entries, thresholds->primary->entries,
4334                        flex_array_size(new, entries, size - 1));
4335
4336         /* Add new threshold */
4337         new->entries[size - 1].eventfd = eventfd;
4338         new->entries[size - 1].threshold = threshold;
4339
4340         /* Sort thresholds. Registering of new threshold isn't time-critical */
4341         sort(new->entries, size, sizeof(*new->entries),
4342                         compare_thresholds, NULL);
4343
4344         /* Find current threshold */
4345         new->current_threshold = -1;
4346         for (i = 0; i < size; i++) {
4347                 if (new->entries[i].threshold <= usage) {
4348                         /*
4349                          * new->current_threshold will not be used until
4350                          * rcu_assign_pointer(), so it's safe to increment
4351                          * it here.
4352                          */
4353                         ++new->current_threshold;
4354                 } else
4355                         break;
4356         }
4357
4358         /* Free old spare buffer and save old primary buffer as spare */
4359         kfree(thresholds->spare);
4360         thresholds->spare = thresholds->primary;
4361
4362         rcu_assign_pointer(thresholds->primary, new);
4363
4364         /* To be sure that nobody uses thresholds */
4365         synchronize_rcu();
4366
4367 unlock:
4368         mutex_unlock(&memcg->thresholds_lock);
4369
4370         return ret;
4371 }
4372
4373 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4374         struct eventfd_ctx *eventfd, const char *args)
4375 {
4376         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4377 }
4378
4379 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4380         struct eventfd_ctx *eventfd, const char *args)
4381 {
4382         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4383 }
4384
4385 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4386         struct eventfd_ctx *eventfd, enum res_type type)
4387 {
4388         struct mem_cgroup_thresholds *thresholds;
4389         struct mem_cgroup_threshold_ary *new;
4390         unsigned long usage;
4391         int i, j, size, entries;
4392
4393         mutex_lock(&memcg->thresholds_lock);
4394
4395         if (type == _MEM) {
4396                 thresholds = &memcg->thresholds;
4397                 usage = mem_cgroup_usage(memcg, false);
4398         } else if (type == _MEMSWAP) {
4399                 thresholds = &memcg->memsw_thresholds;
4400                 usage = mem_cgroup_usage(memcg, true);
4401         } else
4402                 BUG();
4403
4404         if (!thresholds->primary)
4405                 goto unlock;
4406
4407         /* Check if a threshold crossed before removing */
4408         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4409
4410         /* Calculate new number of threshold */
4411         size = entries = 0;
4412         for (i = 0; i < thresholds->primary->size; i++) {
4413                 if (thresholds->primary->entries[i].eventfd != eventfd)
4414                         size++;
4415                 else
4416                         entries++;
4417         }
4418
4419         new = thresholds->spare;
4420
4421         /* If no items related to eventfd have been cleared, nothing to do */
4422         if (!entries)
4423                 goto unlock;
4424
4425         /* Set thresholds array to NULL if we don't have thresholds */
4426         if (!size) {
4427                 kfree(new);
4428                 new = NULL;
4429                 goto swap_buffers;
4430         }
4431
4432         new->size = size;
4433
4434         /* Copy thresholds and find current threshold */
4435         new->current_threshold = -1;
4436         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4437                 if (thresholds->primary->entries[i].eventfd == eventfd)
4438                         continue;
4439
4440                 new->entries[j] = thresholds->primary->entries[i];
4441                 if (new->entries[j].threshold <= usage) {
4442                         /*
4443                          * new->current_threshold will not be used
4444                          * until rcu_assign_pointer(), so it's safe to increment
4445                          * it here.
4446                          */
4447                         ++new->current_threshold;
4448                 }
4449                 j++;
4450         }
4451
4452 swap_buffers:
4453         /* Swap primary and spare array */
4454         thresholds->spare = thresholds->primary;
4455
4456         rcu_assign_pointer(thresholds->primary, new);
4457
4458         /* To be sure that nobody uses thresholds */
4459         synchronize_rcu();
4460
4461         /* If all events are unregistered, free the spare array */
4462         if (!new) {
4463                 kfree(thresholds->spare);
4464                 thresholds->spare = NULL;
4465         }
4466 unlock:
4467         mutex_unlock(&memcg->thresholds_lock);
4468 }
4469
4470 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4471         struct eventfd_ctx *eventfd)
4472 {
4473         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4474 }
4475
4476 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4477         struct eventfd_ctx *eventfd)
4478 {
4479         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4480 }
4481
4482 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4483         struct eventfd_ctx *eventfd, const char *args)
4484 {
4485         struct mem_cgroup_eventfd_list *event;
4486
4487         event = kmalloc(sizeof(*event), GFP_KERNEL);
4488         if (!event)
4489                 return -ENOMEM;
4490
4491         spin_lock(&memcg_oom_lock);
4492
4493         event->eventfd = eventfd;
4494         list_add(&event->list, &memcg->oom_notify);
4495
4496         /* already in OOM ? */
4497         if (memcg->under_oom)
4498                 eventfd_signal(eventfd, 1);
4499         spin_unlock(&memcg_oom_lock);
4500
4501         return 0;
4502 }
4503
4504 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4505         struct eventfd_ctx *eventfd)
4506 {
4507         struct mem_cgroup_eventfd_list *ev, *tmp;
4508
4509         spin_lock(&memcg_oom_lock);
4510
4511         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4512                 if (ev->eventfd == eventfd) {
4513                         list_del(&ev->list);
4514                         kfree(ev);
4515                 }
4516         }
4517
4518         spin_unlock(&memcg_oom_lock);
4519 }
4520
4521 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4522 {
4523         struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4524
4525         seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
4526         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4527         seq_printf(sf, "oom_kill %lu\n",
4528                    atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4529         return 0;
4530 }
4531
4532 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4533         struct cftype *cft, u64 val)
4534 {
4535         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4536
4537         /* cannot set to root cgroup and only 0 and 1 are allowed */
4538         if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4539                 return -EINVAL;
4540
4541         WRITE_ONCE(memcg->oom_kill_disable, val);
4542         if (!val)
4543                 memcg_oom_recover(memcg);
4544
4545         return 0;
4546 }
4547
4548 #ifdef CONFIG_CGROUP_WRITEBACK
4549
4550 #include <trace/events/writeback.h>
4551
4552 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4553 {
4554         return wb_domain_init(&memcg->cgwb_domain, gfp);
4555 }
4556
4557 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4558 {
4559         wb_domain_exit(&memcg->cgwb_domain);
4560 }
4561
4562 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4563 {
4564         wb_domain_size_changed(&memcg->cgwb_domain);
4565 }
4566
4567 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4568 {
4569         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4570
4571         if (!memcg->css.parent)
4572                 return NULL;
4573
4574         return &memcg->cgwb_domain;
4575 }
4576
4577 /**
4578  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4579  * @wb: bdi_writeback in question
4580  * @pfilepages: out parameter for number of file pages
4581  * @pheadroom: out parameter for number of allocatable pages according to memcg
4582  * @pdirty: out parameter for number of dirty pages
4583  * @pwriteback: out parameter for number of pages under writeback
4584  *
4585  * Determine the numbers of file, headroom, dirty, and writeback pages in
4586  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4587  * is a bit more involved.
4588  *
4589  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4590  * headroom is calculated as the lowest headroom of itself and the
4591  * ancestors.  Note that this doesn't consider the actual amount of
4592  * available memory in the system.  The caller should further cap
4593  * *@pheadroom accordingly.
4594  */
4595 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4596                          unsigned long *pheadroom, unsigned long *pdirty,
4597                          unsigned long *pwriteback)
4598 {
4599         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4600         struct mem_cgroup *parent;
4601
4602         mem_cgroup_flush_stats();
4603
4604         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4605         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4606         *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4607                         memcg_page_state(memcg, NR_ACTIVE_FILE);
4608
4609         *pheadroom = PAGE_COUNTER_MAX;
4610         while ((parent = parent_mem_cgroup(memcg))) {
4611                 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4612                                             READ_ONCE(memcg->memory.high));
4613                 unsigned long used = page_counter_read(&memcg->memory);
4614
4615                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4616                 memcg = parent;
4617         }
4618 }
4619
4620 /*
4621  * Foreign dirty flushing
4622  *
4623  * There's an inherent mismatch between memcg and writeback.  The former
4624  * tracks ownership per-page while the latter per-inode.  This was a
4625  * deliberate design decision because honoring per-page ownership in the
4626  * writeback path is complicated, may lead to higher CPU and IO overheads
4627  * and deemed unnecessary given that write-sharing an inode across
4628  * different cgroups isn't a common use-case.
4629  *
4630  * Combined with inode majority-writer ownership switching, this works well
4631  * enough in most cases but there are some pathological cases.  For
4632  * example, let's say there are two cgroups A and B which keep writing to
4633  * different but confined parts of the same inode.  B owns the inode and
4634  * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4635  * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4636  * triggering background writeback.  A will be slowed down without a way to
4637  * make writeback of the dirty pages happen.
4638  *
4639  * Conditions like the above can lead to a cgroup getting repeatedly and
4640  * severely throttled after making some progress after each
4641  * dirty_expire_interval while the underlying IO device is almost
4642  * completely idle.
4643  *
4644  * Solving this problem completely requires matching the ownership tracking
4645  * granularities between memcg and writeback in either direction.  However,
4646  * the more egregious behaviors can be avoided by simply remembering the
4647  * most recent foreign dirtying events and initiating remote flushes on
4648  * them when local writeback isn't enough to keep the memory clean enough.
4649  *
4650  * The following two functions implement such mechanism.  When a foreign
4651  * page - a page whose memcg and writeback ownerships don't match - is
4652  * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4653  * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4654  * decides that the memcg needs to sleep due to high dirty ratio, it calls
4655  * mem_cgroup_flush_foreign() which queues writeback on the recorded
4656  * foreign bdi_writebacks which haven't expired.  Both the numbers of
4657  * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4658  * limited to MEMCG_CGWB_FRN_CNT.
4659  *
4660  * The mechanism only remembers IDs and doesn't hold any object references.
4661  * As being wrong occasionally doesn't matter, updates and accesses to the
4662  * records are lockless and racy.
4663  */
4664 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
4665                                              struct bdi_writeback *wb)
4666 {
4667         struct mem_cgroup *memcg = folio_memcg(folio);
4668         struct memcg_cgwb_frn *frn;
4669         u64 now = get_jiffies_64();
4670         u64 oldest_at = now;
4671         int oldest = -1;
4672         int i;
4673
4674         trace_track_foreign_dirty(folio, wb);
4675
4676         /*
4677          * Pick the slot to use.  If there is already a slot for @wb, keep
4678          * using it.  If not replace the oldest one which isn't being
4679          * written out.
4680          */
4681         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4682                 frn = &memcg->cgwb_frn[i];
4683                 if (frn->bdi_id == wb->bdi->id &&
4684                     frn->memcg_id == wb->memcg_css->id)
4685                         break;
4686                 if (time_before64(frn->at, oldest_at) &&
4687                     atomic_read(&frn->done.cnt) == 1) {
4688                         oldest = i;
4689                         oldest_at = frn->at;
4690                 }
4691         }
4692
4693         if (i < MEMCG_CGWB_FRN_CNT) {
4694                 /*
4695                  * Re-using an existing one.  Update timestamp lazily to
4696                  * avoid making the cacheline hot.  We want them to be
4697                  * reasonably up-to-date and significantly shorter than
4698                  * dirty_expire_interval as that's what expires the record.
4699                  * Use the shorter of 1s and dirty_expire_interval / 8.
4700                  */
4701                 unsigned long update_intv =
4702                         min_t(unsigned long, HZ,
4703                               msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4704
4705                 if (time_before64(frn->at, now - update_intv))
4706                         frn->at = now;
4707         } else if (oldest >= 0) {
4708                 /* replace the oldest free one */
4709                 frn = &memcg->cgwb_frn[oldest];
4710                 frn->bdi_id = wb->bdi->id;
4711                 frn->memcg_id = wb->memcg_css->id;
4712                 frn->at = now;
4713         }
4714 }
4715
4716 /* issue foreign writeback flushes for recorded foreign dirtying events */
4717 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4718 {
4719         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4720         unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4721         u64 now = jiffies_64;
4722         int i;
4723
4724         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4725                 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4726
4727                 /*
4728                  * If the record is older than dirty_expire_interval,
4729                  * writeback on it has already started.  No need to kick it
4730                  * off again.  Also, don't start a new one if there's
4731                  * already one in flight.
4732                  */
4733                 if (time_after64(frn->at, now - intv) &&
4734                     atomic_read(&frn->done.cnt) == 1) {
4735                         frn->at = 0;
4736                         trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4737                         cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4738                                                WB_REASON_FOREIGN_FLUSH,
4739                                                &frn->done);
4740                 }
4741         }
4742 }
4743
4744 #else   /* CONFIG_CGROUP_WRITEBACK */
4745
4746 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4747 {
4748         return 0;
4749 }
4750
4751 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4752 {
4753 }
4754
4755 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4756 {
4757 }
4758
4759 #endif  /* CONFIG_CGROUP_WRITEBACK */
4760
4761 /*
4762  * DO NOT USE IN NEW FILES.
4763  *
4764  * "cgroup.event_control" implementation.
4765  *
4766  * This is way over-engineered.  It tries to support fully configurable
4767  * events for each user.  Such level of flexibility is completely
4768  * unnecessary especially in the light of the planned unified hierarchy.
4769  *
4770  * Please deprecate this and replace with something simpler if at all
4771  * possible.
4772  */
4773
4774 /*
4775  * Unregister event and free resources.
4776  *
4777  * Gets called from workqueue.
4778  */
4779 static void memcg_event_remove(struct work_struct *work)
4780 {
4781         struct mem_cgroup_event *event =
4782                 container_of(work, struct mem_cgroup_event, remove);
4783         struct mem_cgroup *memcg = event->memcg;
4784
4785         remove_wait_queue(event->wqh, &event->wait);
4786
4787         event->unregister_event(memcg, event->eventfd);
4788
4789         /* Notify userspace the event is going away. */
4790         eventfd_signal(event->eventfd, 1);
4791
4792         eventfd_ctx_put(event->eventfd);
4793         kfree(event);
4794         css_put(&memcg->css);
4795 }
4796
4797 /*
4798  * Gets called on EPOLLHUP on eventfd when user closes it.
4799  *
4800  * Called with wqh->lock held and interrupts disabled.
4801  */
4802 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4803                             int sync, void *key)
4804 {
4805         struct mem_cgroup_event *event =
4806                 container_of(wait, struct mem_cgroup_event, wait);
4807         struct mem_cgroup *memcg = event->memcg;
4808         __poll_t flags = key_to_poll(key);
4809
4810         if (flags & EPOLLHUP) {
4811                 /*
4812                  * If the event has been detached at cgroup removal, we
4813                  * can simply return knowing the other side will cleanup
4814                  * for us.
4815                  *
4816                  * We can't race against event freeing since the other
4817                  * side will require wqh->lock via remove_wait_queue(),
4818                  * which we hold.
4819                  */
4820                 spin_lock(&memcg->event_list_lock);
4821                 if (!list_empty(&event->list)) {
4822                         list_del_init(&event->list);
4823                         /*
4824                          * We are in atomic context, but cgroup_event_remove()
4825                          * may sleep, so we have to call it in workqueue.
4826                          */
4827                         schedule_work(&event->remove);
4828                 }
4829                 spin_unlock(&memcg->event_list_lock);
4830         }
4831
4832         return 0;
4833 }
4834
4835 static void memcg_event_ptable_queue_proc(struct file *file,
4836                 wait_queue_head_t *wqh, poll_table *pt)
4837 {
4838         struct mem_cgroup_event *event =
4839                 container_of(pt, struct mem_cgroup_event, pt);
4840
4841         event->wqh = wqh;
4842         add_wait_queue(wqh, &event->wait);
4843 }
4844
4845 /*
4846  * DO NOT USE IN NEW FILES.
4847  *
4848  * Parse input and register new cgroup event handler.
4849  *
4850  * Input must be in format '<event_fd> <control_fd> <args>'.
4851  * Interpretation of args is defined by control file implementation.
4852  */
4853 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4854                                          char *buf, size_t nbytes, loff_t off)
4855 {
4856         struct cgroup_subsys_state *css = of_css(of);
4857         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4858         struct mem_cgroup_event *event;
4859         struct cgroup_subsys_state *cfile_css;
4860         unsigned int efd, cfd;
4861         struct fd efile;
4862         struct fd cfile;
4863         struct dentry *cdentry;
4864         const char *name;
4865         char *endp;
4866         int ret;
4867
4868         if (IS_ENABLED(CONFIG_PREEMPT_RT))
4869                 return -EOPNOTSUPP;
4870
4871         buf = strstrip(buf);
4872
4873         efd = simple_strtoul(buf, &endp, 10);
4874         if (*endp != ' ')
4875                 return -EINVAL;
4876         buf = endp + 1;
4877
4878         cfd = simple_strtoul(buf, &endp, 10);
4879         if ((*endp != ' ') && (*endp != '\0'))
4880                 return -EINVAL;
4881         buf = endp + 1;
4882
4883         event = kzalloc(sizeof(*event), GFP_KERNEL);
4884         if (!event)
4885                 return -ENOMEM;
4886
4887         event->memcg = memcg;
4888         INIT_LIST_HEAD(&event->list);
4889         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4890         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4891         INIT_WORK(&event->remove, memcg_event_remove);
4892
4893         efile = fdget(efd);
4894         if (!efile.file) {
4895                 ret = -EBADF;
4896                 goto out_kfree;
4897         }
4898
4899         event->eventfd = eventfd_ctx_fileget(efile.file);
4900         if (IS_ERR(event->eventfd)) {
4901                 ret = PTR_ERR(event->eventfd);
4902                 goto out_put_efile;
4903         }
4904
4905         cfile = fdget(cfd);
4906         if (!cfile.file) {
4907                 ret = -EBADF;
4908                 goto out_put_eventfd;
4909         }
4910
4911         /* the process need read permission on control file */
4912         /* AV: shouldn't we check that it's been opened for read instead? */
4913         ret = file_permission(cfile.file, MAY_READ);
4914         if (ret < 0)
4915                 goto out_put_cfile;
4916
4917         /*
4918          * The control file must be a regular cgroup1 file. As a regular cgroup
4919          * file can't be renamed, it's safe to access its name afterwards.
4920          */
4921         cdentry = cfile.file->f_path.dentry;
4922         if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4923                 ret = -EINVAL;
4924                 goto out_put_cfile;
4925         }
4926
4927         /*
4928          * Determine the event callbacks and set them in @event.  This used
4929          * to be done via struct cftype but cgroup core no longer knows
4930          * about these events.  The following is crude but the whole thing
4931          * is for compatibility anyway.
4932          *
4933          * DO NOT ADD NEW FILES.
4934          */
4935         name = cdentry->d_name.name;
4936
4937         if (!strcmp(name, "memory.usage_in_bytes")) {
4938                 event->register_event = mem_cgroup_usage_register_event;
4939                 event->unregister_event = mem_cgroup_usage_unregister_event;
4940         } else if (!strcmp(name, "memory.oom_control")) {
4941                 event->register_event = mem_cgroup_oom_register_event;
4942                 event->unregister_event = mem_cgroup_oom_unregister_event;
4943         } else if (!strcmp(name, "memory.pressure_level")) {
4944                 event->register_event = vmpressure_register_event;
4945                 event->unregister_event = vmpressure_unregister_event;
4946         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4947                 event->register_event = memsw_cgroup_usage_register_event;
4948                 event->unregister_event = memsw_cgroup_usage_unregister_event;
4949         } else {
4950                 ret = -EINVAL;
4951                 goto out_put_cfile;
4952         }
4953
4954         /*
4955          * Verify @cfile should belong to @css.  Also, remaining events are
4956          * automatically removed on cgroup destruction but the removal is
4957          * asynchronous, so take an extra ref on @css.
4958          */
4959         cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4960                                                &memory_cgrp_subsys);
4961         ret = -EINVAL;
4962         if (IS_ERR(cfile_css))
4963                 goto out_put_cfile;
4964         if (cfile_css != css) {
4965                 css_put(cfile_css);
4966                 goto out_put_cfile;
4967         }
4968
4969         ret = event->register_event(memcg, event->eventfd, buf);
4970         if (ret)
4971                 goto out_put_css;
4972
4973         vfs_poll(efile.file, &event->pt);
4974
4975         spin_lock_irq(&memcg->event_list_lock);
4976         list_add(&event->list, &memcg->event_list);
4977         spin_unlock_irq(&memcg->event_list_lock);
4978
4979         fdput(cfile);
4980         fdput(efile);
4981
4982         return nbytes;
4983
4984 out_put_css:
4985         css_put(css);
4986 out_put_cfile:
4987         fdput(cfile);
4988 out_put_eventfd:
4989         eventfd_ctx_put(event->eventfd);
4990 out_put_efile:
4991         fdput(efile);
4992 out_kfree:
4993         kfree(event);
4994
4995         return ret;
4996 }
4997
4998 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4999 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
5000 {
5001         /*
5002          * Deprecated.
5003          * Please, take a look at tools/cgroup/memcg_slabinfo.py .
5004          */
5005         return 0;
5006 }
5007 #endif
5008
5009 static int memory_stat_show(struct seq_file *m, void *v);
5010
5011 static struct cftype mem_cgroup_legacy_files[] = {
5012         {
5013                 .name = "usage_in_bytes",
5014                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5015                 .read_u64 = mem_cgroup_read_u64,
5016         },
5017         {
5018                 .name = "max_usage_in_bytes",
5019                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5020                 .write = mem_cgroup_reset,
5021                 .read_u64 = mem_cgroup_read_u64,
5022         },
5023         {
5024                 .name = "limit_in_bytes",
5025                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5026                 .write = mem_cgroup_write,
5027                 .read_u64 = mem_cgroup_read_u64,
5028         },
5029         {
5030                 .name = "soft_limit_in_bytes",
5031                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5032                 .write = mem_cgroup_write,
5033                 .read_u64 = mem_cgroup_read_u64,
5034         },
5035         {
5036                 .name = "failcnt",
5037                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5038                 .write = mem_cgroup_reset,
5039                 .read_u64 = mem_cgroup_read_u64,
5040         },
5041         {
5042                 .name = "stat",
5043                 .seq_show = memory_stat_show,
5044         },
5045         {
5046                 .name = "force_empty",
5047                 .write = mem_cgroup_force_empty_write,
5048         },
5049         {
5050                 .name = "use_hierarchy",
5051                 .write_u64 = mem_cgroup_hierarchy_write,
5052                 .read_u64 = mem_cgroup_hierarchy_read,
5053         },
5054         {
5055                 .name = "cgroup.event_control",         /* XXX: for compat */
5056                 .write = memcg_write_event_control,
5057                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
5058         },
5059         {
5060                 .name = "swappiness",
5061                 .read_u64 = mem_cgroup_swappiness_read,
5062                 .write_u64 = mem_cgroup_swappiness_write,
5063         },
5064         {
5065                 .name = "move_charge_at_immigrate",
5066                 .read_u64 = mem_cgroup_move_charge_read,
5067                 .write_u64 = mem_cgroup_move_charge_write,
5068         },
5069         {
5070                 .name = "oom_control",
5071                 .seq_show = mem_cgroup_oom_control_read,
5072                 .write_u64 = mem_cgroup_oom_control_write,
5073         },
5074         {
5075                 .name = "pressure_level",
5076                 .seq_show = mem_cgroup_dummy_seq_show,
5077         },
5078 #ifdef CONFIG_NUMA
5079         {
5080                 .name = "numa_stat",
5081                 .seq_show = memcg_numa_stat_show,
5082         },
5083 #endif
5084         {
5085                 .name = "kmem.usage_in_bytes",
5086                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5087                 .read_u64 = mem_cgroup_read_u64,
5088         },
5089         {
5090                 .name = "kmem.failcnt",
5091                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5092                 .write = mem_cgroup_reset,
5093                 .read_u64 = mem_cgroup_read_u64,
5094         },
5095         {
5096                 .name = "kmem.max_usage_in_bytes",
5097                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5098                 .write = mem_cgroup_reset,
5099                 .read_u64 = mem_cgroup_read_u64,
5100         },
5101 #if defined(CONFIG_MEMCG_KMEM) && \
5102         (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5103         {
5104                 .name = "kmem.slabinfo",
5105                 .seq_show = mem_cgroup_slab_show,
5106         },
5107 #endif
5108         {
5109                 .name = "kmem.tcp.limit_in_bytes",
5110                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5111                 .write = mem_cgroup_write,
5112                 .read_u64 = mem_cgroup_read_u64,
5113         },
5114         {
5115                 .name = "kmem.tcp.usage_in_bytes",
5116                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5117                 .read_u64 = mem_cgroup_read_u64,
5118         },
5119         {
5120                 .name = "kmem.tcp.failcnt",
5121                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5122                 .write = mem_cgroup_reset,
5123                 .read_u64 = mem_cgroup_read_u64,
5124         },
5125         {
5126                 .name = "kmem.tcp.max_usage_in_bytes",
5127                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5128                 .write = mem_cgroup_reset,
5129                 .read_u64 = mem_cgroup_read_u64,
5130         },
5131         { },    /* terminate */
5132 };
5133
5134 /*
5135  * Private memory cgroup IDR
5136  *
5137  * Swap-out records and page cache shadow entries need to store memcg
5138  * references in constrained space, so we maintain an ID space that is
5139  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
5140  * memory-controlled cgroups to 64k.
5141  *
5142  * However, there usually are many references to the offline CSS after
5143  * the cgroup has been destroyed, such as page cache or reclaimable
5144  * slab objects, that don't need to hang on to the ID. We want to keep
5145  * those dead CSS from occupying IDs, or we might quickly exhaust the
5146  * relatively small ID space and prevent the creation of new cgroups
5147  * even when there are much fewer than 64k cgroups - possibly none.
5148  *
5149  * Maintain a private 16-bit ID space for memcg, and allow the ID to
5150  * be freed and recycled when it's no longer needed, which is usually
5151  * when the CSS is offlined.
5152  *
5153  * The only exception to that are records of swapped out tmpfs/shmem
5154  * pages that need to be attributed to live ancestors on swapin. But
5155  * those references are manageable from userspace.
5156  */
5157
5158 #define MEM_CGROUP_ID_MAX       ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
5159 static DEFINE_IDR(mem_cgroup_idr);
5160
5161 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5162 {
5163         if (memcg->id.id > 0) {
5164                 idr_remove(&mem_cgroup_idr, memcg->id.id);
5165                 memcg->id.id = 0;
5166         }
5167 }
5168
5169 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5170                                                   unsigned int n)
5171 {
5172         refcount_add(n, &memcg->id.ref);
5173 }
5174
5175 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5176 {
5177         if (refcount_sub_and_test(n, &memcg->id.ref)) {
5178                 mem_cgroup_id_remove(memcg);
5179
5180                 /* Memcg ID pins CSS */
5181                 css_put(&memcg->css);
5182         }
5183 }
5184
5185 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5186 {
5187         mem_cgroup_id_put_many(memcg, 1);
5188 }
5189
5190 /**
5191  * mem_cgroup_from_id - look up a memcg from a memcg id
5192  * @id: the memcg id to look up
5193  *
5194  * Caller must hold rcu_read_lock().
5195  */
5196 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5197 {
5198         WARN_ON_ONCE(!rcu_read_lock_held());
5199         return idr_find(&mem_cgroup_idr, id);
5200 }
5201
5202 #ifdef CONFIG_SHRINKER_DEBUG
5203 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
5204 {
5205         struct cgroup *cgrp;
5206         struct cgroup_subsys_state *css;
5207         struct mem_cgroup *memcg;
5208
5209         cgrp = cgroup_get_from_id(ino);
5210         if (IS_ERR(cgrp))
5211                 return ERR_CAST(cgrp);
5212
5213         css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
5214         if (css)
5215                 memcg = container_of(css, struct mem_cgroup, css);
5216         else
5217                 memcg = ERR_PTR(-ENOENT);
5218
5219         cgroup_put(cgrp);
5220
5221         return memcg;
5222 }
5223 #endif
5224
5225 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5226 {
5227         struct mem_cgroup_per_node *pn;
5228
5229         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
5230         if (!pn)
5231                 return 1;
5232
5233         pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5234                                                    GFP_KERNEL_ACCOUNT);
5235         if (!pn->lruvec_stats_percpu) {
5236                 kfree(pn);
5237                 return 1;
5238         }
5239
5240         lruvec_init(&pn->lruvec);
5241         pn->memcg = memcg;
5242
5243         memcg->nodeinfo[node] = pn;
5244         return 0;
5245 }
5246
5247 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5248 {
5249         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5250
5251         if (!pn)
5252                 return;
5253
5254         free_percpu(pn->lruvec_stats_percpu);
5255         kfree(pn);
5256 }
5257
5258 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5259 {
5260         int node;
5261
5262         for_each_node(node)
5263                 free_mem_cgroup_per_node_info(memcg, node);
5264         kfree(memcg->vmstats);
5265         free_percpu(memcg->vmstats_percpu);
5266         kfree(memcg);
5267 }
5268
5269 static void mem_cgroup_free(struct mem_cgroup *memcg)
5270 {
5271         lru_gen_exit_memcg(memcg);
5272         memcg_wb_domain_exit(memcg);
5273         __mem_cgroup_free(memcg);
5274 }
5275
5276 static struct mem_cgroup *mem_cgroup_alloc(void)
5277 {
5278         struct mem_cgroup *memcg;
5279         int node;
5280         int __maybe_unused i;
5281         long error = -ENOMEM;
5282
5283         memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
5284         if (!memcg)
5285                 return ERR_PTR(error);
5286
5287         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5288                                  1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
5289         if (memcg->id.id < 0) {
5290                 error = memcg->id.id;
5291                 goto fail;
5292         }
5293
5294         memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
5295         if (!memcg->vmstats)
5296                 goto fail;
5297
5298         memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5299                                                  GFP_KERNEL_ACCOUNT);
5300         if (!memcg->vmstats_percpu)
5301                 goto fail;
5302
5303         for_each_node(node)
5304                 if (alloc_mem_cgroup_per_node_info(memcg, node))
5305                         goto fail;
5306
5307         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5308                 goto fail;
5309
5310         INIT_WORK(&memcg->high_work, high_work_func);
5311         INIT_LIST_HEAD(&memcg->oom_notify);
5312         mutex_init(&memcg->thresholds_lock);
5313         spin_lock_init(&memcg->move_lock);
5314         vmpressure_init(&memcg->vmpressure);
5315         INIT_LIST_HEAD(&memcg->event_list);
5316         spin_lock_init(&memcg->event_list_lock);
5317         memcg->socket_pressure = jiffies;
5318 #ifdef CONFIG_MEMCG_KMEM
5319         memcg->kmemcg_id = -1;
5320         INIT_LIST_HEAD(&memcg->objcg_list);
5321 #endif
5322 #ifdef CONFIG_CGROUP_WRITEBACK
5323         INIT_LIST_HEAD(&memcg->cgwb_list);
5324         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5325                 memcg->cgwb_frn[i].done =
5326                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5327 #endif
5328 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5329         spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5330         INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5331         memcg->deferred_split_queue.split_queue_len = 0;
5332 #endif
5333         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5334         lru_gen_init_memcg(memcg);
5335         return memcg;
5336 fail:
5337         mem_cgroup_id_remove(memcg);
5338         __mem_cgroup_free(memcg);
5339         return ERR_PTR(error);
5340 }
5341
5342 static struct cgroup_subsys_state * __ref
5343 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5344 {
5345         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5346         struct mem_cgroup *memcg, *old_memcg;
5347
5348         old_memcg = set_active_memcg(parent);
5349         memcg = mem_cgroup_alloc();
5350         set_active_memcg(old_memcg);
5351         if (IS_ERR(memcg))
5352                 return ERR_CAST(memcg);
5353
5354         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5355         WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
5356 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
5357         memcg->zswap_max = PAGE_COUNTER_MAX;
5358 #endif
5359         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5360         if (parent) {
5361                 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
5362                 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
5363
5364                 page_counter_init(&memcg->memory, &parent->memory);
5365                 page_counter_init(&memcg->swap, &parent->swap);
5366                 page_counter_init(&memcg->kmem, &parent->kmem);
5367                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5368         } else {
5369                 init_memcg_events();
5370                 page_counter_init(&memcg->memory, NULL);
5371                 page_counter_init(&memcg->swap, NULL);
5372                 page_counter_init(&memcg->kmem, NULL);
5373                 page_counter_init(&memcg->tcpmem, NULL);
5374
5375                 root_mem_cgroup = memcg;
5376                 return &memcg->css;
5377         }
5378
5379         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5380                 static_branch_inc(&memcg_sockets_enabled_key);
5381
5382 #if defined(CONFIG_MEMCG_KMEM)
5383         if (!cgroup_memory_nobpf)
5384                 static_branch_inc(&memcg_bpf_enabled_key);
5385 #endif
5386
5387         return &memcg->css;
5388 }
5389
5390 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5391 {
5392         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5393
5394         if (memcg_online_kmem(memcg))
5395                 goto remove_id;
5396
5397         /*
5398          * A memcg must be visible for expand_shrinker_info()
5399          * by the time the maps are allocated. So, we allocate maps
5400          * here, when for_each_mem_cgroup() can't skip it.
5401          */
5402         if (alloc_shrinker_info(memcg))
5403                 goto offline_kmem;
5404
5405         /* Online state pins memcg ID, memcg ID pins CSS */
5406         refcount_set(&memcg->id.ref, 1);
5407         css_get(css);
5408
5409         if (unlikely(mem_cgroup_is_root(memcg)))
5410                 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5411                                    FLUSH_TIME);
5412         lru_gen_online_memcg(memcg);
5413         return 0;
5414 offline_kmem:
5415         memcg_offline_kmem(memcg);
5416 remove_id:
5417         mem_cgroup_id_remove(memcg);
5418         return -ENOMEM;
5419 }
5420
5421 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5422 {
5423         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5424         struct mem_cgroup_event *event, *tmp;
5425
5426         /*
5427          * Unregister events and notify userspace.
5428          * Notify userspace about cgroup removing only after rmdir of cgroup
5429          * directory to avoid race between userspace and kernelspace.
5430          */
5431         spin_lock_irq(&memcg->event_list_lock);
5432         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5433                 list_del_init(&event->list);
5434                 schedule_work(&event->remove);
5435         }
5436         spin_unlock_irq(&memcg->event_list_lock);
5437
5438         page_counter_set_min(&memcg->memory, 0);
5439         page_counter_set_low(&memcg->memory, 0);
5440
5441         memcg_offline_kmem(memcg);
5442         reparent_shrinker_deferred(memcg);
5443         wb_memcg_offline(memcg);
5444         lru_gen_offline_memcg(memcg);
5445
5446         drain_all_stock(memcg);
5447
5448         mem_cgroup_id_put(memcg);
5449 }
5450
5451 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5452 {
5453         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5454
5455         invalidate_reclaim_iterators(memcg);
5456         lru_gen_release_memcg(memcg);
5457 }
5458
5459 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5460 {
5461         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5462         int __maybe_unused i;
5463
5464 #ifdef CONFIG_CGROUP_WRITEBACK
5465         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5466                 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5467 #endif
5468         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5469                 static_branch_dec(&memcg_sockets_enabled_key);
5470
5471         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5472                 static_branch_dec(&memcg_sockets_enabled_key);
5473
5474 #if defined(CONFIG_MEMCG_KMEM)
5475         if (!cgroup_memory_nobpf)
5476                 static_branch_dec(&memcg_bpf_enabled_key);
5477 #endif
5478
5479         vmpressure_cleanup(&memcg->vmpressure);
5480         cancel_work_sync(&memcg->high_work);
5481         mem_cgroup_remove_from_trees(memcg);
5482         free_shrinker_info(memcg);
5483         mem_cgroup_free(memcg);
5484 }
5485
5486 /**
5487  * mem_cgroup_css_reset - reset the states of a mem_cgroup
5488  * @css: the target css
5489  *
5490  * Reset the states of the mem_cgroup associated with @css.  This is
5491  * invoked when the userland requests disabling on the default hierarchy
5492  * but the memcg is pinned through dependency.  The memcg should stop
5493  * applying policies and should revert to the vanilla state as it may be
5494  * made visible again.
5495  *
5496  * The current implementation only resets the essential configurations.
5497  * This needs to be expanded to cover all the visible parts.
5498  */
5499 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5500 {
5501         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5502
5503         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5504         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5505         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5506         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5507         page_counter_set_min(&memcg->memory, 0);
5508         page_counter_set_low(&memcg->memory, 0);
5509         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5510         WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
5511         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5512         memcg_wb_domain_size_changed(memcg);
5513 }
5514
5515 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5516 {
5517         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5518         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5519         struct memcg_vmstats_percpu *statc;
5520         long delta, v;
5521         int i, nid;
5522
5523         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5524
5525         for (i = 0; i < MEMCG_NR_STAT; i++) {
5526                 /*
5527                  * Collect the aggregated propagation counts of groups
5528                  * below us. We're in a per-cpu loop here and this is
5529                  * a global counter, so the first cycle will get them.
5530                  */
5531                 delta = memcg->vmstats->state_pending[i];
5532                 if (delta)
5533                         memcg->vmstats->state_pending[i] = 0;
5534
5535                 /* Add CPU changes on this level since the last flush */
5536                 v = READ_ONCE(statc->state[i]);
5537                 if (v != statc->state_prev[i]) {
5538                         delta += v - statc->state_prev[i];
5539                         statc->state_prev[i] = v;
5540                 }
5541
5542                 if (!delta)
5543                         continue;
5544
5545                 /* Aggregate counts on this level and propagate upwards */
5546                 memcg->vmstats->state[i] += delta;
5547                 if (parent)
5548                         parent->vmstats->state_pending[i] += delta;
5549         }
5550
5551         for (i = 0; i < NR_MEMCG_EVENTS; i++) {
5552                 delta = memcg->vmstats->events_pending[i];
5553                 if (delta)
5554                         memcg->vmstats->events_pending[i] = 0;
5555
5556                 v = READ_ONCE(statc->events[i]);
5557                 if (v != statc->events_prev[i]) {
5558                         delta += v - statc->events_prev[i];
5559                         statc->events_prev[i] = v;
5560                 }
5561
5562                 if (!delta)
5563                         continue;
5564
5565                 memcg->vmstats->events[i] += delta;
5566                 if (parent)
5567                         parent->vmstats->events_pending[i] += delta;
5568         }
5569
5570         for_each_node_state(nid, N_MEMORY) {
5571                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5572                 struct mem_cgroup_per_node *ppn = NULL;
5573                 struct lruvec_stats_percpu *lstatc;
5574
5575                 if (parent)
5576                         ppn = parent->nodeinfo[nid];
5577
5578                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5579
5580                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5581                         delta = pn->lruvec_stats.state_pending[i];
5582                         if (delta)
5583                                 pn->lruvec_stats.state_pending[i] = 0;
5584
5585                         v = READ_ONCE(lstatc->state[i]);
5586                         if (v != lstatc->state_prev[i]) {
5587                                 delta += v - lstatc->state_prev[i];
5588                                 lstatc->state_prev[i] = v;
5589                         }
5590
5591                         if (!delta)
5592                                 continue;
5593
5594                         pn->lruvec_stats.state[i] += delta;
5595                         if (ppn)
5596                                 ppn->lruvec_stats.state_pending[i] += delta;
5597                 }
5598         }
5599 }
5600
5601 #ifdef CONFIG_MMU
5602 /* Handlers for move charge at task migration. */
5603 static int mem_cgroup_do_precharge(unsigned long count)
5604 {
5605         int ret;
5606
5607         /* Try a single bulk charge without reclaim first, kswapd may wake */
5608         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5609         if (!ret) {
5610                 mc.precharge += count;
5611                 return ret;
5612         }
5613
5614         /* Try charges one by one with reclaim, but do not retry */
5615         while (count--) {
5616                 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5617                 if (ret)
5618                         return ret;
5619                 mc.precharge++;
5620                 cond_resched();
5621         }
5622         return 0;
5623 }
5624
5625 union mc_target {
5626         struct page     *page;
5627         swp_entry_t     ent;
5628 };
5629
5630 enum mc_target_type {
5631         MC_TARGET_NONE = 0,
5632         MC_TARGET_PAGE,
5633         MC_TARGET_SWAP,
5634         MC_TARGET_DEVICE,
5635 };
5636
5637 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5638                                                 unsigned long addr, pte_t ptent)
5639 {
5640         struct page *page = vm_normal_page(vma, addr, ptent);
5641
5642         if (!page || !page_mapped(page))
5643                 return NULL;
5644         if (PageAnon(page)) {
5645                 if (!(mc.flags & MOVE_ANON))
5646                         return NULL;
5647         } else {
5648                 if (!(mc.flags & MOVE_FILE))
5649                         return NULL;
5650         }
5651         if (!get_page_unless_zero(page))
5652                 return NULL;
5653
5654         return page;
5655 }
5656
5657 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5658 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5659                         pte_t ptent, swp_entry_t *entry)
5660 {
5661         struct page *page = NULL;
5662         swp_entry_t ent = pte_to_swp_entry(ptent);
5663
5664         if (!(mc.flags & MOVE_ANON))
5665                 return NULL;
5666
5667         /*
5668          * Handle device private pages that are not accessible by the CPU, but
5669          * stored as special swap entries in the page table.
5670          */
5671         if (is_device_private_entry(ent)) {
5672                 page = pfn_swap_entry_to_page(ent);
5673                 if (!get_page_unless_zero(page))
5674                         return NULL;
5675                 return page;
5676         }
5677
5678         if (non_swap_entry(ent))
5679                 return NULL;
5680
5681         /*
5682          * Because swap_cache_get_folio() updates some statistics counter,
5683          * we call find_get_page() with swapper_space directly.
5684          */
5685         page = find_get_page(swap_address_space(ent), swp_offset(ent));
5686         entry->val = ent.val;
5687
5688         return page;
5689 }
5690 #else
5691 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5692                         pte_t ptent, swp_entry_t *entry)
5693 {
5694         return NULL;
5695 }
5696 #endif
5697
5698 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5699                         unsigned long addr, pte_t ptent)
5700 {
5701         unsigned long index;
5702         struct folio *folio;
5703
5704         if (!vma->vm_file) /* anonymous vma */
5705                 return NULL;
5706         if (!(mc.flags & MOVE_FILE))
5707                 return NULL;
5708
5709         /* folio is moved even if it's not RSS of this task(page-faulted). */
5710         /* shmem/tmpfs may report page out on swap: account for that too. */
5711         index = linear_page_index(vma, addr);
5712         folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
5713         if (IS_ERR(folio))
5714                 return NULL;
5715         return folio_file_page(folio, index);
5716 }
5717
5718 /**
5719  * mem_cgroup_move_account - move account of the page
5720  * @page: the page
5721  * @compound: charge the page as compound or small page
5722  * @from: mem_cgroup which the page is moved from.
5723  * @to: mem_cgroup which the page is moved to. @from != @to.
5724  *
5725  * The page must be locked and not on the LRU.
5726  *
5727  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5728  * from old cgroup.
5729  */
5730 static int mem_cgroup_move_account(struct page *page,
5731                                    bool compound,
5732                                    struct mem_cgroup *from,
5733                                    struct mem_cgroup *to)
5734 {
5735         struct folio *folio = page_folio(page);
5736         struct lruvec *from_vec, *to_vec;
5737         struct pglist_data *pgdat;
5738         unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
5739         int nid, ret;
5740
5741         VM_BUG_ON(from == to);
5742         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5743         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
5744         VM_BUG_ON(compound && !folio_test_large(folio));
5745
5746         ret = -EINVAL;
5747         if (folio_memcg(folio) != from)
5748                 goto out;
5749
5750         pgdat = folio_pgdat(folio);
5751         from_vec = mem_cgroup_lruvec(from, pgdat);
5752         to_vec = mem_cgroup_lruvec(to, pgdat);
5753
5754         folio_memcg_lock(folio);
5755
5756         if (folio_test_anon(folio)) {
5757                 if (folio_mapped(folio)) {
5758                         __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5759                         __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5760                         if (folio_test_transhuge(folio)) {
5761                                 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5762                                                    -nr_pages);
5763                                 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5764                                                    nr_pages);
5765                         }
5766                 }
5767         } else {
5768                 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5769                 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5770
5771                 if (folio_test_swapbacked(folio)) {
5772                         __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5773                         __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5774                 }
5775
5776                 if (folio_mapped(folio)) {
5777                         __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5778                         __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5779                 }
5780
5781                 if (folio_test_dirty(folio)) {
5782                         struct address_space *mapping = folio_mapping(folio);
5783
5784                         if (mapping_can_writeback(mapping)) {
5785                                 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5786                                                    -nr_pages);
5787                                 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5788                                                    nr_pages);
5789                         }
5790                 }
5791         }
5792
5793 #ifdef CONFIG_SWAP
5794         if (folio_test_swapcache(folio)) {
5795                 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
5796                 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
5797         }
5798 #endif
5799         if (folio_test_writeback(folio)) {
5800                 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5801                 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5802         }
5803
5804         /*
5805          * All state has been migrated, let's switch to the new memcg.
5806          *
5807          * It is safe to change page's memcg here because the page
5808          * is referenced, charged, isolated, and locked: we can't race
5809          * with (un)charging, migration, LRU putback, or anything else
5810          * that would rely on a stable page's memory cgroup.
5811          *
5812          * Note that folio_memcg_lock is a memcg lock, not a page lock,
5813          * to save space. As soon as we switch page's memory cgroup to a
5814          * new memcg that isn't locked, the above state can change
5815          * concurrently again. Make sure we're truly done with it.
5816          */
5817         smp_mb();
5818
5819         css_get(&to->css);
5820         css_put(&from->css);
5821
5822         folio->memcg_data = (unsigned long)to;
5823
5824         __folio_memcg_unlock(from);
5825
5826         ret = 0;
5827         nid = folio_nid(folio);
5828
5829         local_irq_disable();
5830         mem_cgroup_charge_statistics(to, nr_pages);
5831         memcg_check_events(to, nid);
5832         mem_cgroup_charge_statistics(from, -nr_pages);
5833         memcg_check_events(from, nid);
5834         local_irq_enable();
5835 out:
5836         return ret;
5837 }
5838
5839 /**
5840  * get_mctgt_type - get target type of moving charge
5841  * @vma: the vma the pte to be checked belongs
5842  * @addr: the address corresponding to the pte to be checked
5843  * @ptent: the pte to be checked
5844  * @target: the pointer the target page or swap ent will be stored(can be NULL)
5845  *
5846  * Returns
5847  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5848  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5849  *     move charge. if @target is not NULL, the page is stored in target->page
5850  *     with extra refcnt got(Callers should handle it).
5851  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5852  *     target for charge migration. if @target is not NULL, the entry is stored
5853  *     in target->ent.
5854  *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
5855  *   thus not on the lru.
5856  *     For now we such page is charge like a regular page would be as for all
5857  *     intent and purposes it is just special memory taking the place of a
5858  *     regular page.
5859  *
5860  *     See Documentations/vm/hmm.txt and include/linux/hmm.h
5861  *
5862  * Called with pte lock held.
5863  */
5864
5865 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5866                 unsigned long addr, pte_t ptent, union mc_target *target)
5867 {
5868         struct page *page = NULL;
5869         enum mc_target_type ret = MC_TARGET_NONE;
5870         swp_entry_t ent = { .val = 0 };
5871
5872         if (pte_present(ptent))
5873                 page = mc_handle_present_pte(vma, addr, ptent);
5874         else if (pte_none_mostly(ptent))
5875                 /*
5876                  * PTE markers should be treated as a none pte here, separated
5877                  * from other swap handling below.
5878                  */
5879                 page = mc_handle_file_pte(vma, addr, ptent);
5880         else if (is_swap_pte(ptent))
5881                 page = mc_handle_swap_pte(vma, ptent, &ent);
5882
5883         if (target && page) {
5884                 if (!trylock_page(page)) {
5885                         put_page(page);
5886                         return ret;
5887                 }
5888                 /*
5889                  * page_mapped() must be stable during the move. This
5890                  * pte is locked, so if it's present, the page cannot
5891                  * become unmapped. If it isn't, we have only partial
5892                  * control over the mapped state: the page lock will
5893                  * prevent new faults against pagecache and swapcache,
5894                  * so an unmapped page cannot become mapped. However,
5895                  * if the page is already mapped elsewhere, it can
5896                  * unmap, and there is nothing we can do about it.
5897                  * Alas, skip moving the page in this case.
5898                  */
5899                 if (!pte_present(ptent) && page_mapped(page)) {
5900                         unlock_page(page);
5901                         put_page(page);
5902                         return ret;
5903                 }
5904         }
5905
5906         if (!page && !ent.val)
5907                 return ret;
5908         if (page) {
5909                 /*
5910                  * Do only loose check w/o serialization.
5911                  * mem_cgroup_move_account() checks the page is valid or
5912                  * not under LRU exclusion.
5913                  */
5914                 if (page_memcg(page) == mc.from) {
5915                         ret = MC_TARGET_PAGE;
5916                         if (is_device_private_page(page) ||
5917                             is_device_coherent_page(page))
5918                                 ret = MC_TARGET_DEVICE;
5919                         if (target)
5920                                 target->page = page;
5921                 }
5922                 if (!ret || !target) {
5923                         if (target)
5924                                 unlock_page(page);
5925                         put_page(page);
5926                 }
5927         }
5928         /*
5929          * There is a swap entry and a page doesn't exist or isn't charged.
5930          * But we cannot move a tail-page in a THP.
5931          */
5932         if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5933             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5934                 ret = MC_TARGET_SWAP;
5935                 if (target)
5936                         target->ent = ent;
5937         }
5938         return ret;
5939 }
5940
5941 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5942 /*
5943  * We don't consider PMD mapped swapping or file mapped pages because THP does
5944  * not support them for now.
5945  * Caller should make sure that pmd_trans_huge(pmd) is true.
5946  */
5947 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5948                 unsigned long addr, pmd_t pmd, union mc_target *target)
5949 {
5950         struct page *page = NULL;
5951         enum mc_target_type ret = MC_TARGET_NONE;
5952
5953         if (unlikely(is_swap_pmd(pmd))) {
5954                 VM_BUG_ON(thp_migration_supported() &&
5955                                   !is_pmd_migration_entry(pmd));
5956                 return ret;
5957         }
5958         page = pmd_page(pmd);
5959         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5960         if (!(mc.flags & MOVE_ANON))
5961                 return ret;
5962         if (page_memcg(page) == mc.from) {
5963                 ret = MC_TARGET_PAGE;
5964                 if (target) {
5965                         get_page(page);
5966                         if (!trylock_page(page)) {
5967                                 put_page(page);
5968                                 return MC_TARGET_NONE;
5969                         }
5970                         target->page = page;
5971                 }
5972         }
5973         return ret;
5974 }
5975 #else
5976 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5977                 unsigned long addr, pmd_t pmd, union mc_target *target)
5978 {
5979         return MC_TARGET_NONE;
5980 }
5981 #endif
5982
5983 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5984                                         unsigned long addr, unsigned long end,
5985                                         struct mm_walk *walk)
5986 {
5987         struct vm_area_struct *vma = walk->vma;
5988         pte_t *pte;
5989         spinlock_t *ptl;
5990
5991         ptl = pmd_trans_huge_lock(pmd, vma);
5992         if (ptl) {
5993                 /*
5994                  * Note their can not be MC_TARGET_DEVICE for now as we do not
5995                  * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5996                  * this might change.
5997                  */
5998                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5999                         mc.precharge += HPAGE_PMD_NR;
6000                 spin_unlock(ptl);
6001                 return 0;
6002         }
6003
6004         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6005         if (!pte)
6006                 return 0;
6007         for (; addr != end; pte++, addr += PAGE_SIZE)
6008                 if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
6009                         mc.precharge++; /* increment precharge temporarily */
6010         pte_unmap_unlock(pte - 1, ptl);
6011         cond_resched();
6012
6013         return 0;
6014 }
6015
6016 static const struct mm_walk_ops precharge_walk_ops = {
6017         .pmd_entry      = mem_cgroup_count_precharge_pte_range,
6018 };
6019
6020 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6021 {
6022         unsigned long precharge;
6023
6024         mmap_read_lock(mm);
6025         walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
6026         mmap_read_unlock(mm);
6027
6028         precharge = mc.precharge;
6029         mc.precharge = 0;
6030
6031         return precharge;
6032 }
6033
6034 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6035 {
6036         unsigned long precharge = mem_cgroup_count_precharge(mm);
6037
6038         VM_BUG_ON(mc.moving_task);
6039         mc.moving_task = current;
6040         return mem_cgroup_do_precharge(precharge);
6041 }
6042
6043 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
6044 static void __mem_cgroup_clear_mc(void)
6045 {
6046         struct mem_cgroup *from = mc.from;
6047         struct mem_cgroup *to = mc.to;
6048
6049         /* we must uncharge all the leftover precharges from mc.to */
6050         if (mc.precharge) {
6051                 cancel_charge(mc.to, mc.precharge);
6052                 mc.precharge = 0;
6053         }
6054         /*
6055          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
6056          * we must uncharge here.
6057          */
6058         if (mc.moved_charge) {
6059                 cancel_charge(mc.from, mc.moved_charge);
6060                 mc.moved_charge = 0;
6061         }
6062         /* we must fixup refcnts and charges */
6063         if (mc.moved_swap) {
6064                 /* uncharge swap account from the old cgroup */
6065                 if (!mem_cgroup_is_root(mc.from))
6066                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
6067
6068                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
6069
6070                 /*
6071                  * we charged both to->memory and to->memsw, so we
6072                  * should uncharge to->memory.
6073                  */
6074                 if (!mem_cgroup_is_root(mc.to))
6075                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
6076
6077                 mc.moved_swap = 0;
6078         }
6079         memcg_oom_recover(from);
6080         memcg_oom_recover(to);
6081         wake_up_all(&mc.waitq);
6082 }
6083
6084 static void mem_cgroup_clear_mc(void)
6085 {
6086         struct mm_struct *mm = mc.mm;
6087
6088         /*
6089          * we must clear moving_task before waking up waiters at the end of
6090          * task migration.
6091          */
6092         mc.moving_task = NULL;
6093         __mem_cgroup_clear_mc();
6094         spin_lock(&mc.lock);
6095         mc.from = NULL;
6096         mc.to = NULL;
6097         mc.mm = NULL;
6098         spin_unlock(&mc.lock);
6099
6100         mmput(mm);
6101 }
6102
6103 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6104 {
6105         struct cgroup_subsys_state *css;
6106         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
6107         struct mem_cgroup *from;
6108         struct task_struct *leader, *p;
6109         struct mm_struct *mm;
6110         unsigned long move_flags;
6111         int ret = 0;
6112
6113         /* charge immigration isn't supported on the default hierarchy */
6114         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6115                 return 0;
6116
6117         /*
6118          * Multi-process migrations only happen on the default hierarchy
6119          * where charge immigration is not used.  Perform charge
6120          * immigration if @tset contains a leader and whine if there are
6121          * multiple.
6122          */
6123         p = NULL;
6124         cgroup_taskset_for_each_leader(leader, css, tset) {
6125                 WARN_ON_ONCE(p);
6126                 p = leader;
6127                 memcg = mem_cgroup_from_css(css);
6128         }
6129         if (!p)
6130                 return 0;
6131
6132         /*
6133          * We are now committed to this value whatever it is. Changes in this
6134          * tunable will only affect upcoming migrations, not the current one.
6135          * So we need to save it, and keep it going.
6136          */
6137         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
6138         if (!move_flags)
6139                 return 0;
6140
6141         from = mem_cgroup_from_task(p);
6142
6143         VM_BUG_ON(from == memcg);
6144
6145         mm = get_task_mm(p);
6146         if (!mm)
6147                 return 0;
6148         /* We move charges only when we move a owner of the mm */
6149         if (mm->owner == p) {
6150                 VM_BUG_ON(mc.from);
6151                 VM_BUG_ON(mc.to);
6152                 VM_BUG_ON(mc.precharge);
6153                 VM_BUG_ON(mc.moved_charge);
6154                 VM_BUG_ON(mc.moved_swap);
6155
6156                 spin_lock(&mc.lock);
6157                 mc.mm = mm;
6158                 mc.from = from;
6159                 mc.to = memcg;
6160                 mc.flags = move_flags;
6161                 spin_unlock(&mc.lock);
6162                 /* We set mc.moving_task later */
6163
6164                 ret = mem_cgroup_precharge_mc(mm);
6165                 if (ret)
6166                         mem_cgroup_clear_mc();
6167         } else {
6168                 mmput(mm);
6169         }
6170         return ret;
6171 }
6172
6173 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6174 {
6175         if (mc.to)
6176                 mem_cgroup_clear_mc();
6177 }
6178
6179 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6180                                 unsigned long addr, unsigned long end,
6181                                 struct mm_walk *walk)
6182 {
6183         int ret = 0;
6184         struct vm_area_struct *vma = walk->vma;
6185         pte_t *pte;
6186         spinlock_t *ptl;
6187         enum mc_target_type target_type;
6188         union mc_target target;
6189         struct page *page;
6190
6191         ptl = pmd_trans_huge_lock(pmd, vma);
6192         if (ptl) {
6193                 if (mc.precharge < HPAGE_PMD_NR) {
6194                         spin_unlock(ptl);
6195                         return 0;
6196                 }
6197                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6198                 if (target_type == MC_TARGET_PAGE) {
6199                         page = target.page;
6200                         if (isolate_lru_page(page)) {
6201                                 if (!mem_cgroup_move_account(page, true,
6202                                                              mc.from, mc.to)) {
6203                                         mc.precharge -= HPAGE_PMD_NR;
6204                                         mc.moved_charge += HPAGE_PMD_NR;
6205                                 }
6206                                 putback_lru_page(page);
6207                         }
6208                         unlock_page(page);
6209                         put_page(page);
6210                 } else if (target_type == MC_TARGET_DEVICE) {
6211                         page = target.page;
6212                         if (!mem_cgroup_move_account(page, true,
6213                                                      mc.from, mc.to)) {
6214                                 mc.precharge -= HPAGE_PMD_NR;
6215                                 mc.moved_charge += HPAGE_PMD_NR;
6216                         }
6217                         unlock_page(page);
6218                         put_page(page);
6219                 }
6220                 spin_unlock(ptl);
6221                 return 0;
6222         }
6223
6224 retry:
6225         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6226         if (!pte)
6227                 return 0;
6228         for (; addr != end; addr += PAGE_SIZE) {
6229                 pte_t ptent = ptep_get(pte++);
6230                 bool device = false;
6231                 swp_entry_t ent;
6232
6233                 if (!mc.precharge)
6234                         break;
6235
6236                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6237                 case MC_TARGET_DEVICE:
6238                         device = true;
6239                         fallthrough;
6240                 case MC_TARGET_PAGE:
6241                         page = target.page;
6242                         /*
6243                          * We can have a part of the split pmd here. Moving it
6244                          * can be done but it would be too convoluted so simply
6245                          * ignore such a partial THP and keep it in original
6246                          * memcg. There should be somebody mapping the head.
6247                          */
6248                         if (PageTransCompound(page))
6249                                 goto put;
6250                         if (!device && !isolate_lru_page(page))
6251                                 goto put;
6252                         if (!mem_cgroup_move_account(page, false,
6253                                                 mc.from, mc.to)) {
6254                                 mc.precharge--;
6255                                 /* we uncharge from mc.from later. */
6256                                 mc.moved_charge++;
6257                         }
6258                         if (!device)
6259                                 putback_lru_page(page);
6260 put:                    /* get_mctgt_type() gets & locks the page */
6261                         unlock_page(page);
6262                         put_page(page);
6263                         break;
6264                 case MC_TARGET_SWAP:
6265                         ent = target.ent;
6266                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6267                                 mc.precharge--;
6268                                 mem_cgroup_id_get_many(mc.to, 1);
6269                                 /* we fixup other refcnts and charges later. */
6270                                 mc.moved_swap++;
6271                         }
6272                         break;
6273                 default:
6274                         break;
6275                 }
6276         }
6277         pte_unmap_unlock(pte - 1, ptl);
6278         cond_resched();
6279
6280         if (addr != end) {
6281                 /*
6282                  * We have consumed all precharges we got in can_attach().
6283                  * We try charge one by one, but don't do any additional
6284                  * charges to mc.to if we have failed in charge once in attach()
6285                  * phase.
6286                  */
6287                 ret = mem_cgroup_do_precharge(1);
6288                 if (!ret)
6289                         goto retry;
6290         }
6291
6292         return ret;
6293 }
6294
6295 static const struct mm_walk_ops charge_walk_ops = {
6296         .pmd_entry      = mem_cgroup_move_charge_pte_range,
6297 };
6298
6299 static void mem_cgroup_move_charge(void)
6300 {
6301         lru_add_drain_all();
6302         /*
6303          * Signal folio_memcg_lock() to take the memcg's move_lock
6304          * while we're moving its pages to another memcg. Then wait
6305          * for already started RCU-only updates to finish.
6306          */
6307         atomic_inc(&mc.from->moving_account);
6308         synchronize_rcu();
6309 retry:
6310         if (unlikely(!mmap_read_trylock(mc.mm))) {
6311                 /*
6312                  * Someone who are holding the mmap_lock might be waiting in
6313                  * waitq. So we cancel all extra charges, wake up all waiters,
6314                  * and retry. Because we cancel precharges, we might not be able
6315                  * to move enough charges, but moving charge is a best-effort
6316                  * feature anyway, so it wouldn't be a big problem.
6317                  */
6318                 __mem_cgroup_clear_mc();
6319                 cond_resched();
6320                 goto retry;
6321         }
6322         /*
6323          * When we have consumed all precharges and failed in doing
6324          * additional charge, the page walk just aborts.
6325          */
6326         walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
6327         mmap_read_unlock(mc.mm);
6328         atomic_dec(&mc.from->moving_account);
6329 }
6330
6331 static void mem_cgroup_move_task(void)
6332 {
6333         if (mc.to) {
6334                 mem_cgroup_move_charge();
6335                 mem_cgroup_clear_mc();
6336         }
6337 }
6338 #else   /* !CONFIG_MMU */
6339 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6340 {
6341         return 0;
6342 }
6343 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6344 {
6345 }
6346 static void mem_cgroup_move_task(void)
6347 {
6348 }
6349 #endif
6350
6351 #ifdef CONFIG_LRU_GEN
6352 static void mem_cgroup_attach(struct cgroup_taskset *tset)
6353 {
6354         struct task_struct *task;
6355         struct cgroup_subsys_state *css;
6356
6357         /* find the first leader if there is any */
6358         cgroup_taskset_for_each_leader(task, css, tset)
6359                 break;
6360
6361         if (!task)
6362                 return;
6363
6364         task_lock(task);
6365         if (task->mm && READ_ONCE(task->mm->owner) == task)
6366                 lru_gen_migrate_mm(task->mm);
6367         task_unlock(task);
6368 }
6369 #else
6370 static void mem_cgroup_attach(struct cgroup_taskset *tset)
6371 {
6372 }
6373 #endif /* CONFIG_LRU_GEN */
6374
6375 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6376 {
6377         if (value == PAGE_COUNTER_MAX)
6378                 seq_puts(m, "max\n");
6379         else
6380                 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6381
6382         return 0;
6383 }
6384
6385 static u64 memory_current_read(struct cgroup_subsys_state *css,
6386                                struct cftype *cft)
6387 {
6388         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6389
6390         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6391 }
6392
6393 static u64 memory_peak_read(struct cgroup_subsys_state *css,
6394                             struct cftype *cft)
6395 {
6396         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6397
6398         return (u64)memcg->memory.watermark * PAGE_SIZE;
6399 }
6400
6401 static int memory_min_show(struct seq_file *m, void *v)
6402 {
6403         return seq_puts_memcg_tunable(m,
6404                 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6405 }
6406
6407 static ssize_t memory_min_write(struct kernfs_open_file *of,
6408                                 char *buf, size_t nbytes, loff_t off)
6409 {
6410         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6411         unsigned long min;
6412         int err;
6413
6414         buf = strstrip(buf);
6415         err = page_counter_memparse(buf, "max", &min);
6416         if (err)
6417                 return err;
6418
6419         page_counter_set_min(&memcg->memory, min);
6420
6421         return nbytes;
6422 }
6423
6424 static int memory_low_show(struct seq_file *m, void *v)
6425 {
6426         return seq_puts_memcg_tunable(m,
6427                 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6428 }
6429
6430 static ssize_t memory_low_write(struct kernfs_open_file *of,
6431                                 char *buf, size_t nbytes, loff_t off)
6432 {
6433         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6434         unsigned long low;
6435         int err;
6436
6437         buf = strstrip(buf);
6438         err = page_counter_memparse(buf, "max", &low);
6439         if (err)
6440                 return err;
6441
6442         page_counter_set_low(&memcg->memory, low);
6443
6444         return nbytes;
6445 }
6446
6447 static int memory_high_show(struct seq_file *m, void *v)
6448 {
6449         return seq_puts_memcg_tunable(m,
6450                 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6451 }
6452
6453 static ssize_t memory_high_write(struct kernfs_open_file *of,
6454                                  char *buf, size_t nbytes, loff_t off)
6455 {
6456         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6457         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6458         bool drained = false;
6459         unsigned long high;
6460         int err;
6461
6462         buf = strstrip(buf);
6463         err = page_counter_memparse(buf, "max", &high);
6464         if (err)
6465                 return err;
6466
6467         page_counter_set_high(&memcg->memory, high);
6468
6469         for (;;) {
6470                 unsigned long nr_pages = page_counter_read(&memcg->memory);
6471                 unsigned long reclaimed;
6472
6473                 if (nr_pages <= high)
6474                         break;
6475
6476                 if (signal_pending(current))
6477                         break;
6478
6479                 if (!drained) {
6480                         drain_all_stock(memcg);
6481                         drained = true;
6482                         continue;
6483                 }
6484
6485                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6486                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
6487
6488                 if (!reclaimed && !nr_retries--)
6489                         break;
6490         }
6491
6492         memcg_wb_domain_size_changed(memcg);
6493         return nbytes;
6494 }
6495
6496 static int memory_max_show(struct seq_file *m, void *v)
6497 {
6498         return seq_puts_memcg_tunable(m,
6499                 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6500 }
6501
6502 static ssize_t memory_max_write(struct kernfs_open_file *of,
6503                                 char *buf, size_t nbytes, loff_t off)
6504 {
6505         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6506         unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6507         bool drained = false;
6508         unsigned long max;
6509         int err;
6510
6511         buf = strstrip(buf);
6512         err = page_counter_memparse(buf, "max", &max);
6513         if (err)
6514                 return err;
6515
6516         xchg(&memcg->memory.max, max);
6517
6518         for (;;) {
6519                 unsigned long nr_pages = page_counter_read(&memcg->memory);
6520
6521                 if (nr_pages <= max)
6522                         break;
6523
6524                 if (signal_pending(current))
6525                         break;
6526
6527                 if (!drained) {
6528                         drain_all_stock(memcg);
6529                         drained = true;
6530                         continue;
6531                 }
6532
6533                 if (nr_reclaims) {
6534                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6535                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
6536                                 nr_reclaims--;
6537                         continue;
6538                 }
6539
6540                 memcg_memory_event(memcg, MEMCG_OOM);
6541                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6542                         break;
6543         }
6544
6545         memcg_wb_domain_size_changed(memcg);
6546         return nbytes;
6547 }
6548
6549 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6550 {
6551         seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6552         seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6553         seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6554         seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6555         seq_printf(m, "oom_kill %lu\n",
6556                    atomic_long_read(&events[MEMCG_OOM_KILL]));
6557         seq_printf(m, "oom_group_kill %lu\n",
6558                    atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
6559 }
6560
6561 static int memory_events_show(struct seq_file *m, void *v)
6562 {
6563         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6564
6565         __memory_events_show(m, memcg->memory_events);
6566         return 0;
6567 }
6568
6569 static int memory_events_local_show(struct seq_file *m, void *v)
6570 {
6571         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6572
6573         __memory_events_show(m, memcg->memory_events_local);
6574         return 0;
6575 }
6576
6577 static int memory_stat_show(struct seq_file *m, void *v)
6578 {
6579         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6580         char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
6581         struct seq_buf s;
6582
6583         if (!buf)
6584                 return -ENOMEM;
6585         seq_buf_init(&s, buf, PAGE_SIZE);
6586         memory_stat_format(memcg, &s);
6587         seq_puts(m, buf);
6588         kfree(buf);
6589         return 0;
6590 }
6591
6592 #ifdef CONFIG_NUMA
6593 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6594                                                      int item)
6595 {
6596         return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6597 }
6598
6599 static int memory_numa_stat_show(struct seq_file *m, void *v)
6600 {
6601         int i;
6602         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6603
6604         mem_cgroup_flush_stats();
6605
6606         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6607                 int nid;
6608
6609                 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6610                         continue;
6611
6612                 seq_printf(m, "%s", memory_stats[i].name);
6613                 for_each_node_state(nid, N_MEMORY) {
6614                         u64 size;
6615                         struct lruvec *lruvec;
6616
6617                         lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6618                         size = lruvec_page_state_output(lruvec,
6619                                                         memory_stats[i].idx);
6620                         seq_printf(m, " N%d=%llu", nid, size);
6621                 }
6622                 seq_putc(m, '\n');
6623         }
6624
6625         return 0;
6626 }
6627 #endif
6628
6629 static int memory_oom_group_show(struct seq_file *m, void *v)
6630 {
6631         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6632
6633         seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
6634
6635         return 0;
6636 }
6637
6638 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6639                                       char *buf, size_t nbytes, loff_t off)
6640 {
6641         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6642         int ret, oom_group;
6643
6644         buf = strstrip(buf);
6645         if (!buf)
6646                 return -EINVAL;
6647
6648         ret = kstrtoint(buf, 0, &oom_group);
6649         if (ret)
6650                 return ret;
6651
6652         if (oom_group != 0 && oom_group != 1)
6653                 return -EINVAL;
6654
6655         WRITE_ONCE(memcg->oom_group, oom_group);
6656
6657         return nbytes;
6658 }
6659
6660 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
6661                               size_t nbytes, loff_t off)
6662 {
6663         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6664         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6665         unsigned long nr_to_reclaim, nr_reclaimed = 0;
6666         unsigned int reclaim_options;
6667         int err;
6668
6669         buf = strstrip(buf);
6670         err = page_counter_memparse(buf, "", &nr_to_reclaim);
6671         if (err)
6672                 return err;
6673
6674         reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
6675         while (nr_reclaimed < nr_to_reclaim) {
6676                 unsigned long reclaimed;
6677
6678                 if (signal_pending(current))
6679                         return -EINTR;
6680
6681                 /*
6682                  * This is the final attempt, drain percpu lru caches in the
6683                  * hope of introducing more evictable pages for
6684                  * try_to_free_mem_cgroup_pages().
6685                  */
6686                 if (!nr_retries)
6687                         lru_add_drain_all();
6688
6689                 reclaimed = try_to_free_mem_cgroup_pages(memcg,
6690                                                 nr_to_reclaim - nr_reclaimed,
6691                                                 GFP_KERNEL, reclaim_options);
6692
6693                 if (!reclaimed && !nr_retries--)
6694                         return -EAGAIN;
6695
6696                 nr_reclaimed += reclaimed;
6697         }
6698
6699         return nbytes;
6700 }
6701
6702 static struct cftype memory_files[] = {
6703         {
6704                 .name = "current",
6705                 .flags = CFTYPE_NOT_ON_ROOT,
6706                 .read_u64 = memory_current_read,
6707         },
6708         {
6709                 .name = "peak",
6710                 .flags = CFTYPE_NOT_ON_ROOT,
6711                 .read_u64 = memory_peak_read,
6712         },
6713         {
6714                 .name = "min",
6715                 .flags = CFTYPE_NOT_ON_ROOT,
6716                 .seq_show = memory_min_show,
6717                 .write = memory_min_write,
6718         },
6719         {
6720                 .name = "low",
6721                 .flags = CFTYPE_NOT_ON_ROOT,
6722                 .seq_show = memory_low_show,
6723                 .write = memory_low_write,
6724         },
6725         {
6726                 .name = "high",
6727                 .flags = CFTYPE_NOT_ON_ROOT,
6728                 .seq_show = memory_high_show,
6729                 .write = memory_high_write,
6730         },
6731         {
6732                 .name = "max",
6733                 .flags = CFTYPE_NOT_ON_ROOT,
6734                 .seq_show = memory_max_show,
6735                 .write = memory_max_write,
6736         },
6737         {
6738                 .name = "events",
6739                 .flags = CFTYPE_NOT_ON_ROOT,
6740                 .file_offset = offsetof(struct mem_cgroup, events_file),
6741                 .seq_show = memory_events_show,
6742         },
6743         {
6744                 .name = "events.local",
6745                 .flags = CFTYPE_NOT_ON_ROOT,
6746                 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6747                 .seq_show = memory_events_local_show,
6748         },
6749         {
6750                 .name = "stat",
6751                 .seq_show = memory_stat_show,
6752         },
6753 #ifdef CONFIG_NUMA
6754         {
6755                 .name = "numa_stat",
6756                 .seq_show = memory_numa_stat_show,
6757         },
6758 #endif
6759         {
6760                 .name = "oom.group",
6761                 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6762                 .seq_show = memory_oom_group_show,
6763                 .write = memory_oom_group_write,
6764         },
6765         {
6766                 .name = "reclaim",
6767                 .flags = CFTYPE_NS_DELEGATABLE,
6768                 .write = memory_reclaim,
6769         },
6770         { }     /* terminate */
6771 };
6772
6773 struct cgroup_subsys memory_cgrp_subsys = {
6774         .css_alloc = mem_cgroup_css_alloc,
6775         .css_online = mem_cgroup_css_online,
6776         .css_offline = mem_cgroup_css_offline,
6777         .css_released = mem_cgroup_css_released,
6778         .css_free = mem_cgroup_css_free,
6779         .css_reset = mem_cgroup_css_reset,
6780         .css_rstat_flush = mem_cgroup_css_rstat_flush,
6781         .can_attach = mem_cgroup_can_attach,
6782         .attach = mem_cgroup_attach,
6783         .cancel_attach = mem_cgroup_cancel_attach,
6784         .post_attach = mem_cgroup_move_task,
6785         .dfl_cftypes = memory_files,
6786         .legacy_cftypes = mem_cgroup_legacy_files,
6787         .early_init = 0,
6788 };
6789
6790 /*
6791  * This function calculates an individual cgroup's effective
6792  * protection which is derived from its own memory.min/low, its
6793  * parent's and siblings' settings, as well as the actual memory
6794  * distribution in the tree.
6795  *
6796  * The following rules apply to the effective protection values:
6797  *
6798  * 1. At the first level of reclaim, effective protection is equal to
6799  *    the declared protection in memory.min and memory.low.
6800  *
6801  * 2. To enable safe delegation of the protection configuration, at
6802  *    subsequent levels the effective protection is capped to the
6803  *    parent's effective protection.
6804  *
6805  * 3. To make complex and dynamic subtrees easier to configure, the
6806  *    user is allowed to overcommit the declared protection at a given
6807  *    level. If that is the case, the parent's effective protection is
6808  *    distributed to the children in proportion to how much protection
6809  *    they have declared and how much of it they are utilizing.
6810  *
6811  *    This makes distribution proportional, but also work-conserving:
6812  *    if one cgroup claims much more protection than it uses memory,
6813  *    the unused remainder is available to its siblings.
6814  *
6815  * 4. Conversely, when the declared protection is undercommitted at a
6816  *    given level, the distribution of the larger parental protection
6817  *    budget is NOT proportional. A cgroup's protection from a sibling
6818  *    is capped to its own memory.min/low setting.
6819  *
6820  * 5. However, to allow protecting recursive subtrees from each other
6821  *    without having to declare each individual cgroup's fixed share
6822  *    of the ancestor's claim to protection, any unutilized -
6823  *    "floating" - protection from up the tree is distributed in
6824  *    proportion to each cgroup's *usage*. This makes the protection
6825  *    neutral wrt sibling cgroups and lets them compete freely over
6826  *    the shared parental protection budget, but it protects the
6827  *    subtree as a whole from neighboring subtrees.
6828  *
6829  * Note that 4. and 5. are not in conflict: 4. is about protecting
6830  * against immediate siblings whereas 5. is about protecting against
6831  * neighboring subtrees.
6832  */
6833 static unsigned long effective_protection(unsigned long usage,
6834                                           unsigned long parent_usage,
6835                                           unsigned long setting,
6836                                           unsigned long parent_effective,
6837                                           unsigned long siblings_protected)
6838 {
6839         unsigned long protected;
6840         unsigned long ep;
6841
6842         protected = min(usage, setting);
6843         /*
6844          * If all cgroups at this level combined claim and use more
6845          * protection than what the parent affords them, distribute
6846          * shares in proportion to utilization.
6847          *
6848          * We are using actual utilization rather than the statically
6849          * claimed protection in order to be work-conserving: claimed
6850          * but unused protection is available to siblings that would
6851          * otherwise get a smaller chunk than what they claimed.
6852          */
6853         if (siblings_protected > parent_effective)
6854                 return protected * parent_effective / siblings_protected;
6855
6856         /*
6857          * Ok, utilized protection of all children is within what the
6858          * parent affords them, so we know whatever this child claims
6859          * and utilizes is effectively protected.
6860          *
6861          * If there is unprotected usage beyond this value, reclaim
6862          * will apply pressure in proportion to that amount.
6863          *
6864          * If there is unutilized protection, the cgroup will be fully
6865          * shielded from reclaim, but we do return a smaller value for
6866          * protection than what the group could enjoy in theory. This
6867          * is okay. With the overcommit distribution above, effective
6868          * protection is always dependent on how memory is actually
6869          * consumed among the siblings anyway.
6870          */
6871         ep = protected;
6872
6873         /*
6874          * If the children aren't claiming (all of) the protection
6875          * afforded to them by the parent, distribute the remainder in
6876          * proportion to the (unprotected) memory of each cgroup. That
6877          * way, cgroups that aren't explicitly prioritized wrt each
6878          * other compete freely over the allowance, but they are
6879          * collectively protected from neighboring trees.
6880          *
6881          * We're using unprotected memory for the weight so that if
6882          * some cgroups DO claim explicit protection, we don't protect
6883          * the same bytes twice.
6884          *
6885          * Check both usage and parent_usage against the respective
6886          * protected values. One should imply the other, but they
6887          * aren't read atomically - make sure the division is sane.
6888          */
6889         if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6890                 return ep;
6891         if (parent_effective > siblings_protected &&
6892             parent_usage > siblings_protected &&
6893             usage > protected) {
6894                 unsigned long unclaimed;
6895
6896                 unclaimed = parent_effective - siblings_protected;
6897                 unclaimed *= usage - protected;
6898                 unclaimed /= parent_usage - siblings_protected;
6899
6900                 ep += unclaimed;
6901         }
6902
6903         return ep;
6904 }
6905
6906 /**
6907  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
6908  * @root: the top ancestor of the sub-tree being checked
6909  * @memcg: the memory cgroup to check
6910  *
6911  * WARNING: This function is not stateless! It can only be used as part
6912  *          of a top-down tree iteration, not for isolated queries.
6913  */
6914 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6915                                      struct mem_cgroup *memcg)
6916 {
6917         unsigned long usage, parent_usage;
6918         struct mem_cgroup *parent;
6919
6920         if (mem_cgroup_disabled())
6921                 return;
6922
6923         if (!root)
6924                 root = root_mem_cgroup;
6925
6926         /*
6927          * Effective values of the reclaim targets are ignored so they
6928          * can be stale. Have a look at mem_cgroup_protection for more
6929          * details.
6930          * TODO: calculation should be more robust so that we do not need
6931          * that special casing.
6932          */
6933         if (memcg == root)
6934                 return;
6935
6936         usage = page_counter_read(&memcg->memory);
6937         if (!usage)
6938                 return;
6939
6940         parent = parent_mem_cgroup(memcg);
6941
6942         if (parent == root) {
6943                 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6944                 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6945                 return;
6946         }
6947
6948         parent_usage = page_counter_read(&parent->memory);
6949
6950         WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6951                         READ_ONCE(memcg->memory.min),
6952                         READ_ONCE(parent->memory.emin),
6953                         atomic_long_read(&parent->memory.children_min_usage)));
6954
6955         WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6956                         READ_ONCE(memcg->memory.low),
6957                         READ_ONCE(parent->memory.elow),
6958                         atomic_long_read(&parent->memory.children_low_usage)));
6959 }
6960
6961 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
6962                         gfp_t gfp)
6963 {
6964         long nr_pages = folio_nr_pages(folio);
6965         int ret;
6966
6967         ret = try_charge(memcg, gfp, nr_pages);
6968         if (ret)
6969                 goto out;
6970
6971         css_get(&memcg->css);
6972         commit_charge(folio, memcg);
6973
6974         local_irq_disable();
6975         mem_cgroup_charge_statistics(memcg, nr_pages);
6976         memcg_check_events(memcg, folio_nid(folio));
6977         local_irq_enable();
6978 out:
6979         return ret;
6980 }
6981
6982 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
6983 {
6984         struct mem_cgroup *memcg;
6985         int ret;
6986
6987         memcg = get_mem_cgroup_from_mm(mm);
6988         ret = charge_memcg(folio, memcg, gfp);
6989         css_put(&memcg->css);
6990
6991         return ret;
6992 }
6993
6994 /**
6995  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
6996  * @folio: folio to charge.
6997  * @mm: mm context of the victim
6998  * @gfp: reclaim mode
6999  * @entry: swap entry for which the folio is allocated
7000  *
7001  * This function charges a folio allocated for swapin. Please call this before
7002  * adding the folio to the swapcache.
7003  *
7004  * Returns 0 on success. Otherwise, an error code is returned.
7005  */
7006 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
7007                                   gfp_t gfp, swp_entry_t entry)
7008 {
7009         struct mem_cgroup *memcg;
7010         unsigned short id;
7011         int ret;
7012
7013         if (mem_cgroup_disabled())
7014                 return 0;
7015
7016         id = lookup_swap_cgroup_id(entry);
7017         rcu_read_lock();
7018         memcg = mem_cgroup_from_id(id);
7019         if (!memcg || !css_tryget_online(&memcg->css))
7020                 memcg = get_mem_cgroup_from_mm(mm);
7021         rcu_read_unlock();
7022
7023         ret = charge_memcg(folio, memcg, gfp);
7024
7025         css_put(&memcg->css);
7026         return ret;
7027 }
7028
7029 /*
7030  * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
7031  * @entry: swap entry for which the page is charged
7032  *
7033  * Call this function after successfully adding the charged page to swapcache.
7034  *
7035  * Note: This function assumes the page for which swap slot is being uncharged
7036  * is order 0 page.
7037  */
7038 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
7039 {
7040         /*
7041          * Cgroup1's unified memory+swap counter has been charged with the
7042          * new swapcache page, finish the transfer by uncharging the swap
7043          * slot. The swap slot would also get uncharged when it dies, but
7044          * it can stick around indefinitely and we'd count the page twice
7045          * the entire time.
7046          *
7047          * Cgroup2 has separate resource counters for memory and swap,
7048          * so this is a non-issue here. Memory and swap charge lifetimes
7049          * correspond 1:1 to page and swap slot lifetimes: we charge the
7050          * page to memory here, and uncharge swap when the slot is freed.
7051          */
7052         if (!mem_cgroup_disabled() && do_memsw_account()) {
7053                 /*
7054                  * The swap entry might not get freed for a long time,
7055                  * let's not wait for it.  The page already received a
7056                  * memory+swap charge, drop the swap entry duplicate.
7057                  */
7058                 mem_cgroup_uncharge_swap(entry, 1);
7059         }
7060 }
7061
7062 struct uncharge_gather {
7063         struct mem_cgroup *memcg;
7064         unsigned long nr_memory;
7065         unsigned long pgpgout;
7066         unsigned long nr_kmem;
7067         int nid;
7068 };
7069
7070 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
7071 {
7072         memset(ug, 0, sizeof(*ug));
7073 }
7074
7075 static void uncharge_batch(const struct uncharge_gather *ug)
7076 {
7077         unsigned long flags;
7078
7079         if (ug->nr_memory) {
7080                 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
7081                 if (do_memsw_account())
7082                         page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
7083                 if (ug->nr_kmem)
7084                         memcg_account_kmem(ug->memcg, -ug->nr_kmem);
7085                 memcg_oom_recover(ug->memcg);
7086         }
7087
7088         local_irq_save(flags);
7089         __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
7090         __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
7091         memcg_check_events(ug->memcg, ug->nid);
7092         local_irq_restore(flags);
7093
7094         /* drop reference from uncharge_folio */
7095         css_put(&ug->memcg->css);
7096 }
7097
7098 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
7099 {
7100         long nr_pages;
7101         struct mem_cgroup *memcg;
7102         struct obj_cgroup *objcg;
7103
7104         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7105
7106         /*
7107          * Nobody should be changing or seriously looking at
7108          * folio memcg or objcg at this point, we have fully
7109          * exclusive access to the folio.
7110          */
7111         if (folio_memcg_kmem(folio)) {
7112                 objcg = __folio_objcg(folio);
7113                 /*
7114                  * This get matches the put at the end of the function and
7115                  * kmem pages do not hold memcg references anymore.
7116                  */
7117                 memcg = get_mem_cgroup_from_objcg(objcg);
7118         } else {
7119                 memcg = __folio_memcg(folio);
7120         }
7121
7122         if (!memcg)
7123                 return;
7124
7125         if (ug->memcg != memcg) {
7126                 if (ug->memcg) {
7127                         uncharge_batch(ug);
7128                         uncharge_gather_clear(ug);
7129                 }
7130                 ug->memcg = memcg;
7131                 ug->nid = folio_nid(folio);
7132
7133                 /* pairs with css_put in uncharge_batch */
7134                 css_get(&memcg->css);
7135         }
7136
7137         nr_pages = folio_nr_pages(folio);
7138
7139         if (folio_memcg_kmem(folio)) {
7140                 ug->nr_memory += nr_pages;
7141                 ug->nr_kmem += nr_pages;
7142
7143                 folio->memcg_data = 0;
7144                 obj_cgroup_put(objcg);
7145         } else {
7146                 /* LRU pages aren't accounted at the root level */
7147                 if (!mem_cgroup_is_root(memcg))
7148                         ug->nr_memory += nr_pages;
7149                 ug->pgpgout++;
7150
7151                 folio->memcg_data = 0;
7152         }
7153
7154         css_put(&memcg->css);
7155 }
7156
7157 void __mem_cgroup_uncharge(struct folio *folio)
7158 {
7159         struct uncharge_gather ug;
7160
7161         /* Don't touch folio->lru of any random page, pre-check: */
7162         if (!folio_memcg(folio))
7163                 return;
7164
7165         uncharge_gather_clear(&ug);
7166         uncharge_folio(folio, &ug);
7167         uncharge_batch(&ug);
7168 }
7169
7170 /**
7171  * __mem_cgroup_uncharge_list - uncharge a list of page
7172  * @page_list: list of pages to uncharge
7173  *
7174  * Uncharge a list of pages previously charged with
7175  * __mem_cgroup_charge().
7176  */
7177 void __mem_cgroup_uncharge_list(struct list_head *page_list)
7178 {
7179         struct uncharge_gather ug;
7180         struct folio *folio;
7181
7182         uncharge_gather_clear(&ug);
7183         list_for_each_entry(folio, page_list, lru)
7184                 uncharge_folio(folio, &ug);
7185         if (ug.memcg)
7186                 uncharge_batch(&ug);
7187 }
7188
7189 /**
7190  * mem_cgroup_migrate - Charge a folio's replacement.
7191  * @old: Currently circulating folio.
7192  * @new: Replacement folio.
7193  *
7194  * Charge @new as a replacement folio for @old. @old will
7195  * be uncharged upon free.
7196  *
7197  * Both folios must be locked, @new->mapping must be set up.
7198  */
7199 void mem_cgroup_migrate(struct folio *old, struct folio *new)
7200 {
7201         struct mem_cgroup *memcg;
7202         long nr_pages = folio_nr_pages(new);
7203         unsigned long flags;
7204
7205         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
7206         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
7207         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
7208         VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
7209
7210         if (mem_cgroup_disabled())
7211                 return;
7212
7213         /* Page cache replacement: new folio already charged? */
7214         if (folio_memcg(new))
7215                 return;
7216
7217         memcg = folio_memcg(old);
7218         VM_WARN_ON_ONCE_FOLIO(!memcg, old);
7219         if (!memcg)
7220                 return;
7221
7222         /* Force-charge the new page. The old one will be freed soon */
7223         if (!mem_cgroup_is_root(memcg)) {
7224                 page_counter_charge(&memcg->memory, nr_pages);
7225                 if (do_memsw_account())
7226                         page_counter_charge(&memcg->memsw, nr_pages);
7227         }
7228
7229         css_get(&memcg->css);
7230         commit_charge(new, memcg);
7231
7232         local_irq_save(flags);
7233         mem_cgroup_charge_statistics(memcg, nr_pages);
7234         memcg_check_events(memcg, folio_nid(new));
7235         local_irq_restore(flags);
7236 }
7237
7238 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
7239 EXPORT_SYMBOL(memcg_sockets_enabled_key);
7240
7241 void mem_cgroup_sk_alloc(struct sock *sk)
7242 {
7243         struct mem_cgroup *memcg;
7244
7245         if (!mem_cgroup_sockets_enabled)
7246                 return;
7247
7248         /* Do not associate the sock with unrelated interrupted task's memcg. */
7249         if (!in_task())
7250                 return;
7251
7252         rcu_read_lock();
7253         memcg = mem_cgroup_from_task(current);
7254         if (mem_cgroup_is_root(memcg))
7255                 goto out;
7256         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7257                 goto out;
7258         if (css_tryget(&memcg->css))
7259                 sk->sk_memcg = memcg;
7260 out:
7261         rcu_read_unlock();
7262 }
7263
7264 void mem_cgroup_sk_free(struct sock *sk)
7265 {
7266         if (sk->sk_memcg)
7267                 css_put(&sk->sk_memcg->css);
7268 }
7269
7270 /**
7271  * mem_cgroup_charge_skmem - charge socket memory
7272  * @memcg: memcg to charge
7273  * @nr_pages: number of pages to charge
7274  * @gfp_mask: reclaim mode
7275  *
7276  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
7277  * @memcg's configured limit, %false if it doesn't.
7278  */
7279 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7280                              gfp_t gfp_mask)
7281 {
7282         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7283                 struct page_counter *fail;
7284
7285                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7286                         memcg->tcpmem_pressure = 0;
7287                         return true;
7288                 }
7289                 memcg->tcpmem_pressure = 1;
7290                 if (gfp_mask & __GFP_NOFAIL) {
7291                         page_counter_charge(&memcg->tcpmem, nr_pages);
7292                         return true;
7293                 }
7294                 return false;
7295         }
7296
7297         if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7298                 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7299                 return true;
7300         }
7301
7302         return false;
7303 }
7304
7305 /**
7306  * mem_cgroup_uncharge_skmem - uncharge socket memory
7307  * @memcg: memcg to uncharge
7308  * @nr_pages: number of pages to uncharge
7309  */
7310 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7311 {
7312         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7313                 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7314                 return;
7315         }
7316
7317         mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7318
7319         refill_stock(memcg, nr_pages);
7320 }
7321
7322 static int __init cgroup_memory(char *s)
7323 {
7324         char *token;
7325
7326         while ((token = strsep(&s, ",")) != NULL) {
7327                 if (!*token)
7328                         continue;
7329                 if (!strcmp(token, "nosocket"))
7330                         cgroup_memory_nosocket = true;
7331                 if (!strcmp(token, "nokmem"))
7332                         cgroup_memory_nokmem = true;
7333                 if (!strcmp(token, "nobpf"))
7334                         cgroup_memory_nobpf = true;
7335         }
7336         return 1;
7337 }
7338 __setup("cgroup.memory=", cgroup_memory);
7339
7340 /*
7341  * subsys_initcall() for memory controller.
7342  *
7343  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
7344  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
7345  * basically everything that doesn't depend on a specific mem_cgroup structure
7346  * should be initialized from here.
7347  */
7348 static int __init mem_cgroup_init(void)
7349 {
7350         int cpu, node;
7351
7352         /*
7353          * Currently s32 type (can refer to struct batched_lruvec_stat) is
7354          * used for per-memcg-per-cpu caching of per-node statistics. In order
7355          * to work fine, we should make sure that the overfill threshold can't
7356          * exceed S32_MAX / PAGE_SIZE.
7357          */
7358         BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7359
7360         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7361                                   memcg_hotplug_cpu_dead);
7362
7363         for_each_possible_cpu(cpu)
7364                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7365                           drain_local_stock);
7366
7367         for_each_node(node) {
7368                 struct mem_cgroup_tree_per_node *rtpn;
7369
7370                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
7371
7372                 rtpn->rb_root = RB_ROOT;
7373                 rtpn->rb_rightmost = NULL;
7374                 spin_lock_init(&rtpn->lock);
7375                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7376         }
7377
7378         return 0;
7379 }
7380 subsys_initcall(mem_cgroup_init);
7381
7382 #ifdef CONFIG_SWAP
7383 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7384 {
7385         while (!refcount_inc_not_zero(&memcg->id.ref)) {
7386                 /*
7387                  * The root cgroup cannot be destroyed, so it's refcount must
7388                  * always be >= 1.
7389                  */
7390                 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
7391                         VM_BUG_ON(1);
7392                         break;
7393                 }
7394                 memcg = parent_mem_cgroup(memcg);
7395                 if (!memcg)
7396                         memcg = root_mem_cgroup;
7397         }
7398         return memcg;
7399 }
7400
7401 /**
7402  * mem_cgroup_swapout - transfer a memsw charge to swap
7403  * @folio: folio whose memsw charge to transfer
7404  * @entry: swap entry to move the charge to
7405  *
7406  * Transfer the memsw charge of @folio to @entry.
7407  */
7408 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
7409 {
7410         struct mem_cgroup *memcg, *swap_memcg;
7411         unsigned int nr_entries;
7412         unsigned short oldid;
7413
7414         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7415         VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
7416
7417         if (mem_cgroup_disabled())
7418                 return;
7419
7420         if (!do_memsw_account())
7421                 return;
7422
7423         memcg = folio_memcg(folio);
7424
7425         VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7426         if (!memcg)
7427                 return;
7428
7429         /*
7430          * In case the memcg owning these pages has been offlined and doesn't
7431          * have an ID allocated to it anymore, charge the closest online
7432          * ancestor for the swap instead and transfer the memory+swap charge.
7433          */
7434         swap_memcg = mem_cgroup_id_get_online(memcg);
7435         nr_entries = folio_nr_pages(folio);
7436         /* Get references for the tail pages, too */
7437         if (nr_entries > 1)
7438                 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7439         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7440                                    nr_entries);
7441         VM_BUG_ON_FOLIO(oldid, folio);
7442         mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7443
7444         folio->memcg_data = 0;
7445
7446         if (!mem_cgroup_is_root(memcg))
7447                 page_counter_uncharge(&memcg->memory, nr_entries);
7448
7449         if (memcg != swap_memcg) {
7450                 if (!mem_cgroup_is_root(swap_memcg))
7451                         page_counter_charge(&swap_memcg->memsw, nr_entries);
7452                 page_counter_uncharge(&memcg->memsw, nr_entries);
7453         }
7454
7455         /*
7456          * Interrupts should be disabled here because the caller holds the
7457          * i_pages lock which is taken with interrupts-off. It is
7458          * important here to have the interrupts disabled because it is the
7459          * only synchronisation we have for updating the per-CPU variables.
7460          */
7461         memcg_stats_lock();
7462         mem_cgroup_charge_statistics(memcg, -nr_entries);
7463         memcg_stats_unlock();
7464         memcg_check_events(memcg, folio_nid(folio));
7465
7466         css_put(&memcg->css);
7467 }
7468
7469 /**
7470  * __mem_cgroup_try_charge_swap - try charging swap space for a folio
7471  * @folio: folio being added to swap
7472  * @entry: swap entry to charge
7473  *
7474  * Try to charge @folio's memcg for the swap space at @entry.
7475  *
7476  * Returns 0 on success, -ENOMEM on failure.
7477  */
7478 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
7479 {
7480         unsigned int nr_pages = folio_nr_pages(folio);
7481         struct page_counter *counter;
7482         struct mem_cgroup *memcg;
7483         unsigned short oldid;
7484
7485         if (do_memsw_account())
7486                 return 0;
7487
7488         memcg = folio_memcg(folio);
7489
7490         VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7491         if (!memcg)
7492                 return 0;
7493
7494         if (!entry.val) {
7495                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7496                 return 0;
7497         }
7498
7499         memcg = mem_cgroup_id_get_online(memcg);
7500
7501         if (!mem_cgroup_is_root(memcg) &&
7502             !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7503                 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7504                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7505                 mem_cgroup_id_put(memcg);
7506                 return -ENOMEM;
7507         }
7508
7509         /* Get references for the tail pages, too */
7510         if (nr_pages > 1)
7511                 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7512         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7513         VM_BUG_ON_FOLIO(oldid, folio);
7514         mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7515
7516         return 0;
7517 }
7518
7519 /**
7520  * __mem_cgroup_uncharge_swap - uncharge swap space
7521  * @entry: swap entry to uncharge
7522  * @nr_pages: the amount of swap space to uncharge
7523  */
7524 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7525 {
7526         struct mem_cgroup *memcg;
7527         unsigned short id;
7528
7529         if (mem_cgroup_disabled())
7530                 return;
7531
7532         id = swap_cgroup_record(entry, 0, nr_pages);
7533         rcu_read_lock();
7534         memcg = mem_cgroup_from_id(id);
7535         if (memcg) {
7536                 if (!mem_cgroup_is_root(memcg)) {
7537                         if (do_memsw_account())
7538                                 page_counter_uncharge(&memcg->memsw, nr_pages);
7539                         else
7540                                 page_counter_uncharge(&memcg->swap, nr_pages);
7541                 }
7542                 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7543                 mem_cgroup_id_put_many(memcg, nr_pages);
7544         }
7545         rcu_read_unlock();
7546 }
7547
7548 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7549 {
7550         long nr_swap_pages = get_nr_swap_pages();
7551
7552         if (mem_cgroup_disabled() || do_memsw_account())
7553                 return nr_swap_pages;
7554         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
7555                 nr_swap_pages = min_t(long, nr_swap_pages,
7556                                       READ_ONCE(memcg->swap.max) -
7557                                       page_counter_read(&memcg->swap));
7558         return nr_swap_pages;
7559 }
7560
7561 bool mem_cgroup_swap_full(struct folio *folio)
7562 {
7563         struct mem_cgroup *memcg;
7564
7565         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
7566
7567         if (vm_swap_full())
7568                 return true;
7569         if (do_memsw_account())
7570                 return false;
7571
7572         memcg = folio_memcg(folio);
7573         if (!memcg)
7574                 return false;
7575
7576         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
7577                 unsigned long usage = page_counter_read(&memcg->swap);
7578
7579                 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7580                     usage * 2 >= READ_ONCE(memcg->swap.max))
7581                         return true;
7582         }
7583
7584         return false;
7585 }
7586
7587 static int __init setup_swap_account(char *s)
7588 {
7589         pr_warn_once("The swapaccount= commandline option is deprecated. "
7590                      "Please report your usecase to linux-mm@kvack.org if you "
7591                      "depend on this functionality.\n");
7592         return 1;
7593 }
7594 __setup("swapaccount=", setup_swap_account);
7595
7596 static u64 swap_current_read(struct cgroup_subsys_state *css,
7597                              struct cftype *cft)
7598 {
7599         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7600
7601         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7602 }
7603
7604 static u64 swap_peak_read(struct cgroup_subsys_state *css,
7605                           struct cftype *cft)
7606 {
7607         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7608
7609         return (u64)memcg->swap.watermark * PAGE_SIZE;
7610 }
7611
7612 static int swap_high_show(struct seq_file *m, void *v)
7613 {
7614         return seq_puts_memcg_tunable(m,
7615                 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7616 }
7617
7618 static ssize_t swap_high_write(struct kernfs_open_file *of,
7619                                char *buf, size_t nbytes, loff_t off)
7620 {
7621         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7622         unsigned long high;
7623         int err;
7624
7625         buf = strstrip(buf);
7626         err = page_counter_memparse(buf, "max", &high);
7627         if (err)
7628                 return err;
7629
7630         page_counter_set_high(&memcg->swap, high);
7631
7632         return nbytes;
7633 }
7634
7635 static int swap_max_show(struct seq_file *m, void *v)
7636 {
7637         return seq_puts_memcg_tunable(m,
7638                 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7639 }
7640
7641 static ssize_t swap_max_write(struct kernfs_open_file *of,
7642                               char *buf, size_t nbytes, loff_t off)
7643 {
7644         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7645         unsigned long max;
7646         int err;
7647
7648         buf = strstrip(buf);
7649         err = page_counter_memparse(buf, "max", &max);
7650         if (err)
7651                 return err;
7652
7653         xchg(&memcg->swap.max, max);
7654
7655         return nbytes;
7656 }
7657
7658 static int swap_events_show(struct seq_file *m, void *v)
7659 {
7660         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7661
7662         seq_printf(m, "high %lu\n",
7663                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7664         seq_printf(m, "max %lu\n",
7665                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7666         seq_printf(m, "fail %lu\n",
7667                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7668
7669         return 0;
7670 }
7671
7672 static struct cftype swap_files[] = {
7673         {
7674                 .name = "swap.current",
7675                 .flags = CFTYPE_NOT_ON_ROOT,
7676                 .read_u64 = swap_current_read,
7677         },
7678         {
7679                 .name = "swap.high",
7680                 .flags = CFTYPE_NOT_ON_ROOT,
7681                 .seq_show = swap_high_show,
7682                 .write = swap_high_write,
7683         },
7684         {
7685                 .name = "swap.max",
7686                 .flags = CFTYPE_NOT_ON_ROOT,
7687                 .seq_show = swap_max_show,
7688                 .write = swap_max_write,
7689         },
7690         {
7691                 .name = "swap.peak",
7692                 .flags = CFTYPE_NOT_ON_ROOT,
7693                 .read_u64 = swap_peak_read,
7694         },
7695         {
7696                 .name = "swap.events",
7697                 .flags = CFTYPE_NOT_ON_ROOT,
7698                 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7699                 .seq_show = swap_events_show,
7700         },
7701         { }     /* terminate */
7702 };
7703
7704 static struct cftype memsw_files[] = {
7705         {
7706                 .name = "memsw.usage_in_bytes",
7707                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7708                 .read_u64 = mem_cgroup_read_u64,
7709         },
7710         {
7711                 .name = "memsw.max_usage_in_bytes",
7712                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7713                 .write = mem_cgroup_reset,
7714                 .read_u64 = mem_cgroup_read_u64,
7715         },
7716         {
7717                 .name = "memsw.limit_in_bytes",
7718                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7719                 .write = mem_cgroup_write,
7720                 .read_u64 = mem_cgroup_read_u64,
7721         },
7722         {
7723                 .name = "memsw.failcnt",
7724                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7725                 .write = mem_cgroup_reset,
7726                 .read_u64 = mem_cgroup_read_u64,
7727         },
7728         { },    /* terminate */
7729 };
7730
7731 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7732 /**
7733  * obj_cgroup_may_zswap - check if this cgroup can zswap
7734  * @objcg: the object cgroup
7735  *
7736  * Check if the hierarchical zswap limit has been reached.
7737  *
7738  * This doesn't check for specific headroom, and it is not atomic
7739  * either. But with zswap, the size of the allocation is only known
7740  * once compression has occured, and this optimistic pre-check avoids
7741  * spending cycles on compression when there is already no room left
7742  * or zswap is disabled altogether somewhere in the hierarchy.
7743  */
7744 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
7745 {
7746         struct mem_cgroup *memcg, *original_memcg;
7747         bool ret = true;
7748
7749         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7750                 return true;
7751
7752         original_memcg = get_mem_cgroup_from_objcg(objcg);
7753         for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
7754              memcg = parent_mem_cgroup(memcg)) {
7755                 unsigned long max = READ_ONCE(memcg->zswap_max);
7756                 unsigned long pages;
7757
7758                 if (max == PAGE_COUNTER_MAX)
7759                         continue;
7760                 if (max == 0) {
7761                         ret = false;
7762                         break;
7763                 }
7764
7765                 cgroup_rstat_flush(memcg->css.cgroup);
7766                 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
7767                 if (pages < max)
7768                         continue;
7769                 ret = false;
7770                 break;
7771         }
7772         mem_cgroup_put(original_memcg);
7773         return ret;
7774 }
7775
7776 /**
7777  * obj_cgroup_charge_zswap - charge compression backend memory
7778  * @objcg: the object cgroup
7779  * @size: size of compressed object
7780  *
7781  * This forces the charge after obj_cgroup_may_swap() allowed
7782  * compression and storage in zwap for this cgroup to go ahead.
7783  */
7784 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
7785 {
7786         struct mem_cgroup *memcg;
7787
7788         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7789                 return;
7790
7791         VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
7792
7793         /* PF_MEMALLOC context, charging must succeed */
7794         if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
7795                 VM_WARN_ON_ONCE(1);
7796
7797         rcu_read_lock();
7798         memcg = obj_cgroup_memcg(objcg);
7799         mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
7800         mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
7801         rcu_read_unlock();
7802 }
7803
7804 /**
7805  * obj_cgroup_uncharge_zswap - uncharge compression backend memory
7806  * @objcg: the object cgroup
7807  * @size: size of compressed object
7808  *
7809  * Uncharges zswap memory on page in.
7810  */
7811 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
7812 {
7813         struct mem_cgroup *memcg;
7814
7815         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7816                 return;
7817
7818         obj_cgroup_uncharge(objcg, size);
7819
7820         rcu_read_lock();
7821         memcg = obj_cgroup_memcg(objcg);
7822         mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
7823         mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
7824         rcu_read_unlock();
7825 }
7826
7827 static u64 zswap_current_read(struct cgroup_subsys_state *css,
7828                               struct cftype *cft)
7829 {
7830         cgroup_rstat_flush(css->cgroup);
7831         return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
7832 }
7833
7834 static int zswap_max_show(struct seq_file *m, void *v)
7835 {
7836         return seq_puts_memcg_tunable(m,
7837                 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
7838 }
7839
7840 static ssize_t zswap_max_write(struct kernfs_open_file *of,
7841                                char *buf, size_t nbytes, loff_t off)
7842 {
7843         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7844         unsigned long max;
7845         int err;
7846
7847         buf = strstrip(buf);
7848         err = page_counter_memparse(buf, "max", &max);
7849         if (err)
7850                 return err;
7851
7852         xchg(&memcg->zswap_max, max);
7853
7854         return nbytes;
7855 }
7856
7857 static struct cftype zswap_files[] = {
7858         {
7859                 .name = "zswap.current",
7860                 .flags = CFTYPE_NOT_ON_ROOT,
7861                 .read_u64 = zswap_current_read,
7862         },
7863         {
7864                 .name = "zswap.max",
7865                 .flags = CFTYPE_NOT_ON_ROOT,
7866                 .seq_show = zswap_max_show,
7867                 .write = zswap_max_write,
7868         },
7869         { }     /* terminate */
7870 };
7871 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
7872
7873 static int __init mem_cgroup_swap_init(void)
7874 {
7875         if (mem_cgroup_disabled())
7876                 return 0;
7877
7878         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7879         WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7880 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7881         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
7882 #endif
7883         return 0;
7884 }
7885 subsys_initcall(mem_cgroup_swap_init);
7886
7887 #endif /* CONFIG_SWAP */