From 38099024e51ee37dee5f0f577ca37175c932e3f7 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 7 Dec 2022 12:16:18 +0200 Subject: [PATCH 01/16] macsec: add missing attribute validation for offload Add missing attribute validation for IFLA_MACSEC_OFFLOAD to the netlink policy. Fixes: 791bb3fcafce ("net: macsec: add support for specifying offload upon link creation") Signed-off-by: Emeel Hakim Reviewed-by: Jiri Pirko Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20221207101618.989-1-ehakim@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index f41f67b..2fbac51 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3698,6 +3698,7 @@ static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, + [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 }, }; static void macsec_free_netdev(struct net_device *dev) -- 2.7.4 From ebaaadc332cd21e9df4dcf9ce12552d9354bbbe4 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 7 Dec 2022 11:53:04 +0100 Subject: [PATCH 02/16] s390/qeth: fix use-after-free in hsci KASAN found that addr was dereferenced after br2dev_event_work was freed. ================================================================== BUG: KASAN: use-after-free in qeth_l2_br2dev_worker+0x5ba/0x6b0 Read of size 1 at addr 00000000fdcea440 by task kworker/u760:4/540 CPU: 17 PID: 540 Comm: kworker/u760:4 Tainted: G E 6.1.0-20221128.rc7.git1.5aa3bed4ce83.300.fc36.s390x+kasan #1 Hardware name: IBM 8561 T01 703 (LPAR) Workqueue: 0.0.8000_event qeth_l2_br2dev_worker Call Trace: [<000000016944d4ce>] dump_stack_lvl+0xc6/0xf8 [<000000016942cd9c>] print_address_description.constprop.0+0x34/0x2a0 [<000000016942d118>] print_report+0x110/0x1f8 [<0000000167a7bd04>] kasan_report+0xfc/0x128 [<000000016938d79a>] qeth_l2_br2dev_worker+0x5ba/0x6b0 [<00000001673edd1e>] process_one_work+0x76e/0x1128 [<00000001673ee85c>] worker_thread+0x184/0x1098 [<000000016740718a>] kthread+0x26a/0x310 [<00000001672c606a>] __ret_from_fork+0x8a/0xe8 [<00000001694711da>] ret_from_fork+0xa/0x40 Allocated by task 108338: kasan_save_stack+0x40/0x68 kasan_set_track+0x36/0x48 __kasan_kmalloc+0xa0/0xc0 qeth_l2_switchdev_event+0x25a/0x738 atomic_notifier_call_chain+0x9c/0xf8 br_switchdev_fdb_notify+0xf4/0x110 fdb_notify+0x122/0x180 fdb_add_entry.constprop.0.isra.0+0x312/0x558 br_fdb_add+0x59e/0x858 rtnl_fdb_add+0x58a/0x928 rtnetlink_rcv_msg+0x5f8/0x8d8 netlink_rcv_skb+0x1f2/0x408 netlink_unicast+0x570/0x790 netlink_sendmsg+0x752/0xbe0 sock_sendmsg+0xca/0x110 ____sys_sendmsg+0x510/0x6a8 ___sys_sendmsg+0x12a/0x180 __sys_sendmsg+0xe6/0x168 __do_sys_socketcall+0x3c8/0x468 do_syscall+0x22c/0x328 __do_syscall+0x94/0xf0 system_call+0x82/0xb0 Freed by task 540: kasan_save_stack+0x40/0x68 kasan_set_track+0x36/0x48 kasan_save_free_info+0x4c/0x68 ____kasan_slab_free+0x14e/0x1a8 __kasan_slab_free+0x24/0x30 __kmem_cache_free+0x168/0x338 qeth_l2_br2dev_worker+0x154/0x6b0 process_one_work+0x76e/0x1128 worker_thread+0x184/0x1098 kthread+0x26a/0x310 __ret_from_fork+0x8a/0xe8 ret_from_fork+0xa/0x40 Last potentially related work creation: kasan_save_stack+0x40/0x68 __kasan_record_aux_stack+0xbe/0xd0 insert_work+0x56/0x2e8 __queue_work+0x4ce/0xd10 queue_work_on+0xf4/0x100 qeth_l2_switchdev_event+0x520/0x738 atomic_notifier_call_chain+0x9c/0xf8 br_switchdev_fdb_notify+0xf4/0x110 fdb_notify+0x122/0x180 fdb_add_entry.constprop.0.isra.0+0x312/0x558 br_fdb_add+0x59e/0x858 rtnl_fdb_add+0x58a/0x928 rtnetlink_rcv_msg+0x5f8/0x8d8 netlink_rcv_skb+0x1f2/0x408 netlink_unicast+0x570/0x790 netlink_sendmsg+0x752/0xbe0 sock_sendmsg+0xca/0x110 ____sys_sendmsg+0x510/0x6a8 ___sys_sendmsg+0x12a/0x180 __sys_sendmsg+0xe6/0x168 __do_sys_socketcall+0x3c8/0x468 do_syscall+0x22c/0x328 __do_syscall+0x94/0xf0 system_call+0x82/0xb0 Second to last potentially related work creation: kasan_save_stack+0x40/0x68 __kasan_record_aux_stack+0xbe/0xd0 kvfree_call_rcu+0xb2/0x760 kernfs_unlink_open_file+0x348/0x430 kernfs_fop_release+0xc2/0x320 __fput+0x1ae/0x768 task_work_run+0x1bc/0x298 exit_to_user_mode_prepare+0x1a0/0x1a8 __do_syscall+0x94/0xf0 system_call+0x82/0xb0 The buggy address belongs to the object at 00000000fdcea400 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 64 bytes inside of 96-byte region [00000000fdcea400, 00000000fdcea460) The buggy address belongs to the physical page: page:000000005a9c26e8 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xfdcea flags: 0x3ffff00000000200(slab|node=0|zone=1|lastcpupid=0x1ffff) raw: 3ffff00000000200 0000000000000000 0000000100000122 000000008008cc00 raw: 0000000000000000 0020004100000000 ffffffff00000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: 00000000fdcea300: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc 00000000fdcea380: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >00000000fdcea400: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ 00000000fdcea480: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc 00000000fdcea500: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ================================================================== Fixes: f7936b7b2663 ("s390/qeth: Update MACs of LEARNING_SYNC device") Reported-by: Thorsten Winkler Signed-off-by: Alexandra Winter Reviewed-by: Wenjia Zhang Reviewed-by: Thorsten Winkler Link: https://lore.kernel.org/r/20221207105304.20494-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_l2_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 9dc9358..c6ded3f 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -758,7 +758,6 @@ static void qeth_l2_br2dev_worker(struct work_struct *work) struct list_head *iter; int err = 0; - kfree(br2dev_event_work); QETH_CARD_TEXT_(card, 4, "b2dw%04lx", event); QETH_CARD_TEXT_(card, 4, "ma%012llx", ether_addr_to_u64(addr)); @@ -815,6 +814,7 @@ unlock: dev_put(brdev); dev_put(lsyncdev); dev_put(dstdev); + kfree(br2dev_event_work); } static int qeth_l2_br2dev_queue_work(struct net_device *brdev, -- 2.7.4 From f8bac7f9fdb0017b32157957ffffd490f95faa07 Mon Sep 17 00:00:00 2001 From: "Radu Nicolae Pirea (OSS)" Date: Wed, 7 Dec 2022 15:23:47 +0200 Subject: [PATCH 03/16] net: dsa: sja1105: avoid out of bounds access in sja1105_init_l2_policing() The SJA1105 family has 45 L2 policing table entries (SJA1105_MAX_L2_POLICING_COUNT) and SJA1110 has 110 (SJA1110_MAX_L2_POLICING_COUNT). Keeping the table structure but accounting for the difference in port count (5 in SJA1105 vs 10 in SJA1110) does not fully explain the difference. Rather, the SJA1110 also has L2 ingress policers for multicast traffic. If a packet is classified as multicast, it will be processed by the policer index 99 + SRCPORT. The sja1105_init_l2_policing() function initializes all L2 policers such that they don't interfere with normal packet reception by default. To have a common code between SJA1105 and SJA1110, the index of the multicast policer for the port is calculated because it's an index that is out of bounds for SJA1105 but in bounds for SJA1110, and a bounds check is performed. The code fails to do the proper thing when determining what to do with the multicast policer of port 0 on SJA1105 (ds->num_ports = 5). The "mcast" index will be equal to 45, which is also equal to table->ops->max_entry_count (SJA1105_MAX_L2_POLICING_COUNT). So it passes through the check. But at the same time, SJA1105 doesn't have multicast policers. So the code programs the SHARINDX field of an out-of-bounds element in the L2 Policing table of the static config. The comparison between index 45 and 45 entries should have determined the code to not access this policer index on SJA1105, since its memory wasn't even allocated. With enough bad luck, the out-of-bounds write could even overwrite other valid kernel data, but in this case, the issue was detected using KASAN. Kernel log: sja1105 spi5.0: Probed switch chip: SJA1105Q ================================================================== BUG: KASAN: slab-out-of-bounds in sja1105_setup+0x1cbc/0x2340 Write of size 8 at addr ffffff880bd57708 by task kworker/u8:0/8 ... Workqueue: events_unbound deferred_probe_work_func Call trace: ... sja1105_setup+0x1cbc/0x2340 dsa_register_switch+0x1284/0x18d0 sja1105_probe+0x748/0x840 ... Allocated by task 8: ... sja1105_setup+0x1bcc/0x2340 dsa_register_switch+0x1284/0x18d0 sja1105_probe+0x748/0x840 ... Fixes: 38fbe91f2287 ("net: dsa: sja1105: configure the multicast policers, if present") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Radu Nicolae Pirea (OSS) Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20221207132347.38698-1-radu-nicolae.pirea@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 4126661..b70dcf3 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1038,7 +1038,7 @@ static int sja1105_init_l2_policing(struct sja1105_private *priv) policing[bcast].sharindx = port; /* Only SJA1110 has multicast policers */ - if (mcast <= table->ops->max_entry_count) + if (mcast < table->ops->max_entry_count) policing[mcast].sharindx = port; } -- 2.7.4 From fbf8321238bac04368f57af572e05a9c01347a0b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 Dec 2022 16:53:15 -1000 Subject: [PATCH 04/16] memcg: Fix possible use-after-free in memcg_write_event_control() memcg_write_event_control() accesses the dentry->d_name of the specified control fd to route the write call. As a cgroup interface file can't be renamed, it's safe to access d_name as long as the specified file is a regular cgroup file. Also, as these cgroup interface files can't be removed before the directory, it's safe to access the parent too. Prior to 347c4a874710 ("memcg: remove cgroup_event->cft"), there was a call to __file_cft() which verified that the specified file is a regular cgroupfs file before further accesses. The cftype pointer returned from __file_cft() was no longer necessary and the commit inadvertently dropped the file type check with it allowing any file to slip through. With the invarients broken, the d_name and parent accesses can now race against renames and removals of arbitrary files and cause use-after-free's. Fix the bug by resurrecting the file type check in __file_cft(). Now that cgroupfs is implemented through kernfs, checking the file operations needs to go through a layer of indirection. Instead, let's check the superblock and dentry type. Signed-off-by: Tejun Heo Fixes: 347c4a874710 ("memcg: remove cgroup_event->cft") Cc: stable@kernel.org # v3.14+ Reported-by: Jann Horn Acked-by: Johannes Weiner Acked-by: Roman Gushchin Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup-internal.h | 1 - mm/memcontrol.c | 15 +++++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44..2b7d077 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,7 @@ struct css_task_iter { struct list_head iters_node; /* css_set->task_iters */ }; +extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index fd40208..367b0a4 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -167,7 +167,6 @@ struct cgroup_mgctx { extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1a35c1..266a1ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4832,6 +4832,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; @@ -4886,6 +4887,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4893,7 +4904,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4917,7 +4928,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) -- 2.7.4 From 40f2432b53a01b6d5e3a9057f1d5c406930e1360 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 7 Dec 2022 15:24:32 +0100 Subject: [PATCH 05/16] Revert "HID: logitech-hidpp: Remove special-casing of Bluetooth devices" This reverts commit 8544c812e43ab7bdf40458411b83987b8cba924d. We need to revert commit 532223c8ac57 ("HID: logitech-hidpp: Enable HID++ for all the Logitech Bluetooth devices") because that commit might make hid-logitech-hidpp bind on mice that are not well enough supported by hid-logitech-hidpp, and the end result is that the probe of those mice is now returning -ENODEV, leaving the end user with a dead mouse. Given that commit 8544c812e43a ("HID: logitech-hidpp: Remove special-casing of Bluetooth devices") is a direct dependency of 532223c8ac57, revert it too. Note that this also adapt according to commit 908d325e1665 ("HID: logitech-hidpp: Detect hi-res scrolling support") to re-add support of the devices that were removed from that commit too. I have locally an MX Master and I tested this device with that revert, ensuring we still have high-res scrolling. Reported-by: Rafael J . Wysocki Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index 71a9c25..e9c7aa4 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -4367,6 +4367,15 @@ static const struct hid_device_id hidpp_devices[] = { { /* MX5500 keyboard over Bluetooth */ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb30b), .driver_data = HIDPP_QUIRK_HIDPP_CONSUMER_VENDOR_KEYS }, + { /* M-RCQ142 V470 Cordless Laser Mouse over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb008) }, + { /* MX Master mouse over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb012) }, + { /* MX Ergo trackball over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb01d) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb01e) }, + { /* MX Master 3 mouse over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb023) }, { /* And try to enable HID++ for all the Logitech Bluetooth devices */ HID_DEVICE(BUS_BLUETOOTH, HID_GROUP_ANY, USB_VENDOR_ID_LOGITECH, HID_ANY_ID) }, -- 2.7.4 From a9d9e46c755a189ccb44d91b8cf737742a975de8 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 7 Dec 2022 15:24:33 +0100 Subject: [PATCH 06/16] Revert "HID: logitech-hidpp: Enable HID++ for all the Logitech Bluetooth devices" This reverts commit 532223c8ac57605a10e46dc0ab23dcf01c9acb43. As reported in [0], hid-logitech-hidpp now binds on all bluetooth mice, but there are corner cases where hid-logitech-hidpp just gives up on the mouse. This leads the end user with a dead mouse. Given that we are at -rc8, we are definitively too late to find a proper fix. We already identified 2 issues less than 24 hours after the bug report. One in that ->match() was never designed to be used anywhere else than in hid-generic, and the other that hid-logitech-hidpp has corner cases where it gives up on devices it is not supposed to. So we have no choice but postpone this patch to the next kernel release. [0] https://lore.kernel.org/linux-input/CAJZ5v0g-_o4AqMgNwihCb0jrwrcJZfRrX=jv8aH54WNKO7QB8A@mail.gmail.com/ Reported-by: Rafael J . Wysocki Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index e9c7aa4..8a2aac1 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -4269,21 +4269,6 @@ static void hidpp_remove(struct hid_device *hdev) mutex_destroy(&hidpp->send_mutex); } -static const struct hid_device_id unhandled_hidpp_devices[] = { - /* Logitech Harmony Adapter for PS3, handled in hid-sony */ - { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_HARMONY_PS3) }, - /* Handled in hid-generic */ - { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DINOVO_EDGE_KBD) }, - {} -}; - -static bool hidpp_match(struct hid_device *hdev, - bool ignore_special_driver) -{ - /* Refuse to handle devices handled by other HID drivers */ - return !hid_match_id(hdev, unhandled_hidpp_devices); -} - #define LDJ_DEVICE(product) \ HID_DEVICE(BUS_USB, HID_GROUP_LOGITECH_DJ_DEVICE, \ USB_VENDOR_ID_LOGITECH, (product)) @@ -4376,9 +4361,6 @@ static const struct hid_device_id hidpp_devices[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb01e) }, { /* MX Master 3 mouse over Bluetooth */ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb023) }, - - { /* And try to enable HID++ for all the Logitech Bluetooth devices */ - HID_DEVICE(BUS_BLUETOOTH, HID_GROUP_ANY, USB_VENDOR_ID_LOGITECH, HID_ANY_ID) }, {} }; @@ -4392,7 +4374,6 @@ static const struct hid_usage_id hidpp_usages[] = { static struct hid_driver hidpp_driver = { .name = "logitech-hidpp-device", .id_table = hidpp_devices, - .match = hidpp_match, .report_fixup = hidpp_report_fixup, .probe = hidpp_probe, .remove = hidpp_remove, -- 2.7.4 From 38f1d4aefdac30600698ab07cb1e41fca5851f7a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 1 Dec 2022 18:45:40 +0200 Subject: [PATCH 07/16] mailmap: update Matti Vaittinen's email address The email backend used by ROHM keeps labeling patches as spam. This can result in missing the patches. Switch my mail address from a company mail to a personal one. Link: https://lkml.kernel.org/r/8f4498b66fedcbded37b3b87e0c516e659f8f583.1669912977.git.mazziesaccount@gmail.com Signed-off-by: Matti Vaittinen Suggested-by: Krzysztof Kozlowski Cc: Anup Patel Cc: Arnd Bergmann Cc: Atish Patra Cc: Baolin Wang Cc: Ben Widawsky Cc: Bjorn Andersson Cc: Christian Brauner Cc: Colin Ian King Cc: Kirill Tkhai Cc: Qais Yousef Cc: Vasily Averin Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index aec5172..315e220 100644 --- a/.mailmap +++ b/.mailmap @@ -287,6 +287,7 @@ Matthew Wilcox Matthew Wilcox Matthias Fuchs Matthieu CASTET +Matti Vaittinen Matt Ranostay Matt Ranostay Matthew Ranostay Matt Ranostay -- 2.7.4 From f5ad5083404bb56c9de777dccb68c6672ef6487e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 2 Dec 2022 17:27:24 +0100 Subject: [PATCH 08/16] mm: do not BUG_ON missing brk mapping, because userspace can unmap it The following program will trigger the BUG_ON that this patch removes, because the user can munmap() mm->brk: #include #include #include #include static void *brk_now(void) { return (void *)syscall(SYS_brk, 0); } static void brk_set(void *b) { assert(syscall(SYS_brk, b) != -1); } int main(int argc, char *argv[]) { void *b = brk_now(); brk_set(b + 4096); assert(munmap(b - 4096, 4096 * 2) == 0); brk_set(b); return 0; } Compile that with musl, since glibc actually uses brk(), and then execute it, and it'll hit this splat: kernel BUG at mm/mmap.c:229! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 1379 Comm: a.out Tainted: G S U 6.1.0-rc7+ #419 RIP: 0010:__do_sys_brk+0x2fc/0x340 Code: 00 00 4c 89 ef e8 04 d3 fe ff eb 9a be 01 00 00 00 4c 89 ff e8 35 e0 fe ff e9 6e ff ff ff 4d 89 a7 20> RSP: 0018:ffff888140bc7eb0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000007e7000 RCX: ffff8881020fe000 RDX: ffff8881020fe001 RSI: ffff8881955c9b00 RDI: ffff8881955c9b08 RBP: 0000000000000000 R08: ffff8881955c9b00 R09: 00007ffc77844000 R10: 0000000000000000 R11: 0000000000000001 R12: 00000000007e8000 R13: 00000000007e8000 R14: 00000000007e7000 R15: ffff8881020fe000 FS: 0000000000604298(0000) GS:ffff88901f700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000603fe0 CR3: 000000015ba9a005 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x400678 Code: 10 4c 8d 41 08 4c 89 44 24 10 4c 8b 01 8b 4c 24 08 83 f9 2f 77 0a 4c 8d 4c 24 20 4c 01 c9 eb 05 48 8b> RSP: 002b:00007ffc77863890 EFLAGS: 00000212 ORIG_RAX: 000000000000000c RAX: ffffffffffffffda RBX: 000000000040031b RCX: 0000000000400678 RDX: 00000000004006a1 RSI: 00000000007e6000 RDI: 00000000007e7000 RBP: 00007ffc77863900 R08: 0000000000000000 R09: 00000000007e6000 R10: 00007ffc77863930 R11: 0000000000000212 R12: 00007ffc77863978 R13: 00007ffc77863988 R14: 0000000000000000 R15: 0000000000000000 Instead, just return the old brk value if the original mapping has been removed. [akpm@linux-foundation.org: fix changelog, per Liam] Link: https://lkml.kernel.org/r/20221202162724.2009-1-Jason@zx2c4.com Fixes: 2e7ce7d354f2 ("mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()") Signed-off-by: Jason A. Donenfeld Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Reviewed-by: SeongJae Park Cc: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Will Deacon Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 74a84eb..0024e61 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -226,8 +226,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Search one past newbrk */ mas_set(&mas, newbrk); brkvma = mas_find(&mas, oldbrk); - BUG_ON(brkvma == NULL); - if (brkvma->vm_start >= oldbrk) + if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. -- 2.7.4 From de16d6e4a9fb13d07c88d0e57754348662f53b23 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Dec 2022 09:50:26 +0100 Subject: [PATCH 09/16] kselftests: cgroup: update kmem test precision tolerance MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 1813e51eece0 ("memcg: increase MEMCG_CHARGE_BATCH to 64") has changed the batch size while this test case has been left behind. This has led to a test failure reported by test bot: not ok 2 selftests: cgroup: test_kmem # exit=1 Update the tolerance for the pcp charges to reflect the MEMCG_CHARGE_BATCH change to fix this. [akpm@linux-foundation.org: update comments, per Roman] Link: https://lkml.kernel.org/r/Y4m8Unt6FhWKC6IH@dhcp22.suse.cz Fixes: 1813e51eece0a ("memcg: increase MEMCG_CHARGE_BATCH to 64") Signed-off-by: Michal Hocko Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202212010958.c1053bd3-yujie.liu@intel.com Acked-by: Shakeel Butt Acked-by: Roman Gushchin Tested-by: Yujie Liu Cc: Eric Dumazet Cc: Feng Tang Cc: Johannes Weiner Cc: "Michal Koutný" Cc: Muchun Song Cc: Soheil Hassas Yeganeh Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_kmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 22b31eb..258ddc5 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -19,12 +19,12 @@ /* - * Memory cgroup charging is performed using percpu batches 32 pages + * Memory cgroup charging is performed using percpu batches 64 pages * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So * the maximum discrepancy between charge and vmstat entries is number - * of cpus multiplied by 32 pages. + * of cpus multiplied by 64 pages. */ -#define MAX_VMSTAT_ERROR (4096 * 32 * get_nprocs()) +#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) static int alloc_dcache(const char *cgroup, void *arg) -- 2.7.4 From 44bcabd70cf1425b4243e02251c02b01638a8287 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 4 Dec 2022 16:51:50 -0800 Subject: [PATCH 10/16] tmpfs: fix data loss from failed fallocate Fix tmpfs data loss when the fallocate system call is interrupted by a signal, or fails for some other reason. The partial folio handling in shmem_undo_range() forgot to consider this unfalloc case, and was liable to erase or truncate out data which had already been committed earlier. It turns out that none of the partial folio handling there is appropriate for the unfalloc case, which just wants to proceed to removal of whole folios: which find_get_entries() provides, even when partially covered. Original patch by Rui Wang. Link: https://lore.kernel.org/linux-mm/33b85d82.7764.1842e9ab207.Coremail.chenguoqic@163.com/ Link: https://lkml.kernel.org/r/a5dac112-cf4b-7af-a33-f386e347fd38@google.com Fixes: b9a8a4195c7d ("truncate,shmem: Handle truncates that split large folios") Signed-off-by: Hugh Dickins Reported-by: Guoqi Chen Link: https://lore.kernel.org/all/20221101032248.819360-1-kernel@hev.cc/ Cc: Rui Wang Cc: Huacai Chen Cc: Matthew Wilcox Cc: Vishal Moola (Oracle) Cc: [5.17+] Signed-off-by: Andrew Morton --- mm/shmem.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index c1d8b8a..82911fef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -948,6 +948,15 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index++; } + /* + * When undoing a failed fallocate, we want none of the partial folio + * zeroing and splitting below, but shall want to truncate the whole + * folio when !uptodate indicates that it was added by this fallocate, + * even when [lstart, lend] covers only a part of the folio. + */ + if (unfalloc) + goto whole_folios; + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { @@ -973,6 +982,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_put(folio); } +whole_folios: + index = start; while (index < end) { cond_resched(); -- 2.7.4 From 630dc25e43dacfb5af94cd41532e77c47ec1caff Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 16:08:57 +0100 Subject: [PATCH 11/16] mm/swap: fix SWP_PFN_BITS with CONFIG_PHYS_ADDR_T_64BIT on 32bit We use "unsigned long" to store a PFN in the kernel and phys_addr_t to store a physical address. On a 64bit system, both are 64bit wide. However, on a 32bit system, the latter might be 64bit wide. This is, for example, the case on x86 with PAE: phys_addr_t and PTEs are 64bit wide, while "unsigned long" only spans 32bit. The current definition of SWP_PFN_BITS without MAX_PHYSMEM_BITS misses that case, and assumes that the maximum PFN is limited by an 32bit phys_addr_t. This implies, that SWP_PFN_BITS will currently only be able to cover 4 GiB - 1 on any 32bit system with 4k page size, which is wrong. Let's rely on the number of bits in phys_addr_t instead, but make sure to not exceed the maximum swap offset, to not make the BUILD_BUG_ON() in is_pfn_swap_entry() unhappy. Note that swp_entry_t is effectively an unsigned long and the maximum swap offset shares that value with the swap type. For example, on an 8 GiB x86 PAE system with a kernel config based on Debian 11.5 (-> CONFIG_FLATMEM=y, CONFIG_X86_PAE=y), we will currently fail removing migration entries (remove_migration_ptes()), because mm/page_vma_mapped.c:check_pte() will fail to identify a PFN match as swp_offset_pfn() wrongly masks off PFN bits. For example, split_huge_page_to_list()->...->remap_page() will leave migration entries in place and continue to unlock the page. Later, when we stumble over these migration entries (e.g., via /proc/self/pagemap), pfn_swap_entry_to_page() will BUG_ON() because these migration entries shouldn't exist anymore and the page was unlocked. [ 33.067591] kernel BUG at include/linux/swapops.h:497! [ 33.067597] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 33.067602] CPU: 3 PID: 742 Comm: cow Tainted: G E 6.1.0-rc8+ #16 [ 33.067605] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 [ 33.067606] EIP: pagemap_pmd_range+0x644/0x650 [ 33.067612] Code: 00 00 00 00 66 90 89 ce b9 00 f0 ff ff e9 ff fb ff ff 89 d8 31 db e8 48 c6 52 00 e9 23 fb ff ff e8 61 83 56 00 e9 b6 fe ff ff <0f> 0b bf 00 f0 ff ff e9 38 fa ff ff 3e 8d 74 26 00 55 89 e5 57 31 [ 33.067615] EAX: ee394000 EBX: 00000002 ECX: ee394000 EDX: 00000000 [ 33.067617] ESI: c1b0ded4 EDI: 00024a00 EBP: c1b0ddb4 ESP: c1b0dd68 [ 33.067619] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010246 [ 33.067624] CR0: 80050033 CR2: b7a00000 CR3: 01bbbd20 CR4: 00350ef0 [ 33.067625] Call Trace: [ 33.067628] ? madvise_free_pte_range+0x720/0x720 [ 33.067632] ? smaps_pte_range+0x4b0/0x4b0 [ 33.067634] walk_pgd_range+0x325/0x720 [ 33.067637] ? mt_find+0x1d6/0x3a0 [ 33.067641] ? mt_find+0x1d6/0x3a0 [ 33.067643] __walk_page_range+0x164/0x170 [ 33.067646] walk_page_range+0xf9/0x170 [ 33.067648] ? __kmem_cache_alloc_node+0x2a8/0x340 [ 33.067653] pagemap_read+0x124/0x280 [ 33.067658] ? default_llseek+0x101/0x160 [ 33.067662] ? smaps_account+0x1d0/0x1d0 [ 33.067664] vfs_read+0x90/0x290 [ 33.067667] ? do_madvise.part.0+0x24b/0x390 [ 33.067669] ? debug_smp_processor_id+0x12/0x20 [ 33.067673] ksys_pread64+0x58/0x90 [ 33.067675] __ia32_sys_ia32_pread64+0x1b/0x20 [ 33.067680] __do_fast_syscall_32+0x4c/0xc0 [ 33.067683] do_fast_syscall_32+0x29/0x60 [ 33.067686] do_SYSENTER_32+0x15/0x20 [ 33.067689] entry_SYSENTER_32+0x98/0xf1 Decrease the indentation level of SWP_PFN_BITS and SWP_PFN_MASK to keep it readable and consistent. [david@redhat.com: rely on sizeof(phys_addr_t) and min_t() instead] Link: https://lkml.kernel.org/r/20221206105737.69478-1-david@redhat.com [david@redhat.com: use "int" for comparison, as we're only comparing numbers < 64] Link: https://lkml.kernel.org/r/1f157500-2676-7cef-a84e-9224ed64e540@redhat.com Link: https://lkml.kernel.org/r/20221205150857.167583-1-david@redhat.com Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Yang Shi Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- include/linux/swapops.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95cc..b07b277 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -33,11 +33,13 @@ * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS -#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ -#define SWP_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#define SWP_PFN_BITS min_t(int, \ + sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ + SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ -#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) +#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: -- 2.7.4 From 6c28ca6485ddd7c5da171e479e3ebfbe661efc4d Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 5 Dec 2022 19:23:17 +0000 Subject: [PATCH 12/16] mmap: fix do_brk_flags() modifying obviously incorrect VMAs Add more sanity checks to the VMA that do_brk_flags() will expand. Ensure the VMA matches basic merge requirements within the function before calling can_vma_merge_after(). Drop the duplicate checks from vm_brk_flags() since they will be enforced later. The old code would expand file VMAs on brk(), which is functionally wrong and also dangerous in terms of locking because the brk() path isn't designed for file VMAs and therefore doesn't lock the file mapping. Checking can_vma_merge_after() ensures that new anonymous VMAs can't be merged into file VMAs. See https://lore.kernel.org/linux-mm/CAG48ez1tJZTOjS_FjRZhvtDA-STFmdw8PEizPDwMGFd_ui0Nrw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20221205192304.1957418-1-Liam.Howlett@oracle.com Fixes: 2e7ce7d354f2 ("mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()") Signed-off-by: Liam R. Howlett Suggested-by: Jann Horn Cc: Jason A. Donenfeld Cc: Matthew Wilcox Cc: SeongJae Park Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/mmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 0024e61..1d9aac3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2948,9 +2948,9 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ - if (vma && - (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && - ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + if (vma && vma->vm_end == addr && !vma_policy(vma) && + can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -3037,11 +3037,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) goto munmap_failed; vma = mas_prev(&mas, 0); - if (!vma || vma->vm_end != addr || vma_policy(vma) || - !can_vma_merge_after(vma, flags, NULL, NULL, - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) - vma = NULL; - ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); -- 2.7.4 From fcd0ccd836ffad73d98a66f6fea7b16f735ea920 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 6 Dec 2022 22:00:53 -0800 Subject: [PATCH 13/16] mm/gup: fix gup_pud_range() for dax For dax pud, pud_huge() returns true on x86. So the function works as long as hugetlb is configured. However, dax doesn't depend on hugetlb. Commit 414fd080d125 ("mm/gup: fix gup_pmd_range() for dax") fixed devmap-backed huge PMDs, but missed devmap-backed huge PUDs. Fix this as well. This fixes the below kernel panic: general protection fault, probably for non-canonical address 0x69e7c000cc478: 0000 [#1] SMP < snip > Call Trace: get_user_pages_fast+0x1f/0x40 iov_iter_get_pages+0xc6/0x3b0 ? mempool_alloc+0x5d/0x170 bio_iov_iter_get_pages+0x82/0x4e0 ? bvec_alloc+0x91/0xc0 ? bio_alloc_bioset+0x19a/0x2a0 blkdev_direct_IO+0x282/0x480 ? __io_complete_rw_common+0xc0/0xc0 ? filemap_range_has_page+0x82/0xc0 generic_file_direct_write+0x9d/0x1a0 ? inode_update_time+0x24/0x30 __generic_file_write_iter+0xbd/0x1e0 blkdev_write_iter+0xb4/0x150 ? io_import_iovec+0x8d/0x340 io_write+0xf9/0x300 io_issue_sqe+0x3c3/0x1d30 ? sysvec_reschedule_ipi+0x6c/0x80 __io_queue_sqe+0x33/0x240 ? fget+0x76/0xa0 io_submit_sqes+0xe6a/0x18d0 ? __fget_light+0xd1/0x100 __x64_sys_io_uring_enter+0x199/0x880 ? __context_tracking_enter+0x1f/0x70 ? irqentry_exit_to_user_mode+0x24/0x30 ? irqentry_exit+0x1d/0x30 ? __context_tracking_exit+0xe/0x70 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb RIP: 0033:0x7fc97c11a7be < snip > ---[ end trace 48b2e0e67debcaeb ]--- RIP: 0010:internal_get_user_pages_fast+0x340/0x990 < snip > Kernel panic - not syncing: Fatal exception Kernel Offset: disabled Link: https://lkml.kernel.org/r/1670392853-28252-1-git-send-email-ssengar@linux.microsoft.com Fixes: 414fd080d125 ("mm/gup: fix gup_pmd_range() for dax") Signed-off-by: John Starks Signed-off-by: Saurabh Sengar Cc: Jan Kara Cc: Yu Zhao Cc: Jason Gunthorpe Cc: John Hubbard Cc: David Hildenbrand Cc: Dan Williams Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index fe195d4..3b7bc2c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2852,7 +2852,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud))) { + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; -- 2.7.4 From a501788ab2603d97c41e8cda59cd74b72c29951f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 8 Dec 2022 19:55:48 +0800 Subject: [PATCH 14/16] MAINTAINERS: update Muchun Song's email I'm moving to the @linux.dev account. Map my old addresses and update it to my new address. Link: https://lkml.kernel.org/r/20221208115548.85244-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Andrew Morton --- .mailmap | 2 ++ MAINTAINERS | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 315e220..f5bd014 100644 --- a/.mailmap +++ b/.mailmap @@ -373,6 +373,8 @@ Ricardo Ribalda Roman Gushchin Roman Gushchin Roman Gushchin +Muchun Song +Muchun Song Ross Zwisler Rudolf Marek Rui Saraiva diff --git a/MAINTAINERS b/MAINTAINERS index 0e152a7a..93913c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5299,7 +5299,7 @@ M: Johannes Weiner M: Michal Hocko M: Roman Gushchin M: Shakeel Butt -R: Muchun Song +R: Muchun Song L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained @@ -9443,7 +9443,7 @@ F: drivers/net/ethernet/huawei/hinic/ HUGETLB SUBSYSTEM M: Mike Kravetz -M: Muchun Song +M: Muchun Song L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages -- 2.7.4 From 4a7ba45b1a435e7097ca0f79a847d0949d0eb088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 Dec 2022 16:53:15 -1000 Subject: [PATCH 15/16] memcg: fix possible use-after-free in memcg_write_event_control() memcg_write_event_control() accesses the dentry->d_name of the specified control fd to route the write call. As a cgroup interface file can't be renamed, it's safe to access d_name as long as the specified file is a regular cgroup file. Also, as these cgroup interface files can't be removed before the directory, it's safe to access the parent too. Prior to 347c4a874710 ("memcg: remove cgroup_event->cft"), there was a call to __file_cft() which verified that the specified file is a regular cgroupfs file before further accesses. The cftype pointer returned from __file_cft() was no longer necessary and the commit inadvertently dropped the file type check with it allowing any file to slip through. With the invarients broken, the d_name and parent accesses can now race against renames and removals of arbitrary files and cause use-after-free's. Fix the bug by resurrecting the file type check in __file_cft(). Now that cgroupfs is implemented through kernfs, checking the file operations needs to go through a layer of indirection. Instead, let's check the superblock and dentry type. Link: https://lkml.kernel.org/r/Y5FRm/cfcKPGzWwl@slm.duckdns.org Fixes: 347c4a874710 ("memcg: remove cgroup_event->cft") Signed-off-by: Tejun Heo Reported-by: Jann Horn Acked-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Linus Torvalds Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: [3.14+] Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup-internal.h | 1 - mm/memcontrol.c | 15 +++++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44..2b7d077 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,7 @@ struct css_task_iter { struct list_head iters_node; /* css_set->task_iters */ }; +extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index fd40208..367b0a4 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -167,7 +167,6 @@ struct cgroup_mgctx { extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1a35c1..266a1ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4832,6 +4832,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; @@ -4886,6 +4887,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4893,7 +4904,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4917,7 +4928,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) -- 2.7.4 From 830b3c68c1fb1e9176028d02ef86f3cf76aa2476 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Dec 2022 14:15:18 -0800 Subject: [PATCH 16/16] Linux 6.1 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0992f82..997b677 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- 2.7.4