From 45bec50b0fca601338e2e2673f5264f4cc7dc5b0 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Tue, 18 May 2021 14:26:51 +0200 Subject: [PATCH 01/16] logger: Suppress SVACE sign extension warnings This patch suppresses SVACE warnings shown below and makes the code more robust. The actual sign extension issue cannot happen in current code as value of the len variable in function logger_set_tag() is limited to LOGGER_ENTRY_MAX_PAYLOAD and create_log() is being called only with fixed size argument values. * SIGNED_TO_BIGGER_UNSIGNED: Assignment of a signed value which has type 'int' to a variable of a bigger integer type 'size_t' Sign extension at linux-rpi3/drivers/staging/android/logger.c:898 * SIGNED_TO_BIGGER_UNSIGNED: Assignment of a signed value which has type 'int' to a variable of a bigger integer type 'size_t' Sign extension at linux-rpi3/drivers/staging/android/logger.c:1045 Change-Id: I8286e1a7fdd4cc051efc8136970a560cc8cde794 Signed-off-by: Sylwester Nawrocki --- drivers/staging/android/logger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index a51294a..2e19856 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -873,7 +873,7 @@ static long logger_set_prio(struct logger_writer *writer, void __user *arg) static long logger_set_tag(struct logger_writer *writer, void __user *arg) { struct logger_set_tag tag; - int len; + size_t len; char *p, *q; if (copy_from_user(&tag, arg, sizeof(struct logger_set_tag))) @@ -1010,7 +1010,7 @@ static const struct file_operations logger_fops = { * Log size must must be a power of two, and greater than * (LOGGER_ENTRY_MAX_PAYLOAD + sizeof(struct logger_entry)). */ -static int __init create_log(char *log_name, int size) +static int __init create_log(char *log_name, size_t size) { int ret = 0; struct logger_log *log; -- 2.7.4 From f6ec41e94cf25277dd66086f70e54cba0db9026a Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Tue, 18 May 2021 14:56:13 +0200 Subject: [PATCH 02/16] kdbus: Remove unreachable code from kdbus_msg_examine This fixes an issue pointed out with SVACE warning: * UNREACHABLE_CODE: This statement in the source code might be unreachable during program execution. [unreachable] unreachable at linux-rpi3/ipc/kdbus/message.c:346 [vec_size > vec_size + size (0 > ANY) is always false] vec_size > vec_size + size (0 > ANY) is always false at linux-rpi3/ipc/kdbus/message.c:345 Change-Id: Ia5204bbaad863f88c470e198a081fe58ffb4f208 Signed-off-by: Sylwester Nawrocki --- ipc/kdbus/message.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ipc/kdbus/message.c b/ipc/kdbus/message.c index 63626c8..d069c52 100644 --- a/ipc/kdbus/message.c +++ b/ipc/kdbus/message.c @@ -284,7 +284,7 @@ static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus, size_t *out_n_fds, size_t *out_n_parts) { struct kdbus_item *item, *fds = NULL, *bloom = NULL, *dstname = NULL; - u64 n_parts, n_memfds, n_fds, vec_size; + u64 n_parts, n_memfds, n_fds; /* * Step 1: @@ -334,7 +334,6 @@ static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus, n_parts = 0; n_memfds = 0; n_fds = 0; - vec_size = 0; KDBUS_ITEMS_FOREACH(item, msg->items, KDBUS_ITEMS_SIZE(msg, items)) { switch (item->type) { @@ -342,9 +341,7 @@ static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus, void __force __user *ptr = KDBUS_PTR(item->vec.address); u64 size = item->vec.size; - if (vec_size + size < vec_size) - return -EMSGSIZE; - if (vec_size + size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE) + if (size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE) return -EMSGSIZE; if (ptr && unlikely(!access_ok(ptr, size))) return -EFAULT; -- 2.7.4 From 11e330a300a78a30ece7ce4b7be01b366cb5c461 Mon Sep 17 00:00:00 2001 From: Dongwoo Lee Date: Wed, 2 Jun 2021 11:36:45 +0900 Subject: [PATCH 03/16] ARM: tizen_bcm2711_defconfig: Enable WireGuard This enables WireGuard VPN feature. Change-Id: Ia88f99d110c65120f1e55a1bca2e59b9fe35324f Signed-off-by: Dongwoo Lee --- arch/arm/configs/tizen_bcm2711_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/tizen_bcm2711_defconfig b/arch/arm/configs/tizen_bcm2711_defconfig index 6dbadac..2681228 100644 --- a/arch/arm/configs/tizen_bcm2711_defconfig +++ b/arch/arm/configs/tizen_bcm2711_defconfig @@ -222,6 +222,7 @@ CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y +CONFIG_WIREGUARD=y CONFIG_TUN=y # CONFIG_NET_VENDOR_AURORA is not set CONFIG_BCMGENET=y -- 2.7.4 From 481281b1448cc2c4fb140809e27b6f055209af00 Mon Sep 17 00:00:00 2001 From: Dongwoo Lee Date: Wed, 2 Jun 2021 11:48:28 +0900 Subject: [PATCH 04/16] ARM64: tizen_bcm2711_defconfig: Enable WireGuard This enables WireGuard VPN feature. Change-Id: I2e63427ed5f667858bd3abddb73e5018dc16ac7e Signed-off-by: Dongwoo Lee --- arch/arm64/configs/tizen_bcm2711_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index 07faa8a..d27e80e 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -220,6 +220,7 @@ CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y +CONFIG_WIREGUARD=y CONFIG_TUN=y # CONFIG_NET_VENDOR_AURORA is not set CONFIG_BCMGENET=y -- 2.7.4 From c7544c84fcdec91b8f4d6a8583d77583c6866fa5 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Tue, 27 Oct 2020 20:48:36 +0900 Subject: [PATCH 05/16] mm: LKSM: bug fix for kernel memory leak For efficiency, LKSM cleans exited processes in a batched manner when it finishes a scanning iteration. When it finds exited process while it is in the scanning iteration, it just pends the mm_slot of the exited process to the internal list. On the other hend, when KSM daemon cleans mm_slots of exited processes, it should care regions of exited processes to remove unreferenced lksm_region objects. Previously, most regions are maintained properly but only regions in "head" of the exited process list does not be cleaned due to the buggy implementation. At last, uncleaned objects are remained as unreferenced garbages. Follow message is detected by kmemleak (reported by sw0312.kim@samsung.com): ========================================================================= unreferenced object 0xffffff80c7083600 (size 128): comm "ksm_crawld", pid 41, jiffies 4294918362 (age 95.632s) hex dump (first 32 bytes): 00 37 08 c7 80 ff ff ff 60 82 19 bd 80 ff ff ff .7......`....... 00 35 08 c7 80 ff ff ff 00 00 00 00 00 00 00 00 .5.............. backtrace: [<0000000048313958>] kmem_cache_alloc_trace+0x1e0/0x348 [<00000000fd246822>] lksm_region_ref_append+0x48/0xf8 [<00000000c5a818a0>] ksm_join+0x3a0/0x498 [<00000000b2c3f36a>] lksm_prepare_full_scan+0xe8/0x390 [<00000000013943b5>] lksm_crawl_thread+0x214/0xbf8 [<00000000b4ce0593>] kthread+0x1b0/0x1b8 [<000000002a3f7216>] ret_from_fork+0x10/0x18 unreferenced object 0xffffff80c7083700 (size 128): comm "ksm_crawld", pid 41, jiffies 4294918362 (age 95.632s) hex dump (first 32 bytes): 00 39 08 c7 80 ff ff ff 00 36 08 c7 80 ff ff ff .9.......6...... 00 35 08 c7 80 ff ff ff 00 00 00 00 00 00 00 00 .5.............. backtrace: [<0000000048313958>] kmem_cache_alloc_trace+0x1e0/0x348 [<00000000fd246822>] lksm_region_ref_append+0x48/0xf8 [<00000000c5a818a0>] ksm_join+0x3a0/0x498 [<00000000b2c3f36a>] lksm_prepare_full_scan+0xe8/0x390 [<00000000013943b5>] lksm_crawl_thread+0x214/0xbf8 [<00000000b4ce0593>] kthread+0x1b0/0x1b8 [<000000002a3f7216>] ret_from_fork+0x10/0x18 ... ========================================================================= This patch takes care of such possible kernel memory leak problem. Change-Id: I3e4b299e02018ece1c19ba53e4f10a68520a807b Signed-off-by: Sung-hun Kim --- mm/lksm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/lksm.c b/mm/lksm.c index b763e63..31d8601 100644 --- a/mm/lksm.c +++ b/mm/lksm.c @@ -2836,6 +2836,9 @@ static void lksm_flush_removed_mm_list(void) cond_resched(); remove_trailing_rmap_items(head, &head->rmap_list); +#ifdef CONFIG_LKSM_FILTER + lksm_region_ref_list_release(head); +#endif clear_bit(MMF_VM_MERGEABLE, &head->mm->flags); mmdrop(head->mm); free_mm_slot(head); -- 2.7.4 From 1de2268e3ed287adce2f1360a13cec11da7499fc Mon Sep 17 00:00:00 2001 From: INSUN PYO Date: Wed, 30 Jun 2021 11:35:05 +0900 Subject: [PATCH 06/16] ARM/ARM64: tizen_bcm2711_defconfig: disable ANDROID_LOGGER config Disable android logger. The android logger has been moved to the kernel module of the linux-tizen-modules package. Change-Id: I64c5207ce00a818795e307b79d4bf540a88fe120 Signed-off-by: INSUN PYO --- arch/arm/configs/tizen_bcm2711_defconfig | 2 +- arch/arm64/configs/tizen_bcm2711_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/configs/tizen_bcm2711_defconfig b/arch/arm/configs/tizen_bcm2711_defconfig index 2681228..90ed4c4 100644 --- a/arch/arm/configs/tizen_bcm2711_defconfig +++ b/arch/arm/configs/tizen_bcm2711_defconfig @@ -402,7 +402,7 @@ CONFIG_DMA_BCM2708=y CONFIG_SW_SYNC=y CONFIG_STAGING=y CONFIG_STAGING_MEDIA=y -CONFIG_ANDROID_LOGGER=y +# CONFIG_ANDROID_LOGGER is not set CONFIG_SND_BCM2835=y CONFIG_VIDEO_BCM2835=y CONFIG_VIDEO_CODEC_BCM2835=y diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index d27e80e..eb9f9d7 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -393,7 +393,7 @@ CONFIG_DMA_BCM2708=y CONFIG_SW_SYNC=y CONFIG_STAGING=y CONFIG_STAGING_MEDIA=y -CONFIG_ANDROID_LOGGER=y +# CONFIG_ANDROID_LOGGER is not set CONFIG_SND_BCM2835=y CONFIG_VIDEO_BCM2835=y CONFIG_VIDEO_CODEC_BCM2835=y -- 2.7.4 From 8be9b6c1a19716e7d7d3caf8ce0f63161cad573c Mon Sep 17 00:00:00 2001 From: Dongwoo Lee Date: Wed, 7 Jul 2021 12:31:01 +0900 Subject: [PATCH 07/16] ARM: tizen_bcm2711_defconfig: Sync with savedefconfig This fixes to synchronize current defconfig with savedefconfig Change-Id: Ia99f98f2d675ae424e796ddbbd087175d9ad3cae Signed-off-by: Dongwoo Lee --- arch/arm/configs/tizen_bcm2711_defconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/configs/tizen_bcm2711_defconfig b/arch/arm/configs/tizen_bcm2711_defconfig index 90ed4c4..fac41b7 100644 --- a/arch/arm/configs/tizen_bcm2711_defconfig +++ b/arch/arm/configs/tizen_bcm2711_defconfig @@ -51,7 +51,6 @@ CONFIG_KERNEL_MODE_NEON=y # CONFIG_SUSPEND is not set CONFIG_PM=y CONFIG_RASPBERRYPI_FIRMWARE=y -CONFIG_ARM_CRYPTO=y CONFIG_CRYPTO_SHA1_ARM_NEON=y CONFIG_CRYPTO_SHA2_ARM_CE=y CONFIG_CRYPTO_AES_ARM_BS=y @@ -402,7 +401,6 @@ CONFIG_DMA_BCM2708=y CONFIG_SW_SYNC=y CONFIG_STAGING=y CONFIG_STAGING_MEDIA=y -# CONFIG_ANDROID_LOGGER is not set CONFIG_SND_BCM2835=y CONFIG_VIDEO_BCM2835=y CONFIG_VIDEO_CODEC_BCM2835=y -- 2.7.4 From bd3c507a3870546cb2decad7c7aa0e6a377064ef Mon Sep 17 00:00:00 2001 From: Dongwoo Lee Date: Wed, 7 Jul 2021 12:33:21 +0900 Subject: [PATCH 08/16] ARM64: tizen_bcm2711_defconfig: Sync with savedefconfig This fixes to synchronize current defconfig with savedefconfig Change-Id: I195f05429a15e424834a590df72b73547126d529 Signed-off-by: Dongwoo Lee --- arch/arm64/configs/tizen_bcm2711_defconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index eb9f9d7..c6af152 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -51,7 +51,6 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPUFREQ_DT=y CONFIG_ARM_RASPBERRYPI_CPUFREQ=y CONFIG_RASPBERRYPI_FIRMWARE=y -CONFIG_ARM64_CRYPTO=y CONFIG_CRYPTO_AES_ARM64_BS=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -393,7 +392,6 @@ CONFIG_DMA_BCM2708=y CONFIG_SW_SYNC=y CONFIG_STAGING=y CONFIG_STAGING_MEDIA=y -# CONFIG_ANDROID_LOGGER is not set CONFIG_SND_BCM2835=y CONFIG_VIDEO_BCM2835=y CONFIG_VIDEO_CODEC_BCM2835=y -- 2.7.4 From 21cbcb27f9571ace3ca217b60c5ce2ac8eff95a9 Mon Sep 17 00:00:00 2001 From: Mateusz Moscicki Date: Thu, 8 Jul 2021 15:36:25 +0200 Subject: [PATCH 09/16] device-mapper: Add dm-bow dm-bow is a device mapper driver that allows to create a checkpoint on the volume so that it is possible to restore the state if necessary. This is needed to protect the ext4 partition during a system upgrade in case of a power failure. Change-Id: Ifb0b9e3cadd84b0e01bc5c7e80dd296be48516c2 Originally-by: Paul Lawrence Origin: https://android.googlesource.com/kernel/common/+/refs/heads/android-mainline/drivers/md/dm-bow.c Signed-off-by: Mateusz Moscicki --- Documentation/device-mapper/dm-bow.txt | 99 +++ drivers/md/Kconfig | 20 + drivers/md/Makefile | 1 + drivers/md/dm-bow.c | 1299 ++++++++++++++++++++++++++++++++ 4 files changed, 1419 insertions(+) create mode 100644 Documentation/device-mapper/dm-bow.txt create mode 100644 drivers/md/dm-bow.c diff --git a/Documentation/device-mapper/dm-bow.txt b/Documentation/device-mapper/dm-bow.txt new file mode 100644 index 0000000..e3fc4d2 --- /dev/null +++ b/Documentation/device-mapper/dm-bow.txt @@ -0,0 +1,99 @@ +dm_bow (backup on write) +======================== + +dm_bow is a device mapper driver that uses the free space on a device to back up +data that is overwritten. The changes can then be committed by a simple state +change, or rolled back by removing the dm_bow device and running a command line +utility over the underlying device. + +dm_bow has three states, set by writing ‘1’ or ‘2’ to /sys/block/dm-?/bow/state. +It is only possible to go from state 0 (initial state) to state 1, and then from +state 1 to state 2. + +State 0: dm_bow collects all trims to the device and assumes that these mark +free space on the overlying file system that can be safely used. Typically the +mount code would create the dm_bow device, mount the file system, call the +FITRIM ioctl on the file system then switch to state 1. These trims are not +propagated to the underlying device. + +State 1: All writes to the device cause the underlying data to be backed up to +the free (trimmed) area as needed in such a way as they can be restored. +However, the writes, with one exception, then happen exactly as they would +without dm_bow, so the device is always in a good final state. The exception is +that sector 0 is used to keep a log of the latest changes, both to indicate that +we are in this state and to allow rollback. See below for all details. If there +isn't enough free space, writes are failed with -ENOSPC. + +State 2: The transition to state 2 triggers replacing the special sector 0 with +the normal sector 0, and the freeing of all state information. dm_bow then +becomes a pass-through driver, allowing the device to continue to be used with +minimal performance impact. + +Usage +===== +dm-bow takes one command line parameter, the name of the underlying device. + +dm-bow will typically be used in the following way. dm-bow will be loaded with a +suitable underlying device and the resultant device will be mounted. A file +system trim will be issued via the FITRIM ioctl, then the device will be +switched to state 1. The file system will now be used as normal. At some point, +the changes can either be committed by switching to state 2, or rolled back by +unmounting the file system, removing the dm-bow device and running the command +line utility. Note that rebooting the device will be equivalent to unmounting +and removing, but the command line utility must still be run + +Details of operation in state 1 +=============================== + +dm_bow maintains a type for all sectors. A sector can be any of: + +SECTOR0 +SECTOR0_CURRENT +UNCHANGED +FREE +CHANGED +BACKUP + +SECTOR0 is the first sector on the device, and is used to hold the log of +changes. This is the one exception. + +SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and +writes from the true sector zero are redirected to. Note that like any backup +sector, if the sector is written to directly, it must be moved again. + +UNCHANGED means that the sector has not been changed since we entered state 1. +Thus if it is written to or trimmed, the contents must first be backed up. + +FREE means that the sector was trimmed in state 0 and has not yet been written +to or used for backup. On being written to, a FREE sector is changed to CHANGED. + +CHANGED means that the sector has been modified, and can be further modified +without further backup. + +BACKUP means that this is a free sector being used as a backup. On being written +to, the contents must first be backed up again. + +All backup operations are logged to the first sector. The log sector has the +format: +-------------------------------------------------------- +| Magic | Count | Sequence | Log entry | Log entry | … +-------------------------------------------------------- + +Magic is a magic number. Count is the number of log entries. Sequence is 0 +initially. A log entry is + +----------------------------------- +| Source | Dest | Size | Checksum | +----------------------------------- + +When SECTOR0 is full, the log sector is backed up and another empty log sector +created with sequence number one higher. The first entry in any log entry with +sequence > 0 therefore must be the log of the backing up of the previous log +sector. Note that sequence is not strictly needed, but is a useful sanity check +and potentially limits the time spent trying to restore a corrupted snapshot. + +On entering state 1, dm_bow has a list of free sectors. All other sectors are +unchanged. Sector0_current is selected from the free sectors and the contents of +sector 0 are copied there. The sector 0 is backed up, which triggers the first +log entry to be written. + diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2cefb07..8e1bd24 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -505,6 +505,26 @@ config DM_FLAKEY help A target that intermittently fails I/O for debugging purposes. +config DM_BOW + tristate "BOW support" + depends on BLK_DEV_DM + select CRYPTO + select CRYPTO_HASH + select DM_BUFIO + help + This device-mapper target creates a read-only device that + transparently validates the data on one underlying device against + a pre-generated tree of cryptographic checksums stored on a second + device. + + You'll need to activate the digests you're going to use in the + cryptoapi configuration. + + To compile this code as a module, choose M here: the module will + be called dm-verity. + + If unsure, say N. + config DM_VERITY tristate "Verity target support" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 6d3e234..e70b1dc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o +obj-$(CONFIG_DM_BOW) += dm-bow.o ifeq ($(CONFIG_DM_INIT),y) dm-mod-objs += dm-init.o diff --git a/drivers/md/dm-bow.c b/drivers/md/dm-bow.c new file mode 100644 index 0000000..5cd9530 --- /dev/null +++ b/drivers/md/dm-bow.c @@ -0,0 +1,1299 @@ +/* + * Copyright (C) 2018 Google Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-core.h" + +#include +#include +#include + +#define DM_MSG_PREFIX "bow" + +struct log_entry { + u64 source; + u64 dest; + u32 size; + u32 checksum; +} __packed; + +struct log_sector { + u32 magic; + u16 header_version; + u16 header_size; + u32 block_size; + u32 count; + u32 sequence; + sector_t sector0; + struct log_entry entries[]; +} __packed; + +/* + * MAGIC is BOW in ascii + */ +#define MAGIC 0x00574f42 +#define HEADER_VERSION 0x0100 + +/* + * A sorted set of ranges representing the state of the data on the device. + * Use an rb_tree for fast lookup of a given sector + * Consecutive ranges are always of different type - operations on this + * set must merge matching consecutive ranges. + * + * Top range is always of type TOP + */ +struct bow_range { + struct rb_node node; + sector_t sector; + enum { + INVALID, /* Type not set */ + SECTOR0, /* First sector - holds log record */ + SECTOR0_CURRENT,/* Live contents of sector0 */ + UNCHANGED, /* Original contents */ + TRIMMED, /* Range has been trimmed */ + CHANGED, /* Range has been changed */ + BACKUP, /* Range is being used as a backup */ + TOP, /* Final range - sector is size of device */ + } type; + struct list_head trimmed_list; /* list of TRIMMED ranges */ +}; + +static const char * const readable_type[] = { + "Invalid", + "Sector0", + "Sector0_current", + "Unchanged", + "Free", + "Changed", + "Backup", + "Top", +}; + +enum state { + TRIM, + CHECKPOINT, + COMMITTED, +}; + +struct bow_context { + struct dm_dev *dev; + u32 block_size; + u32 block_shift; + struct workqueue_struct *workqueue; + struct dm_bufio_client *bufio; + struct mutex ranges_lock; /* Hold to access this struct and/or ranges */ + struct rb_root ranges; + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ + atomic_t state; /* One of the enum state values above */ + u64 trims_total; + struct log_sector *log_sector; + struct list_head trimmed_list; + bool forward_trims; +}; + +sector_t range_top(struct bow_range *br) +{ + return container_of(rb_next(&br->node), struct bow_range, node) + ->sector; +} + +u64 range_size(struct bow_range *br) +{ + return (range_top(br) - br->sector) * SECTOR_SIZE; +} + +static sector_t bvec_top(struct bvec_iter *bi_iter) +{ + return bi_iter->bi_sector + bi_iter->bi_size / SECTOR_SIZE; +} + +/* + * Find the first range that overlaps with bi_iter + * bi_iter is set to the size of the overlapping sub-range + */ +static struct bow_range *find_first_overlapping_range(struct rb_root *ranges, + struct bvec_iter *bi_iter) +{ + struct rb_node *node = ranges->rb_node; + struct bow_range *br; + + while (node) { + br = container_of(node, struct bow_range, node); + + if (br->sector <= bi_iter->bi_sector + && bi_iter->bi_sector < range_top(br)) + break; + + if (bi_iter->bi_sector < br->sector) + node = node->rb_left; + else + node = node->rb_right; + } + + WARN_ON(!node); + if (!node) + return NULL; + + if (range_top(br) - bi_iter->bi_sector + < bi_iter->bi_size >> SECTOR_SHIFT) + bi_iter->bi_size = (range_top(br) - bi_iter->bi_sector) + << SECTOR_SHIFT; + + return br; +} + +void add_before(struct rb_root *ranges, struct bow_range *new_br, + struct bow_range *existing) +{ + struct rb_node *parent = &(existing->node); + struct rb_node **link = &(parent->rb_left); + + while (*link) { + parent = *link; + link = &((*link)->rb_right); + } + + rb_link_node(&new_br->node, parent, link); + rb_insert_color(&new_br->node, ranges); +} + +/* + * Given a range br returned by find_first_overlapping_range, split br into a + * leading range, a range matching the bi_iter and a trailing range. + * Leading and trailing may end up size 0 and will then be deleted. The + * new range matching the bi_iter is then returned and should have its type + * and type specific fields populated. + * If bi_iter runs off the end of the range, bi_iter is truncated accordingly + */ +static int split_range(struct bow_context *bc, struct bow_range **br, + struct bvec_iter *bi_iter) +{ + struct bow_range *new_br; + + if (bi_iter->bi_sector < (*br)->sector) { + WARN_ON(true); + return BLK_STS_IOERR; + } + + if (bi_iter->bi_sector > (*br)->sector) { + struct bow_range *leading_br = + kzalloc(sizeof(*leading_br), GFP_KERNEL); + + if (!leading_br) + return BLK_STS_RESOURCE; + + *leading_br = **br; + if (leading_br->type == TRIMMED) + list_add(&leading_br->trimmed_list, &bc->trimmed_list); + + add_before(&bc->ranges, leading_br, *br); + (*br)->sector = bi_iter->bi_sector; + } + + if (bvec_top(bi_iter) >= range_top(*br)) { + bi_iter->bi_size = (range_top(*br) - (*br)->sector) + * SECTOR_SIZE; + return BLK_STS_OK; + } + + /* new_br will be the beginning, existing br will be the tail */ + new_br = kzalloc(sizeof(*new_br), GFP_KERNEL); + if (!new_br) + return BLK_STS_RESOURCE; + + new_br->sector = (*br)->sector; + (*br)->sector = bvec_top(bi_iter); + add_before(&bc->ranges, new_br, *br); + *br = new_br; + + return BLK_STS_OK; +} + +/* + * Sets type of a range. May merge range into surrounding ranges + * Since br may be invalidated, always sets br to NULL to prevent + * usage after this is called + */ +static void set_type(struct bow_context *bc, struct bow_range **br, int type) +{ + struct bow_range *prev = container_of(rb_prev(&(*br)->node), + struct bow_range, node); + struct bow_range *next = container_of(rb_next(&(*br)->node), + struct bow_range, node); + + if ((*br)->type == TRIMMED) { + bc->trims_total -= range_size(*br); + list_del(&(*br)->trimmed_list); + } + + if (type == TRIMMED) { + bc->trims_total += range_size(*br); + list_add(&(*br)->trimmed_list, &bc->trimmed_list); + } + + (*br)->type = type; + + if (next->type == type) { + if (type == TRIMMED) + list_del(&next->trimmed_list); + rb_erase(&next->node, &bc->ranges); + kfree(next); + } + + if (prev->type == type) { + if (type == TRIMMED) + list_del(&(*br)->trimmed_list); + rb_erase(&(*br)->node, &bc->ranges); + kfree(*br); + } + + *br = NULL; +} + +static struct bow_range *find_free_range(struct bow_context *bc) +{ + if (list_empty(&bc->trimmed_list)) { + DMERR("Unable to find free space to back up to"); + return NULL; + } + + return list_first_entry(&bc->trimmed_list, struct bow_range, + trimmed_list); +} + +static sector_t sector_to_page(struct bow_context const *bc, sector_t sector) +{ + WARN_ON((sector & (((sector_t)1 << (bc->block_shift - SECTOR_SHIFT)) - 1)) + != 0); + return sector >> (bc->block_shift - SECTOR_SHIFT); +} + +static int copy_data(struct bow_context const *bc, + struct bow_range *source, struct bow_range *dest, + u32 *checksum) +{ + int i; + + if (range_size(source) != range_size(dest)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (checksum) + *checksum = sector_to_page(bc, source->sector); + + for (i = 0; i < range_size(source) >> bc->block_shift; ++i) { + struct dm_buffer *read_buffer, *write_buffer; + u8 *read, *write; + sector_t page = sector_to_page(bc, source->sector) + i; + + read = dm_bufio_read(bc->bufio, page, &read_buffer); + if (IS_ERR(read)) { + DMERR("Cannot read page %llu", + (unsigned long long)page); + return PTR_ERR(read); + } + + if (checksum) + *checksum = crc32(*checksum, read, bc->block_size); + + write = dm_bufio_new(bc->bufio, + sector_to_page(bc, dest->sector) + i, + &write_buffer); + if (IS_ERR(write)) { + DMERR("Cannot write sector"); + dm_bufio_release(read_buffer); + return PTR_ERR(write); + } + + memcpy(write, read, bc->block_size); + + dm_bufio_mark_buffer_dirty(write_buffer); + dm_bufio_release(write_buffer); + dm_bufio_release(read_buffer); + } + + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +/****** logging functions ******/ + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum); + +static int backup_log_sector(struct bow_context *bc) +{ + struct bow_range *first_br, *free_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + + if (first_br->type != SECTOR0) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + free_br = find_free_range(bc); + /* No space left - return this error to userspace */ + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + if (bi_iter.bi_size != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + bc->log_sector->count = 0; + bc->log_sector->sequence++; + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum) +{ + struct dm_buffer *sector_buffer; + u8 *sector; + + if (sizeof(struct log_sector) + + sizeof(struct log_entry) * (bc->log_sector->count + 1) + > bc->block_size) { + int ret = backup_log_sector(bc); + + if (ret) + return ret; + } + + sector = dm_bufio_new(bc->bufio, 0, §or_buffer); + if (IS_ERR(sector)) { + DMERR("Cannot write boot sector"); + dm_bufio_release(sector_buffer); + return BLK_STS_NOSPC; + } + + bc->log_sector->entries[bc->log_sector->count].source = source; + bc->log_sector->entries[bc->log_sector->count].dest = dest; + bc->log_sector->entries[bc->log_sector->count].size = size; + bc->log_sector->entries[bc->log_sector->count].checksum = checksum; + bc->log_sector->count++; + + memcpy(sector, bc->log_sector, bc->block_size); + dm_bufio_mark_buffer_dirty(sector_buffer); + dm_bufio_release(sector_buffer); + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +static int prepare_log(struct bow_context *bc) +{ + struct bow_range *free_br, *first_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + /* Carve out first sector as log sector */ + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + if (first_br->type != UNCHANGED) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) < bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + bi_iter.bi_sector = 0; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &first_br, &bi_iter); + if (ret) + return ret; + first_br->type = SECTOR0; + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + /* Find free sector for active sector0 reads/writes */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + + /* Copy data */ + ret = copy_data(bc, first_br, free_br, NULL); + if (ret) + return ret; + + bc->log_sector->sector0 = free_br->sector; + + set_type(bc, &free_br, SECTOR0_CURRENT); + + /* Find free sector to back up original sector zero */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + + /* Back up */ + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + /* + * Set up our replacement boot sector - it will get written when we + * add the first log entry, which we do immediately + */ + bc->log_sector->magic = MAGIC; + bc->log_sector->header_version = HEADER_VERSION; + bc->log_sector->header_size = sizeof(*bc->log_sector); + bc->log_sector->block_size = bc->block_size; + bc->log_sector->count = 0; + bc->log_sector->sequence = 0; + + /* Add log entry */ + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static struct bow_range *find_sector0_current(struct bow_context *bc) +{ + struct bvec_iter bi_iter; + + bi_iter.bi_sector = bc->log_sector->sector0; + bi_iter.bi_size = bc->block_size; + return find_first_overlapping_range(&bc->ranges, &bi_iter); +} + +/****** sysfs interface functions ******/ + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&bc->state)); +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + enum state state, original_state; + int ret; + + state = buf[0] - '0'; + if (state < TRIM || state > COMMITTED) { + DMERR("State value %d out of range", state); + return -EINVAL; + } + + mutex_lock(&bc->ranges_lock); + original_state = atomic_read(&bc->state); + if (state != original_state + 1) { + DMERR("Invalid state change from %d to %d", + original_state, state); + ret = -EINVAL; + goto bad; + } + + DMINFO("Switching to state %s", state == CHECKPOINT ? "Checkpoint" + : state == COMMITTED ? "Committed" : "Unknown"); + + if (state == CHECKPOINT) { + ret = prepare_log(bc); + if (ret) { + DMERR("Failed to switch to checkpoint state"); + goto bad; + } + } else if (state == COMMITTED) { + struct bow_range *br = find_sector0_current(bc); + struct bow_range *sector0_br = + container_of(rb_first(&bc->ranges), struct bow_range, + node); + + ret = copy_data(bc, br, sector0_br, 0); + if (ret) { + DMERR("Failed to switch to committed state"); + goto bad; + } + } + atomic_inc(&bc->state); + ret = count; + +bad: + mutex_unlock(&bc->ranges_lock); + return ret; +} + +static ssize_t free_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + u64 trims_total; + + mutex_lock(&bc->ranges_lock); + trims_total = bc->trims_total; + mutex_unlock(&bc->ranges_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", trims_total); +} + +static struct kobj_attribute attr_state = __ATTR_RW(state); +static struct kobj_attribute attr_free = __ATTR_RO(free); + +static struct attribute *bow_attrs[] = { + &attr_state.attr, + &attr_free.attr, + NULL +}; + +static struct kobj_type bow_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = bow_attrs, + .release = dm_kobject_release +}; + +/****** constructor/destructor ******/ + +static void dm_bow_dtr(struct dm_target *ti) +{ + struct bow_context *bc = (struct bow_context *) ti->private; + struct kobject *kobj; + + if (bc->workqueue) + destroy_workqueue(bc->workqueue); + if (bc->bufio) + dm_bufio_client_destroy(bc->bufio); + + kobj = &bc->kobj_holder.kobj; + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + + while (rb_first(&bc->ranges)) { + struct bow_range *br = container_of(rb_first(&bc->ranges), + struct bow_range, node); + + rb_erase(&br->node, &bc->ranges); + kfree(br); + } + + mutex_destroy(&bc->ranges_lock); + kfree(bc->log_sector); + kfree(bc); +} + +static void dm_bow_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct bow_context *bc = ti->private; + const unsigned int block_size = bc->block_size; + + limits->logical_block_size = + max_t(unsigned int, limits->logical_block_size, block_size); + limits->physical_block_size = + max_t(unsigned int, limits->physical_block_size, block_size); + limits->io_min = max_t(unsigned int, limits->io_min, block_size); + + if (limits->max_discard_sectors == 0) { + limits->discard_granularity = 1 << 12; + limits->max_hw_discard_sectors = 1 << 15; + limits->max_discard_sectors = 1 << 15; + bc->forward_trims = false; + } else { + limits->discard_granularity = 1 << 12; + bc->forward_trims = true; + } +} + +static int dm_bow_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bow_context *bc = ti->private; + struct dm_arg_set as; + static const struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; + unsigned int opt_params; + const char *opt_string; + int err; + char dummy; + + as.argc = argc; + as.argv = argv; + + err = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (err) + return err; + + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + return -EINVAL; + } + + if (sscanf(opt_string, "block_size:%u%c", + &bc->block_size, &dummy) == 1) { + if (bc->block_size < SECTOR_SIZE || + bc->block_size > 4096 || + !is_power_of_2(bc->block_size)) { + ti->error = "Invalid block_size"; + return -EINVAL; + } + } else { + ti->error = "Invalid feature arguments"; + return -EINVAL; + } + } + + return 0; +} + +static int dm_bow_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bow_context *bc; + struct bow_range *br; + int ret; + struct mapped_device *md = dm_table_get_md(ti->table); + + if (argc < 1) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + bc = kzalloc(sizeof(*bc), GFP_KERNEL); + if (!bc) { + ti->error = "Cannot allocate bow context"; + return -ENOMEM; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; + ti->private = bc; + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &bc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + bc->block_size = + bdev_get_queue(bc->dev->bdev)->limits.logical_block_size; + if (argc > 1) { + ret = dm_bow_ctr_optional(ti, argc - 1, &argv[1]); + if (ret) + goto bad; + } + + bc->block_shift = ilog2(bc->block_size); + bc->log_sector = kzalloc(bc->block_size, GFP_KERNEL); + if (!bc->log_sector) { + ti->error = "Cannot allocate log sector"; + goto bad; + } + + init_completion(&bc->kobj_holder.completion); + ret = kobject_init_and_add(&bc->kobj_holder.kobj, &bow_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", + "bow"); + if (ret) { + ti->error = "Cannot create sysfs node"; + goto bad; + } + + mutex_init(&bc->ranges_lock); + bc->ranges = RB_ROOT; + bc->bufio = dm_bufio_client_create(bc->dev->bdev, bc->block_size, 1, 0, + NULL, NULL); + if (IS_ERR(bc->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + ret = PTR_ERR(bc->bufio); + bc->bufio = NULL; + goto bad; + } + + bc->workqueue = alloc_workqueue("dm-bow", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM + | WQ_UNBOUND, num_online_cpus()); + if (!bc->workqueue) { + ti->error = "Cannot allocate workqueue"; + ret = -ENOMEM; + goto bad; + } + + INIT_LIST_HEAD(&bc->trimmed_list); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = ti->len; + br->type = TOP; + rb_link_node(&br->node, NULL, &bc->ranges.rb_node); + rb_insert_color(&br->node, &bc->ranges); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = 0; + br->type = UNCHANGED; + rb_link_node(&br->node, bc->ranges.rb_node, + &bc->ranges.rb_node->rb_left); + rb_insert_color(&br->node, &bc->ranges); + + ti->discards_supported = true; + + return 0; + +bad: + dm_bow_dtr(ti); + return ret; +} + +/****** Handle writes ******/ + +static int prepare_unchanged_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter, + bool record_checksum) +{ + struct bow_range *backup_br; + struct bvec_iter backup_bi; + sector_t log_source, log_dest; + unsigned int log_size; + u32 checksum = 0; + int ret; + int original_type; + sector_t sector0; + + /* Find a free range */ + backup_br = find_free_range(bc); + if (!backup_br) + return BLK_STS_NOSPC; + + /* Carve out a backup range. This may be smaller than the br given */ + backup_bi.bi_sector = backup_br->sector; + backup_bi.bi_size = min(range_size(backup_br), (u64) bi_iter->bi_size); + ret = split_range(bc, &backup_br, &backup_bi); + if (ret) + return ret; + + /* + * Carve out a changed range. This will not be smaller than the backup + * br since the backup br is smaller than the source range and iterator + */ + bi_iter->bi_size = backup_bi.bi_size; + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + if (range_size(br) != range_size(backup_br)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + + /* Copy data over */ + ret = copy_data(bc, br, backup_br, record_checksum ? &checksum : NULL); + if (ret) + return ret; + + /* Add an entry to the log */ + log_source = br->sector; + log_dest = backup_br->sector; + log_size = range_size(br); + + /* + * Set the types. Note that since set_type also amalgamates ranges + * we have to set both sectors to their final type before calling + * set_type on either + */ + original_type = br->type; + sector0 = backup_br->sector; + bc->trims_total -= range_size(backup_br); + if (backup_br->type == TRIMMED) + list_del(&backup_br->trimmed_list); + backup_br->type = br->type == SECTOR0_CURRENT ? SECTOR0_CURRENT + : BACKUP; + br->type = CHANGED; + set_type(bc, &backup_br, backup_br->type); + + /* + * Add the log entry after marking the backup sector, since adding a log + * can cause another backup + */ + ret = add_log_entry(bc, log_source, log_dest, log_size, checksum); + if (ret) { + br->type = original_type; + return ret; + } + + /* Now it is safe to mark this backup successful */ + if (original_type == SECTOR0_CURRENT) + bc->log_sector->sector0 = sector0; + + set_type(bc, &br, br->type); + return ret; +} + +static int prepare_free_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + int ret; + + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + set_type(bc, &br, CHANGED); + return BLK_STS_OK; +} + +static int prepare_changed_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + /* Nothing to do ... */ + return BLK_STS_OK; +} + +static int prepare_one_range(struct bow_context *bc, + struct bvec_iter *bi_iter) +{ + struct bow_range *br = find_first_overlapping_range(&bc->ranges, + bi_iter); + switch (br->type) { + case CHANGED: + return prepare_changed_range(bc, br, bi_iter); + + case TRIMMED: + return prepare_free_range(bc, br, bi_iter); + + case UNCHANGED: + case BACKUP: + return prepare_unchanged_range(bc, br, bi_iter, true); + + /* + * We cannot track the checksum for the active sector0, since it + * may change at any point. + */ + case SECTOR0_CURRENT: + return prepare_unchanged_range(bc, br, bi_iter, false); + + case SECTOR0: /* Handled in the dm_bow_map */ + case TOP: /* Illegal - top is off the end of the device */ + default: + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +struct write_work { + struct work_struct work; + struct bow_context *bc; + struct bio *bio; +}; + +static void bow_write(struct work_struct *work) +{ + struct write_work *ww = container_of(work, struct write_work, work); + struct bow_context *bc = ww->bc; + struct bio *bio = ww->bio; + struct bvec_iter bi_iter = bio->bi_iter; + int ret = BLK_STS_OK; + + kfree(ww); + + mutex_lock(&bc->ranges_lock); + do { + ret = prepare_one_range(bc, &bi_iter); + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + } while (!ret && bi_iter.bi_size); + + mutex_unlock(&bc->ranges_lock); + + if (!ret) { + bio_set_dev(bio, bc->dev->bdev); + submit_bio(bio); + } else { + DMERR("Write failure with error %d", -ret); + bio->bi_status = ret; + bio_endio(bio); + } +} + +static int queue_write(struct bow_context *bc, struct bio *bio) +{ + struct write_work *ww = kmalloc(sizeof(*ww), GFP_NOIO | __GFP_NORETRY + | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (!ww) { + DMERR("Failed to allocate write_work"); + return -ENOMEM; + } + + INIT_WORK(&ww->work, bow_write); + ww->bc = bc; + ww->bio = bio; + queue_work(bc->workqueue, &ww->work); + return DM_MAPIO_SUBMITTED; +} + +static int handle_sector0(struct bow_context *bc, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + + if (bio->bi_iter.bi_size > bc->block_size) { + struct bio * split = bio_split(bio, + bc->block_size >> SECTOR_SHIFT, + GFP_NOIO, + &fs_bio_set); + if (!split) { + DMERR("Failed to split bio"); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + bio_chain(split, bio); + split->bi_iter.bi_sector = bc->log_sector->sector0; + bio_set_dev(split, bc->dev->bdev); + submit_bio(split); + + if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + } else { + bio->bi_iter.bi_sector = bc->log_sector->sector0; + } + + return ret; +} + +static int add_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("add_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, TRIMMED); + break; + + case TRIMMED: + /* Nothing to do */ + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + bio_endio(bio); + return DM_MAPIO_SUBMITTED; +} + +static int remove_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("remove_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + /* Nothing to do */ + break; + + case TRIMMED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, UNCHANGED); + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + return DM_MAPIO_REMAPPED; +} + +int remap_unless_illegal_trim(struct bow_context *bc, struct bio *bio) +{ + if (!bc->forward_trims && bio_op(bio) == REQ_OP_DISCARD) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } else { + bio_set_dev(bio, bc->dev->bdev); + return DM_MAPIO_REMAPPED; + } +} + +/****** dm interface ******/ + +static int dm_bow_map(struct dm_target *ti, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + struct bow_context *bc = ti->private; + + if (likely(bc->state.counter == COMMITTED)) + return remap_unless_illegal_trim(bc, bio); + + if (bio_data_dir(bio) == READ && bio->bi_iter.bi_sector != 0) + return remap_unless_illegal_trim(bc, bio); + + if (atomic_read(&bc->state) != COMMITTED) { + enum state state; + + mutex_lock(&bc->ranges_lock); + state = atomic_read(&bc->state); + if (state == TRIM) { + if (bio_op(bio) == REQ_OP_DISCARD) + ret = add_trim(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = remove_trim(bc, bio); + else + /* pass-through */; + } else if (state == CHECKPOINT) { + if (bio->bi_iter.bi_sector == 0) + ret = handle_sector0(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + else + /* pass-through */; + } else { + /* pass-through */ + } + mutex_unlock(&bc->ranges_lock); + } + + if (ret == DM_MAPIO_REMAPPED) + return remap_unless_illegal_trim(bc, bio); + + return ret; +} + +static void dm_bow_tablestatus(struct dm_target *ti, char *result, + unsigned int maxlen) +{ + char *end = result + maxlen; + struct bow_context *bc = ti->private; + struct rb_node *i; + int trimmed_list_length = 0; + int trimmed_range_count = 0; + struct bow_range *br; + + if (maxlen == 0) + return; + result[0] = 0; + + list_for_each_entry(br, &bc->trimmed_list, trimmed_list) + if (br->type == TRIMMED) { + ++trimmed_list_length; + } else { + scnprintf(result, end - result, + "ERROR: non-trimmed entry in trimmed_list"); + return; + } + + if (!rb_first(&bc->ranges)) { + scnprintf(result, end - result, "ERROR: Empty ranges"); + return; + } + + if (container_of(rb_first(&bc->ranges), struct bow_range, node) + ->sector) { + scnprintf(result, end - result, + "ERROR: First range does not start at sector 0"); + return; + } + + for (i = rb_first(&bc->ranges); i; i = rb_next(i)) { + struct bow_range *br = container_of(i, struct bow_range, node); + + result += scnprintf(result, end - result, "%s: %llu", + readable_type[br->type], + (unsigned long long)br->sector); + if (result >= end) + return; + + result += scnprintf(result, end - result, "\n"); + if (result >= end) + return; + + if (br->type == TRIMMED) + ++trimmed_range_count; + + if (br->type == TOP) { + if (br->sector != ti->len) { + scnprintf(result, end - result, + "\nERROR: Top sector is incorrect"); + } + + if (&br->node != rb_last(&bc->ranges)) { + scnprintf(result, end - result, + "\nERROR: Top sector is not last"); + } + + break; + } + + if (!rb_next(i)) { + scnprintf(result, end - result, + "\nERROR: Last range not of type TOP"); + return; + } + + if (br->sector > range_top(br)) { + scnprintf(result, end - result, + "\nERROR: sectors out of order"); + return; + } + } + + if (trimmed_range_count != trimmed_list_length) + scnprintf(result, end - result, + "\nERROR: not all trimmed ranges in trimmed list"); +} + +static void dm_bow_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + switch (type) { + case STATUSTYPE_INFO: + if (maxlen) + result[0] = 0; + break; + + case STATUSTYPE_TABLE: + dm_bow_tablestatus(ti, result, maxlen); + break; + } +} + +int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct bow_context *bc = ti->private; + struct dm_dev *dev = bc->dev; + + *bdev = dev->bdev; + /* Only pass ioctls through if the device sizes match exactly. */ + return ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +static int dm_bow_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct bow_context *bc = ti->private; + + return fn(ti, bc->dev, 0, ti->len, data); +} + +static struct target_type bow_target = { + .name = "bow", + .version = {1, 2, 0}, + .module = THIS_MODULE, + .ctr = dm_bow_ctr, + .dtr = dm_bow_dtr, + .map = dm_bow_map, + .status = dm_bow_status, + .prepare_ioctl = dm_bow_prepare_ioctl, + .iterate_devices = dm_bow_iterate_devices, + .io_hints = dm_bow_io_hints, +}; + +int __init dm_bow_init(void) +{ + int r = dm_register_target(&bow_target); + + if (r < 0) + DMERR("registering bow failed %d", r); + return r; +} + +void dm_bow_exit(void) +{ + dm_unregister_target(&bow_target); +} + +MODULE_LICENSE("GPL"); + +module_init(dm_bow_init); +module_exit(dm_bow_exit); -- 2.7.4 From d9b1612efd214fd2112223b86b8cfcfea1bde4ef Mon Sep 17 00:00:00 2001 From: Mateusz Moscicki Date: Thu, 8 Jul 2021 15:40:21 +0200 Subject: [PATCH 10/16] tizen_bcm2711_defconfig: Enable dm-bow This enables device mapper driver to allow checkpoint & restore on ext4 partitions. Change-Id: Ic88811b4e9b66bc42e12e220e420c75464cf6ea8 Signed-off-by: Mateusz Moscicki [sw0312.kim: sync with savedefconfig] Signed-off-by: Seung-Woo Kim --- arch/arm/configs/tizen_bcm2711_defconfig | 1 + arch/arm64/configs/tizen_bcm2711_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm/configs/tizen_bcm2711_defconfig b/arch/arm/configs/tizen_bcm2711_defconfig index fac41b7..d8157e2 100644 --- a/arch/arm/configs/tizen_bcm2711_defconfig +++ b/arch/arm/configs/tizen_bcm2711_defconfig @@ -219,6 +219,7 @@ CONFIG_MD=y CONFIG_BLK_DEV_DM=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_BOW=y CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y CONFIG_WIREGUARD=y diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index c6af152..5563402 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -217,6 +217,7 @@ CONFIG_MD=y CONFIG_BLK_DEV_DM=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_BOW=y CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y CONFIG_WIREGUARD=y -- 2.7.4 From 213af95b4919cfefd37fd3bed53c9b342896462a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 Dec 2020 15:26:14 +0100 Subject: [PATCH 11/16] ovl: do not fail because of O_NOATIME In case the file cannot be opened with O_NOATIME because of lack of capabilities, then clear O_NOATIME instead of failing. Remove WARN_ON(), since it would now trigger if O_NOATIME was cleared. Noticed by Amir Goldstein. Signed-off-by: Miklos Szeredi [sw0312.kim: backport v5.11 mainline commit b6650dab404c to resolve overlayfs file open EPERM fail issue in v5.10] Signed-off-by: Seung-Woo Kim Change-Id: I9c0c74747bb4a208fc68ca561f740281b2b553fe --- fs/overlayfs/file.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 5c5c397..1cb844b 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -53,9 +53,10 @@ static struct file *ovl_open_realfile(const struct file *file, err = inode_permission(realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); - } else if (!inode_owner_or_capable(realinode)) { - realfile = ERR_PTR(-EPERM); } else { + if (!inode_owner_or_capable(realinode)) + flags &= ~O_NOATIME; + realfile = open_with_fake_path(&file->f_path, flags, realinode, current_cred()); } @@ -75,12 +76,6 @@ static int ovl_change_flags(struct file *file, unsigned int flags) struct inode *inode = file_inode(file); int err; - flags |= OVL_OPEN_FLAGS; - - /* If some flag changed that cannot be changed then something's amiss */ - if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) - return -EIO; - flags &= OVL_SETFL_MASK; if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) -- 2.7.4 From 57dba3134e126db09c287597e6ac1e97d81d5ad3 Mon Sep 17 00:00:00 2001 From: Kiwoong Ha Date: Tue, 9 Mar 2021 06:41:17 +0900 Subject: [PATCH 12/16] tizen_bcm2711_defconfig: Enable configs for docker Enable below configs for docker-engine CONFIG_IP_VS=y CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y CONFIG_NETFILTER_XT_MATCH_IPVS=y CONFIG_VETH=y Change-Id: I49c809b50d40eb653cb3cb6da12c5655788cd0f9 Signed-off-by: Kiwoong Ha --- arch/arm/configs/tizen_bcm2711_defconfig | 4 ++++ arch/arm64/configs/tizen_bcm2711_defconfig | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm/configs/tizen_bcm2711_defconfig b/arch/arm/configs/tizen_bcm2711_defconfig index d8157e2..50b4456 100644 --- a/arch/arm/configs/tizen_bcm2711_defconfig +++ b/arch/arm/configs/tizen_bcm2711_defconfig @@ -121,6 +121,7 @@ CONFIG_NETFILTER_XT_TARGET_NETMAP=y CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y CONFIG_NETFILTER_XT_TARGET_SECMARK=y CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y CONFIG_NETFILTER_XT_MATCH_CGROUP=y CONFIG_NETFILTER_XT_MATCH_COMMENT=y CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y @@ -129,6 +130,7 @@ CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y CONFIG_NETFILTER_XT_MATCH_HELPER=y CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_IPVS=y CONFIG_NETFILTER_XT_MATCH_LENGTH=y CONFIG_NETFILTER_XT_MATCH_LIMIT=y CONFIG_NETFILTER_XT_MATCH_MAC=y @@ -143,6 +145,7 @@ CONFIG_NETFILTER_XT_MATCH_STRING=y CONFIG_NETFILTER_XT_MATCH_TCPMSS=y CONFIG_NETFILTER_XT_MATCH_TIME=y CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_IP_VS=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_MATCH_AH=y CONFIG_IP_NF_MATCH_ECN=y @@ -224,6 +227,7 @@ CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y CONFIG_WIREGUARD=y CONFIG_TUN=y +CONFIG_VETH=y # CONFIG_NET_VENDOR_AURORA is not set CONFIG_BCMGENET=y CONFIG_PPP=y diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index 5563402..a68bf74 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -124,6 +124,7 @@ CONFIG_NETFILTER_XT_TARGET_NETMAP=y CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y CONFIG_NETFILTER_XT_TARGET_SECMARK=y CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y CONFIG_NETFILTER_XT_MATCH_CGROUP=y CONFIG_NETFILTER_XT_MATCH_COMMENT=y CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y @@ -132,6 +133,7 @@ CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y CONFIG_NETFILTER_XT_MATCH_HELPER=y CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_IPVS=y CONFIG_NETFILTER_XT_MATCH_LENGTH=y CONFIG_NETFILTER_XT_MATCH_LIMIT=y CONFIG_NETFILTER_XT_MATCH_MAC=y @@ -146,6 +148,7 @@ CONFIG_NETFILTER_XT_MATCH_STRING=y CONFIG_NETFILTER_XT_MATCH_TCPMSS=y CONFIG_NETFILTER_XT_MATCH_TIME=y CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_IP_VS=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_MATCH_AH=y CONFIG_IP_NF_MATCH_ECN=y @@ -222,6 +225,7 @@ CONFIG_DM_VERITY=y CONFIG_NETDEVICES=y CONFIG_WIREGUARD=y CONFIG_TUN=y +CONFIG_VETH=y # CONFIG_NET_VENDOR_AURORA is not set CONFIG_BCMGENET=y CONFIG_PPP=y -- 2.7.4 From 99eef1bd4a4f1d9a1351e68d54cebe86a38afe7c Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Mon, 14 Dec 2020 19:13:26 -0800 Subject: [PATCH 13/16] mm: cma: improve pr_debug log in cma_release() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It is required to print 'count' of pages, along with the pages, passed to cma_release to debug the cases of mismatched count value passed between cma_alloc() and cma_release() from a code path. As an example, consider the below scenario: 1) CMA pool size is 4MB and 2) User doing the erroneous step of allocating 2 pages but freeing 1 page in a loop from this CMA pool. The step 2 causes cma_alloc() to return NULL at one point of time because of -ENOMEM condition. And the current pr_debug logs is not giving the info about these types of allocation patterns because of count value not being printed in cma_release(). We are printing the count value in the trace logs, just extend the same to pr_debug logs too. [akpm@linux-foundation.org: fix printk warning] Link: https://lkml.kernel.org/r/1606318341-29521-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Reviewed-by: Souptick Joarder Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Vinayak Menon Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=b8ca396f984295ba09f25f6982f9abd0bb7f5a29 Signed-off-by: Łukasz Stelmach Change-Id: Ic44cb49df247f690145a2d9dc8279aa94de4205f --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index 7f415d7..a751ed4 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -512,7 +512,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) if (!cma || !pages) return false; - pr_debug("%s(page %p)\n", __func__, (void *)pages); + pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); -- 2.7.4 From 48526e75139ceaf3c7ce040a96529273ba434a39 Mon Sep 17 00:00:00 2001 From: Patrick Daly Date: Thu, 25 Feb 2021 17:16:44 -0800 Subject: [PATCH 14/16] mm: cma: print region name on failure MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Print the name of the CMA region for convenience. This is useful information to have when cma_alloc() fails. [pdaly@codeaurora.org: print the "count" variable] Link: https://lkml.kernel.org/r/20210209142414.12768-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210208115200.20286-1-georgi.djakov@linaro.org Signed-off-by: Patrick Daly Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Reviewed-by: David Hildenbrand Reviewed-by: Randy Dunlap Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=a052d4d13d88c2073d1339d9dce02cba7b4dc609 Signed-off-by: Łukasz Stelmach Change-Id: Id498bfa25916823d7e3d804db01691916fc8fac4 --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index a751ed4..ebcf221 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -486,8 +486,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, count, ret); + pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } -- 2.7.4 From 72e99a1c8712c4ac92ddbb3b17c1074ebfcbb49e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 21 Apr 2021 14:44:11 +1000 Subject: [PATCH 15/16] mm: vmstat: add cma statistics MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Since CMA is used more widely, it's worth to have CMA allocation statistics into vmstat. With it, we could know how agressively system uses cma allocation and how often it fails. Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Cc: John Dias Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=bbb269206f3c914d4f23e023de4ec020abea6d1b Signed-off-by: Łukasz Stelmach Change-Id: Iadf5fef76fe6dfa41177cfccf022ea39ae1cb19e --- include/linux/vm_event_item.h | 4 ++++ mm/cma.c | 12 +++++++++--- mm/vmstat.c | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e7597..21d7c7f 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -71,6 +71,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, +#endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ diff --git a/mm/cma.c b/mm/cma.c index ebcf221..9749e95 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -420,13 +420,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) - return NULL; + goto out; pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, count, align); if (!count) - return NULL; + goto out; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -434,7 +434,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return NULL; + goto out; for (;;) { mutex_lock(&cma->lock); @@ -492,6 +492,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) + count_vm_event(CMA_ALLOC_SUCCESS); + else + count_vm_event(CMA_ALLOC_FAIL); + return page; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0b..2cf6681 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1298,6 +1298,10 @@ const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", +#endif "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", -- 2.7.4 From e97da7ed64f52e05db52b50523e05db77970d312 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 21 Apr 2021 14:44:11 +1000 Subject: [PATCH 16/16] mm: cma: use pr_err_ratelimited for CMA warning MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If we did not reserve extra CMA memory, the log buffer can be easily filled up by CMA failure warning when the devices calling dmam_alloc_coherent() to alloc DMA memory. Thus we can use pr_err_ratelimited() instead to reduce the duplicate CMA warning. Link: https://lkml.kernel.org/r/ce2251ef49e1727a9a40531d1996660b05462bd2.1615279825.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: David Hildenbrand Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=63f83b31f4f36d933e13bd8b9a25d6d9a0cf89dd Signed-off-by: Łukasz Stelmach Change-Id: I24e03acb8288fc60e22dddfd409d56b50f3ac12f --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 9749e95..18046bf 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -486,8 +486,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, cma->name, count, ret); + pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } -- 2.7.4