zlogger: map only the actively used buffer to the userspace 54/281154/14
authorMarek Szyprowski <m.szyprowski@samsung.com>
Thu, 15 Sep 2022 11:30:03 +0000 (13:30 +0200)
committerMarek Szyprowski <m.szyprowski@samsung.com>
Wed, 19 Oct 2022 09:17:02 +0000 (11:17 +0200)
Each zlogger client maps only a single 4KB buffer and writes to it. Once
it is filled, it calls ALLOC ioctl (like before) and kernel remaps a new
buffer in place of the old buffer.

If kernel needs to free a buffer, the userspace mapping is cleared and
will cause a page fault on the next access, which in turn will cause
allocation of the new buffer just like the ALLOC ioctl does.

Change-Id: I2d6fe8406e201ef20b6378a7ba37ed5df7790406
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
include/uapi/linux/zlogger.h
kernel/zlogger/zlogger.c

index 1cad482..15784ed 100644 (file)
@@ -27,7 +27,7 @@
 #define ZLOGGER_DEVICE_COUNT (8)
 #define ZLOGGER_MAP_SIZE (4 * ZLOGGER_MB)
 #define ZLOGGER_BUFFER_SIZE (ZLOGGER_DEVICE_COUNT * ZLOGGER_MAP_SIZE)
-#define ZLOGGER_BLOCK_SIZE (2 * ZLOGGER_KB)
+#define ZLOGGER_BLOCK_SIZE (4 * ZLOGGER_KB)
 #define ZLOGGER_BLOCK_MAP_COUNT (ZLOGGER_MAP_SIZE / ZLOGGER_BLOCK_SIZE)
 #define ZLOGGER_BLOCK_COUNT (ZLOGGER_BUFFER_SIZE / ZLOGGER_BLOCK_SIZE)
 #define ZLOGGER_DATA_MAX (ZLOGGER_BLOCK_SIZE - sizeof(struct zlogger_header))
index 02a4fd2..fbc7456 100644 (file)
@@ -43,6 +43,7 @@
 #define NS_PER_SEC (1000000000UL)
 
 #define ZLOGGER_DEVICE_NAME "zlogger"
+#define ZLOGGER_DUMP_DEVICE_NAME "zlogger_dump"
 #define ZLOGGER_SMACK_LABEL "*"
 
 #define BLOCK_RATIO(count) (count*100/ZLOGGER_BLOCK_COUNT)
 
 #define MAX_BUF_LEN 255
 
+#if ZLOGGER_BLOCK_SIZE != PAGE_SIZE
+#error Only ZLOGGER_BLOCK_SIZE == PAGE_SIZE is supported
+#endif
+
 struct queue {
        char name[5];
        uint16_t front;
@@ -65,6 +70,7 @@ struct thread_table_field {
        pid_t tid;
        uint16_t blk;
        bool is_stdout;
+       struct vm_area_struct *vma;
        struct hlist_node next;
 };
 
@@ -101,6 +107,7 @@ struct zlog_file {
 /* --zlogger file channel */
 
 static struct miscdevice zlogger_device;
+static struct miscdevice zlogger_dump_device;
 
 static int g_init;
 static char *g_shm_ptr[ZLOGGER_DEVICE_COUNT];
@@ -123,6 +130,8 @@ static struct completion g_completion;
 static int g_zlog_enable = 1;
 module_param_named(zlog_enable, g_zlog_enable, int, 0644);
 
+static int zlogger_unmap(struct thread_table_field *ptr);
+
 #if (KERNEL_VERSION(3, 17, 0) > LINUX_VERSION_CODE)
 static inline u64 ktime_get_ns(void)
 {
@@ -130,26 +139,26 @@ static inline u64 ktime_get_ns(void)
 }
 #endif
 
-static uint16_t get_thread_table(pid_t tid, bool is_stdout)
+static struct thread_table_field *get_thread_table(pid_t tid, bool is_stdout)
 {
        struct thread_table_field *ptr = NULL;
 
        hash_for_each_possible(g_thread_table->data, ptr, next, tid) {
                if (ptr->tid == tid && ptr->is_stdout == is_stdout)
-                       return ptr->blk;
+                       return ptr;
        }
 
-       return 0;
+       return NULL;
 }
 
-static void set_thread_table(pid_t tid, bool is_stdout, uint16_t blk)
+static struct thread_table_field *set_thread_table(pid_t tid, bool is_stdout, uint16_t blk)
 {
        struct thread_table_field *ptr = NULL;
 
        hash_for_each_possible(g_thread_table->data, ptr, next, tid) {
                if (ptr->tid == tid && ptr->is_stdout == is_stdout) {
                        ptr->blk = blk;
-                       return;
+                       return ptr;
                }
        }
 
@@ -158,6 +167,7 @@ static void set_thread_table(pid_t tid, bool is_stdout, uint16_t blk)
        ptr->is_stdout = is_stdout;
        ptr->blk = blk;
        hash_add(g_thread_table->data, &ptr->next, tid);
+       return ptr;
 }
 
 static inline char *get_shared_memory(int dev_index)
@@ -240,16 +250,16 @@ static int zlog_task(void *user_data)
        int blk;
 
        do {
-
                hash_for_each_safe(g_thread_table->data, tmp_bkt, tmp_iter, ptr, next) {
                        blk = ptr->blk;
                        // TODO: g_start_time should be under some kind of mutex.
                        if (blk && get_block(blk)->head.ts < g_start_time) {
-                               mutex_lock(&g_block_mutex);
                                get_block(blk)->head.tid = 0;
-                               queue_push(&g_free_q, blk);
                                ptr->blk = 0;
                                // TODO: The userspace might very well be using this block right now.
+                               zlogger_unmap(ptr);
+                               mutex_lock(&g_block_mutex);
+                               queue_push(&g_free_q, blk);
                                mutex_unlock(&g_block_mutex);
                        }
                }
@@ -288,29 +298,32 @@ static void run_task(void)
        }
 }
 
-static long alloc_block_for_thread(bool is_stdout)
+static struct thread_table_field *alloc_block_for_thread(bool is_stdout)
 {
+       struct thread_table_field *ptr;
        pid_t pid = current->tgid;
        pid_t tid = current->pid;
        uint16_t blk;
        struct zlogger_block *block;
 
        mutex_lock(&g_block_mutex);
-       blk = get_thread_table(tid, is_stdout);
-       if (blk)
-               queue_push(&g_free_q, blk);
+       ptr = get_thread_table(tid, is_stdout);
+       if (ptr && ptr->blk)
+               queue_push(&g_free_q, ptr->blk);
        blk = queue_pop(&g_free_q);
-       set_thread_table(tid, is_stdout, blk);
-
        if (!blk) {
                if ((g_err_count++ % 10000) < 3)
                        pr_info("[NO MEMORY] tid:%d free:%d err:%d", tid, g_free_q.count, g_err_count);
                mutex_unlock(&g_block_mutex);
-               return -ENOMEM;
+               return NULL;
        }
-
+       ptr = set_thread_table(tid, is_stdout, blk);
        block = get_block(blk);
 
+       /* security: ensure mmaped block doesn't leak any information */
+       if (!is_stdout)
+               memset(block, 0, ZLOGGER_BLOCK_SIZE);
+
        // TODO: Needs documentation on how the g_start_time value behaves.
        if (g_start_time < block->head.ts)
                g_start_time = block->head.ts;
@@ -321,31 +334,33 @@ static long alloc_block_for_thread(bool is_stdout)
        block->head.ts = g_start_time;
        mutex_unlock(&g_block_mutex);
 
-       return (long)blk;
+       return ptr;
 }
 
 static inline struct zlogger_block *get_valid_block(int tid, size_t len, bool is_stdout)
 {
-       uint16_t blk = 0;
-       long r;
+       struct thread_table_field *ptr;
 
-       blk = get_thread_table(tid, is_stdout);
+       ptr = get_thread_table(tid, is_stdout);
+       if (ptr && ptr->blk) {
+               struct zlogger_block *block = get_block(ptr->blk);
 
-       if (blk != 0) {
-               struct zlogger_block *block = get_block(blk);
-
-               if (!block)
+               if (!block) {
+                       printk("%s %d no block %d allocated\n", __func__, __LINE__, ptr->blk);
                        return NULL;
+               }
 
                if (block->head.offset + len < ZLOGGER_DATA_MAX)
                        return block;
        }
 
-       r = alloc_block_for_thread(is_stdout);
-       if (r <= 0)
+       ptr = alloc_block_for_thread(is_stdout);
+       if (!ptr) {
+               printk("%s %d no block allocated\n", __func__, __LINE__);
                return NULL;
+       }
 
-       return get_block((uint16_t)r);
+       return get_block(ptr->blk);
 }
 
 static int zlogger_open(struct inode *inode, struct file *file)
@@ -373,27 +388,181 @@ static int zlogger_release(struct inode *ignored, struct file *file)
        return 0;
 }
 
-static int zlogger_mmap(struct file *filep, struct vm_area_struct *vma)
+/* called under mmap semaphore */
+static void zlogger_vm_open(struct vm_area_struct *vma)
 {
-       const int PAGES_PER_MAP = ZLOGGER_MAP_SIZE / PAGE_SIZE;
-       int dev_index = (int)vma->vm_pgoff / PAGES_PER_MAP;
-       unsigned long offset = vma->vm_pgoff % PAGES_PER_MAP;
+       /*
+          Force page fault on the next access of the given buffer,
+          this will allocate next block.
+       */
+       zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+
+       /*
+          This is called from the new copied vma,
+          so ensure that a copy will always fault
+       */
+       vma->vm_private_data = NULL;
+}
+
+/* called under mmap semaphore */
+static void zlogger_vm_close(struct vm_area_struct *vma)
+{
+       struct thread_table_field *ptr = vma->vm_private_data;
+
+       if (ptr)
+               ptr->vma = NULL;
+}
+
+/* called under mmap semaphore */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+static vm_fault_t zlogger_fault(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+#else
+static int zlogger_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#endif
+       struct thread_table_field *ptr;
+       void *p = NULL;
+       struct page *page;
+
+       ptr = alloc_block_for_thread(false);
+       if (!ptr)
+               return VM_FAULT_SIGSEGV;
+
+       ptr->vma = vma;
+       vma->vm_private_data = ptr;
+
+       p = get_block(ptr->blk);
+       if (!p)
+               return VM_FAULT_SIGSEGV;
+
+       page = virt_to_page((unsigned long)p);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+       return vmf_insert_pfn(vma, vma->vm_start, page_to_pfn(page));
+#else
+       return vm_insert_pfn(vma, vma->vm_start, page_to_pfn(page));
+#endif
+}
+
+static const struct vm_operations_struct zlogger_vm_ops = {
+       .fault = zlogger_fault,
+       .open = zlogger_vm_open,
+       .close = zlogger_vm_close,
+};
+
+static int zlogger_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       struct thread_table_field *ptr;
        unsigned long size = vma->vm_end - vma->vm_start;
-       char *p;
        struct page *page;
+       void *p;
 
-       if (dev_index > ZLOGGER_DEVICE_COUNT || offset != 0 || size > ZLOGGER_MAP_SIZE) {
-               pr_err("mmap failed: dev(%d) offset(%lu), size(%lu), pgoff(%lu)\n", dev_index, offset, size, vma->vm_pgoff);
+       if (vma->vm_pgoff != 0 || size != ZLOGGER_BLOCK_SIZE)
                return -EINVAL;
-       }
 
-       p = get_shared_memory(dev_index);
-       if (p)
-               page = virt_to_page((unsigned long)p);
-       else
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY;
+       vma->vm_private_data = filp;
+       vma->vm_ops = &zlogger_vm_ops;
+
+       ptr = get_thread_table(current->pid, false);
+       if (ptr && ptr->vma)
                return -EINVAL;
 
-       return remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, vma->vm_page_prot);
+       ptr = alloc_block_for_thread(false);
+       if (!ptr)
+               return -ENOMEM;
+
+       ptr->vma = vma;
+       vma->vm_private_data = ptr;
+
+       p = get_block(ptr->blk);
+       if (!p)
+               return -ENOMEM;
+
+       page = virt_to_page((unsigned long)p);
+
+       return remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), ZLOGGER_BLOCK_SIZE, vma->vm_page_prot);
+}
+
+static int zlogger_unmap(struct thread_table_field *ptr)
+{
+       struct vm_area_struct *vma = ptr->vma;
+
+       if (!ptr->vma)
+               return 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+       if (mmap_write_lock_killable(vma->vm_mm))
+               return -EINTR;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+       if (down_write_killable(&vma->vm_mm->mmap_sem))
+               return -EINTR;
+#else
+       down_write(&vma->vm_mm->mmap_sem);
+#endif
+
+       zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+       vma->vm_private_data = NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+       mmap_write_unlock(vma->vm_mm);
+#else
+       up_write(&vma->vm_mm->mmap_sem);
+#endif
+       return 0;
+}
+
+static int zlogger_realloc_mmap(struct file *filp)
+{
+       struct thread_table_field *ptr;
+       struct vm_area_struct *vma;
+       pid_t tid = current->pid;
+       struct page *page;
+       void *p;
+       int ret;
+
+       ptr = get_thread_table(tid, false);
+       if (!ptr || !ptr->vma)
+               return -EINVAL;
+
+       vma = ptr->vma;
+       ptr->vma = NULL;
+
+       ptr = alloc_block_for_thread(false);
+       if (!ptr)
+               return -ENOMEM;
+
+       ptr->vma = vma;
+       vma->vm_private_data = ptr;
+
+       p = get_block(ptr->blk);
+       if (!p)
+               return -ENOMEM;
+
+       page = virt_to_page((unsigned long)p);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+       if (mmap_write_lock_killable(current->mm))
+               return -EINTR;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+       if (down_write_killable(&current->mm->mmap_sem))
+               return -EINTR;
+#else
+       down_write(&current->mm->mmap_sem);
+#endif
+
+       zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+       ret = remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), ZLOGGER_BLOCK_SIZE, vma->vm_page_prot);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+       mmap_write_unlock(current->mm);
+#else
+       up_write(&current->mm->mmap_sem);
+#endif
+
+       return ret;
 }
 
 static ssize_t zlogger_read(struct file *filep, char __user *buffer, size_t len, loff_t *offset)
@@ -636,9 +805,12 @@ static long zlogger_clear(void)
        }
 
        hash_for_each(g_thread_table->data, i, ptr, next) {
-               if (ptr->blk != 0) {
-                       queue_push(&g_free_q, ptr->blk);
+               int blk = ptr->blk;
+
+               if (blk != 0) {
                        ptr->blk = 0;
+                       zlogger_unmap(ptr);
+                       queue_push(&g_free_q, blk);
                }
        }
 
@@ -650,20 +822,14 @@ static long zlogger_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
 {
        switch (cmd) {
                case ZLOGGER_IOCTL_COMMAND_ALLOC:
-                       return alloc_block_for_thread(false);
-               break;
-
-               case ZLOGGER_IOCTL_COMMAND_CLEAR:
-                       return zlogger_clear();
-               break;
+                       return zlogger_realloc_mmap(filp);
 
                case ZLOGGER_IOCTL_COMMAND_SET_DEFAULT_PRIORITY:
                        return zlogger_update_prio(filp, arg);
-               break;
 
                case ZLOGGER_IOCTL_COMMAND_SET_DEFAULT_TAG:
-               return zlogger_set_default_tag(filp, arg);
-               break;
+                       return zlogger_set_default_tag(filp, arg);
+
                default:
                        return -EINVAL;
        }
@@ -692,6 +858,51 @@ static const struct file_operations zlogger_fops = {
        .owner = THIS_MODULE,
 };
 
+static int zlogger_dump_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+       const int PAGES_PER_MAP = ZLOGGER_MAP_SIZE / PAGE_SIZE;
+       int dev_index = (int)vma->vm_pgoff / PAGES_PER_MAP;
+       unsigned long offset = vma->vm_pgoff % PAGES_PER_MAP;
+       unsigned long size = vma->vm_end - vma->vm_start;
+       char *p;
+       struct page *page;
+
+       if (dev_index > ZLOGGER_DEVICE_COUNT || offset != 0 || size > ZLOGGER_MAP_SIZE) {
+               pr_err("mmap failed: dev(%d) offset(%lu), size(%lu), pgoff(%lu)\n", dev_index, offset, size, vma->vm_pgoff);
+               return -EINVAL;
+       }
+
+       p = get_shared_memory(dev_index);
+       if (p)
+               page = virt_to_page((unsigned long)p);
+       else
+               return -EINVAL;
+
+       return remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, vma->vm_page_prot);
+}
+
+static long zlogger_dump_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+               case ZLOGGER_IOCTL_COMMAND_CLEAR:
+                       return zlogger_clear();
+
+               default:
+                       return -EINVAL;
+       }
+}
+
+static const struct file_operations zlogger_dump_fops = {
+       .open = nonseekable_open,
+       .mmap = zlogger_dump_mmap,
+       .unlocked_ioctl = zlogger_dump_ioctl,
+#ifdef CONFIG_COMPAT
+       /* arg is unused so far, so we can call ioctl directly */
+       .compat_ioctl = zlogger_dump_ioctl,
+#endif
+       .owner = THIS_MODULE,
+};
+
 static int zlogger_init(void)
 {
        int i = 0;
@@ -737,12 +948,26 @@ static int zlogger_init(void)
 #ifdef CONFIG_SECURITY_SMACK_SET_DEV_SMK_LABEL
        zlogger_device.lab_smk64 = ZLOGGER_SMACK_LABEL;
 #endif
+
        r = misc_register(&zlogger_device);
        if (unlikely(r)) {
                pr_err("Failed to register misc device for '%s' (%d)\n", ZLOGGER_DEVICE_NAME, r);
                goto out_free_zlog_task;
        }
 
+       zlogger_dump_device.minor = MISC_DYNAMIC_MINOR;
+       zlogger_dump_device.name = ZLOGGER_DUMP_DEVICE_NAME;
+       zlogger_dump_device.fops = &zlogger_dump_fops;
+       zlogger_dump_device.mode = 0444;
+#ifdef CONFIG_SECURITY_SMACK_SET_DEV_SMK_LABEL
+       zlogger_dump_device.lab_smk64 = ZLOGGER_SMACK_LABEL;
+#endif
+       r = misc_register(&zlogger_dump_device);
+       if (unlikely(r)) {
+               pr_err("Failed to register misc device for '%s' (%d)\n", ZLOGGER_DUMP_DEVICE_NAME, r);
+               goto out_free_zlogger_device;
+       }
+
        g_init = 1;
        pr_info("Init success\n");
 
@@ -775,6 +1000,7 @@ static void zlogger_exit(void)
        struct hlist_node *tmp_iter = NULL;
        int tmp_bkt;
 
+       misc_deregister(&zlogger_dump_device);
        misc_deregister(&zlogger_device);
 
        // TODO: What about the task that is running in the background?
@@ -799,5 +1025,8 @@ module_init(zlogger_init);
 module_exit(zlogger_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("jh1009.sung <jh1009.sung@samsung.com>, Arkadiusz Nowak <a.nowak3@samsung.com>, Mateusz Majewski <m.majewski2@samsung.com");
+MODULE_AUTHOR("jh1009.sung <jh1009.sung@samsung.com>");
+MODULE_AUTHOR("Arkadiusz Nowak <a.nowak3@samsung.com>");
+MODULE_AUTHOR("Mateusz Majewski <m.majewski2@samsung.com");
+MODULE_AUTHOR("Marek Szyprowski <m.szyprowski@samsung.com");
 MODULE_DESCRIPTION("Tizen zero copy logger");