--- /dev/null
+/*
+ * resourced:compaction
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <glib.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <sys/syscall.h>
+
+#include "config-parser.h"
+#include "module.h"
+#include "macro.h"
+#include "memory-common.h"
+#include "notifier.h"
+#include "procfs.h"
+#include "resourced.h"
+#include "trace.h"
+#include "util.h"
+
+/**
+ * State bit for zone's fragmentation warning
+ * ZONE_FRAG_WARN_RAISE bit is set for each zone
+ * when the fragmentation level reaches specified
+ * value for at least one of supported page orders.
+ */
+#define ZONE_FRAG_WARN_NONE (0)
+#define ZONE_FRAG_WARN_RAISE (0x1 << 0)
+
+/* Internal compaction module states */
+#define COMPACT_IDLE (0)
+/* State NOTIFIED is to eliminate spurious thread wakeups */
+#define COMPACT_NOTIFIED (1 << 0)
+/* Failed to write to procfs entry */
+#define COMPACT_FAILURE (1 << 1)
+/**
+ * Failed to perform one of the basic operations:
+ * like reading /proc/zoneinfo or /proc/pagetypeinfo
+ * Set to indicate that there is no point of return
+ * and that the compaction thread should/can safely
+ * clean things up and call it a day.
+ **/
+#define COMPACT_WITHDRAW (1 << 2)
+/**
+ * Set when compaction module has been explicitly
+ * requested to quit
+ */
+#define COMPACT_CANCEL (1 << 3)
+#define COMPACT_SKIP (COMPACT_WITHDRAW | COMPACT_CANCEL)
+
+#define MAX_PAGE_ORDER 0xa
+#define ZONE_MAX_NR 4
+#define HIGH_ORDER_SHIFT 0x5
+#define LOW_ORDER_MASK 0xfe
+
+#define PROC_COMPACT_ENTRY "/proc/sys/vm/compact_memory"
+#define MEM_CONF_FILE RD_CONFIG_FILE(memory)
+
+enum {
+ PARSE_TAG_ZONE = 1,
+ PARSE_TAG_PAGE_COUNT,
+ PARSE_TAG_WM_MIN,
+ PARSE_TAG_WM_LOW,
+ PARSE_TAG_WM_HIGH,
+ PARSE_TAG_MANAGED,
+ PARSE_TAG_MAX,
+};
+
+#define COMPACT_CONFIG_SECTION "Compaction"
+#define COMPACT_CONFIG_ENABLE "CompactEnable"
+#define COMPACT_CONFIG_FRAG "Fraglevel"
+
+/**
+ * Default frag level (percentage, order-based) which determines
+ * when to trigger compaction.
+ */
+#define COMPACT_DEF_FRAG_LEVEL 800 /* 80% */
+
+/*
+ * Note: Tightly coupled with corresponding ids.
+ * Mind while modifying.
+ */
+static const char *zone_names[] = {"Normal", "DMA", "HighMem", "DMA32"};
+
+struct parser_data {
+ struct memory_info *mem_info;
+ struct zone_info *zone;
+};
+
+struct zone_info {
+ unsigned int id;
+ unsigned long pages_per_order[MAX_PAGE_ORDER +1];
+ unsigned long free_pages;
+ unsigned long wm_min;
+ unsigned long wm_low;
+ unsigned long wm_high;
+ unsigned long managed;
+ unsigned int frag_map:MAX_PAGE_ORDER+1;
+ unsigned int frag_warn:2;
+};
+
+struct memory_info {
+ unsigned int zone_count;
+ struct zone_info zones[ZONE_MAX_NR];
+};
+
+struct compact_control {
+ struct memory_info *mem_info;
+ pthread_t compact_thread;
+ pthread_mutex_t lock;
+ unsigned int frag_level;
+ unsigned int status;
+ unsigned int compact_type;
+};
+
+#define PARSE_TAG(exp, fn, id) \
+ { \
+ .re_exp = exp, \
+ .callback = fn, \
+ .tag = PARSE_TAG_##id, \
+ }
+
+#define PARSE_TAG_EMPTY() {0,}
+
+/*
+ * @TODO: This should be attached to module ops
+ */
+struct compact_data {
+ struct compact_control *compact;
+ pthread_mutex_t notify_lock;
+ pthread_cond_t notify;
+ pthread_mutex_t drained_lock;
+ pthread_cond_t drained;
+};
+
+static struct compact_data compact_data = {
+ .notify_lock = PTHREAD_MUTEX_INITIALIZER,
+ .notify = PTHREAD_COND_INITIALIZER,
+ .drained_lock = PTHREAD_MUTEX_INITIALIZER,
+ .drained = PTHREAD_COND_INITIALIZER
+};
+
+static inline unsigned int get_zone_id(const char *zone_name, size_t len)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(zone_names); ++i) {
+ if (!strncmp(zone_name, zone_names[i], len))
+ return 1 << i;
+ }
+ return 0;
+}
+
+static inline const char *get_zone_name(unsigned int zone_id)
+{
+ unsigned int i = ffs(zone_id) - 1;
+
+ return (i < ARRAY_SIZE(zone_names)) ? zone_names[i] : NULL;
+}
+
+/*
+ * External fragmentation is an issue but one that mostly kernel
+ * should be concerned about, not the user space.
+ * Still though, fragmented *physical* memory may (but does not
+ * have to) lead to system getting less responsive - as the kernel
+ * might get trapped waiting for allocation of high-order
+ * physically-contiguous pages. The fragmentation issue gets more
+ * significant in case of huge pages - thought this is left aside
+ * due to not being relevant, in this particular case.
+ *
+ * Triggering the compaction from the user-space is a rather
+ * nasty hack. But if this is to be done, than...
+ * 1. There is not much point in triggering compaction if
+ * the system is already at a heavy memory pressure
+ * and it is struggling to satisfy 0-order allocations
+ * 2. Specifying the overall fragmentation level is quite tricky
+ * and without some decent background of what is/has been going
+ * on as far as memory allocations are being concern (both from
+ * user-space and from the kernel) is not really reliable, to say
+ * at least. All in all, what's the acceptable (whatever it means)
+ * level for external fragmentation? Having most of the available
+ * memory within the low-order page blocks should raise an alert,
+ * but that's only in theory. Things get more complicated when taking
+ * into consideration the migration types of available pages.
+ * This might go wrong in so many ways .....
+ * Shall this be continued ?
+ */
+static void compaction_start(struct compact_control *compact)
+{
+ struct memory_info *mem_info;
+ int current_status = COMPACT_IDLE;
+ _cleanup_close_ int fd = -1;
+ int n = 1;
+
+ pthread_mutex_lock(&compact->lock);
+
+ if (compact->status & COMPACT_SKIP)
+ current_status |= COMPACT_WITHDRAW;
+
+ pthread_mutex_unlock(&compact->lock);
+
+ if (current_status & COMPACT_WITHDRAW)
+ return;
+
+ mem_info = compact->mem_info;
+
+
+ fd = open(PROC_COMPACT_ENTRY, O_WRONLY);
+ if (fd < 0) {
+ if (errno == EACCES || errno == EFAULT || errno == ENOENT)
+ current_status |= COMPACT_WITHDRAW;
+ _E("Compaction: failed to open procfs entry [%d]\n", errno);
+ goto leave;
+ }
+ /*
+ * It doesn't really matter what gets written,
+ * as long as smth gets....
+ */
+ if (write(fd, &n, sizeof(n)) <= 0)
+ current_status |= COMPACT_FAILURE;
+ /*
+ * Reset the external fragmentation warnings.
+ * Locking is not required here as all updates will get suspended
+ * until the compaction status won't indicate all is done here
+ */
+ if (current_status & COMPACT_FAILURE)
+ goto leave;
+
+ for (n = 0; n < mem_info->zone_count; ++n)
+ mem_info->zones[n].frag_warn &= ~ZONE_FRAG_WARN_RAISE;
+leave:
+
+ pthread_mutex_lock(&compact->lock);
+ compact->status |= current_status;
+ pthread_mutex_unlock(&compact->lock);
+}
+
+static void compact_validate_zone(struct zone_info *zone,
+ unsigned int frag_level,
+ struct memory_info *mem_info)
+{
+ int order, req_order;
+ unsigned int current_frag_map = 0;
+ /*
+ * Skip compaction if the system is below the low watermark.
+ * It's gonna be done either way
+ */
+ if (zone->free_pages < zone->wm_low) {
+ _I("Skipping validation due to falling below the low watermark\n");
+ _I("Zone %s: number of free pages: %lu low watermark: %lu\n",
+ get_zone_name(zone->id),
+ zone->free_pages, zone->wm_low);
+ return;
+ }
+
+ for (req_order = 1; req_order <= MAX_PAGE_ORDER; ++req_order) {
+ unsigned long available_pages = 0;
+
+ for (order = req_order; order <= MAX_PAGE_ORDER; ++order)
+ available_pages += zone->pages_per_order[order] << order;
+
+ if ((1000 - (available_pages * 1000 / zone->free_pages)) >= frag_level)
+ current_frag_map |= 1 << req_order;
+ }
+
+ if (current_frag_map) {
+
+ if ((!zone->frag_map && current_frag_map) ||
+ ((zone->frag_map ^ current_frag_map) &&
+ !((zone->frag_map ^ current_frag_map) & zone->frag_map)))
+
+ zone->frag_warn |= ZONE_FRAG_WARN_RAISE;
+ }
+
+ zone->frag_map = current_frag_map;
+}
+
+static void compaction_verify_zone(struct compact_control *compact,
+ struct zone_info *zone)
+{
+ struct memory_info *mem_info = compact->mem_info;
+
+ /*
+ * Here comes the shady part:
+ * without some decent memory tracing here it is
+ * truly difficult to determine whether the compaction
+ * is required or not.
+ */
+ compact_validate_zone(zone, compact->frag_level, mem_info);
+}
+
+static void compaction_verify(struct compact_control *compact)
+{
+ /* Get the overall idea of current external fragmentation */
+ struct memory_info *mem_info = compact->mem_info;
+ unsigned int compact_targets = 0;
+ int n;
+
+ /*
+ * Verify each zone although the compaction can be
+ * triggered per node (or globally) only.
+ */
+ for (n = 0; n < mem_info->zone_count; ++n) {
+ struct zone_info *zone = &mem_info->zones[n];
+
+ compaction_verify_zone(compact, zone);
+ if (zone->frag_warn & ZONE_FRAG_WARN_RAISE) {
+ /*
+ * As the compaction can be triggered either globally
+ * or on per-node it's enough to have at least one
+ * zone for which the external fragmentation got
+ * dangerously high. Still to have a minimum control
+ * over the whole process - validate all zones.
+ */
+ ++compact_targets;
+ }
+ }
+
+ if (compact_targets)
+ compaction_start(compact);
+}
+
+
+#define compact_zoneinfo_set(zone, _off, v) \
+ (*(typeof(v)*)(((char*)(zone)) + _off) = v)
+
+
+static int compact_parse_zone(const char *s, regmatch_t *match,
+ unsigned int parse_tag, void *data)
+{
+ struct parser_data *parser_data = (struct parser_data *)data;
+ unsigned int zone_id;
+ struct zone_info *zone;
+
+ if (parse_tag != PARSE_TAG_ZONE)
+ return -EINVAL;
+
+ zone_id = get_zone_id(s + match[1].rm_so,
+ match[1].rm_eo - match[1].rm_so);
+ zone = parser_data->mem_info->zones;
+
+ if (!zone_id)
+ return -EINVAL;
+
+ while (zone->id && zone->id != zone_id)
+ ++zone;
+
+ if (!zone->id) {
+ ++parser_data->mem_info->zone_count;
+ zone->id = zone_id;
+ }
+
+ parser_data->zone = zone;
+ return 0;
+}
+
+static int compact_parse_zoneinfo(const char *s, regmatch_t *match,
+ unsigned int parse_tag,
+ void *data)
+{
+ struct parser_data *parser_data = (struct parser_data *)data;
+ char *e;
+ unsigned long v;
+
+ v = strtoul(s + match[1].rm_so, &e, 0);
+ if (!(s != e))
+ return -EINVAL;
+
+ switch (parse_tag) {
+ case PARSE_TAG_WM_MIN:
+ compact_zoneinfo_set(parser_data->zone,
+ offsetof(struct zone_info, wm_min), v);
+ break;
+ case PARSE_TAG_WM_LOW:
+ compact_zoneinfo_set(parser_data->zone,
+ offsetof(struct zone_info, wm_low), v);
+
+ break;
+ case PARSE_TAG_WM_HIGH:
+ compact_zoneinfo_set(parser_data->zone,
+ offsetof(struct zone_info, wm_high), v);
+ break;
+ case PARSE_TAG_MANAGED:
+ compact_zoneinfo_set(parser_data->zone,
+ offsetof(struct zone_info, managed), v);
+ break;
+ }
+ return 0;
+}
+
+static int compact_parse_pages(const char *s, regmatch_t *match,
+ unsigned int parse_tag, void *data)
+{
+ struct parser_data *parser_data = (struct parser_data *)data;
+ char *e;
+ unsigned long v, page_count = 0;
+ int order;
+
+ if (parse_tag != PARSE_TAG_PAGE_COUNT)
+ return -EINVAL;
+
+ for (order = 0; order < MAX_PAGE_ORDER; ++order) {
+
+ v = strtoul(s, &e, 0);
+ if (!(s != e))
+ return -EINVAL;
+ parser_data->zone->pages_per_order[order] = v;
+ page_count += v << order;
+ s = e;
+ }
+
+ if (parser_data->zone->free_pages != page_count) {
+ /*
+ * The drop of number of available pages is being handled
+ * on a per-order basis, thought this might be a good point
+ * to validate the zone's watermarks
+ */
+ parser_data->zone->free_pages = page_count;
+ }
+ return 0;
+}
+
+static int compact_get_buddyinfo(struct compact_control *compact)
+{
+ const struct parse_arg args[] = {
+ PARSE_TAG("zone[[:blank:]]+(Normal|DMA|DMA32|HighMem)",
+ compact_parse_zone, ZONE),
+ PARSE_TAG("([[:blank:]]+([0-9]+))+",
+ compact_parse_pages, PAGE_COUNT),
+ PARSE_TAG_EMPTY(),
+ };
+
+ struct parser_data parser_data = {
+ .mem_info = compact->mem_info,
+ .zone = &compact->mem_info->zones[0],
+ };
+
+ return proc_parse_buddyinfo(args, &parser_data);
+}
+
+static int compact_get_zoneinfo(struct compact_control *compact)
+{
+ const struct parse_arg args[] = {
+ PARSE_TAG("zone[[:blank:]]+(Normal|DMA|DMA32|HighMem)",
+ compact_parse_zone, ZONE),
+ PARSE_TAG("min[[:blank:]]+([0-9]+)\n",
+ compact_parse_zoneinfo, WM_MIN),
+ PARSE_TAG("low[[:blank:]]+([0-9]+)\n",
+ compact_parse_zoneinfo, WM_LOW),
+ PARSE_TAG("high[[:blank:]]+([0-9]+)\n",
+ compact_parse_zoneinfo, WM_HIGH),
+ PARSE_TAG("managed[[:blank:]]+([0-9]+)\n",
+ compact_parse_zoneinfo, MANAGED),
+ PARSE_TAG_EMPTY(),
+ };
+
+ struct parser_data parser_data = {
+ .mem_info = compact->mem_info,
+ .zone = &compact->mem_info->zones[0],
+ };
+ return proc_parse_zoneinfo(args, &parser_data);
+}
+
+static void compact_track_frag_level(struct compact_control *compact)
+{
+ int woken = 1;
+
+ do {
+ /* Eliminate updates on spurious wake-ups */
+ if (woken) {
+ compact_get_buddyinfo(compact);
+ compaction_verify(compact);
+ }
+
+ pthread_mutex_lock(&compact_data.notify_lock);
+ pthread_cond_wait(&compact_data.notify,
+ &compact_data.notify_lock);
+ pthread_mutex_unlock(&compact_data.notify_lock);
+
+ pthread_mutex_lock(&compact->lock);
+ woken = compact->status & COMPACT_NOTIFIED ? 1 : 0;
+ compact->status &= ~COMPACT_NOTIFIED;
+ pthread_mutex_unlock(&compact->lock);
+
+ } while (!(compact->status & COMPACT_SKIP));
+
+}
+
+static int compact_mem_state_changed(void *data)
+{
+ struct compact_control *compact;
+ struct memory_info *mem_info;
+ int result = RESOURCED_ERROR_NONE;
+
+ pthread_mutex_lock(&compact_data.drained_lock);
+ compact = compact_data.compact;
+ mem_info = compact ? compact->mem_info : NULL;
+ if (mem_info) {
+ int new_state = *((int *)data);
+
+ if (new_state < LOWMEM_NORMAL || new_state >= LOWMEM_MAX_LEVEL) {
+ result = RESOURCED_ERROR_FAIL;
+ goto leave;
+ }
+
+ pthread_mutex_lock(&compact_data.compact->lock);
+ if (!(compact->status & COMPACT_SKIP)) {
+ compact->status |= COMPACT_NOTIFIED;
+ pthread_cond_signal(&compact_data.notify);
+ }
+ pthread_mutex_unlock(&compact_data.compact->lock);
+ }
+leave:
+ pthread_mutex_unlock(&compact_data.drained_lock);
+ return result;
+}
+
+static void compact_cleanup(struct compact_control *compact)
+{
+ struct memory_info *mem_info = compact->mem_info;
+
+ if (!(compact->status & COMPACT_SKIP))
+ _E("Invalid compact thread state [%d]\n", compact->status);
+
+ unregister_notifier(RESOURCED_NOTIFIER_MEM_STATE_CHANGED,
+ compact_mem_state_changed);
+
+ (void) pthread_mutex_destroy(&compact->lock);
+
+ free(mem_info);
+ free(compact);
+}
+
+static void *compact_tracer(void *arg)
+{
+ struct compact_data *cdata = (struct compact_data *)arg;
+ struct compact_control *compact = cdata->compact;
+
+ if (compact_get_zoneinfo(compact) == RESOURCED_ERROR_NONE)
+ compact_track_frag_level(compact);
+
+ /* Dropped - so clean-up */
+ pthread_mutex_lock(&compact->lock);
+ compact->status |= COMPACT_WITHDRAW;
+ pthread_mutex_unlock(&compact->lock);
+
+ pthread_mutex_lock(&cdata->drained_lock);
+ compact_cleanup(compact);
+ cdata->compact = NULL;
+ pthread_mutex_unlock(&cdata->drained_lock);
+
+ pthread_cond_signal(&cdata->drained);
+
+ pthread_exit(NULL);
+}
+
+static int compact_config_parse(struct parse_result *result, void *user_data)
+{
+ struct compact_control *compact = (struct compact_control *)user_data;
+ unsigned long v;
+ char *e = NULL;
+
+ /* Single config section is expected */
+ if (!result->section || strcmp(COMPACT_CONFIG_SECTION, result->section))
+ return RESOURCED_ERROR_NONE;
+
+ if (!result->name || !result->value)
+ return RESOURCED_ERROR_NONE;
+
+ if (!strcmp(COMPACT_CONFIG_ENABLE, result->name)) {
+
+ v = strtol(result->value, &e, 10);
+
+ if (!(result->value != e) || *e != '\0')
+ return RESOURCED_ERROR_FAIL;
+
+ /**
+ * At init state - no locking for status
+ * needed here, as the compact data is not yet
+ * made available
+ */
+ if (!v) {
+ (void) pthread_mutex_lock(&compact->lock);
+ compact->status |= COMPACT_SKIP;
+ (void) pthread_mutex_unlock(&compact->lock);
+ }
+
+ } else if (!strcmp(COMPACT_CONFIG_FRAG, result->name)) {
+
+ v = strtol(result->value, &e, 0);
+
+ if (!(result->value != e) || *e != '\0')
+ return RESOURCED_ERROR_FAIL;
+ compact->frag_level = v;
+ }
+
+ return RESOURCED_ERROR_NONE;
+}
+
+static int compact_init(void *data)
+{
+ struct memory_info *mem_info;
+ struct compact_control *compact;
+ int result = RESOURCED_ERROR_OUT_OF_MEMORY;
+
+ pthread_mutex_lock(&compact_data.drained_lock);
+ if (compact_data.compact) {
+ _E("Unbalanced calls to compact module load/unload\n");
+ result = RESOURCED_ERROR_NONE;
+ goto leave;
+ }
+
+ compact = calloc(1, sizeof(*compact));
+ if (!compact)
+ goto leave;
+
+ mem_info = calloc(1, sizeof(*mem_info));
+ if (!mem_info)
+ goto cleanup;
+
+ compact->mem_info = mem_info;
+ compact->frag_level = COMPACT_DEF_FRAG_LEVEL;
+
+ result = pthread_mutex_init(&compact->lock, NULL);
+ if (result) {
+ _E("Failed to init compact lock: %m");
+ goto cleanup_all;
+ }
+
+ /* Load configuration */
+ config_parse(MEM_CONF_FILE, compact_config_parse,
+ compact);
+
+ if (compact->status & COMPACT_SKIP) {
+ _I("Compaction module disabled.");
+ result = RESOURCED_ERROR_FAIL;
+ goto cleanup_all;
+ }
+
+ compact_data.compact = compact;
+
+ result = pthread_create(&compact->compact_thread, NULL,
+ compact_tracer, (void*)&compact_data);
+ if (result) {
+ compact_data.compact = NULL;
+ goto cleanup_all;
+ }
+
+ pthread_detach(compact->compact_thread);
+ pthread_mutex_unlock(&compact_data.drained_lock);
+
+ register_notifier(RESOURCED_NOTIFIER_MEM_STATE_CHANGED,
+ compact_mem_state_changed);
+ return RESOURCED_ERROR_NONE;
+
+cleanup_all:
+ free(mem_info);
+cleanup:
+ free(compact);
+leave:
+ pthread_mutex_unlock(&compact_data.drained_lock);
+ return result;
+}
+
+static int compact_exit(void *data)
+{
+ struct compact_control *compact;
+
+ pthread_mutex_lock(&compact_data.drained_lock);
+ compact = compact_data.compact;
+ compact_data.compact = NULL;
+
+ if (!compact)
+ goto leave;
+
+ pthread_mutex_lock(&compact->lock);
+ compact->status |= COMPACT_CANCEL;
+ pthread_mutex_unlock(&compact->lock);
+
+ pthread_cond_signal(&compact_data.notify);
+ pthread_cond_wait(&compact_data.drained, &compact_data.drained_lock);
+leave:
+ pthread_mutex_unlock(&compact_data.drained_lock);
+ return 0;
+}
+
+static int compact_runtime_support(void *data)
+{
+ _cleanup_close_ int fd = -1;
+
+ fd = open(PROC_COMPACT_ENTRY, O_WRONLY);
+ if (fd < 0) {
+ _E("Unable to open compaction procfs entry\n");
+ return RESOURCED_ERROR_NO_DATA;
+ }
+ return RESOURCED_ERROR_NONE;
+}
+
+static const struct module_ops compact_module_ops = {
+ .priority = MODULE_PRIORITY_LATE,
+ .name = "compact",
+ .init = compact_init,
+ .exit = compact_exit,
+ .check_runtime_support = compact_runtime_support,
+};
+
+MODULE_REGISTER(&compact_module_ops)
+