#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/nodemask.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid0.h"
#include "bitmap.h"
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static struct workqueue_struct *raid5_wq;
/*
* Stripe cache
*/
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+ struct r5worker_group *group;
+ int i, cpu = sh->cpu;
+
+ if (!cpu_online(cpu)) {
+ cpu = cpumask_any(cpu_online_mask);
+ sh->cpu = cpu;
+ }
+
+ if (list_empty(&sh->lru)) {
+ struct r5worker_group *group;
+ group = conf->worker_groups + cpu_to_group(cpu);
+ list_add_tail(&sh->lru, &group->handle_list);
+ }
+
+ if (conf->worker_cnt_per_group == 0) {
+ md_wakeup_thread(conf->mddev->thread);
+ return;
+ }
+
+ group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+ for (i = 0; i < conf->worker_cnt_per_group; i++)
+ queue_work_on(sh->cpu, raid5_wq, &group->workers[i].work);
+}
+
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
{
BUG_ON(!list_empty(&sh->lru));
else {
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
- list_add_tail(&sh->lru, &conf->handle_list);
+ if (conf->worker_cnt_per_group == 0) {
+ list_add_tail(&sh->lru, &conf->handle_list);
+ } else {
+ raid5_wakeup_stripe_thread(sh);
+ return;
+ }
}
md_wakeup_thread(conf->mddev->thread);
} else {
raid5_build_block(sh, i, previous);
}
insert_hash(conf, sh);
+ sh->cpu = smp_processor_id();
}
static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
list_add_tail(&sh->lru, &conf->hold_list);
+ raid5_wakeup_stripe_thread(sh);
}
}
}
* head of the hold_list has changed, i.e. the head was promoted to the
* handle_list.
*/
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh;
+ struct stripe_head *sh = NULL, *tmp;
+ struct list_head *handle_list = NULL;
+
+ if (conf->worker_cnt_per_group == 0) {
+ handle_list = &conf->handle_list;
+ } else if (group != ANY_GROUP) {
+ handle_list = &conf->worker_groups[group].handle_list;
+ } else {
+ int i;
+ for (i = 0; i < conf->group_cnt; i++) {
+ handle_list = &conf->worker_groups[i].handle_list;
+ if (!list_empty(handle_list))
+ break;
+ }
+ }
pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
__func__,
- list_empty(&conf->handle_list) ? "empty" : "busy",
+ list_empty(handle_list) ? "empty" : "busy",
list_empty(&conf->hold_list) ? "empty" : "busy",
atomic_read(&conf->pending_full_writes), conf->bypass_count);
- if (!list_empty(&conf->handle_list)) {
- sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+ if (!list_empty(handle_list)) {
+ sh = list_entry(handle_list->next, typeof(*sh), lru);
if (list_empty(&conf->hold_list))
conf->bypass_count = 0;
((conf->bypass_threshold &&
conf->bypass_count > conf->bypass_threshold) ||
atomic_read(&conf->pending_full_writes) == 0)) {
- sh = list_entry(conf->hold_list.next,
- typeof(*sh), lru);
- conf->bypass_count -= conf->bypass_threshold;
- if (conf->bypass_count < 0)
- conf->bypass_count = 0;
- } else
+
+ list_for_each_entry(tmp, &conf->hold_list, lru) {
+ if (conf->worker_cnt_per_group == 0 ||
+ group == ANY_GROUP ||
+ !cpu_online(tmp->cpu) ||
+ cpu_to_group(tmp->cpu) == group) {
+ sh = tmp;
+ break;
+ }
+ }
+
+ if (sh) {
+ conf->bypass_count -= conf->bypass_threshold;
+ if (conf->bypass_count < 0)
+ conf->bypass_count = 0;
+ }
+ }
+
+ if (!sh)
return NULL;
list_del_init(&sh->lru);
}
#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, int group)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0;
while (batch_size < MAX_STRIPE_BATCH &&
- (sh = __get_priority_stripe(conf)) != NULL)
+ (sh = __get_priority_stripe(conf, group)) != NULL)
batch[batch_size++] = sh;
if (batch_size == 0)
return batch_size;
}
+static void raid5_do_work(struct work_struct *work)
+{
+ struct r5worker *worker = container_of(work, struct r5worker, work);
+ struct r5worker_group *group = worker->group;
+ struct r5conf *conf = group->conf;
+ int group_id = group - conf->worker_groups;
+ int handled;
+ struct blk_plug plug;
+
+ pr_debug("+++ raid5worker active\n");
+
+ blk_start_plug(&plug);
+ handled = 0;
+ spin_lock_irq(&conf->device_lock);
+ while (1) {
+ int batch_size, released;
+
+ released = release_stripe_list(conf);
+
+ batch_size = handle_active_stripes(conf, group_id);
+ if (!batch_size && !released)
+ break;
+ handled += batch_size;
+ }
+ pr_debug("%d stripes handled\n", handled);
+
+ spin_unlock_irq(&conf->device_lock);
+ blk_finish_plug(&plug);
+
+ pr_debug("--- raid5worker inactive\n");
+}
+
/*
* This is our raid5 kernel thread.
*
handled++;
}
- batch_size = handle_active_stripes(conf);
+ batch_size = handle_active_stripes(conf, ANY_GROUP);
if (!batch_size && !released)
break;
handled += batch_size;
.attrs = raid5_attrs,
};
+static int alloc_thread_groups(struct r5conf *conf, int cnt)
+{
+ int i, j;
+ ssize_t size;
+ struct r5worker *workers;
+
+ conf->worker_cnt_per_group = cnt;
+ if (cnt == 0) {
+ conf->worker_groups = NULL;
+ return 0;
+ }
+ conf->group_cnt = num_possible_nodes();
+ size = sizeof(struct r5worker) * cnt;
+ workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+ conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+ conf->group_cnt, GFP_NOIO);
+ if (!conf->worker_groups || !workers) {
+ kfree(workers);
+ kfree(conf->worker_groups);
+ conf->worker_groups = NULL;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < conf->group_cnt; i++) {
+ struct r5worker_group *group;
+
+ group = &conf->worker_groups[i];
+ INIT_LIST_HEAD(&group->handle_list);
+ group->conf = conf;
+ group->workers = workers + i * cnt;
+
+ for (j = 0; j < cnt; j++) {
+ group->workers[j].group = group;
+ INIT_WORK(&group->workers[j].work, raid5_do_work);
+ }
+ }
+
+ return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+ if (conf->worker_groups)
+ kfree(conf->worker_groups[0].workers);
+ kfree(conf->worker_groups);
+ conf->worker_groups = NULL;
+}
+
static sector_t
raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
static void free_conf(struct r5conf *conf)
{
+ free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
kfree(conf->disks);
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+ /* Don't enable multi-threading by default*/
+ if (alloc_thread_groups(conf, 0))
+ goto abort;
spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
static int __init raid5_init(void)
{
+ raid5_wq = alloc_workqueue("raid5wq",
+ WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+ if (!raid5_wq)
+ return -ENOMEM;
register_md_personality(&raid6_personality);
register_md_personality(&raid5_personality);
register_md_personality(&raid4_personality);
unregister_md_personality(&raid6_personality);
unregister_md_personality(&raid5_personality);
unregister_md_personality(&raid4_personality);
+ destroy_workqueue(raid5_wq);
}
module_init(raid5_init);