Drivers: hv: vmbus: Implement NUMA aware CPU affinity for channels

author K. Y. Srinivasan <kys@microsoft.com>

Sun, 31 May 2015 06:37:48 +0000 (23:37 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 1 Jun 2015 01:56:31 +0000 (10:56 +0900)
author K. Y. Srinivasan <kys@microsoft.com>
Sun, 31 May 2015 06:37:48 +0000 (23:37 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Jun 2015 01:56:31 +0000 (10:56 +0900)
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c

index c3eba37..4506a66 100644 (file)
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -370,25 +370,27 @@ static const struct hv_vmbus_device_id hp_devs[] = {
  /*
   * We use this state to statically distribute the channel interrupt load.
   */
-static u32  next_vp;
+static int next_numa_node_id;
  
  /*
   * Starting with Win8, we can statically distribute the incoming
- * channel interrupt load by binding a channel to VCPU. We
- * implement here a simple round robin scheme for distributing
- * the interrupt load.
- * We will bind channels that are not performance critical to cpu 0 and
- * performance critical channels (IDE, SCSI and Network) will be uniformly
- * distributed across all available CPUs.
+ * channel interrupt load by binding a channel to VCPU.
+ * We do this in a hierarchical fashion:
+ * First distribute the primary channels across available NUMA nodes
+ * and then distribute the subchannels amongst the CPUs in the NUMA
+ * node assigned to the primary channel.
+ *
+ * For pre-win8 hosts or non-performance critical channels we assign the
+ * first CPU in the first NUMA node.
   */
  static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
  {
         u32 cur_cpu;
         int i;
         bool perf_chn = false;
-       u32 max_cpus = num_online_cpus();
-       struct vmbus_channel *primary = channel->primary_channel, *prev;
-       unsigned long flags;
+       struct vmbus_channel *primary = channel->primary_channel;
+       int next_node;
+       struct cpumask available_mask;
  
         for (i = IDE; i < MAX_PERF_CHN; i++) {
                 if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -405,36 +407,48 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
                  * Also if the channel is not a performance critical
                  * channel, bind it to cpu 0.
                  */
+               channel->numa_node = 0;
+               cpumask_set_cpu(0, &channel->alloced_cpus_in_node);
                 channel->target_cpu = 0;
                 channel->target_vp = hv_context.vp_index[0];
                 return;
         }
  
         /*
-        * Primary channels are distributed evenly across all vcpus we have.
-        * When the host asks us to create subchannels it usually makes us
-        * num_cpus-1 offers and we are supposed to distribute the work evenly
-        * among the channel itself and all its subchannels. Make sure they are
-        * all assigned to different vcpus.
+        * We distribute primary channels evenly across all the available
+        * NUMA nodes and within the assigned NUMA node we will assign the
+        * first available CPU to the primary channel.
+        * The sub-channels will be assigned to the CPUs available in the
+        * NUMA node evenly.
          */
-       if (!primary)
-               cur_cpu = (++next_vp % max_cpus);
-       else {
+       if (!primary) {
+               while (true) {
+                       next_node = next_numa_node_id++;
+                       if (next_node == nr_node_ids)
+                               next_node = next_numa_node_id = 0;
+                       if (cpumask_empty(cpumask_of_node(next_node)))
+                               continue;
+                       break;
+               }
+               channel->numa_node = next_node;
+               primary = channel;
+       }
+
+       if (cpumask_weight(&primary->alloced_cpus_in_node) ==
+           cpumask_weight(cpumask_of_node(primary->numa_node))) {
                 /*
-                * Let's assign the first subchannel of a channel to the
-                * primary->target_cpu+1 and all the subsequent channels to
-                * the prev->target_cpu+1.
+                * We have cycled through all the CPUs in the node;
+                * reset the alloced map.
                  */
-               spin_lock_irqsave(&primary->lock, flags);
-               if (primary->num_sc == 1)
-                       cur_cpu = (primary->target_cpu + 1) % max_cpus;
-               else {
-                       prev = list_prev_entry(channel, sc_list);
-                       cur_cpu = (prev->target_cpu + 1) % max_cpus;
-               }
-               spin_unlock_irqrestore(&primary->lock, flags);
+               cpumask_clear(&primary->alloced_cpus_in_node);
         }
  
+       cpumask_xor(&available_mask, &primary->alloced_cpus_in_node,
+                   cpumask_of_node(primary->numa_node));
+
+       cur_cpu = cpumask_next(-1, &available_mask);
+       cpumask_set_cpu(cur_cpu, &primary->alloced_cpus_in_node);
+
         channel->target_cpu = cur_cpu;
         channel->target_vp = hv_context.vp_index[cur_cpu];
  }
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h

index 4317cd1..30d3a1f 100644 (file)
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -697,6 +697,11 @@ struct vmbus_channel {
         /* The corresponding CPUID in the guest */
         u32 target_cpu;
         /*
+        * State to manage the CPU affiliation of channels.
+        */
+       struct cpumask alloced_cpus_in_node;
+       int numa_node;
+       /*
          * Support for sub-channels. For high performance devices,
          * it will be useful to have multiple sub-channels to support
          * a scalable communication infrastructure with the host.
author	K. Y. Srinivasan <kys@microsoft.com>
	Sun, 31 May 2015 06:37:48 +0000 (23:37 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 1 Jun 2015 01:56:31 +0000 (10:56 +0900)
drivers/hv/channel_mgmt.c		patch \| blob \| history
include/linux/hyperv.h		patch \| blob \| history