drivers/infiniband/hw/hfi1/affinity.c

   1 /*
   2  * Copyright(c) 2015, 2016 Intel Corporation.
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of version 2 of the GNU General Public License as
  11  * published by the Free Software Foundation.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * BSD LICENSE
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  *  - Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  *  - Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in
  28  *    the documentation and/or other materials provided with the
  29  *    distribution.
  30  *  - Neither the name of Intel Corporation nor the names of its
  31  *    contributors may be used to endorse or promote products derived
  32  *    from this software without specific prior written permission.
  33  *
  34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45  *
  46  */
  47 #include <linux/topology.h>
  48 #include <linux/cpumask.h>
  49 #include <linux/module.h>
  50
  51 #include "hfi.h"
  52 #include "affinity.h"
  53 #include "sdma.h"
  54 #include "trace.h"
  55
  56 struct hfi1_affinity_node_list node_affinity = {
  57         .list = LIST_HEAD_INIT(node_affinity.list),
  58         .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
  59 };
  60
  61 /* Name of IRQ types, indexed by enum irq_type */
  62 static const char * const irq_type_names[] = {
  63         "SDMA",
  64         "RCVCTXT",
  65         "GENERAL",
  66         "OTHER",
  67 };
  68
  69 /* Per NUMA node count of HFI devices */
  70 static unsigned int *hfi1_per_node_cntr;
  71
  72 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  73 {
  74         cpumask_clear(&set->mask);
  75         cpumask_clear(&set->used);
  76         set->gen = 0;
  77 }
  78
  79 /* Initialize non-HT cpu cores mask */
  80 void init_real_cpu_mask(void)
  81 {
  82         int possible, curr_cpu, i, ht;
  83
  84         cpumask_clear(&node_affinity.real_cpu_mask);
  85
  86         /* Start with cpu online mask as the real cpu mask */
  87         cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
  88
  89         /*
  90          * Remove HT cores from the real cpu mask.  Do this in two steps below.
  91          */
  92         possible = cpumask_weight(&node_affinity.real_cpu_mask);
  93         ht = cpumask_weight(topology_sibling_cpumask(
  94                                 cpumask_first(&node_affinity.real_cpu_mask)));
  95         /*
  96          * Step 1.  Skip over the first N HT siblings and use them as the
  97          * "real" cores.  Assumes that HT cores are not enumerated in
  98          * succession (except in the single core case).
  99          */
 100         curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 101         for (i = 0; i < possible / ht; i++)
 102                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 103         /*
 104          * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 105          * skip any gaps.
 106          */
 107         for (; i < possible; i++) {
 108                 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
 109                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 110         }
 111 }
 112
 113 int node_affinity_init(void)
 114 {
 115         int node;
 116         struct pci_dev *dev = NULL;
 117         const struct pci_device_id *ids = hfi1_pci_tbl;
 118
 119         cpumask_clear(&node_affinity.proc.used);
 120         cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
 121
 122         node_affinity.proc.gen = 0;
 123         node_affinity.num_core_siblings =
 124                                 cpumask_weight(topology_sibling_cpumask(
 125                                         cpumask_first(&node_affinity.proc.mask)
 126                                         ));
 127         node_affinity.num_online_nodes = num_online_nodes();
 128         node_affinity.num_online_cpus = num_online_cpus();
 129
 130         /*
 131          * The real cpu mask is part of the affinity struct but it has to be
 132          * initialized early. It is needed to calculate the number of user
 133          * contexts in set_up_context_variables().
 134          */
 135         init_real_cpu_mask();
 136
 137         hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
 138                                      sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
 139         if (!hfi1_per_node_cntr)
 140                 return -ENOMEM;
 141
 142         while (ids->vendor) {
 143                 dev = NULL;
 144                 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
 145                         node = pcibus_to_node(dev->bus);
 146                         if (node < 0)
 147                                 node = numa_node_id();
 148
 149                         hfi1_per_node_cntr[node]++;
 150                 }
 151                 ids++;
 152         }
 153
 154         return 0;
 155 }
 156
 157 void node_affinity_destroy(void)
 158 {
 159         struct list_head *pos, *q;
 160         struct hfi1_affinity_node *entry;
 161
 162         spin_lock(&node_affinity.lock);
 163         list_for_each_safe(pos, q, &node_affinity.list) {
 164                 entry = list_entry(pos, struct hfi1_affinity_node,
 165                                    list);
 166                 list_del(pos);
 167                 kfree(entry);
 168         }
 169         spin_unlock(&node_affinity.lock);
 170         kfree(hfi1_per_node_cntr);
 171 }
 172
 173 static struct hfi1_affinity_node *node_affinity_allocate(int node)
 174 {
 175         struct hfi1_affinity_node *entry;
 176
 177         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 178         if (!entry)
 179                 return NULL;
 180         entry->node = node;
 181         INIT_LIST_HEAD(&entry->list);
 182
 183         return entry;
 184 }
 185
 186 /*
 187  * It appends an entry to the list.
 188  * It *must* be called with node_affinity.lock held.
 189  */
 190 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
 191 {
 192         list_add_tail(&entry->list, &node_affinity.list);
 193 }
 194
 195 /* It must be called with node_affinity.lock held */
 196 static struct hfi1_affinity_node *node_affinity_lookup(int node)
 197 {
 198         struct list_head *pos;
 199         struct hfi1_affinity_node *entry;
 200
 201         list_for_each(pos, &node_affinity.list) {
 202                 entry = list_entry(pos, struct hfi1_affinity_node, list);
 203                 if (entry->node == node)
 204                         return entry;
 205         }
 206
 207         return NULL;
 208 }
 209
 210 /*
 211  * Interrupt affinity.
 212  *
 213  * non-rcv avail gets a default mask that
 214  * starts as possible cpus with threads reset
 215  * and each rcv avail reset.
 216  *
 217  * rcv avail gets node relative 1 wrapping back
 218  * to the node relative 1 as necessary.
 219  *
 220  */
 221 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 222 {
 223         int node = pcibus_to_node(dd->pcidev->bus);
 224         struct hfi1_affinity_node *entry;
 225         const struct cpumask *local_mask;
 226         int curr_cpu, possible, i;
 227
 228         if (node < 0)
 229                 node = numa_node_id();
 230         dd->node = node;
 231
 232         local_mask = cpumask_of_node(dd->node);
 233         if (cpumask_first(local_mask) >= nr_cpu_ids)
 234                 local_mask = topology_core_cpumask(0);
 235
 236         spin_lock(&node_affinity.lock);
 237         entry = node_affinity_lookup(dd->node);
 238         spin_unlock(&node_affinity.lock);
 239
 240         /*
 241          * If this is the first time this NUMA node's affinity is used,
 242          * create an entry in the global affinity structure and initialize it.
 243          */
 244         if (!entry) {
 245                 entry = node_affinity_allocate(node);
 246                 if (!entry) {
 247                         dd_dev_err(dd,
 248                                    "Unable to allocate global affinity node\n");
 249                         return -ENOMEM;
 250                 }
 251                 init_cpu_mask_set(&entry->def_intr);
 252                 init_cpu_mask_set(&entry->rcv_intr);
 253                 cpumask_clear(&entry->general_intr_mask);
 254                 /* Use the "real" cpu mask of this node as the default */
 255                 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
 256                             local_mask);
 257
 258                 /* fill in the receive list */
 259                 possible = cpumask_weight(&entry->def_intr.mask);
 260                 curr_cpu = cpumask_first(&entry->def_intr.mask);
 261
 262                 if (possible == 1) {
 263                         /* only one CPU, everyone will use it */
 264                         cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
 265                         cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 266                 } else {
 267                         /*
 268                          * The general/control context will be the first CPU in
 269                          * the default list, so it is removed from the default
 270                          * list and added to the general interrupt list.
 271                          */
 272                         cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
 273                         cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 274                         curr_cpu = cpumask_next(curr_cpu,
 275                                                 &entry->def_intr.mask);
 276
 277                         /*
 278                          * Remove the remaining kernel receive queues from
 279                          * the default list and add them to the receive list.
 280                          */
 281                         for (i = 0;
 282                              i < (dd->n_krcv_queues - 1) *
 283                                   hfi1_per_node_cntr[dd->node];
 284                              i++) {
 285                                 cpumask_clear_cpu(curr_cpu,
 286                                                   &entry->def_intr.mask);
 287                                 cpumask_set_cpu(curr_cpu,
 288                                                 &entry->rcv_intr.mask);
 289                                 curr_cpu = cpumask_next(curr_cpu,
 290                                                         &entry->def_intr.mask);
 291                                 if (curr_cpu >= nr_cpu_ids)
 292                                         break;
 293                         }
 294
 295                         /*
 296                          * If there ends up being 0 CPU cores leftover for SDMA
 297                          * engines, use the same CPU cores as general/control
 298                          * context.
 299                          */
 300                         if (cpumask_weight(&entry->def_intr.mask) == 0)
 301                                 cpumask_copy(&entry->def_intr.mask,
 302                                              &entry->general_intr_mask);
 303                 }
 304
 305                 spin_lock(&node_affinity.lock);
 306                 node_affinity_add_tail(entry);
 307                 spin_unlock(&node_affinity.lock);
 308         }
 309
 310         return 0;
 311 }
 312
 313 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 314 {
 315         int ret;
 316         cpumask_var_t diff;
 317         struct hfi1_affinity_node *entry;
 318         struct cpu_mask_set *set = NULL;
 319         struct sdma_engine *sde = NULL;
 320         struct hfi1_ctxtdata *rcd = NULL;
 321         char extra[64];
 322         int cpu = -1;
 323
 324         extra[0] = '\0';
 325         cpumask_clear(&msix->mask);
 326
 327         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 328         if (!ret)
 329                 return -ENOMEM;
 330
 331         spin_lock(&node_affinity.lock);
 332         entry = node_affinity_lookup(dd->node);
 333         spin_unlock(&node_affinity.lock);
 334
 335         switch (msix->type) {
 336         case IRQ_SDMA:
 337                 sde = (struct sdma_engine *)msix->arg;
 338                 scnprintf(extra, 64, "engine %u", sde->this_idx);
 339                 set = &entry->def_intr;
 340                 break;
 341         case IRQ_GENERAL:
 342                 cpu = cpumask_first(&entry->general_intr_mask);
 343                 break;
 344         case IRQ_RCVCTXT:
 345                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 346                 if (rcd->ctxt == HFI1_CTRL_CTXT)
 347                         cpu = cpumask_first(&entry->general_intr_mask);
 348                 else
 349                         set = &entry->rcv_intr;
 350                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 351                 break;
 352         default:
 353                 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 354                 return -EINVAL;
 355         }
 356
 357         /*
 358          * The general and control contexts are placed on a particular
 359          * CPU, which is set above. Skip accounting for it. Everything else
 360          * finds its CPU here.
 361          */
 362         if (cpu == -1 && set) {
 363                 spin_lock(&node_affinity.lock);
 364                 if (cpumask_equal(&set->mask, &set->used)) {
 365                         /*
 366                          * We've used up all the CPUs, bump up the generation
 367                          * and reset the 'used' map
 368                          */
 369                         set->gen++;
 370                         cpumask_clear(&set->used);
 371                 }
 372                 cpumask_andnot(diff, &set->mask, &set->used);
 373                 cpu = cpumask_first(diff);
 374                 cpumask_set_cpu(cpu, &set->used);
 375                 spin_unlock(&node_affinity.lock);
 376         }
 377
 378         switch (msix->type) {
 379         case IRQ_SDMA:
 380                 sde->cpu = cpu;
 381                 break;
 382         case IRQ_GENERAL:
 383         case IRQ_RCVCTXT:
 384         case IRQ_OTHER:
 385                 break;
 386         }
 387
 388         cpumask_set_cpu(cpu, &msix->mask);
 389         dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
 390                     msix->msix.vector, irq_type_names[msix->type],
 391                     extra, cpu);
 392         irq_set_affinity_hint(msix->msix.vector, &msix->mask);
 393
 394         free_cpumask_var(diff);
 395         return 0;
 396 }
 397
 398 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 399                            struct hfi1_msix_entry *msix)
 400 {
 401         struct cpu_mask_set *set = NULL;
 402         struct hfi1_ctxtdata *rcd;
 403         struct hfi1_affinity_node *entry;
 404
 405         spin_lock(&node_affinity.lock);
 406         entry = node_affinity_lookup(dd->node);
 407         spin_unlock(&node_affinity.lock);
 408
 409         switch (msix->type) {
 410         case IRQ_SDMA:
 411                 set = &entry->def_intr;
 412                 break;
 413         case IRQ_GENERAL:
 414                 /* Don't do accounting for general contexts */
 415                 break;
 416         case IRQ_RCVCTXT:
 417                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 418                 /* Don't do accounting for control contexts */
 419                 if (rcd->ctxt != HFI1_CTRL_CTXT)
 420                         set = &entry->rcv_intr;
 421                 break;
 422         default:
 423                 return;
 424         }
 425
 426         if (set) {
 427                 spin_lock(&node_affinity.lock);
 428                 cpumask_andnot(&set->used, &set->used, &msix->mask);
 429                 if (cpumask_empty(&set->used) && set->gen) {
 430                         set->gen--;
 431                         cpumask_copy(&set->used, &set->mask);
 432                 }
 433                 spin_unlock(&node_affinity.lock);
 434         }
 435
 436         irq_set_affinity_hint(msix->msix.vector, NULL);
 437         cpumask_clear(&msix->mask);
 438 }
 439
 440 /* This should be called with node_affinity.lock held */
 441 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
 442                                 struct hfi1_affinity_node_list *affinity)
 443 {
 444         int possible, curr_cpu, i;
 445         uint num_cores_per_socket = node_affinity.num_online_cpus /
 446                                         affinity->num_core_siblings /
 447                                                 node_affinity.num_online_nodes;
 448
 449         cpumask_copy(hw_thread_mask, &affinity->proc.mask);
 450         if (affinity->num_core_siblings > 0) {
 451                 /* Removing other siblings not needed for now */
 452                 possible = cpumask_weight(hw_thread_mask);
 453                 curr_cpu = cpumask_first(hw_thread_mask);
 454                 for (i = 0;
 455                      i < num_cores_per_socket * node_affinity.num_online_nodes;
 456                      i++)
 457                         curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 458
 459                 for (; i < possible; i++) {
 460                         cpumask_clear_cpu(curr_cpu, hw_thread_mask);
 461                         curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 462                 }
 463
 464                 /* Identifying correct HW threads within physical cores */
 465                 cpumask_shift_left(hw_thread_mask, hw_thread_mask,
 466                                    num_cores_per_socket *
 467                                    node_affinity.num_online_nodes *
 468                                    hw_thread_no);
 469         }
 470 }
 471
 472 int hfi1_get_proc_affinity(int node)
 473 {
 474         int cpu = -1, ret, i;
 475         struct hfi1_affinity_node *entry;
 476         cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 477         const struct cpumask *node_mask,
 478                 *proc_mask = tsk_cpus_allowed(current);
 479         struct hfi1_affinity_node_list *affinity = &node_affinity;
 480         struct cpu_mask_set *set = &affinity->proc;
 481
 482         /*
 483          * check whether process/context affinity has already
 484          * been set
 485          */
 486         if (cpumask_weight(proc_mask) == 1) {
 487                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 488                           current->pid, current->comm,
 489                           cpumask_pr_args(proc_mask));
 490                 /*
 491                  * Mark the pre-set CPU as used. This is atomic so we don't
 492                  * need the lock
 493                  */
 494                 cpu = cpumask_first(proc_mask);
 495                 cpumask_set_cpu(cpu, &set->used);
 496                 goto done;
 497         } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
 498                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 499                           current->pid, current->comm,
 500                           cpumask_pr_args(proc_mask));
 501                 goto done;
 502         }
 503
 504         /*
 505          * The process does not have a preset CPU affinity so find one to
 506          * recommend using the following algorithm:
 507          *
 508          * For each user process that is opening a context on HFI Y:
 509          *  a) If all cores are filled, reinitialize the bitmask
 510          *  b) Fill real cores first, then HT cores (First set of HT
 511          *     cores on all physical cores, then second set of HT core,
 512          *     and, so on) in the following order:
 513          *
 514          *     1. Same NUMA node as HFI Y and not running an IRQ
 515          *        handler
 516          *     2. Same NUMA node as HFI Y and running an IRQ handler
 517          *     3. Different NUMA node to HFI Y and not running an IRQ
 518          *        handler
 519          *     4. Different NUMA node to HFI Y and running an IRQ
 520          *        handler
 521          *  c) Mark core as filled in the bitmask. As user processes are
 522          *     done, clear cores from the bitmask.
 523          */
 524
 525         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 526         if (!ret)
 527                 goto done;
 528         ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
 529         if (!ret)
 530                 goto free_diff;
 531         ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
 532         if (!ret)
 533                 goto free_hw_thread_mask;
 534         ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
 535         if (!ret)
 536                 goto free_available_mask;
 537
 538         spin_lock(&affinity->lock);
 539         /*
 540          * If we've used all available HW threads, clear the mask and start
 541          * overloading.
 542          */
 543         if (cpumask_equal(&set->mask, &set->used)) {
 544                 set->gen++;
 545                 cpumask_clear(&set->used);
 546         }
 547
 548         /*
 549          * If NUMA node has CPUs used by interrupt handlers, include them in the
 550          * interrupt handler mask.
 551          */
 552         entry = node_affinity_lookup(node);
 553         if (entry) {
 554                 cpumask_copy(intrs_mask, (entry->def_intr.gen ?
 555                                           &entry->def_intr.mask :
 556                                           &entry->def_intr.used));
 557                 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
 558                                                     &entry->rcv_intr.mask :
 559                                                     &entry->rcv_intr.used));
 560                 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
 561         }
 562         hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 563                   cpumask_pr_args(intrs_mask));
 564
 565         cpumask_copy(hw_thread_mask, &set->mask);
 566
 567         /*
 568          * If HT cores are enabled, identify which HW threads within the
 569          * physical cores should be used.
 570          */
 571         if (affinity->num_core_siblings > 0) {
 572                 for (i = 0; i < affinity->num_core_siblings; i++) {
 573                         find_hw_thread_mask(i, hw_thread_mask, affinity);
 574
 575                         /*
 576                          * If there's at least one available core for this HW
 577                          * thread number, stop looking for a core.
 578                          *
 579                          * diff will always be not empty at least once in this
 580                          * loop as the used mask gets reset when
 581                          * (set->mask == set->used) before this loop.
 582                          */
 583                         cpumask_andnot(diff, hw_thread_mask, &set->used);
 584                         if (!cpumask_empty(diff))
 585                                 break;
 586                 }
 587         }
 588         hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
 589                   cpumask_pr_args(hw_thread_mask));
 590
 591         node_mask = cpumask_of_node(node);
 592         hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
 593                   cpumask_pr_args(node_mask));
 594
 595         /* Get cpumask of available CPUs on preferred NUMA */
 596         cpumask_and(available_mask, hw_thread_mask, node_mask);
 597         cpumask_andnot(available_mask, available_mask, &set->used);
 598         hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
 599                   cpumask_pr_args(available_mask));
 600
 601         /*
 602          * At first, we don't want to place processes on the same
 603          * CPUs as interrupt handlers. Then, CPUs running interrupt
 604          * handlers are used.
 605          *
 606          * 1) If diff is not empty, then there are CPUs not running
 607          *    non-interrupt handlers available, so diff gets copied
 608          *    over to available_mask.
 609          * 2) If diff is empty, then all CPUs not running interrupt
 610          *    handlers are taken, so available_mask contains all
 611          *    available CPUs running interrupt handlers.
 612          * 3) If available_mask is empty, then all CPUs on the
 613          *    preferred NUMA node are taken, so other NUMA nodes are
 614          *    used for process assignments using the same method as
 615          *    the preferred NUMA node.
 616          */
 617         cpumask_andnot(diff, available_mask, intrs_mask);
 618         if (!cpumask_empty(diff))
 619                 cpumask_copy(available_mask, diff);
 620
 621         /* If we don't have CPUs on the preferred node, use other NUMA nodes */
 622         if (cpumask_empty(available_mask)) {
 623                 cpumask_andnot(available_mask, hw_thread_mask, &set->used);
 624                 /* Excluding preferred NUMA cores */
 625                 cpumask_andnot(available_mask, available_mask, node_mask);
 626                 hfi1_cdbg(PROC,
 627                           "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
 628                           cpumask_pr_args(available_mask));
 629
 630                 /*
 631                  * At first, we don't want to place processes on the same
 632                  * CPUs as interrupt handlers.
 633                  */
 634                 cpumask_andnot(diff, available_mask, intrs_mask);
 635                 if (!cpumask_empty(diff))
 636                         cpumask_copy(available_mask, diff);
 637         }
 638         hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
 639                   cpumask_pr_args(available_mask));
 640
 641         cpu = cpumask_first(available_mask);
 642         if (cpu >= nr_cpu_ids) /* empty */
 643                 cpu = -1;
 644         else
 645                 cpumask_set_cpu(cpu, &set->used);
 646         spin_unlock(&affinity->lock);
 647         hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
 648
 649         free_cpumask_var(intrs_mask);
 650 free_available_mask:
 651         free_cpumask_var(available_mask);
 652 free_hw_thread_mask:
 653         free_cpumask_var(hw_thread_mask);
 654 free_diff:
 655         free_cpumask_var(diff);
 656 done:
 657         return cpu;
 658 }
 659
 660 void hfi1_put_proc_affinity(int cpu)
 661 {
 662         struct hfi1_affinity_node_list *affinity = &node_affinity;
 663         struct cpu_mask_set *set = &affinity->proc;
 664
 665         if (cpu < 0)
 666                 return;
 667         spin_lock(&affinity->lock);
 668         cpumask_clear_cpu(cpu, &set->used);
 669         hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
 670         if (cpumask_empty(&set->used) && set->gen) {
 671                 set->gen--;
 672                 cpumask_copy(&set->used, &set->mask);
 673         }
 674         spin_unlock(&affinity->lock);
 675 }
 676
 677 /* Prevents concurrent reads and writes of the sdma_affinity attrib */
 678 static DEFINE_MUTEX(sdma_affinity_mutex);
 679
 680 int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
 681                            size_t count)
 682 {
 683         struct hfi1_affinity_node *entry;
 684         cpumask_var_t mask;
 685         int ret, i;
 686
 687         spin_lock(&node_affinity.lock);
 688         entry = node_affinity_lookup(dd->node);
 689         spin_unlock(&node_affinity.lock);
 690
 691         if (!entry)
 692                 return -EINVAL;
 693
 694         ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
 695         if (!ret)
 696                 return -ENOMEM;
 697
 698         ret = cpulist_parse(buf, mask);
 699         if (ret)
 700                 goto out;
 701
 702         if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) {
 703                 dd_dev_warn(dd, "Invalid CPU mask\n");
 704                 ret = -EINVAL;
 705                 goto out;
 706         }
 707
 708         mutex_lock(&sdma_affinity_mutex);
 709         /* reset the SDMA interrupt affinity details */
 710         init_cpu_mask_set(&entry->def_intr);
 711         cpumask_copy(&entry->def_intr.mask, mask);
 712         /*
 713          * Reassign the affinity for each SDMA interrupt.
 714          */
 715         for (i = 0; i < dd->num_msix_entries; i++) {
 716                 struct hfi1_msix_entry *msix;
 717
 718                 msix = &dd->msix_entries[i];
 719                 if (msix->type != IRQ_SDMA)
 720                         continue;
 721
 722                 ret = hfi1_get_irq_affinity(dd, msix);
 723
 724                 if (ret)
 725                         break;
 726         }
 727         mutex_unlock(&sdma_affinity_mutex);
 728 out:
 729         free_cpumask_var(mask);
 730         return ret ? ret : strnlen(buf, PAGE_SIZE);
 731 }
 732
 733 int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf)
 734 {
 735         struct hfi1_affinity_node *entry;
 736
 737         spin_lock(&node_affinity.lock);
 738         entry = node_affinity_lookup(dd->node);
 739         spin_unlock(&node_affinity.lock);
 740
 741         if (!entry)
 742                 return -EINVAL;
 743
 744         mutex_lock(&sdma_affinity_mutex);
 745         cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask);
 746         mutex_unlock(&sdma_affinity_mutex);
 747         return strnlen(buf, PAGE_SIZE);
 748 }