staging/rdma/hfi1: Remove query_device function
[platform/kernel/linux-rpi.git] / drivers / staging / rdma / hfi1 / verbs.c
1 /*
2  *
3  * This file is provided under a dual BSD/GPLv2 license.  When using or
4  * redistributing this file, you may do so under either license.
5  *
6  * GPL LICENSE SUMMARY
7  *
8  * Copyright(c) 2015 Intel Corporation.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of version 2 of the GNU General Public License as
12  * published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope that it will be useful, but
15  * WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * General Public License for more details.
18  *
19  * BSD LICENSE
20  *
21  * Copyright(c) 2015 Intel Corporation.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  *
27  *  - Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  *  - Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in
31  *    the documentation and/or other materials provided with the
32  *    distribution.
33  *  - Neither the name of Intel Corporation nor the names of its
34  *    contributors may be used to endorse or promote products derived
35  *    from this software without specific prior written permission.
36  *
37  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48  *
49  */
50
51 #include <rdma/ib_mad.h>
52 #include <rdma/ib_user_verbs.h>
53 #include <linux/io.h>
54 #include <linux/module.h>
55 #include <linux/utsname.h>
56 #include <linux/rculist.h>
57 #include <linux/mm.h>
58 #include <linux/random.h>
59 #include <linux/vmalloc.h>
60
61 #include "hfi.h"
62 #include "common.h"
63 #include "device.h"
64 #include "trace.h"
65 #include "qp.h"
66 #include "sdma.h"
67
68 static unsigned int hfi1_lkey_table_size = 16;
69 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
70                    S_IRUGO);
71 MODULE_PARM_DESC(lkey_table_size,
72                  "LKEY table size in bits (2^n, 1 <= n <= 23)");
73
74 static unsigned int hfi1_max_pds = 0xFFFF;
75 module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
76 MODULE_PARM_DESC(max_pds,
77                  "Maximum number of protection domains to support");
78
79 static unsigned int hfi1_max_ahs = 0xFFFF;
80 module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
81 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
82
83 unsigned int hfi1_max_cqes = 0x2FFFF;
84 module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
85 MODULE_PARM_DESC(max_cqes,
86                  "Maximum number of completion queue entries to support");
87
88 unsigned int hfi1_max_cqs = 0x1FFFF;
89 module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
90 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
91
92 unsigned int hfi1_max_qp_wrs = 0x3FFF;
93 module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
94 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
95
96 unsigned int hfi1_max_qps = 16384;
97 module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
98 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
99
100 unsigned int hfi1_max_sges = 0x60;
101 module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
102 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
103
104 unsigned int hfi1_max_mcast_grps = 16384;
105 module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
106 MODULE_PARM_DESC(max_mcast_grps,
107                  "Maximum number of multicast groups to support");
108
109 unsigned int hfi1_max_mcast_qp_attached = 16;
110 module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
111                    uint, S_IRUGO);
112 MODULE_PARM_DESC(max_mcast_qp_attached,
113                  "Maximum number of attached QPs to support");
114
115 unsigned int hfi1_max_srqs = 1024;
116 module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
117 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
118
119 unsigned int hfi1_max_srq_sges = 128;
120 module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
121 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
122
123 unsigned int hfi1_max_srq_wrs = 0x1FFFF;
124 module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
125 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
126
127 static void verbs_sdma_complete(
128         struct sdma_txreq *cookie,
129         int status,
130         int drained);
131
132 /* Length of buffer to create verbs txreq cache name */
133 #define TXREQ_NAME_LEN 24
134
135 /*
136  * Note that it is OK to post send work requests in the SQE and ERR
137  * states; hfi1_do_send() will process them and generate error
138  * completions as per IB 1.2 C10-96.
139  */
140 const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = {
141         [IB_QPS_RESET] = 0,
142         [IB_QPS_INIT] = HFI1_POST_RECV_OK,
143         [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK,
144         [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
145             HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK |
146             HFI1_PROCESS_NEXT_SEND_OK,
147         [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
148             HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK,
149         [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
150             HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
151         [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV |
152             HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
153 };
154
155 static inline void _hfi1_schedule_send(struct rvt_qp *qp);
156
157 /*
158  * Translate ib_wr_opcode into ib_wc_opcode.
159  */
160 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
161         [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
162         [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
163         [IB_WR_SEND] = IB_WC_SEND,
164         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
165         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
166         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
167         [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
168 };
169
170 /*
171  * Length of header by opcode, 0 --> not supported
172  */
173 const u8 hdr_len_by_opcode[256] = {
174         /* RC */
175         [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
176         [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
177         [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
178         [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
179         [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
180         [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
181         [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
182         [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
183         [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
184         [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
185         [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
186         [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
187         [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
188         [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
189         [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
190         [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
191         [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
192         [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
193         [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
194         [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
195         [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
196         /* UC */
197         [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
198         [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
199         [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
200         [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
201         [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
202         [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
203         [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
204         [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
205         [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
206         [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
207         [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
208         [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
209         /* UD */
210         [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
211         [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
212 };
213
214 static const opcode_handler opcode_handler_tbl[256] = {
215         /* RC */
216         [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
217         [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
218         [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
219         [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
220         [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
221         [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
222         [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
223         [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
224         [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
225         [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
226         [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
227         [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
228         [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
229         [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
230         [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
231         [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
232         [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
233         [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
234         [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
235         [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
236         [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
237         /* UC */
238         [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
239         [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
240         [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
241         [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
242         [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
243         [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
244         [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
245         [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
246         [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
247         [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
248         [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
249         [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
250         /* UD */
251         [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
252         [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
253         /* CNP */
254         [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
255 };
256
257 /*
258  * System image GUID.
259  */
260 __be64 ib_hfi1_sys_image_guid;
261
262 /**
263  * hfi1_copy_sge - copy data to SGE memory
264  * @ss: the SGE state
265  * @data: the data to copy
266  * @length: the length of the data
267  */
268 void hfi1_copy_sge(
269         struct rvt_sge_state *ss,
270         void *data, u32 length,
271         int release)
272 {
273         struct rvt_sge *sge = &ss->sge;
274
275         while (length) {
276                 u32 len = sge->length;
277
278                 if (len > length)
279                         len = length;
280                 if (len > sge->sge_length)
281                         len = sge->sge_length;
282                 WARN_ON_ONCE(len == 0);
283                 memcpy(sge->vaddr, data, len);
284                 sge->vaddr += len;
285                 sge->length -= len;
286                 sge->sge_length -= len;
287                 if (sge->sge_length == 0) {
288                         if (release)
289                                 rvt_put_mr(sge->mr);
290                         if (--ss->num_sge)
291                                 *sge = *ss->sg_list++;
292                 } else if (sge->length == 0 && sge->mr->lkey) {
293                         if (++sge->n >= RVT_SEGSZ) {
294                                 if (++sge->m >= sge->mr->mapsz)
295                                         break;
296                                 sge->n = 0;
297                         }
298                         sge->vaddr =
299                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
300                         sge->length =
301                                 sge->mr->map[sge->m]->segs[sge->n].length;
302                 }
303                 data += len;
304                 length -= len;
305         }
306 }
307
308 /**
309  * hfi1_skip_sge - skip over SGE memory
310  * @ss: the SGE state
311  * @length: the number of bytes to skip
312  */
313 void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
314 {
315         struct rvt_sge *sge = &ss->sge;
316
317         while (length) {
318                 u32 len = sge->length;
319
320                 if (len > length)
321                         len = length;
322                 if (len > sge->sge_length)
323                         len = sge->sge_length;
324                 WARN_ON_ONCE(len == 0);
325                 sge->vaddr += len;
326                 sge->length -= len;
327                 sge->sge_length -= len;
328                 if (sge->sge_length == 0) {
329                         if (release)
330                                 rvt_put_mr(sge->mr);
331                         if (--ss->num_sge)
332                                 *sge = *ss->sg_list++;
333                 } else if (sge->length == 0 && sge->mr->lkey) {
334                         if (++sge->n >= RVT_SEGSZ) {
335                                 if (++sge->m >= sge->mr->mapsz)
336                                         break;
337                                 sge->n = 0;
338                         }
339                         sge->vaddr =
340                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
341                         sge->length =
342                                 sge->mr->map[sge->m]->segs[sge->n].length;
343                 }
344                 length -= len;
345         }
346 }
347
348 /**
349  * post_one_send - post one RC, UC, or UD send work request
350  * @qp: the QP to post on
351  * @wr: the work request to send
352  */
353 static int post_one_send(struct rvt_qp *qp, struct ib_send_wr *wr)
354 {
355         struct rvt_swqe *wqe;
356         u32 next;
357         int i;
358         int j;
359         int acc;
360         struct rvt_lkey_table *rkt;
361         struct rvt_pd *pd;
362         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
363         struct hfi1_pportdata *ppd;
364         struct hfi1_ibport *ibp;
365
366         /* IB spec says that num_sge == 0 is OK. */
367         if (unlikely(wr->num_sge > qp->s_max_sge))
368                 return -EINVAL;
369
370         ppd = &dd->pport[qp->port_num - 1];
371         ibp = &ppd->ibport_data;
372
373         /*
374          * Don't allow RDMA reads or atomic operations on UC or
375          * undefined operations.
376          * Make sure buffer is large enough to hold the result for atomics.
377          */
378         if (qp->ibqp.qp_type == IB_QPT_UC) {
379                 if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
380                         return -EINVAL;
381         } else if (qp->ibqp.qp_type != IB_QPT_RC) {
382                 /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
383                 if (wr->opcode != IB_WR_SEND &&
384                     wr->opcode != IB_WR_SEND_WITH_IMM)
385                         return -EINVAL;
386                 /* Check UD destination address PD */
387                 if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
388                         return -EINVAL;
389         } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
390                 return -EINVAL;
391         else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
392                    (wr->num_sge == 0 ||
393                     wr->sg_list[0].length < sizeof(u64) ||
394                     wr->sg_list[0].addr & (sizeof(u64) - 1)))
395                 return -EINVAL;
396         else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
397                 return -EINVAL;
398
399         next = qp->s_head + 1;
400         if (next >= qp->s_size)
401                 next = 0;
402         if (next == qp->s_last)
403                 return -ENOMEM;
404
405         rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
406         pd = ibpd_to_rvtpd(qp->ibqp.pd);
407         wqe = get_swqe_ptr(qp, qp->s_head);
408
409
410         if (qp->ibqp.qp_type != IB_QPT_UC &&
411             qp->ibqp.qp_type != IB_QPT_RC)
412                 memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
413         else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
414                  wr->opcode == IB_WR_RDMA_WRITE ||
415                  wr->opcode == IB_WR_RDMA_READ)
416                 memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
417         else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
418                  wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
419                 memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
420         else
421                 memcpy(&wqe->wr, wr, sizeof(wqe->wr));
422
423         wqe->length = 0;
424         j = 0;
425         if (wr->num_sge) {
426                 acc = wr->opcode >= IB_WR_RDMA_READ ?
427                         IB_ACCESS_LOCAL_WRITE : 0;
428                 for (i = 0; i < wr->num_sge; i++) {
429                         u32 length = wr->sg_list[i].length;
430                         int ok;
431
432                         if (length == 0)
433                                 continue;
434                         ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
435                                          &wr->sg_list[i], acc);
436                         if (!ok)
437                                 goto bail_inval_free;
438                         wqe->length += length;
439                         j++;
440                 }
441                 wqe->wr.num_sge = j;
442         }
443         if (qp->ibqp.qp_type == IB_QPT_UC ||
444             qp->ibqp.qp_type == IB_QPT_RC) {
445                 if (wqe->length > 0x80000000U)
446                         goto bail_inval_free;
447         } else {
448                 atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
449         }
450         wqe->ssn = qp->s_ssn++;
451         qp->s_head = next;
452
453         return 0;
454
455 bail_inval_free:
456         /* release mr holds */
457         while (j) {
458                 struct rvt_sge *sge = &wqe->sg_list[--j];
459
460                 rvt_put_mr(sge->mr);
461         }
462         return -EINVAL;
463 }
464
465 /**
466  * post_send - post a send on a QP
467  * @ibqp: the QP to post the send on
468  * @wr: the list of work requests to post
469  * @bad_wr: the first bad WR is put here
470  *
471  * This may be called from interrupt context.
472  */
473 static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
474                      struct ib_send_wr **bad_wr)
475 {
476         struct rvt_qp *qp = to_iqp(ibqp);
477         struct hfi1_qp_priv *priv = qp->priv;
478         int err = 0;
479         int call_send;
480         unsigned long flags;
481         unsigned nreq = 0;
482
483         spin_lock_irqsave(&qp->s_lock, flags);
484
485         /* Check that state is OK to post send. */
486         if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) {
487                 spin_unlock_irqrestore(&qp->s_lock, flags);
488                 return -EINVAL;
489         }
490
491         /* sq empty and not list -> call send */
492         call_send = qp->s_head == qp->s_last && !wr->next;
493
494         for (; wr; wr = wr->next) {
495                 err = post_one_send(qp, wr);
496                 if (unlikely(err)) {
497                         *bad_wr = wr;
498                         goto bail;
499                 }
500                 nreq++;
501         }
502 bail:
503         spin_unlock_irqrestore(&qp->s_lock, flags);
504         if (nreq && !call_send)
505                 _hfi1_schedule_send(qp);
506         if (nreq && call_send)
507                 hfi1_do_send(&priv->s_iowait.iowork);
508         return err;
509 }
510
511 /**
512  * post_receive - post a receive on a QP
513  * @ibqp: the QP to post the receive on
514  * @wr: the WR to post
515  * @bad_wr: the first bad WR is put here
516  *
517  * This may be called from interrupt context.
518  */
519 static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
520                         struct ib_recv_wr **bad_wr)
521 {
522         struct rvt_qp *qp = to_iqp(ibqp);
523         struct rvt_rwq *wq = qp->r_rq.wq;
524         unsigned long flags;
525         int ret;
526
527         /* Check that state is OK to post receive. */
528         if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) {
529                 *bad_wr = wr;
530                 ret = -EINVAL;
531                 goto bail;
532         }
533
534         for (; wr; wr = wr->next) {
535                 struct rvt_rwqe *wqe;
536                 u32 next;
537                 int i;
538
539                 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
540                         *bad_wr = wr;
541                         ret = -EINVAL;
542                         goto bail;
543                 }
544
545                 spin_lock_irqsave(&qp->r_rq.lock, flags);
546                 next = wq->head + 1;
547                 if (next >= qp->r_rq.size)
548                         next = 0;
549                 if (next == wq->tail) {
550                         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
551                         *bad_wr = wr;
552                         ret = -ENOMEM;
553                         goto bail;
554                 }
555
556                 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
557                 wqe->wr_id = wr->wr_id;
558                 wqe->num_sge = wr->num_sge;
559                 for (i = 0; i < wr->num_sge; i++)
560                         wqe->sg_list[i] = wr->sg_list[i];
561                 /* Make sure queue entry is written before the head index. */
562                 smp_wmb();
563                 wq->head = next;
564                 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
565         }
566         ret = 0;
567
568 bail:
569         return ret;
570 }
571
572 /*
573  * Make sure the QP is ready and able to accept the given opcode.
574  */
575 static inline int qp_ok(int opcode, struct hfi1_packet *packet)
576 {
577         struct hfi1_ibport *ibp;
578
579         if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK))
580                 goto dropit;
581         if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
582             (opcode == IB_OPCODE_CNP))
583                 return 1;
584 dropit:
585         ibp = &packet->rcd->ppd->ibport_data;
586         ibp->rvp.n_pkt_drops++;
587         return 0;
588 }
589
590
591 /**
592  * hfi1_ib_rcv - process an incoming packet
593  * @packet: data packet information
594  *
595  * This is called to process an incoming packet at interrupt level.
596  *
597  * Tlen is the length of the header + data + CRC in bytes.
598  */
599 void hfi1_ib_rcv(struct hfi1_packet *packet)
600 {
601         struct hfi1_ctxtdata *rcd = packet->rcd;
602         struct hfi1_ib_header *hdr = packet->hdr;
603         u32 tlen = packet->tlen;
604         struct hfi1_pportdata *ppd = rcd->ppd;
605         struct hfi1_ibport *ibp = &ppd->ibport_data;
606         unsigned long flags;
607         u32 qp_num;
608         int lnh;
609         u8 opcode;
610         u16 lid;
611
612         /* Check for GRH */
613         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
614         if (lnh == HFI1_LRH_BTH)
615                 packet->ohdr = &hdr->u.oth;
616         else if (lnh == HFI1_LRH_GRH) {
617                 u32 vtf;
618
619                 packet->ohdr = &hdr->u.l.oth;
620                 if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
621                         goto drop;
622                 vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
623                 if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
624                         goto drop;
625                 packet->rcv_flags |= HFI1_HAS_GRH;
626         } else
627                 goto drop;
628
629         trace_input_ibhdr(rcd->dd, hdr);
630
631         opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
632         inc_opstats(tlen, &rcd->opstats->stats[opcode]);
633
634         /* Get the destination QP number. */
635         qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK;
636         lid = be16_to_cpu(hdr->lrh[1]);
637         if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
638                      (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
639                 struct hfi1_mcast *mcast;
640                 struct hfi1_mcast_qp *p;
641
642                 if (lnh != HFI1_LRH_GRH)
643                         goto drop;
644                 mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid);
645                 if (mcast == NULL)
646                         goto drop;
647                 list_for_each_entry_rcu(p, &mcast->qp_list, list) {
648                         packet->qp = p->qp;
649                         spin_lock_irqsave(&packet->qp->r_lock, flags);
650                         if (likely((qp_ok(opcode, packet))))
651                                 opcode_handler_tbl[opcode](packet);
652                         spin_unlock_irqrestore(&packet->qp->r_lock, flags);
653                 }
654                 /*
655                  * Notify hfi1_multicast_detach() if it is waiting for us
656                  * to finish.
657                  */
658                 if (atomic_dec_return(&mcast->refcount) <= 1)
659                         wake_up(&mcast->wait);
660         } else {
661                 rcu_read_lock();
662                 packet->qp = hfi1_lookup_qpn(ibp, qp_num);
663                 if (!packet->qp) {
664                         rcu_read_unlock();
665                         goto drop;
666                 }
667                 spin_lock_irqsave(&packet->qp->r_lock, flags);
668                 if (likely((qp_ok(opcode, packet))))
669                         opcode_handler_tbl[opcode](packet);
670                 spin_unlock_irqrestore(&packet->qp->r_lock, flags);
671                 rcu_read_unlock();
672         }
673         return;
674
675 drop:
676         ibp->rvp.n_pkt_drops++;
677 }
678
679 /*
680  * This is called from a timer to check for QPs
681  * which need kernel memory in order to send a packet.
682  */
683 static void mem_timer(unsigned long data)
684 {
685         struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
686         struct list_head *list = &dev->memwait;
687         struct rvt_qp *qp = NULL;
688         struct iowait *wait;
689         unsigned long flags;
690         struct hfi1_qp_priv *priv;
691
692         write_seqlock_irqsave(&dev->iowait_lock, flags);
693         if (!list_empty(list)) {
694                 wait = list_first_entry(list, struct iowait, list);
695                 qp = iowait_to_qp(wait);
696                 priv = qp->priv;
697                 list_del_init(&priv->s_iowait.list);
698                 /* refcount held until actual wake up */
699                 if (!list_empty(list))
700                         mod_timer(&dev->mem_timer, jiffies + 1);
701         }
702         write_sequnlock_irqrestore(&dev->iowait_lock, flags);
703
704         if (qp)
705                 hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
706 }
707
708 void update_sge(struct rvt_sge_state *ss, u32 length)
709 {
710         struct rvt_sge *sge = &ss->sge;
711
712         sge->vaddr += length;
713         sge->length -= length;
714         sge->sge_length -= length;
715         if (sge->sge_length == 0) {
716                 if (--ss->num_sge)
717                         *sge = *ss->sg_list++;
718         } else if (sge->length == 0 && sge->mr->lkey) {
719                 if (++sge->n >= RVT_SEGSZ) {
720                         if (++sge->m >= sge->mr->mapsz)
721                                 return;
722                         sge->n = 0;
723                 }
724                 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
725                 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
726         }
727 }
728
729 static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
730                                                 struct rvt_qp *qp)
731 {
732         struct hfi1_qp_priv *priv = qp->priv;
733         struct verbs_txreq *tx;
734         unsigned long flags;
735
736         tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
737         if (!tx) {
738                 spin_lock_irqsave(&qp->s_lock, flags);
739                 write_seqlock(&dev->iowait_lock);
740                 if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK &&
741                     list_empty(&priv->s_iowait.list)) {
742                         dev->n_txwait++;
743                         qp->s_flags |= RVT_S_WAIT_TX;
744                         list_add_tail(&priv->s_iowait.list, &dev->txwait);
745                         trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
746                         atomic_inc(&qp->refcount);
747                 }
748                 qp->s_flags &= ~RVT_S_BUSY;
749                 write_sequnlock(&dev->iowait_lock);
750                 spin_unlock_irqrestore(&qp->s_lock, flags);
751                 tx = ERR_PTR(-EBUSY);
752         }
753         return tx;
754 }
755
756 static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
757                                             struct rvt_qp *qp)
758 {
759         struct verbs_txreq *tx;
760
761         tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
762         if (!tx) {
763                 /* call slow path to get the lock */
764                 tx =  __get_txreq(dev, qp);
765                 if (IS_ERR(tx))
766                         return tx;
767         }
768         tx->qp = qp;
769         return tx;
770 }
771
772 void hfi1_put_txreq(struct verbs_txreq *tx)
773 {
774         struct hfi1_ibdev *dev;
775         struct rvt_qp *qp;
776         unsigned long flags;
777         unsigned int seq;
778         struct hfi1_qp_priv *priv;
779
780         qp = tx->qp;
781         dev = to_idev(qp->ibqp.device);
782
783         if (tx->mr) {
784                 rvt_put_mr(tx->mr);
785                 tx->mr = NULL;
786         }
787         sdma_txclean(dd_from_dev(dev), &tx->txreq);
788
789         /* Free verbs_txreq and return to slab cache */
790         kmem_cache_free(dev->verbs_txreq_cache, tx);
791
792         do {
793                 seq = read_seqbegin(&dev->iowait_lock);
794                 if (!list_empty(&dev->txwait)) {
795                         struct iowait *wait;
796
797                         write_seqlock_irqsave(&dev->iowait_lock, flags);
798                         /* Wake up first QP wanting a free struct */
799                         wait = list_first_entry(&dev->txwait, struct iowait,
800                                                 list);
801                         qp = iowait_to_qp(wait);
802                         priv = qp->priv;
803                         list_del_init(&priv->s_iowait.list);
804                         /* refcount held until actual wake up */
805                         write_sequnlock_irqrestore(&dev->iowait_lock, flags);
806                         hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
807                         break;
808                 }
809         } while (read_seqretry(&dev->iowait_lock, seq));
810 }
811
812 /*
813  * This is called with progress side lock held.
814  */
815 /* New API */
816 static void verbs_sdma_complete(
817         struct sdma_txreq *cookie,
818         int status,
819         int drained)
820 {
821         struct verbs_txreq *tx =
822                 container_of(cookie, struct verbs_txreq, txreq);
823         struct rvt_qp *qp = tx->qp;
824
825         spin_lock(&qp->s_lock);
826         if (tx->wqe)
827                 hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
828         else if (qp->ibqp.qp_type == IB_QPT_RC) {
829                 struct hfi1_ib_header *hdr;
830
831                 hdr = &tx->phdr.hdr;
832                 hfi1_rc_send_complete(qp, hdr);
833         }
834         if (drained) {
835                 /*
836                  * This happens when the send engine notes
837                  * a QP in the error state and cannot
838                  * do the flush work until that QP's
839                  * sdma work has finished.
840                  */
841                 if (qp->s_flags & RVT_S_WAIT_DMA) {
842                         qp->s_flags &= ~RVT_S_WAIT_DMA;
843                         hfi1_schedule_send(qp);
844                 }
845         }
846         spin_unlock(&qp->s_lock);
847
848         hfi1_put_txreq(tx);
849 }
850
851 static int wait_kmem(struct hfi1_ibdev *dev, struct rvt_qp *qp)
852 {
853         struct hfi1_qp_priv *priv = qp->priv;
854         unsigned long flags;
855         int ret = 0;
856
857         spin_lock_irqsave(&qp->s_lock, flags);
858         if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
859                 write_seqlock(&dev->iowait_lock);
860                 if (list_empty(&priv->s_iowait.list)) {
861                         if (list_empty(&dev->memwait))
862                                 mod_timer(&dev->mem_timer, jiffies + 1);
863                         qp->s_flags |= RVT_S_WAIT_KMEM;
864                         list_add_tail(&priv->s_iowait.list, &dev->memwait);
865                         trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
866                         atomic_inc(&qp->refcount);
867                 }
868                 write_sequnlock(&dev->iowait_lock);
869                 qp->s_flags &= ~RVT_S_BUSY;
870                 ret = -EBUSY;
871         }
872         spin_unlock_irqrestore(&qp->s_lock, flags);
873
874         return ret;
875 }
876
877 /*
878  * This routine calls txadds for each sg entry.
879  *
880  * Add failures will revert the sge cursor
881  */
882 static int build_verbs_ulp_payload(
883         struct sdma_engine *sde,
884         struct rvt_sge_state *ss,
885         u32 length,
886         struct verbs_txreq *tx)
887 {
888         struct rvt_sge *sg_list = ss->sg_list;
889         struct rvt_sge sge = ss->sge;
890         u8 num_sge = ss->num_sge;
891         u32 len;
892         int ret = 0;
893
894         while (length) {
895                 len = ss->sge.length;
896                 if (len > length)
897                         len = length;
898                 if (len > ss->sge.sge_length)
899                         len = ss->sge.sge_length;
900                 WARN_ON_ONCE(len == 0);
901                 ret = sdma_txadd_kvaddr(
902                         sde->dd,
903                         &tx->txreq,
904                         ss->sge.vaddr,
905                         len);
906                 if (ret)
907                         goto bail_txadd;
908                 update_sge(ss, len);
909                 length -= len;
910         }
911         return ret;
912 bail_txadd:
913         /* unwind cursor */
914         ss->sge = sge;
915         ss->num_sge = num_sge;
916         ss->sg_list = sg_list;
917         return ret;
918 }
919
920 /*
921  * Build the number of DMA descriptors needed to send length bytes of data.
922  *
923  * NOTE: DMA mapping is held in the tx until completed in the ring or
924  *       the tx desc is freed without having been submitted to the ring
925  *
926  * This routine insures the following all the helper routine
927  * calls succeed.
928  */
929 /* New API */
930 static int build_verbs_tx_desc(
931         struct sdma_engine *sde,
932         struct rvt_sge_state *ss,
933         u32 length,
934         struct verbs_txreq *tx,
935         struct ahg_ib_header *ahdr,
936         u64 pbc)
937 {
938         int ret = 0;
939         struct hfi1_pio_header *phdr;
940         u16 hdrbytes = tx->hdr_dwords << 2;
941
942         phdr = &tx->phdr;
943         if (!ahdr->ahgcount) {
944                 ret = sdma_txinit_ahg(
945                         &tx->txreq,
946                         ahdr->tx_flags,
947                         hdrbytes + length,
948                         ahdr->ahgidx,
949                         0,
950                         NULL,
951                         0,
952                         verbs_sdma_complete);
953                 if (ret)
954                         goto bail_txadd;
955                 phdr->pbc = cpu_to_le64(pbc);
956                 memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc));
957                 /* add the header */
958                 ret = sdma_txadd_kvaddr(
959                         sde->dd,
960                         &tx->txreq,
961                         &tx->phdr,
962                         tx->hdr_dwords << 2);
963                 if (ret)
964                         goto bail_txadd;
965         } else {
966                 struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth;
967                 struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth;
968
969                 /* needed in rc_send_complete() */
970                 phdr->hdr.lrh[0] = ahdr->ibh.lrh[0];
971                 if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) {
972                         sohdr = &ahdr->ibh.u.l.oth;
973                         dohdr = &phdr->hdr.u.l.oth;
974                 }
975                 /* opcode */
976                 dohdr->bth[0] = sohdr->bth[0];
977                 /* PSN/ACK  */
978                 dohdr->bth[2] = sohdr->bth[2];
979                 ret = sdma_txinit_ahg(
980                         &tx->txreq,
981                         ahdr->tx_flags,
982                         length,
983                         ahdr->ahgidx,
984                         ahdr->ahgcount,
985                         ahdr->ahgdesc,
986                         hdrbytes,
987                         verbs_sdma_complete);
988                 if (ret)
989                         goto bail_txadd;
990         }
991
992         /* add the ulp payload - if any.  ss can be NULL for acks */
993         if (ss)
994                 ret = build_verbs_ulp_payload(sde, ss, length, tx);
995 bail_txadd:
996         return ret;
997 }
998
999 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
1000                         u64 pbc)
1001 {
1002         struct hfi1_qp_priv *priv = qp->priv;
1003         struct ahg_ib_header *ahdr = priv->s_hdr;
1004         u32 hdrwords = qp->s_hdrwords;
1005         struct rvt_sge_state *ss = qp->s_cur_sge;
1006         u32 len = qp->s_cur_size;
1007         u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
1008         struct hfi1_ibdev *dev = ps->dev;
1009         struct hfi1_pportdata *ppd = ps->ppd;
1010         struct verbs_txreq *tx;
1011         struct sdma_txreq *stx;
1012         u64 pbc_flags = 0;
1013         u8 sc5 = priv->s_sc;
1014
1015         int ret;
1016
1017         if (!list_empty(&priv->s_iowait.tx_head)) {
1018                 stx = list_first_entry(
1019                         &priv->s_iowait.tx_head,
1020                         struct sdma_txreq,
1021                         list);
1022                 list_del_init(&stx->list);
1023                 tx = container_of(stx, struct verbs_txreq, txreq);
1024                 ret = sdma_send_txreq(tx->sde, &priv->s_iowait, stx);
1025                 if (unlikely(ret == -ECOMM))
1026                         goto bail_ecomm;
1027                 return ret;
1028         }
1029
1030         tx = get_txreq(dev, qp);
1031         if (IS_ERR(tx))
1032                 goto bail_tx;
1033
1034         tx->sde = priv->s_sde;
1035
1036         if (likely(pbc == 0)) {
1037                 u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1038                 /* No vl15 here */
1039                 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1040                 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
1041
1042                 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
1043         }
1044         tx->wqe = qp->s_wqe;
1045         tx->mr = qp->s_rdma_mr;
1046         if (qp->s_rdma_mr)
1047                 qp->s_rdma_mr = NULL;
1048         tx->hdr_dwords = hdrwords + 2;
1049         ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
1050         if (unlikely(ret))
1051                 goto bail_build;
1052         trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
1053         ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
1054         if (unlikely(ret == -ECOMM))
1055                 goto bail_ecomm;
1056         return ret;
1057
1058 bail_ecomm:
1059         /* The current one got "sent" */
1060         return 0;
1061 bail_build:
1062         /* kmalloc or mapping fail */
1063         hfi1_put_txreq(tx);
1064         return wait_kmem(dev, qp);
1065 bail_tx:
1066         return PTR_ERR(tx);
1067 }
1068
1069 /*
1070  * If we are now in the error state, return zero to flush the
1071  * send work request.
1072  */
1073 static int no_bufs_available(struct rvt_qp *qp, struct send_context *sc)
1074 {
1075         struct hfi1_qp_priv *priv = qp->priv;
1076         struct hfi1_devdata *dd = sc->dd;
1077         struct hfi1_ibdev *dev = &dd->verbs_dev;
1078         unsigned long flags;
1079         int ret = 0;
1080
1081         /*
1082          * Note that as soon as want_buffer() is called and
1083          * possibly before it returns, sc_piobufavail()
1084          * could be called. Therefore, put QP on the I/O wait list before
1085          * enabling the PIO avail interrupt.
1086          */
1087         spin_lock_irqsave(&qp->s_lock, flags);
1088         if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
1089                 write_seqlock(&dev->iowait_lock);
1090                 if (list_empty(&priv->s_iowait.list)) {
1091                         struct hfi1_ibdev *dev = &dd->verbs_dev;
1092                         int was_empty;
1093
1094                         dev->n_piowait++;
1095                         qp->s_flags |= RVT_S_WAIT_PIO;
1096                         was_empty = list_empty(&sc->piowait);
1097                         list_add_tail(&priv->s_iowait.list, &sc->piowait);
1098                         trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
1099                         atomic_inc(&qp->refcount);
1100                         /* counting: only call wantpiobuf_intr if first user */
1101                         if (was_empty)
1102                                 hfi1_sc_wantpiobuf_intr(sc, 1);
1103                 }
1104                 write_sequnlock(&dev->iowait_lock);
1105                 qp->s_flags &= ~RVT_S_BUSY;
1106                 ret = -EBUSY;
1107         }
1108         spin_unlock_irqrestore(&qp->s_lock, flags);
1109         return ret;
1110 }
1111
1112 struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
1113 {
1114         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1115         struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
1116         u8 vl;
1117
1118         vl = sc_to_vlt(dd, sc5);
1119         if (vl >= ppd->vls_supported && vl != 15)
1120                 return NULL;
1121         return dd->vld[vl].sc;
1122 }
1123
1124 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
1125                         u64 pbc)
1126 {
1127         struct hfi1_qp_priv *priv = qp->priv;
1128         struct ahg_ib_header *ahdr = priv->s_hdr;
1129         u32 hdrwords = qp->s_hdrwords;
1130         struct rvt_sge_state *ss = qp->s_cur_sge;
1131         u32 len = qp->s_cur_size;
1132         u32 dwords = (len + 3) >> 2;
1133         u32 plen = hdrwords + dwords + 2; /* includes pbc */
1134         struct hfi1_pportdata *ppd = ps->ppd;
1135         u32 *hdr = (u32 *)&ahdr->ibh;
1136         u64 pbc_flags = 0;
1137         u32 sc5;
1138         unsigned long flags = 0;
1139         struct send_context *sc;
1140         struct pio_buf *pbuf;
1141         int wc_status = IB_WC_SUCCESS;
1142
1143         /* vl15 special case taken care of in ud.c */
1144         sc5 = priv->s_sc;
1145         sc = qp_to_send_context(qp, sc5);
1146
1147         if (!sc)
1148                 return -EINVAL;
1149         if (likely(pbc == 0)) {
1150                 u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1151                 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1152                 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
1153                 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
1154         }
1155         pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
1156         if (unlikely(pbuf == NULL)) {
1157                 if (ppd->host_link_state != HLS_UP_ACTIVE) {
1158                         /*
1159                          * If we have filled the PIO buffers to capacity and are
1160                          * not in an active state this request is not going to
1161                          * go out to so just complete it with an error or else a
1162                          * ULP or the core may be stuck waiting.
1163                          */
1164                         hfi1_cdbg(
1165                                 PIO,
1166                                 "alloc failed. state not active, completing");
1167                         wc_status = IB_WC_GENERAL_ERR;
1168                         goto pio_bail;
1169                 } else {
1170                         /*
1171                          * This is a normal occurrence. The PIO buffs are full
1172                          * up but we are still happily sending, well we could be
1173                          * so lets continue to queue the request.
1174                          */
1175                         hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1176                         return no_bufs_available(qp, sc);
1177                 }
1178         }
1179
1180         if (len == 0) {
1181                 pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
1182         } else {
1183                 if (ss) {
1184                         seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
1185                         while (len) {
1186                                 void *addr = ss->sge.vaddr;
1187                                 u32 slen = ss->sge.length;
1188
1189                                 if (slen > len)
1190                                         slen = len;
1191                                 update_sge(ss, slen);
1192                                 seg_pio_copy_mid(pbuf, addr, slen);
1193                                 len -= slen;
1194                         }
1195                         seg_pio_copy_end(pbuf);
1196                 }
1197         }
1198
1199         trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
1200
1201         if (qp->s_rdma_mr) {
1202                 rvt_put_mr(qp->s_rdma_mr);
1203                 qp->s_rdma_mr = NULL;
1204         }
1205
1206 pio_bail:
1207         if (qp->s_wqe) {
1208                 spin_lock_irqsave(&qp->s_lock, flags);
1209                 hfi1_send_complete(qp, qp->s_wqe, wc_status);
1210                 spin_unlock_irqrestore(&qp->s_lock, flags);
1211         } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1212                 spin_lock_irqsave(&qp->s_lock, flags);
1213                 hfi1_rc_send_complete(qp, &ahdr->ibh);
1214                 spin_unlock_irqrestore(&qp->s_lock, flags);
1215         }
1216         return 0;
1217 }
1218
1219 /*
1220  * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1221  * being an entry from the ingress partition key table), return 0
1222  * otherwise. Use the matching criteria for egress partition keys
1223  * specified in the OPAv1 spec., section 9.1l.7.
1224  */
1225 static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
1226 {
1227         u16 mkey = pkey & PKEY_LOW_15_MASK;
1228         u16 ment = ent & PKEY_LOW_15_MASK;
1229
1230         if (mkey == ment) {
1231                 /*
1232                  * If pkey[15] is set (full partition member),
1233                  * is bit 15 in the corresponding table element
1234                  * clear (limited member)?
1235                  */
1236                 if (pkey & PKEY_MEMBER_MASK)
1237                         return !!(ent & PKEY_MEMBER_MASK);
1238                 return 1;
1239         }
1240         return 0;
1241 }
1242
1243 /*
1244  * egress_pkey_check - return 0 if hdr's pkey matches according to the
1245  * criteria in the OPAv1 spec., section 9.11.7.
1246  */
1247 static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
1248                                     struct hfi1_ib_header *hdr,
1249                                     struct rvt_qp *qp)
1250 {
1251         struct hfi1_qp_priv *priv = qp->priv;
1252         struct hfi1_other_headers *ohdr;
1253         struct hfi1_devdata *dd;
1254         int i = 0;
1255         u16 pkey;
1256         u8 lnh, sc5 = priv->s_sc;
1257
1258         if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
1259                 return 0;
1260
1261         /* locate the pkey within the headers */
1262         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
1263         if (lnh == HFI1_LRH_GRH)
1264                 ohdr = &hdr->u.l.oth;
1265         else
1266                 ohdr = &hdr->u.oth;
1267
1268         pkey = (u16)be32_to_cpu(ohdr->bth[0]);
1269
1270         /* If SC15, pkey[0:14] must be 0x7fff */
1271         if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1272                 goto bad;
1273
1274
1275         /* Is the pkey = 0x0, or 0x8000? */
1276         if ((pkey & PKEY_LOW_15_MASK) == 0)
1277                 goto bad;
1278
1279         /* The most likely matching pkey has index qp->s_pkey_index */
1280         if (unlikely(!egress_pkey_matches_entry(pkey,
1281                                         ppd->pkeys[qp->s_pkey_index]))) {
1282                 /* no match - try the entire table */
1283                 for (; i < MAX_PKEY_VALUES; i++) {
1284                         if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1285                                 break;
1286                 }
1287         }
1288
1289         if (i < MAX_PKEY_VALUES)
1290                 return 0;
1291 bad:
1292         incr_cntr64(&ppd->port_xmit_constraint_errors);
1293         dd = ppd->dd;
1294         if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
1295                 u16 slid = be16_to_cpu(hdr->lrh[3]);
1296
1297                 dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
1298                 dd->err_info_xmit_constraint.slid = slid;
1299                 dd->err_info_xmit_constraint.pkey = pkey;
1300         }
1301         return 1;
1302 }
1303
1304 /**
1305  * hfi1_verbs_send - send a packet
1306  * @qp: the QP to send on
1307  * @ps: the state of the packet to send
1308  *
1309  * Return zero if packet is sent or queued OK.
1310  * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
1311  */
1312 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
1313 {
1314         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1315         struct hfi1_qp_priv *priv = qp->priv;
1316         struct ahg_ib_header *ahdr = priv->s_hdr;
1317         int ret;
1318         int pio = 0;
1319         unsigned long flags = 0;
1320
1321         /*
1322          * VL15 packets (IB_QPT_SMI) will always use PIO, so we
1323          * can defer SDMA restart until link goes ACTIVE without
1324          * worrying about just how we got there.
1325          */
1326         if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
1327             !(dd->flags & HFI1_HAS_SEND_DMA))
1328                 pio = 1;
1329
1330         ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp);
1331         if (unlikely(ret)) {
1332                 /*
1333                  * The value we are returning here does not get propagated to
1334                  * the verbs caller. Thus we need to complete the request with
1335                  * error otherwise the caller could be sitting waiting on the
1336                  * completion event. Only do this for PIO. SDMA has its own
1337                  * mechanism for handling the errors. So for SDMA we can just
1338                  * return.
1339                  */
1340                 if (pio) {
1341                         hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1342                                   __func__);
1343                         spin_lock_irqsave(&qp->s_lock, flags);
1344                         hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1345                         spin_unlock_irqrestore(&qp->s_lock, flags);
1346                 }
1347                 return -EINVAL;
1348         }
1349
1350         if (pio) {
1351                 ret = dd->process_pio_send(qp, ps, 0);
1352         } else {
1353 #ifdef CONFIG_SDMA_VERBOSITY
1354                 dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
1355                            slashstrip(__FILE__), __LINE__, __func__);
1356                 dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", qp->s_hdrwords,
1357                            qp->s_cur_size);
1358 #endif
1359                 ret = dd->process_dma_send(qp, ps, 0);
1360         }
1361
1362         return ret;
1363 }
1364
1365 /**
1366  * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
1367  * @dd: the device data structure
1368  */
1369 static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
1370 {
1371         struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1372
1373         memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
1374
1375         rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1376                         IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1377                         IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1378                         IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
1379         rdi->dparms.props.page_size_cap = PAGE_SIZE;
1380         rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1381         rdi->dparms.props.vendor_part_id = dd->pcidev->device;
1382         rdi->dparms.props.hw_ver = dd->minrev;
1383         rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
1384         rdi->dparms.props.max_mr_size = ~0ULL;
1385         rdi->dparms.props.max_qp = hfi1_max_qps;
1386         rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
1387         rdi->dparms.props.max_sge = hfi1_max_sges;
1388         rdi->dparms.props.max_sge_rd = hfi1_max_sges;
1389         rdi->dparms.props.max_cq = hfi1_max_cqs;
1390         rdi->dparms.props.max_ah = hfi1_max_ahs;
1391         rdi->dparms.props.max_cqe = hfi1_max_cqes;
1392         rdi->dparms.props.max_mr = rdi->lkey_table.max;
1393         rdi->dparms.props.max_fmr = rdi->lkey_table.max;
1394         rdi->dparms.props.max_map_per_fmr = 32767;
1395         rdi->dparms.props.max_pd = hfi1_max_pds;
1396         rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1397         rdi->dparms.props.max_qp_init_rd_atom = 255;
1398         rdi->dparms.props.max_srq = hfi1_max_srqs;
1399         rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
1400         rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
1401         rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
1402         rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
1403         rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
1404         rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1405         rdi->dparms.props.max_total_mcast_qp_attach =
1406                                         rdi->dparms.props.max_mcast_qp_attach *
1407                                         rdi->dparms.props.max_mcast_grp;
1408 }
1409
1410 static inline u16 opa_speed_to_ib(u16 in)
1411 {
1412         u16 out = 0;
1413
1414         if (in & OPA_LINK_SPEED_25G)
1415                 out |= IB_SPEED_EDR;
1416         if (in & OPA_LINK_SPEED_12_5G)
1417                 out |= IB_SPEED_FDR;
1418
1419         return out;
1420 }
1421
1422 /*
1423  * Convert a single OPA link width (no multiple flags) to an IB value.
1424  * A zero OPA link width means link down, which means the IB width value
1425  * is a don't care.
1426  */
1427 static inline u16 opa_width_to_ib(u16 in)
1428 {
1429         switch (in) {
1430         case OPA_LINK_WIDTH_1X:
1431         /* map 2x and 3x to 1x as they don't exist in IB */
1432         case OPA_LINK_WIDTH_2X:
1433         case OPA_LINK_WIDTH_3X:
1434                 return IB_WIDTH_1X;
1435         default: /* link down or unknown, return our largest width */
1436         case OPA_LINK_WIDTH_4X:
1437                 return IB_WIDTH_4X;
1438         }
1439 }
1440
1441 static int query_port(struct ib_device *ibdev, u8 port,
1442                       struct ib_port_attr *props)
1443 {
1444         struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1445         struct hfi1_ibport *ibp = to_iport(ibdev, port);
1446         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1447         u16 lid = ppd->lid;
1448
1449         memset(props, 0, sizeof(*props));
1450         props->lid = lid ? lid : 0;
1451         props->lmc = ppd->lmc;
1452         props->sm_lid = ibp->rvp.sm_lid;
1453         props->sm_sl = ibp->rvp.sm_sl;
1454         /* OPA logical states match IB logical states */
1455         props->state = driver_lstate(ppd);
1456         props->phys_state = hfi1_ibphys_portstate(ppd);
1457         props->port_cap_flags = ibp->rvp.port_cap_flags;
1458         props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1459         props->max_msg_sz = 0x80000000;
1460         props->pkey_tbl_len = hfi1_get_npkeys(dd);
1461         props->bad_pkey_cntr = ibp->rvp.pkey_violations;
1462         props->qkey_viol_cntr = ibp->rvp.qkey_violations;
1463         props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1464         /* see rate_show() in ib core/sysfs.c */
1465         props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1466         props->max_vl_num = ppd->vls_supported;
1467         props->init_type_reply = 0;
1468
1469         /* Once we are a "first class" citizen and have added the OPA MTUs to
1470          * the core we can advertise the larger MTU enum to the ULPs, for now
1471          * advertise only 4K.
1472          *
1473          * Those applications which are either OPA aware or pass the MTU enum
1474          * from the Path Records to us will get the new 8k MTU.  Those that
1475          * attempt to process the MTU enum may fail in various ways.
1476          */
1477         props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1478                                       4096 : hfi1_max_mtu), IB_MTU_4096);
1479         props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1480                 mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
1481         props->subnet_timeout = ibp->rvp.subnet_timeout;
1482
1483         return 0;
1484 }
1485
1486 static int port_immutable(struct ib_device *ibdev, u8 port_num,
1487                           struct ib_port_immutable *immutable)
1488 {
1489         struct ib_port_attr attr;
1490         int err;
1491
1492         err = query_port(ibdev, port_num, &attr);
1493         if (err)
1494                 return err;
1495
1496         memset(immutable, 0, sizeof(*immutable));
1497
1498         immutable->pkey_tbl_len = attr.pkey_tbl_len;
1499         immutable->gid_tbl_len = attr.gid_tbl_len;
1500         immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
1501         immutable->max_mad_size = OPA_MGMT_MAD_SIZE;
1502
1503         return 0;
1504 }
1505
1506 static int modify_device(struct ib_device *device,
1507                          int device_modify_mask,
1508                          struct ib_device_modify *device_modify)
1509 {
1510         struct hfi1_devdata *dd = dd_from_ibdev(device);
1511         unsigned i;
1512         int ret;
1513
1514         if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1515                                    IB_DEVICE_MODIFY_NODE_DESC)) {
1516                 ret = -EOPNOTSUPP;
1517                 goto bail;
1518         }
1519
1520         if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1521                 memcpy(device->node_desc, device_modify->node_desc, 64);
1522                 for (i = 0; i < dd->num_pports; i++) {
1523                         struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1524
1525                         hfi1_node_desc_chg(ibp);
1526                 }
1527         }
1528
1529         if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1530                 ib_hfi1_sys_image_guid =
1531                         cpu_to_be64(device_modify->sys_image_guid);
1532                 for (i = 0; i < dd->num_pports; i++) {
1533                         struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1534
1535                         hfi1_sys_guid_chg(ibp);
1536                 }
1537         }
1538
1539         ret = 0;
1540
1541 bail:
1542         return ret;
1543 }
1544
1545 static int modify_port(struct ib_device *ibdev, u8 port,
1546                        int port_modify_mask, struct ib_port_modify *props)
1547 {
1548         struct hfi1_ibport *ibp = to_iport(ibdev, port);
1549         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1550         int ret = 0;
1551
1552         ibp->rvp.port_cap_flags |= props->set_port_cap_mask;
1553         ibp->rvp.port_cap_flags &= ~props->clr_port_cap_mask;
1554         if (props->set_port_cap_mask || props->clr_port_cap_mask)
1555                 hfi1_cap_mask_chg(ibp);
1556         if (port_modify_mask & IB_PORT_SHUTDOWN) {
1557                 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1558                   OPA_LINKDOWN_REASON_UNKNOWN);
1559                 ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1560         }
1561         if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
1562                 ibp->rvp.qkey_violations = 0;
1563         return ret;
1564 }
1565
1566 static int query_gid(struct ib_device *ibdev, u8 port,
1567                      int index, union ib_gid *gid)
1568 {
1569         struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1570         int ret = 0;
1571
1572         if (!port || port > dd->num_pports)
1573                 ret = -EINVAL;
1574         else {
1575                 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1576                 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1577
1578                 gid->global.subnet_prefix = ibp->rvp.gid_prefix;
1579                 if (index == 0)
1580                         gid->global.interface_id = cpu_to_be64(ppd->guid);
1581                 else if (index < HFI1_GUIDS_PER_PORT)
1582                         gid->global.interface_id = ibp->guids[index - 1];
1583                 else
1584                         ret = -EINVAL;
1585         }
1586
1587         return ret;
1588 }
1589
1590 /*
1591  * convert ah port,sl to sc
1592  */
1593 u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
1594 {
1595         struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
1596
1597         return ibp->sl_to_sc[ah->sl];
1598 }
1599
1600 static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
1601 {
1602         struct hfi1_ibport *ibp;
1603         struct hfi1_pportdata *ppd;
1604         struct hfi1_devdata *dd;
1605         u8 sc5;
1606
1607         /* test the mapping for validity */
1608         ibp = to_iport(ibdev, ah_attr->port_num);
1609         ppd = ppd_from_ibp(ibp);
1610         sc5 = ibp->sl_to_sc[ah_attr->sl];
1611         dd = dd_from_ppd(ppd);
1612         if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1613                 return -EINVAL;
1614         return 0;
1615 }
1616
1617 static void hfi1_notify_new_ah(struct ib_device *ibdev,
1618                                struct ib_ah_attr *ah_attr,
1619                                struct rvt_ah *ah)
1620 {
1621         struct hfi1_ibport *ibp;
1622         struct hfi1_pportdata *ppd;
1623         struct hfi1_devdata *dd;
1624         u8 sc5;
1625
1626         /*
1627          * Do not trust reading anything from rvt_ah at this point as it is not
1628          * done being setup. We can however modify things which we need to set.
1629          */
1630
1631         ibp = to_iport(ibdev, ah_attr->port_num);
1632         ppd = ppd_from_ibp(ibp);
1633         sc5 = ibp->sl_to_sc[ah->attr.sl];
1634         dd = dd_from_ppd(ppd);
1635         ah->vl = sc_to_vlt(dd, sc5);
1636         if (ah->vl < num_vls || ah->vl == 15)
1637                 ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
1638 }
1639
1640 struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
1641 {
1642         struct ib_ah_attr attr;
1643         struct ib_ah *ah = ERR_PTR(-EINVAL);
1644         struct rvt_qp *qp0;
1645
1646         memset(&attr, 0, sizeof(attr));
1647         attr.dlid = dlid;
1648         attr.port_num = ppd_from_ibp(ibp)->port;
1649         rcu_read_lock();
1650         qp0 = rcu_dereference(ibp->rvp.qp[0]);
1651         if (qp0)
1652                 ah = ib_create_ah(qp0->ibqp.pd, &attr);
1653         rcu_read_unlock();
1654         return ah;
1655 }
1656
1657 /**
1658  * hfi1_get_npkeys - return the size of the PKEY table for context 0
1659  * @dd: the hfi1_ib device
1660  */
1661 unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1662 {
1663         return ARRAY_SIZE(dd->pport[0].pkeys);
1664 }
1665
1666 static void init_ibport(struct hfi1_pportdata *ppd)
1667 {
1668         struct hfi1_ibport *ibp = &ppd->ibport_data;
1669         size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1670         int i;
1671
1672         for (i = 0; i < sz; i++) {
1673                 ibp->sl_to_sc[i] = i;
1674                 ibp->sc_to_sl[i] = i;
1675         }
1676
1677         spin_lock_init(&ibp->rvp.lock);
1678         /* Set the prefix to the default value (see ch. 4.1.1) */
1679         ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
1680         ibp->rvp.sm_lid = 0;
1681         /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
1682         ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1683                 IB_PORT_CAP_MASK_NOTICE_SUP;
1684         ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1685         ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1686         ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1687         ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1688         ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1689
1690         RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
1691         RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
1692 }
1693
1694 static void verbs_txreq_kmem_cache_ctor(void *obj)
1695 {
1696         struct verbs_txreq *tx = obj;
1697
1698         memset(tx, 0, sizeof(*tx));
1699 }
1700
1701 /**
1702  * hfi1_register_ib_device - register our device with the infiniband core
1703  * @dd: the device data structure
1704  * Return 0 if successful, errno if unsuccessful.
1705  */
1706 int hfi1_register_ib_device(struct hfi1_devdata *dd)
1707 {
1708         struct hfi1_ibdev *dev = &dd->verbs_dev;
1709         struct ib_device *ibdev = &dev->rdi.ibdev;
1710         struct hfi1_pportdata *ppd = dd->pport;
1711         unsigned i;
1712         int ret;
1713         size_t lcpysz = IB_DEVICE_NAME_MAX;
1714         u16 descq_cnt;
1715         char buf[TXREQ_NAME_LEN];
1716
1717         for (i = 0; i < dd->num_pports; i++)
1718                 init_ibport(ppd + i);
1719
1720         /* Only need to initialize non-zero fields. */
1721
1722         spin_lock_init(&dev->n_cqs_lock);
1723         spin_lock_init(&dev->n_qps_lock);
1724         spin_lock_init(&dev->n_srqs_lock);
1725         spin_lock_init(&dev->n_mcast_grps_lock);
1726         init_timer(&dev->mem_timer);
1727         dev->mem_timer.function = mem_timer;
1728         dev->mem_timer.data = (unsigned long) dev;
1729
1730         seqlock_init(&dev->iowait_lock);
1731         INIT_LIST_HEAD(&dev->txwait);
1732         INIT_LIST_HEAD(&dev->memwait);
1733
1734         descq_cnt = sdma_get_descq_cnt();
1735
1736         snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
1737         /* SLAB_HWCACHE_ALIGN for AHG */
1738         dev->verbs_txreq_cache = kmem_cache_create(buf,
1739                                                    sizeof(struct verbs_txreq),
1740                                                    0, SLAB_HWCACHE_ALIGN,
1741                                                    verbs_txreq_kmem_cache_ctor);
1742         if (!dev->verbs_txreq_cache) {
1743                 ret = -ENOMEM;
1744                 goto err_verbs_txreq;
1745         }
1746
1747         /*
1748          * The system image GUID is supposed to be the same for all
1749          * HFIs in a single system but since there can be other
1750          * device types in the system, we can't be sure this is unique.
1751          */
1752         if (!ib_hfi1_sys_image_guid)
1753                 ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
1754         lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
1755         strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
1756         ibdev->owner = THIS_MODULE;
1757         ibdev->node_guid = cpu_to_be64(ppd->guid);
1758         ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION;
1759         ibdev->uverbs_cmd_mask =
1760                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
1761                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
1762                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
1763                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
1764                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
1765                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
1766                 (1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
1767                 (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
1768                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
1769                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
1770                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
1771                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
1772                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
1773                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
1774                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
1775                 (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
1776                 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
1777                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
1778                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
1779                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
1780                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
1781                 (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
1782                 (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
1783                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
1784                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
1785                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
1786                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
1787                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
1788                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
1789                 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
1790         ibdev->node_type = RDMA_NODE_IB_CA;
1791         ibdev->phys_port_cnt = dd->num_pports;
1792         ibdev->num_comp_vectors = 1;
1793         ibdev->dma_device = &dd->pcidev->dev;
1794         ibdev->query_device = NULL;
1795         ibdev->modify_device = modify_device;
1796         ibdev->query_port = query_port;
1797         ibdev->modify_port = modify_port;
1798         ibdev->query_pkey = NULL;
1799         ibdev->query_gid = query_gid;
1800         ibdev->alloc_ucontext = NULL;
1801         ibdev->dealloc_ucontext = NULL;
1802         ibdev->alloc_pd = NULL;
1803         ibdev->dealloc_pd = NULL;
1804         ibdev->create_ah = NULL;
1805         ibdev->destroy_ah = NULL;
1806         ibdev->modify_ah = NULL;
1807         ibdev->query_ah = NULL;
1808         ibdev->create_srq = hfi1_create_srq;
1809         ibdev->modify_srq = hfi1_modify_srq;
1810         ibdev->query_srq = hfi1_query_srq;
1811         ibdev->destroy_srq = hfi1_destroy_srq;
1812         ibdev->create_qp = NULL;
1813         ibdev->modify_qp = hfi1_modify_qp;
1814         ibdev->query_qp = hfi1_query_qp;
1815         ibdev->destroy_qp = hfi1_destroy_qp;
1816         ibdev->post_send = post_send;
1817         ibdev->post_recv = post_receive;
1818         ibdev->post_srq_recv = hfi1_post_srq_receive;
1819         ibdev->create_cq = hfi1_create_cq;
1820         ibdev->destroy_cq = hfi1_destroy_cq;
1821         ibdev->resize_cq = hfi1_resize_cq;
1822         ibdev->poll_cq = hfi1_poll_cq;
1823         ibdev->req_notify_cq = hfi1_req_notify_cq;
1824         ibdev->get_dma_mr = NULL;
1825         ibdev->reg_user_mr = NULL;
1826         ibdev->dereg_mr = NULL;
1827         ibdev->alloc_mr = NULL;
1828         ibdev->map_mr_sg = NULL;
1829         ibdev->alloc_fmr = NULL;
1830         ibdev->map_phys_fmr = NULL;
1831         ibdev->unmap_fmr = NULL;
1832         ibdev->dealloc_fmr = NULL;
1833         ibdev->attach_mcast = hfi1_multicast_attach;
1834         ibdev->detach_mcast = hfi1_multicast_detach;
1835         ibdev->process_mad = hfi1_process_mad;
1836         ibdev->mmap = NULL;
1837         ibdev->dma_ops = NULL;
1838         ibdev->get_port_immutable = port_immutable;
1839
1840         strncpy(ibdev->node_desc, init_utsname()->nodename,
1841                 sizeof(ibdev->node_desc));
1842
1843         /*
1844          * Fill in rvt info object.
1845          */
1846         dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
1847         dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
1848         dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
1849         dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
1850         dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
1851         /*
1852          * Fill in rvt info device attributes.
1853          */
1854         hfi1_fill_device_attr(dd);
1855
1856         /* queue pair */
1857         dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
1858         dd->verbs_dev.rdi.dparms.qpn_start = 0;
1859         dd->verbs_dev.rdi.dparms.qpn_inc = 1;
1860         dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1861         dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
1862         dd->verbs_dev.rdi.dparms.qpn_res_end =
1863                 dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
1864         dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1865         dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
1866         dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
1867         dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
1868
1869         /* misc settings */
1870         dd->verbs_dev.rdi.flags = RVT_FLAG_CQ_INIT_DRIVER;
1871         dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
1872         dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
1873         dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1874
1875         ppd = dd->pport;
1876         for (i = 0; i < dd->num_pports; i++, ppd++)
1877                 rvt_init_port(&dd->verbs_dev.rdi,
1878                               &ppd->ibport_data.rvp,
1879                               i,
1880                               ppd->pkeys);
1881
1882         ret = rvt_register_device(&dd->verbs_dev.rdi);
1883         if (ret)
1884                 goto err_reg;
1885
1886         ret = hfi1_create_agents(dev);
1887         if (ret)
1888                 goto err_agents;
1889
1890         ret = hfi1_verbs_register_sysfs(dd);
1891         if (ret)
1892                 goto err_class;
1893
1894         goto bail;
1895
1896 err_class:
1897         hfi1_free_agents(dev);
1898 err_agents:
1899         rvt_unregister_device(&dd->verbs_dev.rdi);
1900 err_reg:
1901 err_verbs_txreq:
1902         kmem_cache_destroy(dev->verbs_txreq_cache);
1903         dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1904 bail:
1905         return ret;
1906 }
1907
1908 void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
1909 {
1910         struct hfi1_ibdev *dev = &dd->verbs_dev;
1911
1912         hfi1_verbs_unregister_sysfs(dd);
1913
1914         hfi1_free_agents(dev);
1915
1916         rvt_unregister_device(&dd->verbs_dev.rdi);
1917
1918         if (!list_empty(&dev->txwait))
1919                 dd_dev_err(dd, "txwait list not empty!\n");
1920         if (!list_empty(&dev->memwait))
1921                 dd_dev_err(dd, "memwait list not empty!\n");
1922
1923         del_timer_sync(&dev->mem_timer);
1924         kmem_cache_destroy(dev->verbs_txreq_cache);
1925 }
1926
1927 void hfi1_cnp_rcv(struct hfi1_packet *packet)
1928 {
1929         struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
1930         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1931         struct hfi1_ib_header *hdr = packet->hdr;
1932         struct rvt_qp *qp = packet->qp;
1933         u32 lqpn, rqpn = 0;
1934         u16 rlid = 0;
1935         u8 sl, sc5, sc4_bit, svc_type;
1936         bool sc4_set = has_sc4_bit(packet);
1937
1938         switch (packet->qp->ibqp.qp_type) {
1939         case IB_QPT_UC:
1940                 rlid = qp->remote_ah_attr.dlid;
1941                 rqpn = qp->remote_qpn;
1942                 svc_type = IB_CC_SVCTYPE_UC;
1943                 break;
1944         case IB_QPT_RC:
1945                 rlid = qp->remote_ah_attr.dlid;
1946                 rqpn = qp->remote_qpn;
1947                 svc_type = IB_CC_SVCTYPE_RC;
1948                 break;
1949         case IB_QPT_SMI:
1950         case IB_QPT_GSI:
1951         case IB_QPT_UD:
1952                 svc_type = IB_CC_SVCTYPE_UD;
1953                 break;
1954         default:
1955                 ibp->rvp.n_pkt_drops++;
1956                 return;
1957         }
1958
1959         sc4_bit = sc4_set << 4;
1960         sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
1961         sc5 |= sc4_bit;
1962         sl = ibp->sc_to_sl[sc5];
1963         lqpn = qp->ibqp.qp_num;
1964
1965         process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
1966 }