Merge tag 'uuid-for-4.14' of git://git.infradead.org/users/hch/uuid
[platform/kernel/linux-exynos.git] / drivers / infiniband / hw / hfi1 / rc.c
1 /*
2  * Copyright(c) 2015, 2016 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47
48 #include <linux/io.h>
49 #include <rdma/rdma_vt.h>
50 #include <rdma/rdmavt_qp.h>
51
52 #include "hfi.h"
53 #include "qp.h"
54 #include "verbs_txreq.h"
55 #include "trace.h"
56
57 /* cut down ridiculously long IB macro names */
58 #define OP(x) RC_OP(x)
59
60 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
61                        u32 psn, u32 pmtu)
62 {
63         u32 len;
64
65         len = delta_psn(psn, wqe->psn) * pmtu;
66         ss->sge = wqe->sg_list[0];
67         ss->sg_list = wqe->sg_list + 1;
68         ss->num_sge = wqe->wr.num_sge;
69         ss->total_len = wqe->length;
70         rvt_skip_sge(ss, len, false);
71         return wqe->length - len;
72 }
73
74 /**
75  * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76  * @dev: the device for this QP
77  * @qp: a pointer to the QP
78  * @ohdr: a pointer to the IB header being constructed
79  * @ps: the xmit packet state
80  *
81  * Return 1 if constructed; otherwise, return 0.
82  * Note that we are in the responder's side of the QP context.
83  * Note the QP s_lock must be held.
84  */
85 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
86                        struct ib_other_headers *ohdr,
87                        struct hfi1_pkt_state *ps)
88 {
89         struct rvt_ack_entry *e;
90         u32 hwords;
91         u32 len;
92         u32 bth0;
93         u32 bth2;
94         int middle = 0;
95         u32 pmtu = qp->pmtu;
96         struct hfi1_qp_priv *priv = qp->priv;
97
98         lockdep_assert_held(&qp->s_lock);
99         /* Don't send an ACK if we aren't supposed to. */
100         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
101                 goto bail;
102
103         if (priv->hdr_type == HFI1_PKT_TYPE_9B)
104                 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
105                 hwords = 5;
106         else
107                 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
108                 hwords = 7;
109
110         switch (qp->s_ack_state) {
111         case OP(RDMA_READ_RESPONSE_LAST):
112         case OP(RDMA_READ_RESPONSE_ONLY):
113                 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
114                 if (e->rdma_sge.mr) {
115                         rvt_put_mr(e->rdma_sge.mr);
116                         e->rdma_sge.mr = NULL;
117                 }
118                 /* FALLTHROUGH */
119         case OP(ATOMIC_ACKNOWLEDGE):
120                 /*
121                  * We can increment the tail pointer now that the last
122                  * response has been sent instead of only being
123                  * constructed.
124                  */
125                 if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
126                         qp->s_tail_ack_queue = 0;
127                 /* FALLTHROUGH */
128         case OP(SEND_ONLY):
129         case OP(ACKNOWLEDGE):
130                 /* Check for no next entry in the queue. */
131                 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
132                         if (qp->s_flags & RVT_S_ACK_PENDING)
133                                 goto normal;
134                         goto bail;
135                 }
136
137                 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
138                 if (e->opcode == OP(RDMA_READ_REQUEST)) {
139                         /*
140                          * If a RDMA read response is being resent and
141                          * we haven't seen the duplicate request yet,
142                          * then stop sending the remaining responses the
143                          * responder has seen until the requester re-sends it.
144                          */
145                         len = e->rdma_sge.sge_length;
146                         if (len && !e->rdma_sge.mr) {
147                                 qp->s_tail_ack_queue = qp->r_head_ack_queue;
148                                 goto bail;
149                         }
150                         /* Copy SGE state in case we need to resend */
151                         ps->s_txreq->mr = e->rdma_sge.mr;
152                         if (ps->s_txreq->mr)
153                                 rvt_get_mr(ps->s_txreq->mr);
154                         qp->s_ack_rdma_sge.sge = e->rdma_sge;
155                         qp->s_ack_rdma_sge.num_sge = 1;
156                         ps->s_txreq->ss = &qp->s_ack_rdma_sge;
157                         if (len > pmtu) {
158                                 len = pmtu;
159                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
160                         } else {
161                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
162                                 e->sent = 1;
163                         }
164                         ohdr->u.aeth = rvt_compute_aeth(qp);
165                         hwords++;
166                         qp->s_ack_rdma_psn = e->psn;
167                         bth2 = mask_psn(qp->s_ack_rdma_psn++);
168                 } else {
169                         /* COMPARE_SWAP or FETCH_ADD */
170                         ps->s_txreq->ss = NULL;
171                         len = 0;
172                         qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
173                         ohdr->u.at.aeth = rvt_compute_aeth(qp);
174                         ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
175                         hwords += sizeof(ohdr->u.at) / sizeof(u32);
176                         bth2 = mask_psn(e->psn);
177                         e->sent = 1;
178                 }
179                 bth0 = qp->s_ack_state << 24;
180                 break;
181
182         case OP(RDMA_READ_RESPONSE_FIRST):
183                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
184                 /* FALLTHROUGH */
185         case OP(RDMA_READ_RESPONSE_MIDDLE):
186                 ps->s_txreq->ss = &qp->s_ack_rdma_sge;
187                 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
188                 if (ps->s_txreq->mr)
189                         rvt_get_mr(ps->s_txreq->mr);
190                 len = qp->s_ack_rdma_sge.sge.sge_length;
191                 if (len > pmtu) {
192                         len = pmtu;
193                         middle = HFI1_CAP_IS_KSET(SDMA_AHG);
194                 } else {
195                         ohdr->u.aeth = rvt_compute_aeth(qp);
196                         hwords++;
197                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
198                         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
199                         e->sent = 1;
200                 }
201                 bth0 = qp->s_ack_state << 24;
202                 bth2 = mask_psn(qp->s_ack_rdma_psn++);
203                 break;
204
205         default:
206 normal:
207                 /*
208                  * Send a regular ACK.
209                  * Set the s_ack_state so we wait until after sending
210                  * the ACK before setting s_ack_state to ACKNOWLEDGE
211                  * (see above).
212                  */
213                 qp->s_ack_state = OP(SEND_ONLY);
214                 qp->s_flags &= ~RVT_S_ACK_PENDING;
215                 ps->s_txreq->ss = NULL;
216                 if (qp->s_nak_state)
217                         ohdr->u.aeth =
218                                 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
219                                             (qp->s_nak_state <<
220                                              IB_AETH_CREDIT_SHIFT));
221                 else
222                         ohdr->u.aeth = rvt_compute_aeth(qp);
223                 hwords++;
224                 len = 0;
225                 bth0 = OP(ACKNOWLEDGE) << 24;
226                 bth2 = mask_psn(qp->s_ack_psn);
227         }
228         qp->s_rdma_ack_cnt++;
229         qp->s_hdrwords = hwords;
230         ps->s_txreq->sde = priv->s_sde;
231         ps->s_txreq->s_cur_size = len;
232         hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
233         /* pbc */
234         ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
235         return 1;
236
237 bail:
238         qp->s_ack_state = OP(ACKNOWLEDGE);
239         /*
240          * Ensure s_rdma_ack_cnt changes are committed prior to resetting
241          * RVT_S_RESP_PENDING
242          */
243         smp_wmb();
244         qp->s_flags &= ~(RVT_S_RESP_PENDING
245                                 | RVT_S_ACK_PENDING
246                                 | RVT_S_AHG_VALID);
247         return 0;
248 }
249
250 /**
251  * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
252  * @qp: a pointer to the QP
253  *
254  * Assumes s_lock is held.
255  *
256  * Return 1 if constructed; otherwise, return 0.
257  */
258 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
259 {
260         struct hfi1_qp_priv *priv = qp->priv;
261         struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
262         struct ib_other_headers *ohdr;
263         struct rvt_sge_state *ss;
264         struct rvt_swqe *wqe;
265         u32 hwords;
266         u32 len;
267         u32 bth0 = 0;
268         u32 bth2;
269         u32 pmtu = qp->pmtu;
270         char newreq;
271         int middle = 0;
272         int delta;
273
274         lockdep_assert_held(&qp->s_lock);
275         ps->s_txreq = get_txreq(ps->dev, qp);
276         if (IS_ERR(ps->s_txreq))
277                 goto bail_no_tx;
278
279         ps->s_txreq->phdr.hdr.hdr_type = priv->hdr_type;
280         if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
281                 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
282                 hwords = 5;
283                 if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
284                         ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
285                 else
286                         ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
287         } else {
288                 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
289                 hwords = 7;
290                 if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
291                     (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
292                         ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
293                 else
294                         ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
295         }
296
297         /* Sending responses has higher priority over sending requests. */
298         if ((qp->s_flags & RVT_S_RESP_PENDING) &&
299             make_rc_ack(dev, qp, ohdr, ps))
300                 return 1;
301
302         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
303                 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
304                         goto bail;
305                 /* We are in the error state, flush the work request. */
306                 smp_read_barrier_depends(); /* see post_one_send() */
307                 if (qp->s_last == READ_ONCE(qp->s_head))
308                         goto bail;
309                 /* If DMAs are in progress, we can't flush immediately. */
310                 if (iowait_sdma_pending(&priv->s_iowait)) {
311                         qp->s_flags |= RVT_S_WAIT_DMA;
312                         goto bail;
313                 }
314                 clear_ahg(qp);
315                 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
316                 hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
317                         IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
318                 /* will get called again */
319                 goto done_free_tx;
320         }
321
322         if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
323                 goto bail;
324
325         if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
326                 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
327                         qp->s_flags |= RVT_S_WAIT_PSN;
328                         goto bail;
329                 }
330                 qp->s_sending_psn = qp->s_psn;
331                 qp->s_sending_hpsn = qp->s_psn - 1;
332         }
333
334         /* Send a request. */
335         wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
336         switch (qp->s_state) {
337         default:
338                 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
339                         goto bail;
340                 /*
341                  * Resend an old request or start a new one.
342                  *
343                  * We keep track of the current SWQE so that
344                  * we don't reset the "furthest progress" state
345                  * if we need to back up.
346                  */
347                 newreq = 0;
348                 if (qp->s_cur == qp->s_tail) {
349                         /* Check if send work queue is empty. */
350                         smp_read_barrier_depends(); /* see post_one_send() */
351                         if (qp->s_tail == READ_ONCE(qp->s_head)) {
352                                 clear_ahg(qp);
353                                 goto bail;
354                         }
355                         /*
356                          * If a fence is requested, wait for previous
357                          * RDMA read and atomic operations to finish.
358                          */
359                         if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
360                             qp->s_num_rd_atomic) {
361                                 qp->s_flags |= RVT_S_WAIT_FENCE;
362                                 goto bail;
363                         }
364                         /*
365                          * Local operations are processed immediately
366                          * after all prior requests have completed
367                          */
368                         if (wqe->wr.opcode == IB_WR_REG_MR ||
369                             wqe->wr.opcode == IB_WR_LOCAL_INV) {
370                                 int local_ops = 0;
371                                 int err = 0;
372
373                                 if (qp->s_last != qp->s_cur)
374                                         goto bail;
375                                 if (++qp->s_cur == qp->s_size)
376                                         qp->s_cur = 0;
377                                 if (++qp->s_tail == qp->s_size)
378                                         qp->s_tail = 0;
379                                 if (!(wqe->wr.send_flags &
380                                       RVT_SEND_COMPLETION_ONLY)) {
381                                         err = rvt_invalidate_rkey(
382                                                 qp,
383                                                 wqe->wr.ex.invalidate_rkey);
384                                         local_ops = 1;
385                                 }
386                                 hfi1_send_complete(qp, wqe,
387                                                    err ? IB_WC_LOC_PROT_ERR
388                                                        : IB_WC_SUCCESS);
389                                 if (local_ops)
390                                         atomic_dec(&qp->local_ops_pending);
391                                 qp->s_hdrwords = 0;
392                                 goto done_free_tx;
393                         }
394
395                         newreq = 1;
396                         qp->s_psn = wqe->psn;
397                 }
398                 /*
399                  * Note that we have to be careful not to modify the
400                  * original work request since we may need to resend
401                  * it.
402                  */
403                 len = wqe->length;
404                 ss = &qp->s_sge;
405                 bth2 = mask_psn(qp->s_psn);
406                 switch (wqe->wr.opcode) {
407                 case IB_WR_SEND:
408                 case IB_WR_SEND_WITH_IMM:
409                 case IB_WR_SEND_WITH_INV:
410                         /* If no credit, return. */
411                         if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
412                             rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
413                                 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
414                                 goto bail;
415                         }
416                         if (len > pmtu) {
417                                 qp->s_state = OP(SEND_FIRST);
418                                 len = pmtu;
419                                 break;
420                         }
421                         if (wqe->wr.opcode == IB_WR_SEND) {
422                                 qp->s_state = OP(SEND_ONLY);
423                         } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
424                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
425                                 /* Immediate data comes after the BTH */
426                                 ohdr->u.imm_data = wqe->wr.ex.imm_data;
427                                 hwords += 1;
428                         } else {
429                                 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
430                                 /* Invalidate rkey comes after the BTH */
431                                 ohdr->u.ieth = cpu_to_be32(
432                                                 wqe->wr.ex.invalidate_rkey);
433                                 hwords += 1;
434                         }
435                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
436                                 bth0 |= IB_BTH_SOLICITED;
437                         bth2 |= IB_BTH_REQ_ACK;
438                         if (++qp->s_cur == qp->s_size)
439                                 qp->s_cur = 0;
440                         break;
441
442                 case IB_WR_RDMA_WRITE:
443                         if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
444                                 qp->s_lsn++;
445                         goto no_flow_control;
446                 case IB_WR_RDMA_WRITE_WITH_IMM:
447                         /* If no credit, return. */
448                         if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
449                             rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
450                                 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
451                                 goto bail;
452                         }
453 no_flow_control:
454                         put_ib_reth_vaddr(
455                                 wqe->rdma_wr.remote_addr,
456                                 &ohdr->u.rc.reth);
457                         ohdr->u.rc.reth.rkey =
458                                 cpu_to_be32(wqe->rdma_wr.rkey);
459                         ohdr->u.rc.reth.length = cpu_to_be32(len);
460                         hwords += sizeof(struct ib_reth) / sizeof(u32);
461                         if (len > pmtu) {
462                                 qp->s_state = OP(RDMA_WRITE_FIRST);
463                                 len = pmtu;
464                                 break;
465                         }
466                         if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
467                                 qp->s_state = OP(RDMA_WRITE_ONLY);
468                         } else {
469                                 qp->s_state =
470                                         OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
471                                 /* Immediate data comes after RETH */
472                                 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
473                                 hwords += 1;
474                                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
475                                         bth0 |= IB_BTH_SOLICITED;
476                         }
477                         bth2 |= IB_BTH_REQ_ACK;
478                         if (++qp->s_cur == qp->s_size)
479                                 qp->s_cur = 0;
480                         break;
481
482                 case IB_WR_RDMA_READ:
483                         /*
484                          * Don't allow more operations to be started
485                          * than the QP limits allow.
486                          */
487                         if (newreq) {
488                                 if (qp->s_num_rd_atomic >=
489                                     qp->s_max_rd_atomic) {
490                                         qp->s_flags |= RVT_S_WAIT_RDMAR;
491                                         goto bail;
492                                 }
493                                 qp->s_num_rd_atomic++;
494                                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
495                                         qp->s_lsn++;
496                         }
497                         put_ib_reth_vaddr(
498                                 wqe->rdma_wr.remote_addr,
499                                 &ohdr->u.rc.reth);
500                         ohdr->u.rc.reth.rkey =
501                                 cpu_to_be32(wqe->rdma_wr.rkey);
502                         ohdr->u.rc.reth.length = cpu_to_be32(len);
503                         qp->s_state = OP(RDMA_READ_REQUEST);
504                         hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
505                         ss = NULL;
506                         len = 0;
507                         bth2 |= IB_BTH_REQ_ACK;
508                         if (++qp->s_cur == qp->s_size)
509                                 qp->s_cur = 0;
510                         break;
511
512                 case IB_WR_ATOMIC_CMP_AND_SWP:
513                 case IB_WR_ATOMIC_FETCH_AND_ADD:
514                         /*
515                          * Don't allow more operations to be started
516                          * than the QP limits allow.
517                          */
518                         if (newreq) {
519                                 if (qp->s_num_rd_atomic >=
520                                     qp->s_max_rd_atomic) {
521                                         qp->s_flags |= RVT_S_WAIT_RDMAR;
522                                         goto bail;
523                                 }
524                                 qp->s_num_rd_atomic++;
525                                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
526                                         qp->s_lsn++;
527                         }
528                         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
529                                 qp->s_state = OP(COMPARE_SWAP);
530                                 put_ib_ateth_swap(wqe->atomic_wr.swap,
531                                                   &ohdr->u.atomic_eth);
532                                 put_ib_ateth_compare(wqe->atomic_wr.compare_add,
533                                                      &ohdr->u.atomic_eth);
534                         } else {
535                                 qp->s_state = OP(FETCH_ADD);
536                                 put_ib_ateth_swap(wqe->atomic_wr.compare_add,
537                                                   &ohdr->u.atomic_eth);
538                                 put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
539                         }
540                         put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
541                                            &ohdr->u.atomic_eth);
542                         ohdr->u.atomic_eth.rkey = cpu_to_be32(
543                                 wqe->atomic_wr.rkey);
544                         hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
545                         ss = NULL;
546                         len = 0;
547                         bth2 |= IB_BTH_REQ_ACK;
548                         if (++qp->s_cur == qp->s_size)
549                                 qp->s_cur = 0;
550                         break;
551
552                 default:
553                         goto bail;
554                 }
555                 qp->s_sge.sge = wqe->sg_list[0];
556                 qp->s_sge.sg_list = wqe->sg_list + 1;
557                 qp->s_sge.num_sge = wqe->wr.num_sge;
558                 qp->s_sge.total_len = wqe->length;
559                 qp->s_len = wqe->length;
560                 if (newreq) {
561                         qp->s_tail++;
562                         if (qp->s_tail >= qp->s_size)
563                                 qp->s_tail = 0;
564                 }
565                 if (wqe->wr.opcode == IB_WR_RDMA_READ)
566                         qp->s_psn = wqe->lpsn + 1;
567                 else
568                         qp->s_psn++;
569                 break;
570
571         case OP(RDMA_READ_RESPONSE_FIRST):
572                 /*
573                  * qp->s_state is normally set to the opcode of the
574                  * last packet constructed for new requests and therefore
575                  * is never set to RDMA read response.
576                  * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
577                  * thread to indicate a SEND needs to be restarted from an
578                  * earlier PSN without interfering with the sending thread.
579                  * See restart_rc().
580                  */
581                 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
582                 /* FALLTHROUGH */
583         case OP(SEND_FIRST):
584                 qp->s_state = OP(SEND_MIDDLE);
585                 /* FALLTHROUGH */
586         case OP(SEND_MIDDLE):
587                 bth2 = mask_psn(qp->s_psn++);
588                 ss = &qp->s_sge;
589                 len = qp->s_len;
590                 if (len > pmtu) {
591                         len = pmtu;
592                         middle = HFI1_CAP_IS_KSET(SDMA_AHG);
593                         break;
594                 }
595                 if (wqe->wr.opcode == IB_WR_SEND) {
596                         qp->s_state = OP(SEND_LAST);
597                 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
598                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
599                         /* Immediate data comes after the BTH */
600                         ohdr->u.imm_data = wqe->wr.ex.imm_data;
601                         hwords += 1;
602                 } else {
603                         qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
604                         /* invalidate data comes after the BTH */
605                         ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
606                         hwords += 1;
607                 }
608                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
609                         bth0 |= IB_BTH_SOLICITED;
610                 bth2 |= IB_BTH_REQ_ACK;
611                 qp->s_cur++;
612                 if (qp->s_cur >= qp->s_size)
613                         qp->s_cur = 0;
614                 break;
615
616         case OP(RDMA_READ_RESPONSE_LAST):
617                 /*
618                  * qp->s_state is normally set to the opcode of the
619                  * last packet constructed for new requests and therefore
620                  * is never set to RDMA read response.
621                  * RDMA_READ_RESPONSE_LAST is used by the ACK processing
622                  * thread to indicate a RDMA write needs to be restarted from
623                  * an earlier PSN without interfering with the sending thread.
624                  * See restart_rc().
625                  */
626                 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
627                 /* FALLTHROUGH */
628         case OP(RDMA_WRITE_FIRST):
629                 qp->s_state = OP(RDMA_WRITE_MIDDLE);
630                 /* FALLTHROUGH */
631         case OP(RDMA_WRITE_MIDDLE):
632                 bth2 = mask_psn(qp->s_psn++);
633                 ss = &qp->s_sge;
634                 len = qp->s_len;
635                 if (len > pmtu) {
636                         len = pmtu;
637                         middle = HFI1_CAP_IS_KSET(SDMA_AHG);
638                         break;
639                 }
640                 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
641                         qp->s_state = OP(RDMA_WRITE_LAST);
642                 } else {
643                         qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
644                         /* Immediate data comes after the BTH */
645                         ohdr->u.imm_data = wqe->wr.ex.imm_data;
646                         hwords += 1;
647                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
648                                 bth0 |= IB_BTH_SOLICITED;
649                 }
650                 bth2 |= IB_BTH_REQ_ACK;
651                 qp->s_cur++;
652                 if (qp->s_cur >= qp->s_size)
653                         qp->s_cur = 0;
654                 break;
655
656         case OP(RDMA_READ_RESPONSE_MIDDLE):
657                 /*
658                  * qp->s_state is normally set to the opcode of the
659                  * last packet constructed for new requests and therefore
660                  * is never set to RDMA read response.
661                  * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
662                  * thread to indicate a RDMA read needs to be restarted from
663                  * an earlier PSN without interfering with the sending thread.
664                  * See restart_rc().
665                  */
666                 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
667                 put_ib_reth_vaddr(
668                         wqe->rdma_wr.remote_addr + len,
669                         &ohdr->u.rc.reth);
670                 ohdr->u.rc.reth.rkey =
671                         cpu_to_be32(wqe->rdma_wr.rkey);
672                 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
673                 qp->s_state = OP(RDMA_READ_REQUEST);
674                 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
675                 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
676                 qp->s_psn = wqe->lpsn + 1;
677                 ss = NULL;
678                 len = 0;
679                 qp->s_cur++;
680                 if (qp->s_cur == qp->s_size)
681                         qp->s_cur = 0;
682                 break;
683         }
684         qp->s_sending_hpsn = bth2;
685         delta = delta_psn(bth2, wqe->psn);
686         if (delta && delta % HFI1_PSN_CREDIT == 0)
687                 bth2 |= IB_BTH_REQ_ACK;
688         if (qp->s_flags & RVT_S_SEND_ONE) {
689                 qp->s_flags &= ~RVT_S_SEND_ONE;
690                 qp->s_flags |= RVT_S_WAIT_ACK;
691                 bth2 |= IB_BTH_REQ_ACK;
692         }
693         qp->s_len -= len;
694         qp->s_hdrwords = hwords;
695         ps->s_txreq->sde = priv->s_sde;
696         ps->s_txreq->ss = ss;
697         ps->s_txreq->s_cur_size = len;
698         hfi1_make_ruc_header(
699                 qp,
700                 ohdr,
701                 bth0 | (qp->s_state << 24),
702                 bth2,
703                 middle,
704                 ps);
705         /* pbc */
706         ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
707         return 1;
708
709 done_free_tx:
710         hfi1_put_txreq(ps->s_txreq);
711         ps->s_txreq = NULL;
712         return 1;
713
714 bail:
715         hfi1_put_txreq(ps->s_txreq);
716
717 bail_no_tx:
718         ps->s_txreq = NULL;
719         qp->s_flags &= ~RVT_S_BUSY;
720         qp->s_hdrwords = 0;
721         return 0;
722 }
723
724 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
725                                       struct ib_other_headers *ohdr,
726                                       u32 bth0, u32 bth1)
727 {
728         if (qp->r_nak_state)
729                 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
730                                             (qp->r_nak_state <<
731                                              IB_AETH_CREDIT_SHIFT));
732         else
733                 ohdr->u.aeth = rvt_compute_aeth(qp);
734
735         ohdr->bth[0] = cpu_to_be32(bth0);
736         ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
737         ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
738 }
739
740 static inline void hfi1_queue_rc_ack(struct rvt_qp *qp, bool is_fecn)
741 {
742         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
743         unsigned long flags;
744
745         spin_lock_irqsave(&qp->s_lock, flags);
746         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
747                 goto unlock;
748         this_cpu_inc(*ibp->rvp.rc_qacks);
749         qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
750         qp->s_nak_state = qp->r_nak_state;
751         qp->s_ack_psn = qp->r_ack_psn;
752         if (is_fecn)
753                 qp->s_flags |= RVT_S_ECN;
754
755         /* Schedule the send tasklet. */
756         hfi1_schedule_send(qp);
757 unlock:
758         spin_unlock_irqrestore(&qp->s_lock, flags);
759 }
760
761 static inline void hfi1_make_rc_ack_9B(struct rvt_qp *qp,
762                                        struct hfi1_opa_header *opa_hdr,
763                                        u8 sc5, bool is_fecn,
764                                        u64 *pbc_flags, u32 *hwords,
765                                        u32 *nwords)
766 {
767         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
768         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
769         struct ib_header *hdr = &opa_hdr->ibh;
770         struct ib_other_headers *ohdr;
771         u16 lrh0 = HFI1_LRH_BTH;
772         u16 pkey;
773         u32 bth0, bth1;
774
775         opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
776         ohdr = &hdr->u.oth;
777         /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
778         *hwords = 6;
779
780         if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
781                 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
782                                          rdma_ah_read_grh(&qp->remote_ah_attr),
783                                          *hwords - 2, SIZE_OF_CRC);
784                 ohdr = &hdr->u.l.oth;
785                 lrh0 = HFI1_LRH_GRH;
786         }
787         /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
788         *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
789
790         /* read pkey_index w/o lock (its atomic) */
791         pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
792
793         lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
794                 (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
795                         IB_SL_SHIFT;
796
797         hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
798                          opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
799                          ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
800
801         bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
802         if (qp->s_mig_state == IB_MIG_MIGRATED)
803                 bth0 |= IB_BTH_MIG_REQ;
804         bth1 = (!!is_fecn) << IB_BECN_SHIFT;
805         hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
806 }
807
808 static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp,
809                                         struct hfi1_opa_header *opa_hdr,
810                                         u8 sc5, bool is_fecn,
811                                         u64 *pbc_flags, u32 *hwords,
812                                         u32 *nwords)
813 {
814         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
815         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
816         struct hfi1_16b_header *hdr = &opa_hdr->opah;
817         struct ib_other_headers *ohdr;
818         u32 bth0, bth1;
819         u16 len, pkey;
820         u8 becn = !!is_fecn;
821         u8 l4 = OPA_16B_L4_IB_LOCAL;
822         u8 extra_bytes;
823
824         opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
825         ohdr = &hdr->u.oth;
826         /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
827         *hwords = 8;
828         extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
829         *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
830
831         if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
832             hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
833                 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
834                                          rdma_ah_read_grh(&qp->remote_ah_attr),
835                                          *hwords - 4, *nwords);
836                 ohdr = &hdr->u.l.oth;
837                 l4 = OPA_16B_L4_IB_GLOBAL;
838         }
839         *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
840
841         /* read pkey_index w/o lock (its atomic) */
842         pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
843
844         /* Convert dwords to flits */
845         len = (*hwords + *nwords) >> 1;
846
847         hfi1_make_16b_hdr(hdr,
848                           ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr),
849                           opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
850                                       16B),
851                           len, pkey, becn, 0, l4, sc5);
852
853         bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
854         bth0 |= extra_bytes << 20;
855         if (qp->s_mig_state == IB_MIG_MIGRATED)
856                 bth1 = OPA_BTH_MIG_REQ;
857         hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
858 }
859
860 typedef void (*hfi1_make_rc_ack)(struct rvt_qp *qp,
861                                  struct hfi1_opa_header *opa_hdr,
862                                  u8 sc5, bool is_fecn,
863                                  u64 *pbc_flags, u32 *hwords,
864                                  u32 *nwords);
865
866 /* We support only two types - 9B and 16B for now */
867 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
868         [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
869         [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
870 };
871
872 /**
873  * hfi1_send_rc_ack - Construct an ACK packet and send it
874  * @qp: a pointer to the QP
875  *
876  * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
877  * Note that RDMA reads and atomics are handled in the
878  * send side QP state and send engine.
879  */
880 void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd,
881                       struct rvt_qp *qp, bool is_fecn)
882 {
883         struct hfi1_ibport *ibp = rcd_to_iport(rcd);
884         struct hfi1_qp_priv *priv = qp->priv;
885         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
886         u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
887         u64 pbc, pbc_flags = 0;
888         u32 hwords = 0;
889         u32 nwords = 0;
890         u32 plen;
891         struct pio_buf *pbuf;
892         struct hfi1_opa_header opa_hdr;
893
894         /* clear the defer count */
895         qp->r_adefered = 0;
896
897         /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
898         if (qp->s_flags & RVT_S_RESP_PENDING) {
899                 hfi1_queue_rc_ack(qp, is_fecn);
900                 return;
901         }
902
903         /* Ensure s_rdma_ack_cnt changes are committed */
904         smp_read_barrier_depends();
905         if (qp->s_rdma_ack_cnt) {
906                 hfi1_queue_rc_ack(qp, is_fecn);
907                 return;
908         }
909
910         /* Don't try to send ACKs if the link isn't ACTIVE */
911         if (driver_lstate(ppd) != IB_PORT_ACTIVE)
912                 return;
913
914         /* Make the appropriate header */
915         hfi1_make_rc_ack_tbl[priv->hdr_type](qp, &opa_hdr, sc5, is_fecn,
916                                              &pbc_flags, &hwords, &nwords);
917
918         plen = 2 /* PBC */ + hwords + nwords;
919         pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
920                          sc_to_vlt(ppd->dd, sc5), plen);
921         pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
922         if (!pbuf) {
923                 /*
924                  * We have no room to send at the moment.  Pass
925                  * responsibility for sending the ACK to the send engine
926                  * so that when enough buffer space becomes available,
927                  * the ACK is sent ahead of other outgoing packets.
928                  */
929                 hfi1_queue_rc_ack(qp, is_fecn);
930                 return;
931         }
932         trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
933                                &opa_hdr, ib_is_sc5(sc5));
934
935         /* write the pbc and data */
936         ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
937                                  (priv->hdr_type == HFI1_PKT_TYPE_9B ?
938                                  (void *)&opa_hdr.ibh :
939                                  (void *)&opa_hdr.opah), hwords);
940         return;
941 }
942
943 /**
944  * reset_psn - reset the QP state to send starting from PSN
945  * @qp: the QP
946  * @psn: the packet sequence number to restart at
947  *
948  * This is called from hfi1_rc_rcv() to process an incoming RC ACK
949  * for the given QP.
950  * Called at interrupt level with the QP s_lock held.
951  */
952 static void reset_psn(struct rvt_qp *qp, u32 psn)
953 {
954         u32 n = qp->s_acked;
955         struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
956         u32 opcode;
957
958         lockdep_assert_held(&qp->s_lock);
959         qp->s_cur = n;
960
961         /*
962          * If we are starting the request from the beginning,
963          * let the normal send code handle initialization.
964          */
965         if (cmp_psn(psn, wqe->psn) <= 0) {
966                 qp->s_state = OP(SEND_LAST);
967                 goto done;
968         }
969
970         /* Find the work request opcode corresponding to the given PSN. */
971         opcode = wqe->wr.opcode;
972         for (;;) {
973                 int diff;
974
975                 if (++n == qp->s_size)
976                         n = 0;
977                 if (n == qp->s_tail)
978                         break;
979                 wqe = rvt_get_swqe_ptr(qp, n);
980                 diff = cmp_psn(psn, wqe->psn);
981                 if (diff < 0)
982                         break;
983                 qp->s_cur = n;
984                 /*
985                  * If we are starting the request from the beginning,
986                  * let the normal send code handle initialization.
987                  */
988                 if (diff == 0) {
989                         qp->s_state = OP(SEND_LAST);
990                         goto done;
991                 }
992                 opcode = wqe->wr.opcode;
993         }
994
995         /*
996          * Set the state to restart in the middle of a request.
997          * Don't change the s_sge, s_cur_sge, or s_cur_size.
998          * See hfi1_make_rc_req().
999          */
1000         switch (opcode) {
1001         case IB_WR_SEND:
1002         case IB_WR_SEND_WITH_IMM:
1003                 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1004                 break;
1005
1006         case IB_WR_RDMA_WRITE:
1007         case IB_WR_RDMA_WRITE_WITH_IMM:
1008                 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1009                 break;
1010
1011         case IB_WR_RDMA_READ:
1012                 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1013                 break;
1014
1015         default:
1016                 /*
1017                  * This case shouldn't happen since its only
1018                  * one PSN per req.
1019                  */
1020                 qp->s_state = OP(SEND_LAST);
1021         }
1022 done:
1023         qp->s_psn = psn;
1024         /*
1025          * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1026          * asynchronously before the send engine can get scheduled.
1027          * Doing it in hfi1_make_rc_req() is too late.
1028          */
1029         if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1030             (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1031                 qp->s_flags |= RVT_S_WAIT_PSN;
1032         qp->s_flags &= ~RVT_S_AHG_VALID;
1033 }
1034
1035 /*
1036  * Back up requester to resend the last un-ACKed request.
1037  * The QP r_lock and s_lock should be held and interrupts disabled.
1038  */
1039 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1040 {
1041         struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1042         struct hfi1_ibport *ibp;
1043
1044         lockdep_assert_held(&qp->r_lock);
1045         lockdep_assert_held(&qp->s_lock);
1046         if (qp->s_retry == 0) {
1047                 if (qp->s_mig_state == IB_MIG_ARMED) {
1048                         hfi1_migrate_qp(qp);
1049                         qp->s_retry = qp->s_retry_cnt;
1050                 } else if (qp->s_last == qp->s_acked) {
1051                         hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
1052                         rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1053                         return;
1054                 } else { /* need to handle delayed completion */
1055                         return;
1056                 }
1057         } else {
1058                 qp->s_retry--;
1059         }
1060
1061         ibp = to_iport(qp->ibqp.device, qp->port_num);
1062         if (wqe->wr.opcode == IB_WR_RDMA_READ)
1063                 ibp->rvp.n_rc_resends++;
1064         else
1065                 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1066
1067         qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1068                          RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1069                          RVT_S_WAIT_ACK);
1070         if (wait)
1071                 qp->s_flags |= RVT_S_SEND_ONE;
1072         reset_psn(qp, psn);
1073 }
1074
1075 /*
1076  * Set qp->s_sending_psn to the next PSN after the given one.
1077  * This would be psn+1 except when RDMA reads are present.
1078  */
1079 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1080 {
1081         struct rvt_swqe *wqe;
1082         u32 n = qp->s_last;
1083
1084         lockdep_assert_held(&qp->s_lock);
1085         /* Find the work request corresponding to the given PSN. */
1086         for (;;) {
1087                 wqe = rvt_get_swqe_ptr(qp, n);
1088                 if (cmp_psn(psn, wqe->lpsn) <= 0) {
1089                         if (wqe->wr.opcode == IB_WR_RDMA_READ)
1090                                 qp->s_sending_psn = wqe->lpsn + 1;
1091                         else
1092                                 qp->s_sending_psn = psn + 1;
1093                         break;
1094                 }
1095                 if (++n == qp->s_size)
1096                         n = 0;
1097                 if (n == qp->s_tail)
1098                         break;
1099         }
1100 }
1101
1102 /*
1103  * This should be called with the QP s_lock held and interrupts disabled.
1104  */
1105 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1106 {
1107         struct ib_other_headers *ohdr;
1108         struct hfi1_qp_priv *priv = qp->priv;
1109         struct rvt_swqe *wqe;
1110         struct ib_header *hdr = NULL;
1111         struct hfi1_16b_header *hdr_16b = NULL;
1112         u32 opcode;
1113         u32 psn;
1114
1115         lockdep_assert_held(&qp->s_lock);
1116         if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
1117                 return;
1118
1119         /* Find out where the BTH is */
1120         if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
1121                 hdr = &opah->ibh;
1122                 if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
1123                         ohdr = &hdr->u.oth;
1124                 else
1125                         ohdr = &hdr->u.l.oth;
1126         } else {
1127                 u8 l4;
1128
1129                 hdr_16b = &opah->opah;
1130                 l4  = hfi1_16B_get_l4(hdr_16b);
1131                 if (l4 == OPA_16B_L4_IB_LOCAL)
1132                         ohdr = &hdr_16b->u.oth;
1133                 else
1134                         ohdr = &hdr_16b->u.l.oth;
1135         }
1136
1137         opcode = ib_bth_get_opcode(ohdr);
1138         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1139             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1140                 WARN_ON(!qp->s_rdma_ack_cnt);
1141                 qp->s_rdma_ack_cnt--;
1142                 return;
1143         }
1144
1145         psn = ib_bth_get_psn(ohdr);
1146         reset_sending_psn(qp, psn);
1147
1148         /*
1149          * Start timer after a packet requesting an ACK has been sent and
1150          * there are still requests that haven't been acked.
1151          */
1152         if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1153             !(qp->s_flags &
1154                 (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1155                 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1156                 rvt_add_retry_timer(qp);
1157
1158         while (qp->s_last != qp->s_acked) {
1159                 u32 s_last;
1160
1161                 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1162                 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1163                     cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1164                         break;
1165                 s_last = qp->s_last;
1166                 trace_hfi1_qp_send_completion(qp, wqe, s_last);
1167                 if (++s_last >= qp->s_size)
1168                         s_last = 0;
1169                 qp->s_last = s_last;
1170                 /* see post_send() */
1171                 barrier();
1172                 rvt_put_swqe(wqe);
1173                 rvt_qp_swqe_complete(qp,
1174                                      wqe,
1175                                      ib_hfi1_wc_opcode[wqe->wr.opcode],
1176                                      IB_WC_SUCCESS);
1177         }
1178         /*
1179          * If we were waiting for sends to complete before re-sending,
1180          * and they are now complete, restart sending.
1181          */
1182         trace_hfi1_sendcomplete(qp, psn);
1183         if (qp->s_flags & RVT_S_WAIT_PSN &&
1184             cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1185                 qp->s_flags &= ~RVT_S_WAIT_PSN;
1186                 qp->s_sending_psn = qp->s_psn;
1187                 qp->s_sending_hpsn = qp->s_psn - 1;
1188                 hfi1_schedule_send(qp);
1189         }
1190 }
1191
1192 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1193 {
1194         qp->s_last_psn = psn;
1195 }
1196
1197 /*
1198  * Generate a SWQE completion.
1199  * This is similar to hfi1_send_complete but has to check to be sure
1200  * that the SGEs are not being referenced if the SWQE is being resent.
1201  */
1202 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1203                                          struct rvt_swqe *wqe,
1204                                          struct hfi1_ibport *ibp)
1205 {
1206         lockdep_assert_held(&qp->s_lock);
1207         /*
1208          * Don't decrement refcount and don't generate a
1209          * completion if the SWQE is being resent until the send
1210          * is finished.
1211          */
1212         if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1213             cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1214                 u32 s_last;
1215
1216                 rvt_put_swqe(wqe);
1217                 s_last = qp->s_last;
1218                 trace_hfi1_qp_send_completion(qp, wqe, s_last);
1219                 if (++s_last >= qp->s_size)
1220                         s_last = 0;
1221                 qp->s_last = s_last;
1222                 /* see post_send() */
1223                 barrier();
1224                 rvt_qp_swqe_complete(qp,
1225                                      wqe,
1226                                      ib_hfi1_wc_opcode[wqe->wr.opcode],
1227                                      IB_WC_SUCCESS);
1228         } else {
1229                 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1230
1231                 this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1232                 /*
1233                  * If send progress not running attempt to progress
1234                  * SDMA queue.
1235                  */
1236                 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1237                         struct sdma_engine *engine;
1238                         u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1239                         u8 sc5;
1240
1241                         /* For now use sc to find engine */
1242                         sc5 = ibp->sl_to_sc[sl];
1243                         engine = qp_to_sdma_engine(qp, sc5);
1244                         sdma_engine_progress_schedule(engine);
1245                 }
1246         }
1247
1248         qp->s_retry = qp->s_retry_cnt;
1249         update_last_psn(qp, wqe->lpsn);
1250
1251         /*
1252          * If we are completing a request which is in the process of
1253          * being resent, we can stop re-sending it since we know the
1254          * responder has already seen it.
1255          */
1256         if (qp->s_acked == qp->s_cur) {
1257                 if (++qp->s_cur >= qp->s_size)
1258                         qp->s_cur = 0;
1259                 qp->s_acked = qp->s_cur;
1260                 wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1261                 if (qp->s_acked != qp->s_tail) {
1262                         qp->s_state = OP(SEND_LAST);
1263                         qp->s_psn = wqe->psn;
1264                 }
1265         } else {
1266                 if (++qp->s_acked >= qp->s_size)
1267                         qp->s_acked = 0;
1268                 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1269                         qp->s_draining = 0;
1270                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1271         }
1272         return wqe;
1273 }
1274
1275 /**
1276  * do_rc_ack - process an incoming RC ACK
1277  * @qp: the QP the ACK came in on
1278  * @psn: the packet sequence number of the ACK
1279  * @opcode: the opcode of the request that resulted in the ACK
1280  *
1281  * This is called from rc_rcv_resp() to process an incoming RC ACK
1282  * for the given QP.
1283  * May be called at interrupt level, with the QP s_lock held.
1284  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1285  */
1286 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1287                      u64 val, struct hfi1_ctxtdata *rcd)
1288 {
1289         struct hfi1_ibport *ibp;
1290         enum ib_wc_status status;
1291         struct rvt_swqe *wqe;
1292         int ret = 0;
1293         u32 ack_psn;
1294         int diff;
1295
1296         lockdep_assert_held(&qp->s_lock);
1297         /*
1298          * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1299          * requests and implicitly NAK RDMA read and atomic requests issued
1300          * before the NAK'ed request.  The MSN won't include the NAK'ed
1301          * request but will include an ACK'ed request(s).
1302          */
1303         ack_psn = psn;
1304         if (aeth >> IB_AETH_NAK_SHIFT)
1305                 ack_psn--;
1306         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1307         ibp = rcd_to_iport(rcd);
1308
1309         /*
1310          * The MSN might be for a later WQE than the PSN indicates so
1311          * only complete WQEs that the PSN finishes.
1312          */
1313         while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1314                 /*
1315                  * RDMA_READ_RESPONSE_ONLY is a special case since
1316                  * we want to generate completion events for everything
1317                  * before the RDMA read, copy the data, then generate
1318                  * the completion for the read.
1319                  */
1320                 if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1321                     opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1322                     diff == 0) {
1323                         ret = 1;
1324                         goto bail_stop;
1325                 }
1326                 /*
1327                  * If this request is a RDMA read or atomic, and the ACK is
1328                  * for a later operation, this ACK NAKs the RDMA read or
1329                  * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1330                  * can ACK a RDMA read and likewise for atomic ops.  Note
1331                  * that the NAK case can only happen if relaxed ordering is
1332                  * used and requests are sent after an RDMA read or atomic
1333                  * is sent but before the response is received.
1334                  */
1335                 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1336                      (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1337                     ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1338                       wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1339                      (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1340                         /* Retry this request. */
1341                         if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1342                                 qp->r_flags |= RVT_R_RDMAR_SEQ;
1343                                 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1344                                 if (list_empty(&qp->rspwait)) {
1345                                         qp->r_flags |= RVT_R_RSP_SEND;
1346                                         rvt_get_qp(qp);
1347                                         list_add_tail(&qp->rspwait,
1348                                                       &rcd->qp_wait_list);
1349                                 }
1350                         }
1351                         /*
1352                          * No need to process the ACK/NAK since we are
1353                          * restarting an earlier request.
1354                          */
1355                         goto bail_stop;
1356                 }
1357                 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1358                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1359                         u64 *vaddr = wqe->sg_list[0].vaddr;
1360                         *vaddr = val;
1361                 }
1362                 if (qp->s_num_rd_atomic &&
1363                     (wqe->wr.opcode == IB_WR_RDMA_READ ||
1364                      wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1365                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1366                         qp->s_num_rd_atomic--;
1367                         /* Restart sending task if fence is complete */
1368                         if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1369                             !qp->s_num_rd_atomic) {
1370                                 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1371                                                  RVT_S_WAIT_ACK);
1372                                 hfi1_schedule_send(qp);
1373                         } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1374                                 qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1375                                                  RVT_S_WAIT_ACK);
1376                                 hfi1_schedule_send(qp);
1377                         }
1378                 }
1379                 wqe = do_rc_completion(qp, wqe, ibp);
1380                 if (qp->s_acked == qp->s_tail)
1381                         break;
1382         }
1383
1384         switch (aeth >> IB_AETH_NAK_SHIFT) {
1385         case 0:         /* ACK */
1386                 this_cpu_inc(*ibp->rvp.rc_acks);
1387                 if (qp->s_acked != qp->s_tail) {
1388                         /*
1389                          * We are expecting more ACKs so
1390                          * mod the retry timer.
1391                          */
1392                         rvt_mod_retry_timer(qp);
1393                         /*
1394                          * We can stop re-sending the earlier packets and
1395                          * continue with the next packet the receiver wants.
1396                          */
1397                         if (cmp_psn(qp->s_psn, psn) <= 0)
1398                                 reset_psn(qp, psn + 1);
1399                 } else {
1400                         /* No more acks - kill all timers */
1401                         rvt_stop_rc_timers(qp);
1402                         if (cmp_psn(qp->s_psn, psn) <= 0) {
1403                                 qp->s_state = OP(SEND_LAST);
1404                                 qp->s_psn = psn + 1;
1405                         }
1406                 }
1407                 if (qp->s_flags & RVT_S_WAIT_ACK) {
1408                         qp->s_flags &= ~RVT_S_WAIT_ACK;
1409                         hfi1_schedule_send(qp);
1410                 }
1411                 rvt_get_credit(qp, aeth);
1412                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1413                 qp->s_retry = qp->s_retry_cnt;
1414                 update_last_psn(qp, psn);
1415                 return 1;
1416
1417         case 1:         /* RNR NAK */
1418                 ibp->rvp.n_rnr_naks++;
1419                 if (qp->s_acked == qp->s_tail)
1420                         goto bail_stop;
1421                 if (qp->s_flags & RVT_S_WAIT_RNR)
1422                         goto bail_stop;
1423                 if (qp->s_rnr_retry == 0) {
1424                         status = IB_WC_RNR_RETRY_EXC_ERR;
1425                         goto class_b;
1426                 }
1427                 if (qp->s_rnr_retry_cnt < 7)
1428                         qp->s_rnr_retry--;
1429
1430                 /* The last valid PSN is the previous PSN. */
1431                 update_last_psn(qp, psn - 1);
1432
1433                 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1434
1435                 reset_psn(qp, psn);
1436
1437                 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1438                 rvt_stop_rc_timers(qp);
1439                 rvt_add_rnr_timer(qp, aeth);
1440                 return 0;
1441
1442         case 3:         /* NAK */
1443                 if (qp->s_acked == qp->s_tail)
1444                         goto bail_stop;
1445                 /* The last valid PSN is the previous PSN. */
1446                 update_last_psn(qp, psn - 1);
1447                 switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1448                         IB_AETH_CREDIT_MASK) {
1449                 case 0: /* PSN sequence error */
1450                         ibp->rvp.n_seq_naks++;
1451                         /*
1452                          * Back up to the responder's expected PSN.
1453                          * Note that we might get a NAK in the middle of an
1454                          * RDMA READ response which terminates the RDMA
1455                          * READ.
1456                          */
1457                         hfi1_restart_rc(qp, psn, 0);
1458                         hfi1_schedule_send(qp);
1459                         break;
1460
1461                 case 1: /* Invalid Request */
1462                         status = IB_WC_REM_INV_REQ_ERR;
1463                         ibp->rvp.n_other_naks++;
1464                         goto class_b;
1465
1466                 case 2: /* Remote Access Error */
1467                         status = IB_WC_REM_ACCESS_ERR;
1468                         ibp->rvp.n_other_naks++;
1469                         goto class_b;
1470
1471                 case 3: /* Remote Operation Error */
1472                         status = IB_WC_REM_OP_ERR;
1473                         ibp->rvp.n_other_naks++;
1474 class_b:
1475                         if (qp->s_last == qp->s_acked) {
1476                                 hfi1_send_complete(qp, wqe, status);
1477                                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1478                         }
1479                         break;
1480
1481                 default:
1482                         /* Ignore other reserved NAK error codes */
1483                         goto reserved;
1484                 }
1485                 qp->s_retry = qp->s_retry_cnt;
1486                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1487                 goto bail_stop;
1488
1489         default:                /* 2: reserved */
1490 reserved:
1491                 /* Ignore reserved NAK codes. */
1492                 goto bail_stop;
1493         }
1494         /* cannot be reached  */
1495 bail_stop:
1496         rvt_stop_rc_timers(qp);
1497         return ret;
1498 }
1499
1500 /*
1501  * We have seen an out of sequence RDMA read middle or last packet.
1502  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1503  */
1504 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
1505                          struct hfi1_ctxtdata *rcd)
1506 {
1507         struct rvt_swqe *wqe;
1508
1509         lockdep_assert_held(&qp->s_lock);
1510         /* Remove QP from retry timer */
1511         rvt_stop_rc_timers(qp);
1512
1513         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1514
1515         while (cmp_psn(psn, wqe->lpsn) > 0) {
1516                 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1517                     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1518                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1519                         break;
1520                 wqe = do_rc_completion(qp, wqe, ibp);
1521         }
1522
1523         ibp->rvp.n_rdma_seq++;
1524         qp->r_flags |= RVT_R_RDMAR_SEQ;
1525         hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1526         if (list_empty(&qp->rspwait)) {
1527                 qp->r_flags |= RVT_R_RSP_SEND;
1528                 rvt_get_qp(qp);
1529                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1530         }
1531 }
1532
1533 /**
1534  * rc_rcv_resp - process an incoming RC response packet
1535  * @packet: data packet information
1536  *
1537  * This is called from hfi1_rc_rcv() to process an incoming RC response
1538  * packet for the given QP.
1539  * Called at interrupt level.
1540  */
1541 static void rc_rcv_resp(struct hfi1_packet *packet)
1542 {
1543         struct hfi1_ctxtdata *rcd = packet->rcd;
1544         void *data = packet->payload;
1545         u32 tlen = packet->tlen;
1546         struct rvt_qp *qp = packet->qp;
1547         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1548         struct ib_other_headers *ohdr = packet->ohdr;
1549         struct rvt_swqe *wqe;
1550         enum ib_wc_status status;
1551         unsigned long flags;
1552         int diff;
1553         u64 val;
1554         u32 aeth;
1555         u32 psn = ib_bth_get_psn(packet->ohdr);
1556         u32 pmtu = qp->pmtu;
1557         u16 hdrsize = packet->hlen;
1558         u8 opcode = packet->opcode;
1559         u8 pad = packet->pad;
1560         u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
1561
1562         spin_lock_irqsave(&qp->s_lock, flags);
1563         trace_hfi1_ack(qp, psn);
1564
1565         /* Ignore invalid responses. */
1566         smp_read_barrier_depends(); /* see post_one_send */
1567         if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1568                 goto ack_done;
1569
1570         /* Ignore duplicate responses. */
1571         diff = cmp_psn(psn, qp->s_last_psn);
1572         if (unlikely(diff <= 0)) {
1573                 /* Update credits for "ghost" ACKs */
1574                 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1575                         aeth = be32_to_cpu(ohdr->u.aeth);
1576                         if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1577                                 rvt_get_credit(qp, aeth);
1578                 }
1579                 goto ack_done;
1580         }
1581
1582         /*
1583          * Skip everything other than the PSN we expect, if we are waiting
1584          * for a reply to a restarted RDMA read or atomic op.
1585          */
1586         if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1587                 if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
1588                         goto ack_done;
1589                 qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1590         }
1591
1592         if (unlikely(qp->s_acked == qp->s_tail))
1593                 goto ack_done;
1594         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1595         status = IB_WC_SUCCESS;
1596
1597         switch (opcode) {
1598         case OP(ACKNOWLEDGE):
1599         case OP(ATOMIC_ACKNOWLEDGE):
1600         case OP(RDMA_READ_RESPONSE_FIRST):
1601                 aeth = be32_to_cpu(ohdr->u.aeth);
1602                 if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1603                         val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1604                 else
1605                         val = 0;
1606                 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1607                     opcode != OP(RDMA_READ_RESPONSE_FIRST))
1608                         goto ack_done;
1609                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1610                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1611                         goto ack_op_err;
1612                 /*
1613                  * If this is a response to a resent RDMA read, we
1614                  * have to be careful to copy the data to the right
1615                  * location.
1616                  */
1617                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1618                                                   wqe, psn, pmtu);
1619                 goto read_middle;
1620
1621         case OP(RDMA_READ_RESPONSE_MIDDLE):
1622                 /* no AETH, no ACK */
1623                 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1624                         goto ack_seq_err;
1625                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1626                         goto ack_op_err;
1627 read_middle:
1628                 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
1629                         goto ack_len_err;
1630                 if (unlikely(pmtu >= qp->s_rdma_read_len))
1631                         goto ack_len_err;
1632
1633                 /*
1634                  * We got a response so update the timeout.
1635                  * 4.096 usec. * (1 << qp->timeout)
1636                  */
1637                 rvt_mod_retry_timer(qp);
1638                 if (qp->s_flags & RVT_S_WAIT_ACK) {
1639                         qp->s_flags &= ~RVT_S_WAIT_ACK;
1640                         hfi1_schedule_send(qp);
1641                 }
1642
1643                 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1644                         qp->s_retry = qp->s_retry_cnt;
1645
1646                 /*
1647                  * Update the RDMA receive state but do the copy w/o
1648                  * holding the locks and blocking interrupts.
1649                  */
1650                 qp->s_rdma_read_len -= pmtu;
1651                 update_last_psn(qp, psn);
1652                 spin_unlock_irqrestore(&qp->s_lock, flags);
1653                 hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
1654                 goto bail;
1655
1656         case OP(RDMA_READ_RESPONSE_ONLY):
1657                 aeth = be32_to_cpu(ohdr->u.aeth);
1658                 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1659                         goto ack_done;
1660                 /*
1661                  * Check that the data size is >= 0 && <= pmtu.
1662                  * Remember to account for ICRC (4).
1663                  */
1664                 if (unlikely(tlen < (hdrsize + extra_bytes)))
1665                         goto ack_len_err;
1666                 /*
1667                  * If this is a response to a resent RDMA read, we
1668                  * have to be careful to copy the data to the right
1669                  * location.
1670                  */
1671                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1672                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1673                                                   wqe, psn, pmtu);
1674                 goto read_last;
1675
1676         case OP(RDMA_READ_RESPONSE_LAST):
1677                 /* ACKs READ req. */
1678                 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1679                         goto ack_seq_err;
1680                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1681                         goto ack_op_err;
1682                 /*
1683                  * Check that the data size is >= 1 && <= pmtu.
1684                  * Remember to account for ICRC (4).
1685                  */
1686                 if (unlikely(tlen <= (hdrsize + extra_bytes)))
1687                         goto ack_len_err;
1688 read_last:
1689                 tlen -= hdrsize + extra_bytes;
1690                 if (unlikely(tlen != qp->s_rdma_read_len))
1691                         goto ack_len_err;
1692                 aeth = be32_to_cpu(ohdr->u.aeth);
1693                 hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
1694                 WARN_ON(qp->s_rdma_read_sge.num_sge);
1695                 (void)do_rc_ack(qp, aeth, psn,
1696                                  OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1697                 goto ack_done;
1698         }
1699
1700 ack_op_err:
1701         status = IB_WC_LOC_QP_OP_ERR;
1702         goto ack_err;
1703
1704 ack_seq_err:
1705         rdma_seq_err(qp, ibp, psn, rcd);
1706         goto ack_done;
1707
1708 ack_len_err:
1709         status = IB_WC_LOC_LEN_ERR;
1710 ack_err:
1711         if (qp->s_last == qp->s_acked) {
1712                 hfi1_send_complete(qp, wqe, status);
1713                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1714         }
1715 ack_done:
1716         spin_unlock_irqrestore(&qp->s_lock, flags);
1717 bail:
1718         return;
1719 }
1720
1721 static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
1722                                   struct rvt_qp *qp)
1723 {
1724         if (list_empty(&qp->rspwait)) {
1725                 qp->r_flags |= RVT_R_RSP_NAK;
1726                 rvt_get_qp(qp);
1727                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1728         }
1729 }
1730
1731 static inline void rc_cancel_ack(struct rvt_qp *qp)
1732 {
1733         qp->r_adefered = 0;
1734         if (list_empty(&qp->rspwait))
1735                 return;
1736         list_del_init(&qp->rspwait);
1737         qp->r_flags &= ~RVT_R_RSP_NAK;
1738         rvt_put_qp(qp);
1739 }
1740
1741 /**
1742  * rc_rcv_error - process an incoming duplicate or error RC packet
1743  * @ohdr: the other headers for this packet
1744  * @data: the packet data
1745  * @qp: the QP for this packet
1746  * @opcode: the opcode for this packet
1747  * @psn: the packet sequence number for this packet
1748  * @diff: the difference between the PSN and the expected PSN
1749  *
1750  * This is called from hfi1_rc_rcv() to process an unexpected
1751  * incoming RC packet for the given QP.
1752  * Called at interrupt level.
1753  * Return 1 if no more processing is needed; otherwise return 0 to
1754  * schedule a response to be sent.
1755  */
1756 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
1757                                  struct rvt_qp *qp, u32 opcode, u32 psn,
1758                                  int diff, struct hfi1_ctxtdata *rcd)
1759 {
1760         struct hfi1_ibport *ibp = rcd_to_iport(rcd);
1761         struct rvt_ack_entry *e;
1762         unsigned long flags;
1763         u8 i, prev;
1764         int old_req;
1765
1766         trace_hfi1_rcv_error(qp, psn);
1767         if (diff > 0) {
1768                 /*
1769                  * Packet sequence error.
1770                  * A NAK will ACK earlier sends and RDMA writes.
1771                  * Don't queue the NAK if we already sent one.
1772                  */
1773                 if (!qp->r_nak_state) {
1774                         ibp->rvp.n_rc_seqnak++;
1775                         qp->r_nak_state = IB_NAK_PSN_ERROR;
1776                         /* Use the expected PSN. */
1777                         qp->r_ack_psn = qp->r_psn;
1778                         /*
1779                          * Wait to send the sequence NAK until all packets
1780                          * in the receive queue have been processed.
1781                          * Otherwise, we end up propagating congestion.
1782                          */
1783                         rc_defered_ack(rcd, qp);
1784                 }
1785                 goto done;
1786         }
1787
1788         /*
1789          * Handle a duplicate request.  Don't re-execute SEND, RDMA
1790          * write or atomic op.  Don't NAK errors, just silently drop
1791          * the duplicate request.  Note that r_sge, r_len, and
1792          * r_rcv_len may be in use so don't modify them.
1793          *
1794          * We are supposed to ACK the earliest duplicate PSN but we
1795          * can coalesce an outstanding duplicate ACK.  We have to
1796          * send the earliest so that RDMA reads can be restarted at
1797          * the requester's expected PSN.
1798          *
1799          * First, find where this duplicate PSN falls within the
1800          * ACKs previously sent.
1801          * old_req is true if there is an older response that is scheduled
1802          * to be sent before sending this one.
1803          */
1804         e = NULL;
1805         old_req = 1;
1806         ibp->rvp.n_rc_dupreq++;
1807
1808         spin_lock_irqsave(&qp->s_lock, flags);
1809
1810         for (i = qp->r_head_ack_queue; ; i = prev) {
1811                 if (i == qp->s_tail_ack_queue)
1812                         old_req = 0;
1813                 if (i)
1814                         prev = i - 1;
1815                 else
1816                         prev = HFI1_MAX_RDMA_ATOMIC;
1817                 if (prev == qp->r_head_ack_queue) {
1818                         e = NULL;
1819                         break;
1820                 }
1821                 e = &qp->s_ack_queue[prev];
1822                 if (!e->opcode) {
1823                         e = NULL;
1824                         break;
1825                 }
1826                 if (cmp_psn(psn, e->psn) >= 0) {
1827                         if (prev == qp->s_tail_ack_queue &&
1828                             cmp_psn(psn, e->lpsn) <= 0)
1829                                 old_req = 0;
1830                         break;
1831                 }
1832         }
1833         switch (opcode) {
1834         case OP(RDMA_READ_REQUEST): {
1835                 struct ib_reth *reth;
1836                 u32 offset;
1837                 u32 len;
1838
1839                 /*
1840                  * If we didn't find the RDMA read request in the ack queue,
1841                  * we can ignore this request.
1842                  */
1843                 if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1844                         goto unlock_done;
1845                 /* RETH comes after BTH */
1846                 reth = &ohdr->u.rc.reth;
1847                 /*
1848                  * Address range must be a subset of the original
1849                  * request and start on pmtu boundaries.
1850                  * We reuse the old ack_queue slot since the requester
1851                  * should not back up and request an earlier PSN for the
1852                  * same request.
1853                  */
1854                 offset = delta_psn(psn, e->psn) * qp->pmtu;
1855                 len = be32_to_cpu(reth->length);
1856                 if (unlikely(offset + len != e->rdma_sge.sge_length))
1857                         goto unlock_done;
1858                 if (e->rdma_sge.mr) {
1859                         rvt_put_mr(e->rdma_sge.mr);
1860                         e->rdma_sge.mr = NULL;
1861                 }
1862                 if (len != 0) {
1863                         u32 rkey = be32_to_cpu(reth->rkey);
1864                         u64 vaddr = get_ib_reth_vaddr(reth);
1865                         int ok;
1866
1867                         ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1868                                          IB_ACCESS_REMOTE_READ);
1869                         if (unlikely(!ok))
1870                                 goto unlock_done;
1871                 } else {
1872                         e->rdma_sge.vaddr = NULL;
1873                         e->rdma_sge.length = 0;
1874                         e->rdma_sge.sge_length = 0;
1875                 }
1876                 e->psn = psn;
1877                 if (old_req)
1878                         goto unlock_done;
1879                 qp->s_tail_ack_queue = prev;
1880                 break;
1881         }
1882
1883         case OP(COMPARE_SWAP):
1884         case OP(FETCH_ADD): {
1885                 /*
1886                  * If we didn't find the atomic request in the ack queue
1887                  * or the send engine is already backed up to send an
1888                  * earlier entry, we can ignore this request.
1889                  */
1890                 if (!e || e->opcode != (u8)opcode || old_req)
1891                         goto unlock_done;
1892                 qp->s_tail_ack_queue = prev;
1893                 break;
1894         }
1895
1896         default:
1897                 /*
1898                  * Ignore this operation if it doesn't request an ACK
1899                  * or an earlier RDMA read or atomic is going to be resent.
1900                  */
1901                 if (!(psn & IB_BTH_REQ_ACK) || old_req)
1902                         goto unlock_done;
1903                 /*
1904                  * Resend the most recent ACK if this request is
1905                  * after all the previous RDMA reads and atomics.
1906                  */
1907                 if (i == qp->r_head_ack_queue) {
1908                         spin_unlock_irqrestore(&qp->s_lock, flags);
1909                         qp->r_nak_state = 0;
1910                         qp->r_ack_psn = qp->r_psn - 1;
1911                         goto send_ack;
1912                 }
1913
1914                 /*
1915                  * Resend the RDMA read or atomic op which
1916                  * ACKs this duplicate request.
1917                  */
1918                 qp->s_tail_ack_queue = i;
1919                 break;
1920         }
1921         qp->s_ack_state = OP(ACKNOWLEDGE);
1922         qp->s_flags |= RVT_S_RESP_PENDING;
1923         qp->r_nak_state = 0;
1924         hfi1_schedule_send(qp);
1925
1926 unlock_done:
1927         spin_unlock_irqrestore(&qp->s_lock, flags);
1928 done:
1929         return 1;
1930
1931 send_ack:
1932         return 0;
1933 }
1934
1935 static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
1936 {
1937         unsigned next;
1938
1939         next = n + 1;
1940         if (next > HFI1_MAX_RDMA_ATOMIC)
1941                 next = 0;
1942         qp->s_tail_ack_queue = next;
1943         qp->s_ack_state = OP(ACKNOWLEDGE);
1944 }
1945
1946 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
1947                           u32 lqpn, u32 rqpn, u8 svc_type)
1948 {
1949         struct opa_hfi1_cong_log_event_internal *cc_event;
1950         unsigned long flags;
1951
1952         if (sl >= OPA_MAX_SLS)
1953                 return;
1954
1955         spin_lock_irqsave(&ppd->cc_log_lock, flags);
1956
1957         ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
1958         ppd->threshold_event_counter++;
1959
1960         cc_event = &ppd->cc_events[ppd->cc_log_idx++];
1961         if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
1962                 ppd->cc_log_idx = 0;
1963         cc_event->lqpn = lqpn & RVT_QPN_MASK;
1964         cc_event->rqpn = rqpn & RVT_QPN_MASK;
1965         cc_event->sl = sl;
1966         cc_event->svc_type = svc_type;
1967         cc_event->rlid = rlid;
1968         /* keep timestamp in units of 1.024 usec */
1969         cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
1970
1971         spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
1972 }
1973
1974 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
1975                   u32 rqpn, u8 svc_type)
1976 {
1977         struct cca_timer *cca_timer;
1978         u16 ccti, ccti_incr, ccti_timer, ccti_limit;
1979         u8 trigger_threshold;
1980         struct cc_state *cc_state;
1981         unsigned long flags;
1982
1983         if (sl >= OPA_MAX_SLS)
1984                 return;
1985
1986         cc_state = get_cc_state(ppd);
1987
1988         if (!cc_state)
1989                 return;
1990
1991         /*
1992          * 1) increase CCTI (for this SL)
1993          * 2) select IPG (i.e., call set_link_ipg())
1994          * 3) start timer
1995          */
1996         ccti_limit = cc_state->cct.ccti_limit;
1997         ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
1998         ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
1999         trigger_threshold =
2000                 cc_state->cong_setting.entries[sl].trigger_threshold;
2001
2002         spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2003
2004         cca_timer = &ppd->cca_timer[sl];
2005         if (cca_timer->ccti < ccti_limit) {
2006                 if (cca_timer->ccti + ccti_incr <= ccti_limit)
2007                         cca_timer->ccti += ccti_incr;
2008                 else
2009                         cca_timer->ccti = ccti_limit;
2010                 set_link_ipg(ppd);
2011         }
2012
2013         ccti = cca_timer->ccti;
2014
2015         if (!hrtimer_active(&cca_timer->hrtimer)) {
2016                 /* ccti_timer is in units of 1.024 usec */
2017                 unsigned long nsec = 1024 * ccti_timer;
2018
2019                 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2020                               HRTIMER_MODE_REL);
2021         }
2022
2023         spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2024
2025         if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2026                 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2027 }
2028
2029 /**
2030  * hfi1_rc_rcv - process an incoming RC packet
2031  * @packet: data packet information
2032  *
2033  * This is called from qp_rcv() to process an incoming RC packet
2034  * for the given QP.
2035  * May be called at interrupt level.
2036  */
2037 void hfi1_rc_rcv(struct hfi1_packet *packet)
2038 {
2039         struct hfi1_ctxtdata *rcd = packet->rcd;
2040         void *data = packet->payload;
2041         u32 tlen = packet->tlen;
2042         struct rvt_qp *qp = packet->qp;
2043         struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2044         struct ib_other_headers *ohdr = packet->ohdr;
2045         u32 bth0 = be32_to_cpu(ohdr->bth[0]);
2046         u32 opcode = packet->opcode;
2047         u32 hdrsize = packet->hlen;
2048         u32 psn = ib_bth_get_psn(packet->ohdr);
2049         u32 pad = packet->pad;
2050         struct ib_wc wc;
2051         u32 pmtu = qp->pmtu;
2052         int diff;
2053         struct ib_reth *reth;
2054         unsigned long flags;
2055         int ret;
2056         bool is_fecn = false;
2057         bool copy_last = false;
2058         u32 rkey;
2059         u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2060
2061         lockdep_assert_held(&qp->r_lock);
2062
2063         if (hfi1_ruc_check_hdr(ibp, packet))
2064                 return;
2065
2066         is_fecn = process_ecn(qp, packet, false);
2067
2068         /*
2069          * Process responses (ACKs) before anything else.  Note that the
2070          * packet sequence number will be for something in the send work
2071          * queue rather than the expected receive packet sequence number.
2072          * In other words, this QP is the requester.
2073          */
2074         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2075             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2076                 rc_rcv_resp(packet);
2077                 if (is_fecn)
2078                         goto send_ack;
2079                 return;
2080         }
2081
2082         /* Compute 24 bits worth of difference. */
2083         diff = delta_psn(psn, qp->r_psn);
2084         if (unlikely(diff)) {
2085                 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2086                         return;
2087                 goto send_ack;
2088         }
2089
2090         /* Check for opcode sequence errors. */
2091         switch (qp->r_state) {
2092         case OP(SEND_FIRST):
2093         case OP(SEND_MIDDLE):
2094                 if (opcode == OP(SEND_MIDDLE) ||
2095                     opcode == OP(SEND_LAST) ||
2096                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2097                     opcode == OP(SEND_LAST_WITH_INVALIDATE))
2098                         break;
2099                 goto nack_inv;
2100
2101         case OP(RDMA_WRITE_FIRST):
2102         case OP(RDMA_WRITE_MIDDLE):
2103                 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2104                     opcode == OP(RDMA_WRITE_LAST) ||
2105                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2106                         break;
2107                 goto nack_inv;
2108
2109         default:
2110                 if (opcode == OP(SEND_MIDDLE) ||
2111                     opcode == OP(SEND_LAST) ||
2112                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2113                     opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2114                     opcode == OP(RDMA_WRITE_MIDDLE) ||
2115                     opcode == OP(RDMA_WRITE_LAST) ||
2116                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2117                         goto nack_inv;
2118                 /*
2119                  * Note that it is up to the requester to not send a new
2120                  * RDMA read or atomic operation before receiving an ACK
2121                  * for the previous operation.
2122                  */
2123                 break;
2124         }
2125
2126         if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2127                 rvt_comm_est(qp);
2128
2129         /* OK, process the packet. */
2130         switch (opcode) {
2131         case OP(SEND_FIRST):
2132                 ret = hfi1_rvt_get_rwqe(qp, 0);
2133                 if (ret < 0)
2134                         goto nack_op_err;
2135                 if (!ret)
2136                         goto rnr_nak;
2137                 qp->r_rcv_len = 0;
2138                 /* FALLTHROUGH */
2139         case OP(SEND_MIDDLE):
2140         case OP(RDMA_WRITE_MIDDLE):
2141 send_middle:
2142                 /* Check for invalid length PMTU or posted rwqe len. */
2143                 /*
2144                  * There will be no padding for 9B packet but 16B packets
2145                  * will come in with some padding since we always add
2146                  * CRC and LT bytes which will need to be flit aligned
2147                  */
2148                 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2149                         goto nack_inv;
2150                 qp->r_rcv_len += pmtu;
2151                 if (unlikely(qp->r_rcv_len > qp->r_len))
2152                         goto nack_inv;
2153                 hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
2154                 break;
2155
2156         case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2157                 /* consume RWQE */
2158                 ret = hfi1_rvt_get_rwqe(qp, 1);
2159                 if (ret < 0)
2160                         goto nack_op_err;
2161                 if (!ret)
2162                         goto rnr_nak;
2163                 goto send_last_imm;
2164
2165         case OP(SEND_ONLY):
2166         case OP(SEND_ONLY_WITH_IMMEDIATE):
2167         case OP(SEND_ONLY_WITH_INVALIDATE):
2168                 ret = hfi1_rvt_get_rwqe(qp, 0);
2169                 if (ret < 0)
2170                         goto nack_op_err;
2171                 if (!ret)
2172                         goto rnr_nak;
2173                 qp->r_rcv_len = 0;
2174                 if (opcode == OP(SEND_ONLY))
2175                         goto no_immediate_data;
2176                 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2177                         goto send_last_inv;
2178                 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2179         case OP(SEND_LAST_WITH_IMMEDIATE):
2180 send_last_imm:
2181                 wc.ex.imm_data = ohdr->u.imm_data;
2182                 wc.wc_flags = IB_WC_WITH_IMM;
2183                 goto send_last;
2184         case OP(SEND_LAST_WITH_INVALIDATE):
2185 send_last_inv:
2186                 rkey = be32_to_cpu(ohdr->u.ieth);
2187                 if (rvt_invalidate_rkey(qp, rkey))
2188                         goto no_immediate_data;
2189                 wc.ex.invalidate_rkey = rkey;
2190                 wc.wc_flags = IB_WC_WITH_INVALIDATE;
2191                 goto send_last;
2192         case OP(RDMA_WRITE_LAST):
2193                 copy_last = rvt_is_user_qp(qp);
2194                 /* fall through */
2195         case OP(SEND_LAST):
2196 no_immediate_data:
2197                 wc.wc_flags = 0;
2198                 wc.ex.imm_data = 0;
2199 send_last:
2200                 /* Check for invalid length. */
2201                 /* LAST len should be >= 1 */
2202                 if (unlikely(tlen < (hdrsize + extra_bytes)))
2203                         goto nack_inv;
2204                 /* Don't count the CRC(and padding and LT byte for 16B). */
2205                 tlen -= (hdrsize + extra_bytes);
2206                 wc.byte_len = tlen + qp->r_rcv_len;
2207                 if (unlikely(wc.byte_len > qp->r_len))
2208                         goto nack_inv;
2209                 hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
2210                 rvt_put_ss(&qp->r_sge);
2211                 qp->r_msn++;
2212                 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2213                         break;
2214                 wc.wr_id = qp->r_wr_id;
2215                 wc.status = IB_WC_SUCCESS;
2216                 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2217                     opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2218                         wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2219                 else
2220                         wc.opcode = IB_WC_RECV;
2221                 wc.qp = &qp->ibqp;
2222                 wc.src_qp = qp->remote_qpn;
2223                 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
2224                 /*
2225                  * It seems that IB mandates the presence of an SL in a
2226                  * work completion only for the UD transport (see section
2227                  * 11.4.2 of IBTA Vol. 1).
2228                  *
2229                  * However, the way the SL is chosen below is consistent
2230                  * with the way that IB/qib works and is trying avoid
2231                  * introducing incompatibilities.
2232                  *
2233                  * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2234                  */
2235                 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
2236                 /* zero fields that are N/A */
2237                 wc.vendor_err = 0;
2238                 wc.pkey_index = 0;
2239                 wc.dlid_path_bits = 0;
2240                 wc.port_num = 0;
2241                 /* Signal completion event if the solicited bit is set. */
2242                 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
2243                              (bth0 & IB_BTH_SOLICITED) != 0);
2244                 break;
2245
2246         case OP(RDMA_WRITE_ONLY):
2247                 copy_last = rvt_is_user_qp(qp);
2248                 /* fall through */
2249         case OP(RDMA_WRITE_FIRST):
2250         case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2251                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2252                         goto nack_inv;
2253                 /* consume RWQE */
2254                 reth = &ohdr->u.rc.reth;
2255                 qp->r_len = be32_to_cpu(reth->length);
2256                 qp->r_rcv_len = 0;
2257                 qp->r_sge.sg_list = NULL;
2258                 if (qp->r_len != 0) {
2259                         u32 rkey = be32_to_cpu(reth->rkey);
2260                         u64 vaddr = get_ib_reth_vaddr(reth);
2261                         int ok;
2262
2263                         /* Check rkey & NAK */
2264                         ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2265                                          rkey, IB_ACCESS_REMOTE_WRITE);
2266                         if (unlikely(!ok))
2267                                 goto nack_acc;
2268                         qp->r_sge.num_sge = 1;
2269                 } else {
2270                         qp->r_sge.num_sge = 0;
2271                         qp->r_sge.sge.mr = NULL;
2272                         qp->r_sge.sge.vaddr = NULL;
2273                         qp->r_sge.sge.length = 0;
2274                         qp->r_sge.sge.sge_length = 0;
2275                 }
2276                 if (opcode == OP(RDMA_WRITE_FIRST))
2277                         goto send_middle;
2278                 else if (opcode == OP(RDMA_WRITE_ONLY))
2279                         goto no_immediate_data;
2280                 ret = hfi1_rvt_get_rwqe(qp, 1);
2281                 if (ret < 0)
2282                         goto nack_op_err;
2283                 if (!ret) {
2284                         /* peer will send again */
2285                         rvt_put_ss(&qp->r_sge);
2286                         goto rnr_nak;
2287                 }
2288                 wc.ex.imm_data = ohdr->u.rc.imm_data;
2289                 wc.wc_flags = IB_WC_WITH_IMM;
2290                 goto send_last;
2291
2292         case OP(RDMA_READ_REQUEST): {
2293                 struct rvt_ack_entry *e;
2294                 u32 len;
2295                 u8 next;
2296
2297                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2298                         goto nack_inv;
2299                 next = qp->r_head_ack_queue + 1;
2300                 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2301                 if (next > HFI1_MAX_RDMA_ATOMIC)
2302                         next = 0;
2303                 spin_lock_irqsave(&qp->s_lock, flags);
2304                 if (unlikely(next == qp->s_tail_ack_queue)) {
2305                         if (!qp->s_ack_queue[next].sent)
2306                                 goto nack_inv_unlck;
2307                         update_ack_queue(qp, next);
2308                 }
2309                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2310                 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2311                         rvt_put_mr(e->rdma_sge.mr);
2312                         e->rdma_sge.mr = NULL;
2313                 }
2314                 reth = &ohdr->u.rc.reth;
2315                 len = be32_to_cpu(reth->length);
2316                 if (len) {
2317                         u32 rkey = be32_to_cpu(reth->rkey);
2318                         u64 vaddr = get_ib_reth_vaddr(reth);
2319                         int ok;
2320
2321                         /* Check rkey & NAK */
2322                         ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2323                                          rkey, IB_ACCESS_REMOTE_READ);
2324                         if (unlikely(!ok))
2325                                 goto nack_acc_unlck;
2326                         /*
2327                          * Update the next expected PSN.  We add 1 later
2328                          * below, so only add the remainder here.
2329                          */
2330                         qp->r_psn += rvt_div_mtu(qp, len - 1);
2331                 } else {
2332                         e->rdma_sge.mr = NULL;
2333                         e->rdma_sge.vaddr = NULL;
2334                         e->rdma_sge.length = 0;
2335                         e->rdma_sge.sge_length = 0;
2336                 }
2337                 e->opcode = opcode;
2338                 e->sent = 0;
2339                 e->psn = psn;
2340                 e->lpsn = qp->r_psn;
2341                 /*
2342                  * We need to increment the MSN here instead of when we
2343                  * finish sending the result since a duplicate request would
2344                  * increment it more than once.
2345                  */
2346                 qp->r_msn++;
2347                 qp->r_psn++;
2348                 qp->r_state = opcode;
2349                 qp->r_nak_state = 0;
2350                 qp->r_head_ack_queue = next;
2351
2352                 /* Schedule the send engine. */
2353                 qp->s_flags |= RVT_S_RESP_PENDING;
2354                 hfi1_schedule_send(qp);
2355
2356                 spin_unlock_irqrestore(&qp->s_lock, flags);
2357                 if (is_fecn)
2358                         goto send_ack;
2359                 return;
2360         }
2361
2362         case OP(COMPARE_SWAP):
2363         case OP(FETCH_ADD): {
2364                 struct ib_atomic_eth *ateth;
2365                 struct rvt_ack_entry *e;
2366                 u64 vaddr;
2367                 atomic64_t *maddr;
2368                 u64 sdata;
2369                 u32 rkey;
2370                 u8 next;
2371
2372                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2373                         goto nack_inv;
2374                 next = qp->r_head_ack_queue + 1;
2375                 if (next > HFI1_MAX_RDMA_ATOMIC)
2376                         next = 0;
2377                 spin_lock_irqsave(&qp->s_lock, flags);
2378                 if (unlikely(next == qp->s_tail_ack_queue)) {
2379                         if (!qp->s_ack_queue[next].sent)
2380                                 goto nack_inv_unlck;
2381                         update_ack_queue(qp, next);
2382                 }
2383                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2384                 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2385                         rvt_put_mr(e->rdma_sge.mr);
2386                         e->rdma_sge.mr = NULL;
2387                 }
2388                 ateth = &ohdr->u.atomic_eth;
2389                 vaddr = get_ib_ateth_vaddr(ateth);
2390                 if (unlikely(vaddr & (sizeof(u64) - 1)))
2391                         goto nack_inv_unlck;
2392                 rkey = be32_to_cpu(ateth->rkey);
2393                 /* Check rkey & NAK */
2394                 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2395                                           vaddr, rkey,
2396                                           IB_ACCESS_REMOTE_ATOMIC)))
2397                         goto nack_acc_unlck;
2398                 /* Perform atomic OP and save result. */
2399                 maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
2400                 sdata = get_ib_ateth_swap(ateth);
2401                 e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2402                         (u64)atomic64_add_return(sdata, maddr) - sdata :
2403                         (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
2404                                       get_ib_ateth_compare(ateth),
2405                                       sdata);
2406                 rvt_put_mr(qp->r_sge.sge.mr);
2407                 qp->r_sge.num_sge = 0;
2408                 e->opcode = opcode;
2409                 e->sent = 0;
2410                 e->psn = psn;
2411                 e->lpsn = psn;
2412                 qp->r_msn++;
2413                 qp->r_psn++;
2414                 qp->r_state = opcode;
2415                 qp->r_nak_state = 0;
2416                 qp->r_head_ack_queue = next;
2417
2418                 /* Schedule the send engine. */
2419                 qp->s_flags |= RVT_S_RESP_PENDING;
2420                 hfi1_schedule_send(qp);
2421
2422                 spin_unlock_irqrestore(&qp->s_lock, flags);
2423                 if (is_fecn)
2424                         goto send_ack;
2425                 return;
2426         }
2427
2428         default:
2429                 /* NAK unknown opcodes. */
2430                 goto nack_inv;
2431         }
2432         qp->r_psn++;
2433         qp->r_state = opcode;
2434         qp->r_ack_psn = psn;
2435         qp->r_nak_state = 0;
2436         /* Send an ACK if requested or required. */
2437         if (psn & IB_BTH_REQ_ACK) {
2438                 if (packet->numpkt == 0) {
2439                         rc_cancel_ack(qp);
2440                         goto send_ack;
2441                 }
2442                 if (qp->r_adefered >= HFI1_PSN_CREDIT) {
2443                         rc_cancel_ack(qp);
2444                         goto send_ack;
2445                 }
2446                 if (unlikely(is_fecn)) {
2447                         rc_cancel_ack(qp);
2448                         goto send_ack;
2449                 }
2450                 qp->r_adefered++;
2451                 rc_defered_ack(rcd, qp);
2452         }
2453         return;
2454
2455 rnr_nak:
2456         qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
2457         qp->r_ack_psn = qp->r_psn;
2458         /* Queue RNR NAK for later */
2459         rc_defered_ack(rcd, qp);
2460         return;
2461
2462 nack_op_err:
2463         rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2464         qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2465         qp->r_ack_psn = qp->r_psn;
2466         /* Queue NAK for later */
2467         rc_defered_ack(rcd, qp);
2468         return;
2469
2470 nack_inv_unlck:
2471         spin_unlock_irqrestore(&qp->s_lock, flags);
2472 nack_inv:
2473         rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2474         qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2475         qp->r_ack_psn = qp->r_psn;
2476         /* Queue NAK for later */
2477         rc_defered_ack(rcd, qp);
2478         return;
2479
2480 nack_acc_unlck:
2481         spin_unlock_irqrestore(&qp->s_lock, flags);
2482 nack_acc:
2483         rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2484         qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2485         qp->r_ack_psn = qp->r_psn;
2486 send_ack:
2487         hfi1_send_rc_ack(rcd, qp, is_fecn);
2488 }
2489
2490 void hfi1_rc_hdrerr(
2491         struct hfi1_ctxtdata *rcd,
2492         struct hfi1_packet *packet,
2493         struct rvt_qp *qp)
2494 {
2495         struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2496         int diff;
2497         u32 opcode;
2498         u32 psn;
2499
2500         if (hfi1_ruc_check_hdr(ibp, packet))
2501                 return;
2502
2503         psn = ib_bth_get_psn(packet->ohdr);
2504         opcode = ib_bth_get_opcode(packet->ohdr);
2505
2506         /* Only deal with RDMA Writes for now */
2507         if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
2508                 diff = delta_psn(psn, qp->r_psn);
2509                 if (!qp->r_nak_state && diff >= 0) {
2510                         ibp->rvp.n_rc_seqnak++;
2511                         qp->r_nak_state = IB_NAK_PSN_ERROR;
2512                         /* Use the expected PSN. */
2513                         qp->r_ack_psn = qp->r_psn;
2514                         /*
2515                          * Wait to send the sequence
2516                          * NAK until all packets
2517                          * in the receive queue have
2518                          * been processed.
2519                          * Otherwise, we end up
2520                          * propagating congestion.
2521                          */
2522                         rc_defered_ack(rcd, qp);
2523                 } /* Out of sequence NAK */
2524         } /* QP Request NAKs */
2525 }