vt_ioctl: fix GIO_UNIMAP regression
[platform/kernel/linux-rpi.git] / net / rds / ib_cm.c
1 /*
2  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  */
33 #include <linux/dmapool.h>
34 #include <linux/kernel.h>
35 #include <linux/in.h>
36 #include <linux/slab.h>
37 #include <linux/vmalloc.h>
38 #include <linux/ratelimit.h>
39 #include <net/addrconf.h>
40 #include <rdma/ib_cm.h>
41
42 #include "rds_single_path.h"
43 #include "rds.h"
44 #include "ib.h"
45 #include "ib_mr.h"
46
47 /*
48  * Set the selected protocol version
49  */
50 static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
51 {
52         conn->c_version = version;
53 }
54
55 /*
56  * Set up flow control
57  */
58 static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
59 {
60         struct rds_ib_connection *ic = conn->c_transport_data;
61
62         if (rds_ib_sysctl_flow_control && credits != 0) {
63                 /* We're doing flow control */
64                 ic->i_flowctl = 1;
65                 rds_ib_send_add_credits(conn, credits);
66         } else {
67                 ic->i_flowctl = 0;
68         }
69 }
70
71 /*
72  * Tune RNR behavior. Without flow control, we use a rather
73  * low timeout, but not the absolute minimum - this should
74  * be tunable.
75  *
76  * We already set the RNR retry count to 7 (which is the
77  * smallest infinite number :-) above.
78  * If flow control is off, we want to change this back to 0
79  * so that we learn quickly when our credit accounting is
80  * buggy.
81  *
82  * Caller passes in a qp_attr pointer - don't waste stack spacv
83  * by allocation this twice.
84  */
85 static void
86 rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
87 {
88         int ret;
89
90         attr->min_rnr_timer = IB_RNR_TIMER_000_32;
91         ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
92         if (ret)
93                 printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
94 }
95
96 /*
97  * Connection established.
98  * We get here for both outgoing and incoming connection.
99  */
100 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
101 {
102         struct rds_ib_connection *ic = conn->c_transport_data;
103         const union rds_ib_conn_priv *dp = NULL;
104         struct ib_qp_attr qp_attr;
105         __be64 ack_seq = 0;
106         __be32 credit = 0;
107         u8 major = 0;
108         u8 minor = 0;
109         int err;
110
111         dp = event->param.conn.private_data;
112         if (conn->c_isv6) {
113                 if (event->param.conn.private_data_len >=
114                     sizeof(struct rds6_ib_connect_private)) {
115                         major = dp->ricp_v6.dp_protocol_major;
116                         minor = dp->ricp_v6.dp_protocol_minor;
117                         credit = dp->ricp_v6.dp_credit;
118                         /* dp structure start is not guaranteed to be 8 bytes
119                          * aligned.  Since dp_ack_seq is 64-bit extended load
120                          * operations can be used so go through get_unaligned
121                          * to avoid unaligned errors.
122                          */
123                         ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
124                 }
125         } else if (event->param.conn.private_data_len >=
126                    sizeof(struct rds_ib_connect_private)) {
127                 major = dp->ricp_v4.dp_protocol_major;
128                 minor = dp->ricp_v4.dp_protocol_minor;
129                 credit = dp->ricp_v4.dp_credit;
130                 ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
131         }
132
133         /* make sure it isn't empty data */
134         if (major) {
135                 rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
136                 rds_ib_set_flow_control(conn, be32_to_cpu(credit));
137         }
138
139         if (conn->c_version < RDS_PROTOCOL_VERSION) {
140                 if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
141                         pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
142                                   &conn->c_laddr, &conn->c_faddr,
143                                   RDS_PROTOCOL_MAJOR(conn->c_version),
144                                   RDS_PROTOCOL_MINOR(conn->c_version));
145                         rds_conn_destroy(conn);
146                         return;
147                 }
148         }
149
150         pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
151                   ic->i_active_side ? "Active" : "Passive",
152                   &conn->c_laddr, &conn->c_faddr, conn->c_tos,
153                   RDS_PROTOCOL_MAJOR(conn->c_version),
154                   RDS_PROTOCOL_MINOR(conn->c_version),
155                   ic->i_flowctl ? ", flow control" : "");
156
157         /* receive sl from the peer */
158         ic->i_sl = ic->i_cm_id->route.path_rec->sl;
159
160         atomic_set(&ic->i_cq_quiesce, 0);
161
162         /* Init rings and fill recv. this needs to wait until protocol
163          * negotiation is complete, since ring layout is different
164          * from 3.1 to 4.1.
165          */
166         rds_ib_send_init_ring(ic);
167         rds_ib_recv_init_ring(ic);
168         /* Post receive buffers - as a side effect, this will update
169          * the posted credit count. */
170         rds_ib_recv_refill(conn, 1, GFP_KERNEL);
171
172         /* Tune RNR behavior */
173         rds_ib_tune_rnr(ic, &qp_attr);
174
175         qp_attr.qp_state = IB_QPS_RTS;
176         err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
177         if (err)
178                 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
179
180         /* update ib_device with this local ipaddr */
181         err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
182         if (err)
183                 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
184                         err);
185
186         /* If the peer gave us the last packet it saw, process this as if
187          * we had received a regular ACK. */
188         if (dp) {
189                 if (ack_seq)
190                         rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
191                                             NULL);
192         }
193
194         conn->c_proposed_version = conn->c_version;
195         rds_connect_complete(conn);
196 }
197
198 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
199                                       struct rdma_conn_param *conn_param,
200                                       union rds_ib_conn_priv *dp,
201                                       u32 protocol_version,
202                                       u32 max_responder_resources,
203                                       u32 max_initiator_depth,
204                                       bool isv6)
205 {
206         struct rds_ib_connection *ic = conn->c_transport_data;
207         struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
208
209         memset(conn_param, 0, sizeof(struct rdma_conn_param));
210
211         conn_param->responder_resources =
212                 min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
213         conn_param->initiator_depth =
214                 min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
215         conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
216         conn_param->rnr_retry_count = 7;
217
218         if (dp) {
219                 memset(dp, 0, sizeof(*dp));
220                 if (isv6) {
221                         dp->ricp_v6.dp_saddr = conn->c_laddr;
222                         dp->ricp_v6.dp_daddr = conn->c_faddr;
223                         dp->ricp_v6.dp_protocol_major =
224                             RDS_PROTOCOL_MAJOR(protocol_version);
225                         dp->ricp_v6.dp_protocol_minor =
226                             RDS_PROTOCOL_MINOR(protocol_version);
227                         dp->ricp_v6.dp_protocol_minor_mask =
228                             cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
229                         dp->ricp_v6.dp_ack_seq =
230                             cpu_to_be64(rds_ib_piggyb_ack(ic));
231                         dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
232
233                         conn_param->private_data = &dp->ricp_v6;
234                         conn_param->private_data_len = sizeof(dp->ricp_v6);
235                 } else {
236                         dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
237                         dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
238                         dp->ricp_v4.dp_protocol_major =
239                             RDS_PROTOCOL_MAJOR(protocol_version);
240                         dp->ricp_v4.dp_protocol_minor =
241                             RDS_PROTOCOL_MINOR(protocol_version);
242                         dp->ricp_v4.dp_protocol_minor_mask =
243                             cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
244                         dp->ricp_v4.dp_ack_seq =
245                             cpu_to_be64(rds_ib_piggyb_ack(ic));
246                         dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
247
248                         conn_param->private_data = &dp->ricp_v4;
249                         conn_param->private_data_len = sizeof(dp->ricp_v4);
250                 }
251
252                 /* Advertise flow control */
253                 if (ic->i_flowctl) {
254                         unsigned int credits;
255
256                         credits = IB_GET_POST_CREDITS
257                                 (atomic_read(&ic->i_credits));
258                         if (isv6)
259                                 dp->ricp_v6.dp_credit = cpu_to_be32(credits);
260                         else
261                                 dp->ricp_v4.dp_credit = cpu_to_be32(credits);
262                         atomic_sub(IB_SET_POST_CREDITS(credits),
263                                    &ic->i_credits);
264                 }
265         }
266 }
267
268 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
269 {
270         rdsdebug("event %u (%s) data %p\n",
271                  event->event, ib_event_msg(event->event), data);
272 }
273
274 /* Plucking the oldest entry from the ring can be done concurrently with
275  * the thread refilling the ring.  Each ring operation is protected by
276  * spinlocks and the transient state of refilling doesn't change the
277  * recording of which entry is oldest.
278  *
279  * This relies on IB only calling one cq comp_handler for each cq so that
280  * there will only be one caller of rds_recv_incoming() per RDS connection.
281  */
282 static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
283 {
284         struct rds_connection *conn = context;
285         struct rds_ib_connection *ic = conn->c_transport_data;
286
287         rdsdebug("conn %p cq %p\n", conn, cq);
288
289         rds_ib_stats_inc(s_ib_evt_handler_call);
290
291         tasklet_schedule(&ic->i_recv_tasklet);
292 }
293
294 static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
295                      struct ib_wc *wcs)
296 {
297         int nr, i;
298         struct ib_wc *wc;
299
300         while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
301                 for (i = 0; i < nr; i++) {
302                         wc = wcs + i;
303                         rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
304                                  (unsigned long long)wc->wr_id, wc->status,
305                                  wc->byte_len, be32_to_cpu(wc->ex.imm_data));
306
307                         if (wc->wr_id <= ic->i_send_ring.w_nr ||
308                             wc->wr_id == RDS_IB_ACK_WR_ID)
309                                 rds_ib_send_cqe_handler(ic, wc);
310                         else
311                                 rds_ib_mr_cqe_handler(ic, wc);
312
313                 }
314         }
315 }
316
317 static void rds_ib_tasklet_fn_send(unsigned long data)
318 {
319         struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
320         struct rds_connection *conn = ic->conn;
321
322         rds_ib_stats_inc(s_ib_tasklet_call);
323
324         /* if cq has been already reaped, ignore incoming cq event */
325         if (atomic_read(&ic->i_cq_quiesce))
326                 return;
327
328         poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
329         ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
330         poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
331
332         if (rds_conn_up(conn) &&
333             (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
334             test_bit(0, &conn->c_map_queued)))
335                 rds_send_xmit(&ic->conn->c_path[0]);
336 }
337
338 static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
339                      struct ib_wc *wcs,
340                      struct rds_ib_ack_state *ack_state)
341 {
342         int nr, i;
343         struct ib_wc *wc;
344
345         while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
346                 for (i = 0; i < nr; i++) {
347                         wc = wcs + i;
348                         rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
349                                  (unsigned long long)wc->wr_id, wc->status,
350                                  wc->byte_len, be32_to_cpu(wc->ex.imm_data));
351
352                         rds_ib_recv_cqe_handler(ic, wc, ack_state);
353                 }
354         }
355 }
356
357 static void rds_ib_tasklet_fn_recv(unsigned long data)
358 {
359         struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
360         struct rds_connection *conn = ic->conn;
361         struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
362         struct rds_ib_ack_state state;
363
364         if (!rds_ibdev)
365                 rds_conn_drop(conn);
366
367         rds_ib_stats_inc(s_ib_tasklet_call);
368
369         /* if cq has been already reaped, ignore incoming cq event */
370         if (atomic_read(&ic->i_cq_quiesce))
371                 return;
372
373         memset(&state, 0, sizeof(state));
374         poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
375         ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
376         poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
377
378         if (state.ack_next_valid)
379                 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
380         if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
381                 rds_send_drop_acked(conn, state.ack_recv, NULL);
382                 ic->i_ack_recv = state.ack_recv;
383         }
384
385         if (rds_conn_up(conn))
386                 rds_ib_attempt_ack(ic);
387 }
388
389 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
390 {
391         struct rds_connection *conn = data;
392         struct rds_ib_connection *ic = conn->c_transport_data;
393
394         rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
395                  ib_event_msg(event->event));
396
397         switch (event->event) {
398         case IB_EVENT_COMM_EST:
399                 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
400                 break;
401         default:
402                 rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
403                          event->event, ib_event_msg(event->event),
404                          &conn->c_laddr, &conn->c_faddr);
405                 rds_conn_drop(conn);
406                 break;
407         }
408 }
409
410 static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
411 {
412         struct rds_connection *conn = context;
413         struct rds_ib_connection *ic = conn->c_transport_data;
414
415         rdsdebug("conn %p cq %p\n", conn, cq);
416
417         rds_ib_stats_inc(s_ib_evt_handler_call);
418
419         tasklet_schedule(&ic->i_send_tasklet);
420 }
421
422 static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
423 {
424         int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
425         int index = rds_ibdev->dev->num_comp_vectors - 1;
426         int i;
427
428         for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
429                 if (rds_ibdev->vector_load[i] < min) {
430                         index = i;
431                         min = rds_ibdev->vector_load[i];
432                 }
433         }
434
435         rds_ibdev->vector_load[index]++;
436         return index;
437 }
438
439 static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
440 {
441         rds_ibdev->vector_load[index]--;
442 }
443
444 /* Allocate DMA coherent memory to be used to store struct rds_header for
445  * sending/receiving packets.  The pointers to the DMA memory and the
446  * associated DMA addresses are stored in two arrays.
447  *
448  * @ibdev: the IB device
449  * @pool: the DMA memory pool
450  * @dma_addrs: pointer to the array for storing DMA addresses
451  * @num_hdrs: number of headers to allocate
452  *
453  * It returns the pointer to the array storing the DMA memory pointers.  On
454  * error, NULL pointer is returned.
455  */
456 struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
457                                        struct dma_pool *pool,
458                                        dma_addr_t **dma_addrs, u32 num_hdrs)
459 {
460         struct rds_header **hdrs;
461         dma_addr_t *hdr_daddrs;
462         u32 i;
463
464         hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
465                              ibdev_to_node(ibdev));
466         if (!hdrs)
467                 return NULL;
468
469         hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
470                                    ibdev_to_node(ibdev));
471         if (!hdr_daddrs) {
472                 kvfree(hdrs);
473                 return NULL;
474         }
475
476         for (i = 0; i < num_hdrs; i++) {
477                 hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
478                 if (!hdrs[i]) {
479                         rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
480                         return NULL;
481                 }
482         }
483
484         *dma_addrs = hdr_daddrs;
485         return hdrs;
486 }
487
488 /* Free the DMA memory used to store struct rds_header.
489  *
490  * @pool: the DMA memory pool
491  * @hdrs: pointer to the array storing DMA memory pointers
492  * @dma_addrs: pointer to the array storing DMA addresses
493  * @num_hdars: number of headers to free.
494  */
495 void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
496                        dma_addr_t *dma_addrs, u32 num_hdrs)
497 {
498         u32 i;
499
500         for (i = 0; i < num_hdrs; i++)
501                 dma_pool_free(pool, hdrs[i], dma_addrs[i]);
502         kvfree(hdrs);
503         kvfree(dma_addrs);
504 }
505
506 /*
507  * This needs to be very careful to not leave IS_ERR pointers around for
508  * cleanup to trip over.
509  */
510 static int rds_ib_setup_qp(struct rds_connection *conn)
511 {
512         struct rds_ib_connection *ic = conn->c_transport_data;
513         struct ib_device *dev = ic->i_cm_id->device;
514         struct ib_qp_init_attr attr;
515         struct ib_cq_init_attr cq_attr = {};
516         struct rds_ib_device *rds_ibdev;
517         unsigned long max_wrs;
518         int ret, fr_queue_space;
519         struct dma_pool *pool;
520
521         /*
522          * It's normal to see a null device if an incoming connection races
523          * with device removal, so we don't print a warning.
524          */
525         rds_ibdev = rds_ib_get_client_data(dev);
526         if (!rds_ibdev)
527                 return -EOPNOTSUPP;
528
529         /* The fr_queue_space is currently set to 512, to add extra space on
530          * completion queue and send queue. This extra space is used for FRWR
531          * registration and invalidation work requests
532          */
533         fr_queue_space = RDS_IB_DEFAULT_FR_WR;
534
535         /* add the conn now so that connection establishment has the dev */
536         rds_ib_add_conn(rds_ibdev, conn);
537
538         max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
539                 rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
540         if (ic->i_send_ring.w_nr != max_wrs)
541                 rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
542
543         max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
544                 rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
545         if (ic->i_recv_ring.w_nr != max_wrs)
546                 rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
547
548         /* Protection domain and memory range */
549         ic->i_pd = rds_ibdev->pd;
550
551         ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
552         cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
553         cq_attr.comp_vector = ic->i_scq_vector;
554         ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
555                                      rds_ib_cq_event_handler, conn,
556                                      &cq_attr);
557         if (IS_ERR(ic->i_send_cq)) {
558                 ret = PTR_ERR(ic->i_send_cq);
559                 ic->i_send_cq = NULL;
560                 ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
561                 rdsdebug("ib_create_cq send failed: %d\n", ret);
562                 goto rds_ibdev_out;
563         }
564
565         ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
566         cq_attr.cqe = ic->i_recv_ring.w_nr;
567         cq_attr.comp_vector = ic->i_rcq_vector;
568         ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
569                                      rds_ib_cq_event_handler, conn,
570                                      &cq_attr);
571         if (IS_ERR(ic->i_recv_cq)) {
572                 ret = PTR_ERR(ic->i_recv_cq);
573                 ic->i_recv_cq = NULL;
574                 ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
575                 rdsdebug("ib_create_cq recv failed: %d\n", ret);
576                 goto send_cq_out;
577         }
578
579         ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
580         if (ret) {
581                 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
582                 goto recv_cq_out;
583         }
584
585         ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
586         if (ret) {
587                 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
588                 goto recv_cq_out;
589         }
590
591         /* XXX negotiate max send/recv with remote? */
592         memset(&attr, 0, sizeof(attr));
593         attr.event_handler = rds_ib_qp_event_handler;
594         attr.qp_context = conn;
595         /* + 1 to allow for the single ack message */
596         attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
597         attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
598         attr.cap.max_send_sge = rds_ibdev->max_sge;
599         attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
600         attr.sq_sig_type = IB_SIGNAL_REQ_WR;
601         attr.qp_type = IB_QPT_RC;
602         attr.send_cq = ic->i_send_cq;
603         attr.recv_cq = ic->i_recv_cq;
604
605         /*
606          * XXX this can fail if max_*_wr is too large?  Are we supposed
607          * to back off until we get a value that the hardware can support?
608          */
609         ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
610         if (ret) {
611                 rdsdebug("rdma_create_qp failed: %d\n", ret);
612                 goto recv_cq_out;
613         }
614
615         pool = rds_ibdev->rid_hdrs_pool;
616         ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
617                                              ic->i_send_ring.w_nr);
618         if (!ic->i_send_hdrs) {
619                 ret = -ENOMEM;
620                 rdsdebug("DMA send hdrs alloc failed\n");
621                 goto qp_out;
622         }
623
624         ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
625                                              ic->i_recv_ring.w_nr);
626         if (!ic->i_recv_hdrs) {
627                 ret = -ENOMEM;
628                 rdsdebug("DMA recv hdrs alloc failed\n");
629                 goto send_hdrs_dma_out;
630         }
631
632         ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
633                                     &ic->i_ack_dma);
634         if (!ic->i_ack) {
635                 ret = -ENOMEM;
636                 rdsdebug("DMA ack header alloc failed\n");
637                 goto recv_hdrs_dma_out;
638         }
639
640         ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
641                                               ic->i_send_ring.w_nr),
642                                    ibdev_to_node(dev));
643         if (!ic->i_sends) {
644                 ret = -ENOMEM;
645                 rdsdebug("send allocation failed\n");
646                 goto ack_dma_out;
647         }
648
649         ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
650                                               ic->i_recv_ring.w_nr),
651                                    ibdev_to_node(dev));
652         if (!ic->i_recvs) {
653                 ret = -ENOMEM;
654                 rdsdebug("recv allocation failed\n");
655                 goto sends_out;
656         }
657
658         rds_ib_recv_init_ack(ic);
659
660         rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
661                  ic->i_send_cq, ic->i_recv_cq);
662
663         goto out;
664
665 sends_out:
666         vfree(ic->i_sends);
667
668 ack_dma_out:
669         dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
670         ic->i_ack = NULL;
671
672 recv_hdrs_dma_out:
673         rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
674                           ic->i_recv_ring.w_nr);
675         ic->i_recv_hdrs = NULL;
676         ic->i_recv_hdrs_dma = NULL;
677
678 send_hdrs_dma_out:
679         rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
680                           ic->i_send_ring.w_nr);
681         ic->i_send_hdrs = NULL;
682         ic->i_send_hdrs_dma = NULL;
683
684 qp_out:
685         rdma_destroy_qp(ic->i_cm_id);
686 recv_cq_out:
687         ib_destroy_cq(ic->i_recv_cq);
688         ic->i_recv_cq = NULL;
689 send_cq_out:
690         ib_destroy_cq(ic->i_send_cq);
691         ic->i_send_cq = NULL;
692 rds_ibdev_out:
693         rds_ib_remove_conn(rds_ibdev, conn);
694 out:
695         rds_ib_dev_put(rds_ibdev);
696
697         return ret;
698 }
699
700 static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
701 {
702         const union rds_ib_conn_priv *dp = event->param.conn.private_data;
703         u8 data_len, major, minor;
704         u32 version = 0;
705         __be16 mask;
706         u16 common;
707
708         /*
709          * rdma_cm private data is odd - when there is any private data in the
710          * request, we will be given a pretty large buffer without telling us the
711          * original size. The only way to tell the difference is by looking at
712          * the contents, which are initialized to zero.
713          * If the protocol version fields aren't set, this is a connection attempt
714          * from an older version. This could be 3.0 or 2.0 - we can't tell.
715          * We really should have changed this for OFED 1.3 :-(
716          */
717
718         /* Be paranoid. RDS always has privdata */
719         if (!event->param.conn.private_data_len) {
720                 printk(KERN_NOTICE "RDS incoming connection has no private data, "
721                         "rejecting\n");
722                 return 0;
723         }
724
725         if (isv6) {
726                 data_len = sizeof(struct rds6_ib_connect_private);
727                 major = dp->ricp_v6.dp_protocol_major;
728                 minor = dp->ricp_v6.dp_protocol_minor;
729                 mask = dp->ricp_v6.dp_protocol_minor_mask;
730         } else {
731                 data_len = sizeof(struct rds_ib_connect_private);
732                 major = dp->ricp_v4.dp_protocol_major;
733                 minor = dp->ricp_v4.dp_protocol_minor;
734                 mask = dp->ricp_v4.dp_protocol_minor_mask;
735         }
736
737         /* Even if len is crap *now* I still want to check it. -ASG */
738         if (event->param.conn.private_data_len < data_len || major == 0)
739                 return RDS_PROTOCOL_4_0;
740
741         common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
742         if (major == 4 && common) {
743                 version = RDS_PROTOCOL_4_0;
744                 while ((common >>= 1) != 0)
745                         version++;
746         } else if (RDS_PROTOCOL_COMPAT_VERSION ==
747                    RDS_PROTOCOL(major, minor)) {
748                 version = RDS_PROTOCOL_COMPAT_VERSION;
749         } else {
750                 if (isv6)
751                         printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
752                                            &dp->ricp_v6.dp_saddr, major, minor);
753                 else
754                         printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
755                                            &dp->ricp_v4.dp_saddr, major, minor);
756         }
757         return version;
758 }
759
760 #if IS_ENABLED(CONFIG_IPV6)
761 /* Given an IPv6 address, find the net_device which hosts that address and
762  * return its index.  This is used by the rds_ib_cm_handle_connect() code to
763  * find the interface index of where an incoming request comes from when
764  * the request is using a link local address.
765  *
766  * Note one problem in this search.  It is possible that two interfaces have
767  * the same link local address.  Unfortunately, this cannot be solved unless
768  * the underlying layer gives us the interface which an incoming RDMA connect
769  * request comes from.
770  */
771 static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
772 {
773         struct net_device *dev;
774         int idx = 0;
775
776         rcu_read_lock();
777         for_each_netdev_rcu(net, dev) {
778                 if (ipv6_chk_addr(net, addr, dev, 1)) {
779                         idx = dev->ifindex;
780                         break;
781                 }
782         }
783         rcu_read_unlock();
784
785         return idx;
786 }
787 #endif
788
789 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
790                              struct rdma_cm_event *event, bool isv6)
791 {
792         __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
793         __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
794         const struct rds_ib_conn_priv_cmn *dp_cmn;
795         struct rds_connection *conn = NULL;
796         struct rds_ib_connection *ic = NULL;
797         struct rdma_conn_param conn_param;
798         const union rds_ib_conn_priv *dp;
799         union rds_ib_conn_priv dp_rep;
800         struct in6_addr s_mapped_addr;
801         struct in6_addr d_mapped_addr;
802         const struct in6_addr *saddr6;
803         const struct in6_addr *daddr6;
804         int destroy = 1;
805         u32 ifindex = 0;
806         u32 version;
807         int err = 1;
808
809         /* Check whether the remote protocol version matches ours. */
810         version = rds_ib_protocol_compatible(event, isv6);
811         if (!version) {
812                 err = RDS_RDMA_REJ_INCOMPAT;
813                 goto out;
814         }
815
816         dp = event->param.conn.private_data;
817         if (isv6) {
818 #if IS_ENABLED(CONFIG_IPV6)
819                 dp_cmn = &dp->ricp_v6.dp_cmn;
820                 saddr6 = &dp->ricp_v6.dp_saddr;
821                 daddr6 = &dp->ricp_v6.dp_daddr;
822                 /* If either address is link local, need to find the
823                  * interface index in order to create a proper RDS
824                  * connection.
825                  */
826                 if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
827                         /* Using init_net for now ..  */
828                         ifindex = __rds_find_ifindex(&init_net, daddr6);
829                         /* No index found...  Need to bail out. */
830                         if (ifindex == 0) {
831                                 err = -EOPNOTSUPP;
832                                 goto out;
833                         }
834                 } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
835                         /* Use our address to find the correct index. */
836                         ifindex = __rds_find_ifindex(&init_net, daddr6);
837                         /* No index found...  Need to bail out. */
838                         if (ifindex == 0) {
839                                 err = -EOPNOTSUPP;
840                                 goto out;
841                         }
842                 }
843 #else
844                 err = -EOPNOTSUPP;
845                 goto out;
846 #endif
847         } else {
848                 dp_cmn = &dp->ricp_v4.dp_cmn;
849                 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
850                 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
851                 saddr6 = &s_mapped_addr;
852                 daddr6 = &d_mapped_addr;
853         }
854
855         rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
856                  saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
857                  RDS_PROTOCOL_MINOR(version),
858                  (unsigned long long)be64_to_cpu(lguid),
859                  (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
860
861         /* RDS/IB is not currently netns aware, thus init_net */
862         conn = rds_conn_create(&init_net, daddr6, saddr6,
863                                &rds_ib_transport, dp_cmn->ricpc_dp_toss,
864                                GFP_KERNEL, ifindex);
865         if (IS_ERR(conn)) {
866                 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
867                 conn = NULL;
868                 goto out;
869         }
870
871         /*
872          * The connection request may occur while the
873          * previous connection exist, e.g. in case of failover.
874          * But as connections may be initiated simultaneously
875          * by both hosts, we have a random backoff mechanism -
876          * see the comment above rds_queue_reconnect()
877          */
878         mutex_lock(&conn->c_cm_lock);
879         if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
880                 if (rds_conn_state(conn) == RDS_CONN_UP) {
881                         rdsdebug("incoming connect while connecting\n");
882                         rds_conn_drop(conn);
883                         rds_ib_stats_inc(s_ib_listen_closed_stale);
884                 } else
885                 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
886                         /* Wait and see - our connect may still be succeeding */
887                         rds_ib_stats_inc(s_ib_connect_raced);
888                 }
889                 goto out;
890         }
891
892         ic = conn->c_transport_data;
893
894         rds_ib_set_protocol(conn, version);
895         rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
896
897         /* If the peer gave us the last packet it saw, process this as if
898          * we had received a regular ACK. */
899         if (dp_cmn->ricpc_ack_seq)
900                 rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
901                                     NULL);
902
903         BUG_ON(cm_id->context);
904         BUG_ON(ic->i_cm_id);
905
906         ic->i_cm_id = cm_id;
907         cm_id->context = conn;
908
909         /* We got halfway through setting up the ib_connection, if we
910          * fail now, we have to take the long route out of this mess. */
911         destroy = 0;
912
913         err = rds_ib_setup_qp(conn);
914         if (err) {
915                 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
916                 goto out;
917         }
918
919         rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
920                                   event->param.conn.responder_resources,
921                                   event->param.conn.initiator_depth, isv6);
922
923         /* rdma_accept() calls rdma_reject() internally if it fails */
924         if (rdma_accept(cm_id, &conn_param))
925                 rds_ib_conn_error(conn, "rdma_accept failed\n");
926
927 out:
928         if (conn)
929                 mutex_unlock(&conn->c_cm_lock);
930         if (err)
931                 rdma_reject(cm_id, &err, sizeof(int),
932                             IB_CM_REJ_CONSUMER_DEFINED);
933         return destroy;
934 }
935
936
937 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
938 {
939         struct rds_connection *conn = cm_id->context;
940         struct rds_ib_connection *ic = conn->c_transport_data;
941         struct rdma_conn_param conn_param;
942         union rds_ib_conn_priv dp;
943         int ret;
944
945         /* If the peer doesn't do protocol negotiation, we must
946          * default to RDSv3.0 */
947         rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
948         ic->i_flowctl = rds_ib_sysctl_flow_control;     /* advertise flow control */
949
950         ret = rds_ib_setup_qp(conn);
951         if (ret) {
952                 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
953                 goto out;
954         }
955
956         rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
957                                   conn->c_proposed_version,
958                                   UINT_MAX, UINT_MAX, isv6);
959         ret = rdma_connect(cm_id, &conn_param);
960         if (ret)
961                 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
962
963 out:
964         /* Beware - returning non-zero tells the rdma_cm to destroy
965          * the cm_id. We should certainly not do it as long as we still
966          * "own" the cm_id. */
967         if (ret) {
968                 if (ic->i_cm_id == cm_id)
969                         ret = 0;
970         }
971         ic->i_active_side = true;
972         return ret;
973 }
974
975 int rds_ib_conn_path_connect(struct rds_conn_path *cp)
976 {
977         struct rds_connection *conn = cp->cp_conn;
978         struct sockaddr_storage src, dest;
979         rdma_cm_event_handler handler;
980         struct rds_ib_connection *ic;
981         int ret;
982
983         ic = conn->c_transport_data;
984
985         /* XXX I wonder what affect the port space has */
986         /* delegate cm event handler to rdma_transport */
987 #if IS_ENABLED(CONFIG_IPV6)
988         if (conn->c_isv6)
989                 handler = rds6_rdma_cm_event_handler;
990         else
991 #endif
992                 handler = rds_rdma_cm_event_handler;
993         ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
994                                      RDMA_PS_TCP, IB_QPT_RC);
995         if (IS_ERR(ic->i_cm_id)) {
996                 ret = PTR_ERR(ic->i_cm_id);
997                 ic->i_cm_id = NULL;
998                 rdsdebug("rdma_create_id() failed: %d\n", ret);
999                 goto out;
1000         }
1001
1002         rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
1003
1004         if (ipv6_addr_v4mapped(&conn->c_faddr)) {
1005                 struct sockaddr_in *sin;
1006
1007                 sin = (struct sockaddr_in *)&src;
1008                 sin->sin_family = AF_INET;
1009                 sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
1010                 sin->sin_port = 0;
1011
1012                 sin = (struct sockaddr_in *)&dest;
1013                 sin->sin_family = AF_INET;
1014                 sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
1015                 sin->sin_port = htons(RDS_PORT);
1016         } else {
1017                 struct sockaddr_in6 *sin6;
1018
1019                 sin6 = (struct sockaddr_in6 *)&src;
1020                 sin6->sin6_family = AF_INET6;
1021                 sin6->sin6_addr = conn->c_laddr;
1022                 sin6->sin6_port = 0;
1023                 sin6->sin6_scope_id = conn->c_dev_if;
1024
1025                 sin6 = (struct sockaddr_in6 *)&dest;
1026                 sin6->sin6_family = AF_INET6;
1027                 sin6->sin6_addr = conn->c_faddr;
1028                 sin6->sin6_port = htons(RDS_CM_PORT);
1029                 sin6->sin6_scope_id = conn->c_dev_if;
1030         }
1031
1032         ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
1033                                 (struct sockaddr *)&dest,
1034                                 RDS_RDMA_RESOLVE_TIMEOUT_MS);
1035         if (ret) {
1036                 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
1037                          ret);
1038                 rdma_destroy_id(ic->i_cm_id);
1039                 ic->i_cm_id = NULL;
1040         }
1041
1042 out:
1043         return ret;
1044 }
1045
1046 /*
1047  * This is so careful about only cleaning up resources that were built up
1048  * so that it can be called at any point during startup.  In fact it
1049  * can be called multiple times for a given connection.
1050  */
1051 void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
1052 {
1053         struct rds_connection *conn = cp->cp_conn;
1054         struct rds_ib_connection *ic = conn->c_transport_data;
1055         int err = 0;
1056
1057         rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
1058                  ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
1059                  ic->i_cm_id ? ic->i_cm_id->qp : NULL);
1060
1061         if (ic->i_cm_id) {
1062                 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
1063                 err = rdma_disconnect(ic->i_cm_id);
1064                 if (err) {
1065                         /* Actually this may happen quite frequently, when
1066                          * an outgoing connect raced with an incoming connect.
1067                          */
1068                         rdsdebug("failed to disconnect, cm: %p err %d\n",
1069                                 ic->i_cm_id, err);
1070                 }
1071
1072                 /* kick off "flush_worker" for all pools in order to reap
1073                  * all FRMR registrations that are still marked "FRMR_IS_INUSE"
1074                  */
1075                 rds_ib_flush_mrs();
1076
1077                 /*
1078                  * We want to wait for tx and rx completion to finish
1079                  * before we tear down the connection, but we have to be
1080                  * careful not to get stuck waiting on a send ring that
1081                  * only has unsignaled sends in it.  We've shutdown new
1082                  * sends before getting here so by waiting for signaled
1083                  * sends to complete we're ensured that there will be no
1084                  * more tx processing.
1085                  */
1086                 wait_event(rds_ib_ring_empty_wait,
1087                            rds_ib_ring_empty(&ic->i_recv_ring) &&
1088                            (atomic_read(&ic->i_signaled_sends) == 0) &&
1089                            (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
1090                            (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
1091                 tasklet_kill(&ic->i_send_tasklet);
1092                 tasklet_kill(&ic->i_recv_tasklet);
1093
1094                 atomic_set(&ic->i_cq_quiesce, 1);
1095
1096                 /* first destroy the ib state that generates callbacks */
1097                 if (ic->i_cm_id->qp)
1098                         rdma_destroy_qp(ic->i_cm_id);
1099                 if (ic->i_send_cq) {
1100                         if (ic->rds_ibdev)
1101                                 ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
1102                         ib_destroy_cq(ic->i_send_cq);
1103                 }
1104
1105                 if (ic->i_recv_cq) {
1106                         if (ic->rds_ibdev)
1107                                 ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
1108                         ib_destroy_cq(ic->i_recv_cq);
1109                 }
1110
1111                 if (ic->rds_ibdev) {
1112                         struct dma_pool *pool;
1113
1114                         pool = ic->rds_ibdev->rid_hdrs_pool;
1115
1116                         /* then free the resources that ib callbacks use */
1117                         if (ic->i_send_hdrs) {
1118                                 rds_dma_hdrs_free(pool, ic->i_send_hdrs,
1119                                                   ic->i_send_hdrs_dma,
1120                                                   ic->i_send_ring.w_nr);
1121                                 ic->i_send_hdrs = NULL;
1122                                 ic->i_send_hdrs_dma = NULL;
1123                         }
1124
1125                         if (ic->i_recv_hdrs) {
1126                                 rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
1127                                                   ic->i_recv_hdrs_dma,
1128                                                   ic->i_recv_ring.w_nr);
1129                                 ic->i_recv_hdrs = NULL;
1130                                 ic->i_recv_hdrs_dma = NULL;
1131                         }
1132
1133                         if (ic->i_ack) {
1134                                 dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
1135                                 ic->i_ack = NULL;
1136                         }
1137                 } else {
1138                         WARN_ON(ic->i_send_hdrs);
1139                         WARN_ON(ic->i_send_hdrs_dma);
1140                         WARN_ON(ic->i_recv_hdrs);
1141                         WARN_ON(ic->i_recv_hdrs_dma);
1142                         WARN_ON(ic->i_ack);
1143                 }
1144
1145                 if (ic->i_sends)
1146                         rds_ib_send_clear_ring(ic);
1147                 if (ic->i_recvs)
1148                         rds_ib_recv_clear_ring(ic);
1149
1150                 rdma_destroy_id(ic->i_cm_id);
1151
1152                 /*
1153                  * Move connection back to the nodev list.
1154                  */
1155                 if (ic->rds_ibdev)
1156                         rds_ib_remove_conn(ic->rds_ibdev, conn);
1157
1158                 ic->i_cm_id = NULL;
1159                 ic->i_pd = NULL;
1160                 ic->i_send_cq = NULL;
1161                 ic->i_recv_cq = NULL;
1162         }
1163         BUG_ON(ic->rds_ibdev);
1164
1165         /* Clear pending transmit */
1166         if (ic->i_data_op) {
1167                 struct rds_message *rm;
1168
1169                 rm = container_of(ic->i_data_op, struct rds_message, data);
1170                 rds_message_put(rm);
1171                 ic->i_data_op = NULL;
1172         }
1173
1174         /* Clear the ACK state */
1175         clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
1176 #ifdef KERNEL_HAS_ATOMIC64
1177         atomic64_set(&ic->i_ack_next, 0);
1178 #else
1179         ic->i_ack_next = 0;
1180 #endif
1181         ic->i_ack_recv = 0;
1182
1183         /* Clear flow control state */
1184         ic->i_flowctl = 0;
1185         atomic_set(&ic->i_credits, 0);
1186
1187         /* Re-init rings, but retain sizes. */
1188         rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
1189         rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
1190
1191         if (ic->i_ibinc) {
1192                 rds_inc_put(&ic->i_ibinc->ii_inc);
1193                 ic->i_ibinc = NULL;
1194         }
1195
1196         vfree(ic->i_sends);
1197         ic->i_sends = NULL;
1198         vfree(ic->i_recvs);
1199         ic->i_recvs = NULL;
1200         ic->i_active_side = false;
1201 }
1202
1203 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
1204 {
1205         struct rds_ib_connection *ic;
1206         unsigned long flags;
1207         int ret;
1208
1209         /* XXX too lazy? */
1210         ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
1211         if (!ic)
1212                 return -ENOMEM;
1213
1214         ret = rds_ib_recv_alloc_caches(ic, gfp);
1215         if (ret) {
1216                 kfree(ic);
1217                 return ret;
1218         }
1219
1220         INIT_LIST_HEAD(&ic->ib_node);
1221         tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
1222                      (unsigned long)ic);
1223         tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
1224                      (unsigned long)ic);
1225         mutex_init(&ic->i_recv_mutex);
1226 #ifndef KERNEL_HAS_ATOMIC64
1227         spin_lock_init(&ic->i_ack_lock);
1228 #endif
1229         atomic_set(&ic->i_signaled_sends, 0);
1230         atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
1231
1232         /*
1233          * rds_ib_conn_shutdown() waits for these to be emptied so they
1234          * must be initialized before it can be called.
1235          */
1236         rds_ib_ring_init(&ic->i_send_ring, 0);
1237         rds_ib_ring_init(&ic->i_recv_ring, 0);
1238
1239         ic->conn = conn;
1240         conn->c_transport_data = ic;
1241
1242         spin_lock_irqsave(&ib_nodev_conns_lock, flags);
1243         list_add_tail(&ic->ib_node, &ib_nodev_conns);
1244         spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
1245
1246
1247         rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
1248         return 0;
1249 }
1250
1251 /*
1252  * Free a connection. Connection must be shut down and not set for reconnect.
1253  */
1254 void rds_ib_conn_free(void *arg)
1255 {
1256         struct rds_ib_connection *ic = arg;
1257         spinlock_t      *lock_ptr;
1258
1259         rdsdebug("ic %p\n", ic);
1260
1261         /*
1262          * Conn is either on a dev's list or on the nodev list.
1263          * A race with shutdown() or connect() would cause problems
1264          * (since rds_ibdev would change) but that should never happen.
1265          */
1266         lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
1267
1268         spin_lock_irq(lock_ptr);
1269         list_del(&ic->ib_node);
1270         spin_unlock_irq(lock_ptr);
1271
1272         rds_ib_recv_free_caches(ic);
1273
1274         kfree(ic);
1275 }
1276
1277
1278 /*
1279  * An error occurred on the connection
1280  */
1281 void
1282 __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
1283 {
1284         va_list ap;
1285
1286         rds_conn_drop(conn);
1287
1288         va_start(ap, fmt);
1289         vprintk(fmt, ap);
1290         va_end(ap);
1291 }