94e76579622d040768f0f7eab95f3a2927f48143
[platform/kernel/linux-starfive.git] / drivers / block / nbd.c
1 /*
2  * Network block device - make block devices work over TCP
3  *
4  * Note that you can not swap over this thing, yet. Seems to work but
5  * deadlocks sometimes - you can not swap over TCP in general.
6  * 
7  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
8  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9  *
10  * This file is released under GPLv2 or later.
11  *
12  * (part of code stolen from loop.c)
13  */
14
15 #include <linux/major.h>
16
17 #include <linux/blkdev.h>
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/bio.h>
23 #include <linux/stat.h>
24 #include <linux/errno.h>
25 #include <linux/file.h>
26 #include <linux/ioctl.h>
27 #include <linux/mutex.h>
28 #include <linux/compiler.h>
29 #include <linux/err.h>
30 #include <linux/kernel.h>
31 #include <linux/slab.h>
32 #include <net/sock.h>
33 #include <linux/net.h>
34 #include <linux/kthread.h>
35 #include <linux/types.h>
36 #include <linux/debugfs.h>
37 #include <linux/blk-mq.h>
38
39 #include <linux/uaccess.h>
40 #include <asm/types.h>
41
42 #include <linux/nbd.h>
43
44 struct nbd_sock {
45         struct socket *sock;
46         struct mutex tx_lock;
47 };
48
49 #define NBD_TIMEDOUT                    0
50 #define NBD_DISCONNECT_REQUESTED        1
51 #define NBD_DISCONNECTED                2
52 #define NBD_RUNNING                     3
53
54 struct nbd_device {
55         u32 flags;
56         unsigned long runtime_flags;
57         struct nbd_sock **socks;
58         int magic;
59
60         struct blk_mq_tag_set tag_set;
61
62         struct mutex config_lock;
63         struct gendisk *disk;
64         int num_connections;
65         atomic_t recv_threads;
66         wait_queue_head_t recv_wq;
67         loff_t blksize;
68         loff_t bytesize;
69
70         struct task_struct *task_recv;
71         struct task_struct *task_setup;
72
73 #if IS_ENABLED(CONFIG_DEBUG_FS)
74         struct dentry *dbg_dir;
75 #endif
76 };
77
78 struct nbd_cmd {
79         struct nbd_device *nbd;
80         struct completion send_complete;
81 };
82
83 #if IS_ENABLED(CONFIG_DEBUG_FS)
84 static struct dentry *nbd_dbg_dir;
85 #endif
86
87 #define nbd_name(nbd) ((nbd)->disk->disk_name)
88
89 #define NBD_MAGIC 0x68797548
90
91 static unsigned int nbds_max = 16;
92 static struct nbd_device *nbd_dev;
93 static int max_part;
94 static struct workqueue_struct *recv_workqueue;
95
96 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
97 {
98         return disk_to_dev(nbd->disk);
99 }
100
101 static bool nbd_is_connected(struct nbd_device *nbd)
102 {
103         return !!nbd->task_recv;
104 }
105
106 static const char *nbdcmd_to_ascii(int cmd)
107 {
108         switch (cmd) {
109         case  NBD_CMD_READ: return "read";
110         case NBD_CMD_WRITE: return "write";
111         case  NBD_CMD_DISC: return "disconnect";
112         case NBD_CMD_FLUSH: return "flush";
113         case  NBD_CMD_TRIM: return "trim/discard";
114         }
115         return "invalid";
116 }
117
118 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
119 {
120         bdev->bd_inode->i_size = 0;
121         set_capacity(nbd->disk, 0);
122         kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
123
124         return 0;
125 }
126
127 static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
128 {
129         if (!nbd_is_connected(nbd))
130                 return;
131
132         bdev->bd_inode->i_size = nbd->bytesize;
133         set_capacity(nbd->disk, nbd->bytesize >> 9);
134         kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
135 }
136
137 static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
138                         loff_t blocksize, loff_t nr_blocks)
139 {
140         int ret;
141
142         ret = set_blocksize(bdev, blocksize);
143         if (ret)
144                 return ret;
145
146         nbd->blksize = blocksize;
147         nbd->bytesize = blocksize * nr_blocks;
148
149         nbd_size_update(nbd, bdev);
150
151         return 0;
152 }
153
154 static void nbd_end_request(struct nbd_cmd *cmd)
155 {
156         struct nbd_device *nbd = cmd->nbd;
157         struct request *req = blk_mq_rq_from_pdu(cmd);
158         int error = req->errors ? -EIO : 0;
159
160         dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
161                 error ? "failed" : "done");
162
163         blk_mq_complete_request(req, error);
164 }
165
166 /*
167  * Forcibly shutdown the socket causing all listeners to error
168  */
169 static void sock_shutdown(struct nbd_device *nbd)
170 {
171         int i;
172
173         if (nbd->num_connections == 0)
174                 return;
175         if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
176                 return;
177
178         for (i = 0; i < nbd->num_connections; i++) {
179                 struct nbd_sock *nsock = nbd->socks[i];
180                 mutex_lock(&nsock->tx_lock);
181                 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
182                 mutex_unlock(&nsock->tx_lock);
183         }
184         dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
185 }
186
187 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
188                                                  bool reserved)
189 {
190         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
191         struct nbd_device *nbd = cmd->nbd;
192
193         dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
194         set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
195         req->errors++;
196
197         mutex_lock(&nbd->config_lock);
198         sock_shutdown(nbd);
199         mutex_unlock(&nbd->config_lock);
200         return BLK_EH_HANDLED;
201 }
202
203 /*
204  *  Send or receive packet.
205  */
206 static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
207                      int size, int msg_flags)
208 {
209         struct socket *sock = nbd->socks[index]->sock;
210         int result;
211         struct msghdr msg;
212         struct kvec iov;
213         unsigned long pflags = current->flags;
214
215         if (unlikely(!sock)) {
216                 dev_err_ratelimited(disk_to_dev(nbd->disk),
217                         "Attempted %s on closed socket in sock_xmit\n",
218                         (send ? "send" : "recv"));
219                 return -EINVAL;
220         }
221
222         current->flags |= PF_MEMALLOC;
223         do {
224                 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
225                 iov.iov_base = buf;
226                 iov.iov_len = size;
227                 msg.msg_name = NULL;
228                 msg.msg_namelen = 0;
229                 msg.msg_control = NULL;
230                 msg.msg_controllen = 0;
231                 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
232
233                 if (send)
234                         result = kernel_sendmsg(sock, &msg, &iov, 1, size);
235                 else
236                         result = kernel_recvmsg(sock, &msg, &iov, 1, size,
237                                                 msg.msg_flags);
238
239                 if (result <= 0) {
240                         if (result == 0)
241                                 result = -EPIPE; /* short read */
242                         break;
243                 }
244                 size -= result;
245                 buf += result;
246         } while (size > 0);
247
248         tsk_restore_flags(current, pflags, PF_MEMALLOC);
249
250         return result;
251 }
252
253 static inline int sock_send_bvec(struct nbd_device *nbd, int index,
254                                  struct bio_vec *bvec, int flags)
255 {
256         int result;
257         void *kaddr = kmap(bvec->bv_page);
258         result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset,
259                            bvec->bv_len, flags);
260         kunmap(bvec->bv_page);
261         return result;
262 }
263
264 /* always call with the tx_lock held */
265 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
266 {
267         struct request *req = blk_mq_rq_from_pdu(cmd);
268         int result;
269         struct nbd_request request;
270         unsigned long size = blk_rq_bytes(req);
271         struct bio *bio;
272         u32 type;
273         u32 tag = blk_mq_unique_tag(req);
274
275         switch (req_op(req)) {
276         case REQ_OP_DISCARD:
277                 type = NBD_CMD_TRIM;
278                 break;
279         case REQ_OP_FLUSH:
280                 type = NBD_CMD_FLUSH;
281                 break;
282         case REQ_OP_WRITE:
283                 type = NBD_CMD_WRITE;
284                 break;
285         case REQ_OP_READ:
286                 type = NBD_CMD_READ;
287                 break;
288         default:
289                 return -EIO;
290         }
291
292         if (rq_data_dir(req) == WRITE &&
293             (nbd->flags & NBD_FLAG_READ_ONLY)) {
294                 dev_err_ratelimited(disk_to_dev(nbd->disk),
295                                     "Write on read-only\n");
296                 return -EIO;
297         }
298
299         memset(&request, 0, sizeof(request));
300         request.magic = htonl(NBD_REQUEST_MAGIC);
301         request.type = htonl(type);
302         if (type != NBD_CMD_FLUSH) {
303                 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
304                 request.len = htonl(size);
305         }
306         memcpy(request.handle, &tag, sizeof(tag));
307
308         dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
309                 cmd, nbdcmd_to_ascii(type),
310                 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
311         result = sock_xmit(nbd, index, 1, &request, sizeof(request),
312                         (type == NBD_CMD_WRITE) ? MSG_MORE : 0);
313         if (result <= 0) {
314                 dev_err_ratelimited(disk_to_dev(nbd->disk),
315                         "Send control failed (result %d)\n", result);
316                 return -EIO;
317         }
318
319         if (type != NBD_CMD_WRITE)
320                 return 0;
321
322         bio = req->bio;
323         while (bio) {
324                 struct bio *next = bio->bi_next;
325                 struct bvec_iter iter;
326                 struct bio_vec bvec;
327
328                 bio_for_each_segment(bvec, bio, iter) {
329                         bool is_last = !next && bio_iter_last(bvec, iter);
330                         int flags = is_last ? 0 : MSG_MORE;
331
332                         dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
333                                 cmd, bvec.bv_len);
334                         result = sock_send_bvec(nbd, index, &bvec, flags);
335                         if (result <= 0) {
336                                 dev_err(disk_to_dev(nbd->disk),
337                                         "Send data failed (result %d)\n",
338                                         result);
339                                 return -EIO;
340                         }
341                         /*
342                          * The completion might already have come in,
343                          * so break for the last one instead of letting
344                          * the iterator do it. This prevents use-after-free
345                          * of the bio.
346                          */
347                         if (is_last)
348                                 break;
349                 }
350                 bio = next;
351         }
352         return 0;
353 }
354
355 static inline int sock_recv_bvec(struct nbd_device *nbd, int index,
356                                  struct bio_vec *bvec)
357 {
358         int result;
359         void *kaddr = kmap(bvec->bv_page);
360         result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset,
361                            bvec->bv_len, MSG_WAITALL);
362         kunmap(bvec->bv_page);
363         return result;
364 }
365
366 /* NULL returned = something went wrong, inform userspace */
367 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
368 {
369         int result;
370         struct nbd_reply reply;
371         struct nbd_cmd *cmd;
372         struct request *req = NULL;
373         u16 hwq;
374         u32 tag;
375
376         reply.magic = 0;
377         result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL);
378         if (result <= 0) {
379                 if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
380                     !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
381                         dev_err(disk_to_dev(nbd->disk),
382                                 "Receive control failed (result %d)\n", result);
383                 return ERR_PTR(result);
384         }
385
386         if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
387                 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
388                                 (unsigned long)ntohl(reply.magic));
389                 return ERR_PTR(-EPROTO);
390         }
391
392         memcpy(&tag, reply.handle, sizeof(u32));
393
394         hwq = blk_mq_unique_tag_to_hwq(tag);
395         if (hwq < nbd->tag_set.nr_hw_queues)
396                 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
397                                        blk_mq_unique_tag_to_tag(tag));
398         if (!req || !blk_mq_request_started(req)) {
399                 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
400                         tag, req);
401                 return ERR_PTR(-ENOENT);
402         }
403         cmd = blk_mq_rq_to_pdu(req);
404         if (ntohl(reply.error)) {
405                 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
406                         ntohl(reply.error));
407                 req->errors++;
408                 return cmd;
409         }
410
411         dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
412         if (rq_data_dir(req) != WRITE) {
413                 struct req_iterator iter;
414                 struct bio_vec bvec;
415
416                 rq_for_each_segment(bvec, req, iter) {
417                         result = sock_recv_bvec(nbd, index, &bvec);
418                         if (result <= 0) {
419                                 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
420                                         result);
421                                 req->errors++;
422                                 return cmd;
423                         }
424                         dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
425                                 cmd, bvec.bv_len);
426                 }
427         } else {
428                 /* See the comment in nbd_queue_rq. */
429                 wait_for_completion(&cmd->send_complete);
430         }
431         return cmd;
432 }
433
434 static ssize_t pid_show(struct device *dev,
435                         struct device_attribute *attr, char *buf)
436 {
437         struct gendisk *disk = dev_to_disk(dev);
438         struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
439
440         return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
441 }
442
443 static struct device_attribute pid_attr = {
444         .attr = { .name = "pid", .mode = S_IRUGO},
445         .show = pid_show,
446 };
447
448 struct recv_thread_args {
449         struct work_struct work;
450         struct nbd_device *nbd;
451         int index;
452 };
453
454 static void recv_work(struct work_struct *work)
455 {
456         struct recv_thread_args *args = container_of(work,
457                                                      struct recv_thread_args,
458                                                      work);
459         struct nbd_device *nbd = args->nbd;
460         struct nbd_cmd *cmd;
461         int ret = 0;
462
463         BUG_ON(nbd->magic != NBD_MAGIC);
464         while (1) {
465                 cmd = nbd_read_stat(nbd, args->index);
466                 if (IS_ERR(cmd)) {
467                         ret = PTR_ERR(cmd);
468                         break;
469                 }
470
471                 nbd_end_request(cmd);
472         }
473
474         /*
475          * We got an error, shut everybody down if this wasn't the result of a
476          * disconnect request.
477          */
478         if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
479                 sock_shutdown(nbd);
480         atomic_dec(&nbd->recv_threads);
481         wake_up(&nbd->recv_wq);
482 }
483
484 static void nbd_clear_req(struct request *req, void *data, bool reserved)
485 {
486         struct nbd_cmd *cmd;
487
488         if (!blk_mq_request_started(req))
489                 return;
490         cmd = blk_mq_rq_to_pdu(req);
491         req->errors++;
492         nbd_end_request(cmd);
493 }
494
495 static void nbd_clear_que(struct nbd_device *nbd)
496 {
497         BUG_ON(nbd->magic != NBD_MAGIC);
498
499         blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
500         dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
501 }
502
503
504 static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
505 {
506         struct request *req = blk_mq_rq_from_pdu(cmd);
507         struct nbd_device *nbd = cmd->nbd;
508         struct nbd_sock *nsock;
509
510         if (index >= nbd->num_connections) {
511                 dev_err_ratelimited(disk_to_dev(nbd->disk),
512                                     "Attempted send on invalid socket\n");
513                 goto error_out;
514         }
515
516         if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
517                 dev_err_ratelimited(disk_to_dev(nbd->disk),
518                                     "Attempted send on closed socket\n");
519                 goto error_out;
520         }
521
522         req->errors = 0;
523
524         nsock = nbd->socks[index];
525         mutex_lock(&nsock->tx_lock);
526         if (unlikely(!nsock->sock)) {
527                 mutex_unlock(&nsock->tx_lock);
528                 dev_err_ratelimited(disk_to_dev(nbd->disk),
529                                     "Attempted send on closed socket\n");
530                 goto error_out;
531         }
532
533         if (nbd_send_cmd(nbd, cmd, index) != 0) {
534                 dev_err_ratelimited(disk_to_dev(nbd->disk),
535                                     "Request send failed\n");
536                 req->errors++;
537                 nbd_end_request(cmd);
538         }
539
540         mutex_unlock(&nsock->tx_lock);
541
542         return;
543
544 error_out:
545         req->errors++;
546         nbd_end_request(cmd);
547 }
548
549 static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
550                         const struct blk_mq_queue_data *bd)
551 {
552         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
553
554         /*
555          * Since we look at the bio's to send the request over the network we
556          * need to make sure the completion work doesn't mark this request done
557          * before we are done doing our send.  This keeps us from dereferencing
558          * freed data if we have particularly fast completions (ie we get the
559          * completion before we exit sock_xmit on the last bvec) or in the case
560          * that the server is misbehaving (or there was an error) before we're
561          * done sending everything over the wire.
562          */
563         init_completion(&cmd->send_complete);
564         blk_mq_start_request(bd->rq);
565         nbd_handle_cmd(cmd, hctx->queue_num);
566         complete(&cmd->send_complete);
567
568         return BLK_MQ_RQ_QUEUE_OK;
569 }
570
571 static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
572 {
573         struct nbd_sock **socks;
574         struct nbd_sock *nsock;
575
576         if (!nbd->task_setup)
577                 nbd->task_setup = current;
578         if (nbd->task_setup != current) {
579                 dev_err(disk_to_dev(nbd->disk),
580                         "Device being setup by another task");
581                 return -EINVAL;
582         }
583
584         socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
585                          sizeof(struct nbd_sock *), GFP_KERNEL);
586         if (!socks)
587                 return -ENOMEM;
588         nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
589         if (!nsock)
590                 return -ENOMEM;
591
592         nbd->socks = socks;
593
594         mutex_init(&nsock->tx_lock);
595         nsock->sock = sock;
596         socks[nbd->num_connections++] = nsock;
597
598         return 0;
599 }
600
601 /* Reset all properties of an NBD device */
602 static void nbd_reset(struct nbd_device *nbd)
603 {
604         int i;
605
606         for (i = 0; i < nbd->num_connections; i++)
607                 kfree(nbd->socks[i]);
608         kfree(nbd->socks);
609         nbd->socks = NULL;
610         nbd->runtime_flags = 0;
611         nbd->blksize = 1024;
612         nbd->bytesize = 0;
613         set_capacity(nbd->disk, 0);
614         nbd->flags = 0;
615         nbd->tag_set.timeout = 0;
616         nbd->num_connections = 0;
617         nbd->task_setup = NULL;
618         queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
619 }
620
621 static void nbd_bdev_reset(struct block_device *bdev)
622 {
623         set_device_ro(bdev, false);
624         bdev->bd_inode->i_size = 0;
625         if (max_part > 0) {
626                 blkdev_reread_part(bdev);
627                 bdev->bd_invalidated = 1;
628         }
629 }
630
631 static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
632 {
633         if (nbd->flags & NBD_FLAG_READ_ONLY)
634                 set_device_ro(bdev, true);
635         if (nbd->flags & NBD_FLAG_SEND_TRIM)
636                 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
637         if (nbd->flags & NBD_FLAG_SEND_FLUSH)
638                 blk_queue_write_cache(nbd->disk->queue, true, false);
639         else
640                 blk_queue_write_cache(nbd->disk->queue, false, false);
641 }
642
643 static void send_disconnects(struct nbd_device *nbd)
644 {
645         struct nbd_request request = {};
646         int i, ret;
647
648         request.magic = htonl(NBD_REQUEST_MAGIC);
649         request.type = htonl(NBD_CMD_DISC);
650
651         for (i = 0; i < nbd->num_connections; i++) {
652                 ret = sock_xmit(nbd, i, 1, &request, sizeof(request), 0);
653                 if (ret <= 0)
654                         dev_err(disk_to_dev(nbd->disk),
655                                 "Send disconnect failed %d\n", ret);
656         }
657 }
658
659 static int nbd_dev_dbg_init(struct nbd_device *nbd);
660 static void nbd_dev_dbg_close(struct nbd_device *nbd);
661
662 /* Must be called with config_lock held */
663 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
664                        unsigned int cmd, unsigned long arg)
665 {
666         switch (cmd) {
667         case NBD_DISCONNECT: {
668                 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
669                 if (!nbd->socks)
670                         return -EINVAL;
671
672                 mutex_unlock(&nbd->config_lock);
673                 fsync_bdev(bdev);
674                 mutex_lock(&nbd->config_lock);
675
676                 /* Check again after getting mutex back.  */
677                 if (!nbd->socks)
678                         return -EINVAL;
679
680                 if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
681                                       &nbd->runtime_flags))
682                         send_disconnects(nbd);
683                 return 0;
684         }
685
686         case NBD_CLEAR_SOCK:
687                 sock_shutdown(nbd);
688                 nbd_clear_que(nbd);
689                 kill_bdev(bdev);
690                 nbd_bdev_reset(bdev);
691                 /*
692                  * We want to give the run thread a chance to wait for everybody
693                  * to clean up and then do it's own cleanup.
694                  */
695                 if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
696                         int i;
697
698                         for (i = 0; i < nbd->num_connections; i++)
699                                 kfree(nbd->socks[i]);
700                         kfree(nbd->socks);
701                         nbd->socks = NULL;
702                         nbd->num_connections = 0;
703                         nbd->task_setup = NULL;
704                 }
705                 return 0;
706
707         case NBD_SET_SOCK: {
708                 int err;
709                 struct socket *sock = sockfd_lookup(arg, &err);
710
711                 if (!sock)
712                         return err;
713
714                 err = nbd_add_socket(nbd, sock);
715                 if (!err && max_part)
716                         bdev->bd_invalidated = 1;
717
718                 return err;
719         }
720
721         case NBD_SET_BLKSIZE: {
722                 loff_t bsize = div_s64(nbd->bytesize, arg);
723
724                 return nbd_size_set(nbd, bdev, arg, bsize);
725         }
726
727         case NBD_SET_SIZE:
728                 return nbd_size_set(nbd, bdev, nbd->blksize,
729                                         div_s64(arg, nbd->blksize));
730
731         case NBD_SET_SIZE_BLOCKS:
732                 return nbd_size_set(nbd, bdev, nbd->blksize, arg);
733
734         case NBD_SET_TIMEOUT:
735                 nbd->tag_set.timeout = arg * HZ;
736                 return 0;
737
738         case NBD_SET_FLAGS:
739                 nbd->flags = arg;
740                 return 0;
741
742         case NBD_DO_IT: {
743                 struct recv_thread_args *args;
744                 int num_connections = nbd->num_connections;
745                 int error = 0, i;
746
747                 if (nbd->task_recv)
748                         return -EBUSY;
749                 if (!nbd->socks)
750                         return -EINVAL;
751                 if (num_connections > 1 &&
752                     !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
753                         dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
754                         error = -EINVAL;
755                         goto out_err;
756                 }
757
758                 set_bit(NBD_RUNNING, &nbd->runtime_flags);
759                 blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
760                 args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
761                 if (!args) {
762                         error = -ENOMEM;
763                         goto out_err;
764                 }
765                 nbd->task_recv = current;
766                 mutex_unlock(&nbd->config_lock);
767
768                 nbd_parse_flags(nbd, bdev);
769
770                 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
771                 if (error) {
772                         dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
773                         goto out_recv;
774                 }
775
776                 nbd_size_update(nbd, bdev);
777
778                 nbd_dev_dbg_init(nbd);
779                 for (i = 0; i < num_connections; i++) {
780                         sk_set_memalloc(nbd->socks[i]->sock->sk);
781                         atomic_inc(&nbd->recv_threads);
782                         INIT_WORK(&args[i].work, recv_work);
783                         args[i].nbd = nbd;
784                         args[i].index = i;
785                         queue_work(recv_workqueue, &args[i].work);
786                 }
787                 wait_event_interruptible(nbd->recv_wq,
788                                          atomic_read(&nbd->recv_threads) == 0);
789                 for (i = 0; i < num_connections; i++)
790                         flush_work(&args[i].work);
791                 nbd_dev_dbg_close(nbd);
792                 nbd_size_clear(nbd, bdev);
793                 device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
794 out_recv:
795                 mutex_lock(&nbd->config_lock);
796                 nbd->task_recv = NULL;
797 out_err:
798                 sock_shutdown(nbd);
799                 nbd_clear_que(nbd);
800                 kill_bdev(bdev);
801                 nbd_bdev_reset(bdev);
802
803                 /* user requested, ignore socket errors */
804                 if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
805                         error = 0;
806                 if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
807                         error = -ETIMEDOUT;
808
809                 nbd_reset(nbd);
810                 return error;
811         }
812
813         case NBD_CLEAR_QUE:
814                 /*
815                  * This is for compatibility only.  The queue is always cleared
816                  * by NBD_DO_IT or NBD_CLEAR_SOCK.
817                  */
818                 return 0;
819
820         case NBD_PRINT_DEBUG:
821                 /*
822                  * For compatibility only, we no longer keep a list of
823                  * outstanding requests.
824                  */
825                 return 0;
826         }
827         return -ENOTTY;
828 }
829
830 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
831                      unsigned int cmd, unsigned long arg)
832 {
833         struct nbd_device *nbd = bdev->bd_disk->private_data;
834         int error;
835
836         if (!capable(CAP_SYS_ADMIN))
837                 return -EPERM;
838
839         BUG_ON(nbd->magic != NBD_MAGIC);
840
841         mutex_lock(&nbd->config_lock);
842         error = __nbd_ioctl(bdev, nbd, cmd, arg);
843         mutex_unlock(&nbd->config_lock);
844
845         return error;
846 }
847
848 static const struct block_device_operations nbd_fops =
849 {
850         .owner =        THIS_MODULE,
851         .ioctl =        nbd_ioctl,
852         .compat_ioctl = nbd_ioctl,
853 };
854
855 #if IS_ENABLED(CONFIG_DEBUG_FS)
856
857 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
858 {
859         struct nbd_device *nbd = s->private;
860
861         if (nbd->task_recv)
862                 seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
863
864         return 0;
865 }
866
867 static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
868 {
869         return single_open(file, nbd_dbg_tasks_show, inode->i_private);
870 }
871
872 static const struct file_operations nbd_dbg_tasks_ops = {
873         .open = nbd_dbg_tasks_open,
874         .read = seq_read,
875         .llseek = seq_lseek,
876         .release = single_release,
877 };
878
879 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
880 {
881         struct nbd_device *nbd = s->private;
882         u32 flags = nbd->flags;
883
884         seq_printf(s, "Hex: 0x%08x\n\n", flags);
885
886         seq_puts(s, "Known flags:\n");
887
888         if (flags & NBD_FLAG_HAS_FLAGS)
889                 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
890         if (flags & NBD_FLAG_READ_ONLY)
891                 seq_puts(s, "NBD_FLAG_READ_ONLY\n");
892         if (flags & NBD_FLAG_SEND_FLUSH)
893                 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
894         if (flags & NBD_FLAG_SEND_TRIM)
895                 seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
896
897         return 0;
898 }
899
900 static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
901 {
902         return single_open(file, nbd_dbg_flags_show, inode->i_private);
903 }
904
905 static const struct file_operations nbd_dbg_flags_ops = {
906         .open = nbd_dbg_flags_open,
907         .read = seq_read,
908         .llseek = seq_lseek,
909         .release = single_release,
910 };
911
912 static int nbd_dev_dbg_init(struct nbd_device *nbd)
913 {
914         struct dentry *dir;
915
916         if (!nbd_dbg_dir)
917                 return -EIO;
918
919         dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
920         if (!dir) {
921                 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
922                         nbd_name(nbd));
923                 return -EIO;
924         }
925         nbd->dbg_dir = dir;
926
927         debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
928         debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
929         debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
930         debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
931         debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
932
933         return 0;
934 }
935
936 static void nbd_dev_dbg_close(struct nbd_device *nbd)
937 {
938         debugfs_remove_recursive(nbd->dbg_dir);
939 }
940
941 static int nbd_dbg_init(void)
942 {
943         struct dentry *dbg_dir;
944
945         dbg_dir = debugfs_create_dir("nbd", NULL);
946         if (!dbg_dir)
947                 return -EIO;
948
949         nbd_dbg_dir = dbg_dir;
950
951         return 0;
952 }
953
954 static void nbd_dbg_close(void)
955 {
956         debugfs_remove_recursive(nbd_dbg_dir);
957 }
958
959 #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
960
961 static int nbd_dev_dbg_init(struct nbd_device *nbd)
962 {
963         return 0;
964 }
965
966 static void nbd_dev_dbg_close(struct nbd_device *nbd)
967 {
968 }
969
970 static int nbd_dbg_init(void)
971 {
972         return 0;
973 }
974
975 static void nbd_dbg_close(void)
976 {
977 }
978
979 #endif
980
981 static int nbd_init_request(void *data, struct request *rq,
982                             unsigned int hctx_idx, unsigned int request_idx,
983                             unsigned int numa_node)
984 {
985         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
986         cmd->nbd = data;
987         return 0;
988 }
989
990 static struct blk_mq_ops nbd_mq_ops = {
991         .queue_rq       = nbd_queue_rq,
992         .init_request   = nbd_init_request,
993         .timeout        = nbd_xmit_timeout,
994 };
995
996 /*
997  * And here should be modules and kernel interface 
998  *  (Just smiley confuses emacs :-)
999  */
1000
1001 static int __init nbd_init(void)
1002 {
1003         int err = -ENOMEM;
1004         int i;
1005         int part_shift;
1006
1007         BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
1008
1009         if (max_part < 0) {
1010                 printk(KERN_ERR "nbd: max_part must be >= 0\n");
1011                 return -EINVAL;
1012         }
1013
1014         part_shift = 0;
1015         if (max_part > 0) {
1016                 part_shift = fls(max_part);
1017
1018                 /*
1019                  * Adjust max_part according to part_shift as it is exported
1020                  * to user space so that user can know the max number of
1021                  * partition kernel should be able to manage.
1022                  *
1023                  * Note that -1 is required because partition 0 is reserved
1024                  * for the whole disk.
1025                  */
1026                 max_part = (1UL << part_shift) - 1;
1027         }
1028
1029         if ((1UL << part_shift) > DISK_MAX_PARTS)
1030                 return -EINVAL;
1031
1032         if (nbds_max > 1UL << (MINORBITS - part_shift))
1033                 return -EINVAL;
1034         recv_workqueue = alloc_workqueue("knbd-recv",
1035                                          WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1036         if (!recv_workqueue)
1037                 return -ENOMEM;
1038
1039         nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
1040         if (!nbd_dev) {
1041                 destroy_workqueue(recv_workqueue);
1042                 return -ENOMEM;
1043         }
1044
1045         for (i = 0; i < nbds_max; i++) {
1046                 struct request_queue *q;
1047                 struct gendisk *disk = alloc_disk(1 << part_shift);
1048                 if (!disk)
1049                         goto out;
1050                 nbd_dev[i].disk = disk;
1051
1052                 nbd_dev[i].tag_set.ops = &nbd_mq_ops;
1053                 nbd_dev[i].tag_set.nr_hw_queues = 1;
1054                 nbd_dev[i].tag_set.queue_depth = 128;
1055                 nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
1056                 nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
1057                 nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1058                         BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1059                 nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
1060
1061                 err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
1062                 if (err) {
1063                         put_disk(disk);
1064                         goto out;
1065                 }
1066
1067                 /*
1068                  * The new linux 2.5 block layer implementation requires
1069                  * every gendisk to have its very own request_queue struct.
1070                  * These structs are big so we dynamically allocate them.
1071                  */
1072                 q = blk_mq_init_queue(&nbd_dev[i].tag_set);
1073                 if (IS_ERR(q)) {
1074                         blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1075                         put_disk(disk);
1076                         goto out;
1077                 }
1078                 disk->queue = q;
1079
1080                 /*
1081                  * Tell the block layer that we are not a rotational device
1082                  */
1083                 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1084                 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1085                 disk->queue->limits.discard_granularity = 512;
1086                 blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
1087                 disk->queue->limits.discard_zeroes_data = 0;
1088                 blk_queue_max_hw_sectors(disk->queue, 65536);
1089                 disk->queue->limits.max_sectors = 256;
1090         }
1091
1092         if (register_blkdev(NBD_MAJOR, "nbd")) {
1093                 err = -EIO;
1094                 goto out;
1095         }
1096
1097         printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
1098
1099         nbd_dbg_init();
1100
1101         for (i = 0; i < nbds_max; i++) {
1102                 struct gendisk *disk = nbd_dev[i].disk;
1103                 nbd_dev[i].magic = NBD_MAGIC;
1104                 mutex_init(&nbd_dev[i].config_lock);
1105                 disk->major = NBD_MAJOR;
1106                 disk->first_minor = i << part_shift;
1107                 disk->fops = &nbd_fops;
1108                 disk->private_data = &nbd_dev[i];
1109                 sprintf(disk->disk_name, "nbd%d", i);
1110                 init_waitqueue_head(&nbd_dev[i].recv_wq);
1111                 nbd_reset(&nbd_dev[i]);
1112                 add_disk(disk);
1113         }
1114
1115         return 0;
1116 out:
1117         while (i--) {
1118                 blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1119                 blk_cleanup_queue(nbd_dev[i].disk->queue);
1120                 put_disk(nbd_dev[i].disk);
1121         }
1122         kfree(nbd_dev);
1123         destroy_workqueue(recv_workqueue);
1124         return err;
1125 }
1126
1127 static void __exit nbd_cleanup(void)
1128 {
1129         int i;
1130
1131         nbd_dbg_close();
1132
1133         for (i = 0; i < nbds_max; i++) {
1134                 struct gendisk *disk = nbd_dev[i].disk;
1135                 nbd_dev[i].magic = 0;
1136                 if (disk) {
1137                         del_gendisk(disk);
1138                         blk_cleanup_queue(disk->queue);
1139                         blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1140                         put_disk(disk);
1141                 }
1142         }
1143         destroy_workqueue(recv_workqueue);
1144         unregister_blkdev(NBD_MAJOR, "nbd");
1145         kfree(nbd_dev);
1146         printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
1147 }
1148
1149 module_init(nbd_init);
1150 module_exit(nbd_cleanup);
1151
1152 MODULE_DESCRIPTION("Network Block Device");
1153 MODULE_LICENSE("GPL");
1154
1155 module_param(nbds_max, int, 0444);
1156 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
1157 module_param(max_part, int, 0444);
1158 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");