splice, net: Use sendmsg(MSG_SPLICE_PAGES) rather than ->sendpage()
[platform/kernel/linux-starfive.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/net.h>
37 #include <linux/socket.h>
38 #include <linux/sched/signal.h>
39
40 #include "internal.h"
41
42 /*
43  * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44  * indicate they support non-blocking reads or writes, we must clear it
45  * here if set to avoid blocking other users of this pipe if splice is
46  * being done on it.
47  */
48 static noinline void noinline pipe_clear_nowait(struct file *file)
49 {
50         fmode_t fmode = READ_ONCE(file->f_mode);
51
52         do {
53                 if (!(fmode & FMODE_NOWAIT))
54                         break;
55         } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 }
57
58 /*
59  * Attempt to steal a page from a pipe buffer. This should perhaps go into
60  * a vm helper function, it's already simplified quite a bit by the
61  * addition of remove_mapping(). If success is returned, the caller may
62  * attempt to reuse this page for another destination.
63  */
64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65                 struct pipe_buffer *buf)
66 {
67         struct folio *folio = page_folio(buf->page);
68         struct address_space *mapping;
69
70         folio_lock(folio);
71
72         mapping = folio_mapping(folio);
73         if (mapping) {
74                 WARN_ON(!folio_test_uptodate(folio));
75
76                 /*
77                  * At least for ext2 with nobh option, we need to wait on
78                  * writeback completing on this folio, since we'll remove it
79                  * from the pagecache.  Otherwise truncate wont wait on the
80                  * folio, allowing the disk blocks to be reused by someone else
81                  * before we actually wrote our data to them. fs corruption
82                  * ensues.
83                  */
84                 folio_wait_writeback(folio);
85
86                 if (folio_has_private(folio) &&
87                     !filemap_release_folio(folio, GFP_KERNEL))
88                         goto out_unlock;
89
90                 /*
91                  * If we succeeded in removing the mapping, set LRU flag
92                  * and return good.
93                  */
94                 if (remove_mapping(mapping, folio)) {
95                         buf->flags |= PIPE_BUF_FLAG_LRU;
96                         return true;
97                 }
98         }
99
100         /*
101          * Raced with truncate or failed to remove folio from current
102          * address space, unlock and return failure.
103          */
104 out_unlock:
105         folio_unlock(folio);
106         return false;
107 }
108
109 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
110                                         struct pipe_buffer *buf)
111 {
112         put_page(buf->page);
113         buf->flags &= ~PIPE_BUF_FLAG_LRU;
114 }
115
116 /*
117  * Check whether the contents of buf is OK to access. Since the content
118  * is a page cache page, IO may be in flight.
119  */
120 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
121                                        struct pipe_buffer *buf)
122 {
123         struct page *page = buf->page;
124         int err;
125
126         if (!PageUptodate(page)) {
127                 lock_page(page);
128
129                 /*
130                  * Page got truncated/unhashed. This will cause a 0-byte
131                  * splice, if this is the first page.
132                  */
133                 if (!page->mapping) {
134                         err = -ENODATA;
135                         goto error;
136                 }
137
138                 /*
139                  * Uh oh, read-error from disk.
140                  */
141                 if (!PageUptodate(page)) {
142                         err = -EIO;
143                         goto error;
144                 }
145
146                 /*
147                  * Page is ok afterall, we are done.
148                  */
149                 unlock_page(page);
150         }
151
152         return 0;
153 error:
154         unlock_page(page);
155         return err;
156 }
157
158 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
159         .confirm        = page_cache_pipe_buf_confirm,
160         .release        = page_cache_pipe_buf_release,
161         .try_steal      = page_cache_pipe_buf_try_steal,
162         .get            = generic_pipe_buf_get,
163 };
164
165 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
166                 struct pipe_buffer *buf)
167 {
168         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
169                 return false;
170
171         buf->flags |= PIPE_BUF_FLAG_LRU;
172         return generic_pipe_buf_try_steal(pipe, buf);
173 }
174
175 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
176         .release        = page_cache_pipe_buf_release,
177         .try_steal      = user_page_pipe_buf_try_steal,
178         .get            = generic_pipe_buf_get,
179 };
180
181 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
182 {
183         smp_mb();
184         if (waitqueue_active(&pipe->rd_wait))
185                 wake_up_interruptible(&pipe->rd_wait);
186         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
187 }
188
189 /**
190  * splice_to_pipe - fill passed data into a pipe
191  * @pipe:       pipe to fill
192  * @spd:        data to fill
193  *
194  * Description:
195  *    @spd contains a map of pages and len/offset tuples, along with
196  *    the struct pipe_buf_operations associated with these pages. This
197  *    function will link that data to the pipe.
198  *
199  */
200 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201                        struct splice_pipe_desc *spd)
202 {
203         unsigned int spd_pages = spd->nr_pages;
204         unsigned int tail = pipe->tail;
205         unsigned int head = pipe->head;
206         unsigned int mask = pipe->ring_size - 1;
207         int ret = 0, page_nr = 0;
208
209         if (!spd_pages)
210                 return 0;
211
212         if (unlikely(!pipe->readers)) {
213                 send_sig(SIGPIPE, current, 0);
214                 ret = -EPIPE;
215                 goto out;
216         }
217
218         while (!pipe_full(head, tail, pipe->max_usage)) {
219                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
220
221                 buf->page = spd->pages[page_nr];
222                 buf->offset = spd->partial[page_nr].offset;
223                 buf->len = spd->partial[page_nr].len;
224                 buf->private = spd->partial[page_nr].private;
225                 buf->ops = spd->ops;
226                 buf->flags = 0;
227
228                 head++;
229                 pipe->head = head;
230                 page_nr++;
231                 ret += buf->len;
232
233                 if (!--spd->nr_pages)
234                         break;
235         }
236
237         if (!ret)
238                 ret = -EAGAIN;
239
240 out:
241         while (page_nr < spd_pages)
242                 spd->spd_release(spd, page_nr++);
243
244         return ret;
245 }
246 EXPORT_SYMBOL_GPL(splice_to_pipe);
247
248 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
249 {
250         unsigned int head = pipe->head;
251         unsigned int tail = pipe->tail;
252         unsigned int mask = pipe->ring_size - 1;
253         int ret;
254
255         if (unlikely(!pipe->readers)) {
256                 send_sig(SIGPIPE, current, 0);
257                 ret = -EPIPE;
258         } else if (pipe_full(head, tail, pipe->max_usage)) {
259                 ret = -EAGAIN;
260         } else {
261                 pipe->bufs[head & mask] = *buf;
262                 pipe->head = head + 1;
263                 return buf->len;
264         }
265         pipe_buf_release(pipe, buf);
266         return ret;
267 }
268 EXPORT_SYMBOL(add_to_pipe);
269
270 /*
271  * Check if we need to grow the arrays holding pages and partial page
272  * descriptions.
273  */
274 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
275 {
276         unsigned int max_usage = READ_ONCE(pipe->max_usage);
277
278         spd->nr_pages_max = max_usage;
279         if (max_usage <= PIPE_DEF_BUFFERS)
280                 return 0;
281
282         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
283         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
284                                      GFP_KERNEL);
285
286         if (spd->pages && spd->partial)
287                 return 0;
288
289         kfree(spd->pages);
290         kfree(spd->partial);
291         return -ENOMEM;
292 }
293
294 void splice_shrink_spd(struct splice_pipe_desc *spd)
295 {
296         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
297                 return;
298
299         kfree(spd->pages);
300         kfree(spd->partial);
301 }
302
303 /*
304  * Splice data from an O_DIRECT file into pages and then add them to the output
305  * pipe.
306  */
307 ssize_t direct_splice_read(struct file *in, loff_t *ppos,
308                            struct pipe_inode_info *pipe,
309                            size_t len, unsigned int flags)
310 {
311         struct iov_iter to;
312         struct bio_vec *bv;
313         struct kiocb kiocb;
314         struct page **pages;
315         ssize_t ret;
316         size_t used, npages, chunk, remain, reclaim;
317         int i;
318
319         /* Work out how much data we can actually add into the pipe */
320         used = pipe_occupancy(pipe->head, pipe->tail);
321         npages = max_t(ssize_t, pipe->max_usage - used, 0);
322         len = min_t(size_t, len, npages * PAGE_SIZE);
323         npages = DIV_ROUND_UP(len, PAGE_SIZE);
324
325         bv = kzalloc(array_size(npages, sizeof(bv[0])) +
326                      array_size(npages, sizeof(struct page *)), GFP_KERNEL);
327         if (!bv)
328                 return -ENOMEM;
329
330         pages = (void *)(bv + npages);
331         npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
332         if (!npages) {
333                 kfree(bv);
334                 return -ENOMEM;
335         }
336
337         remain = len = min_t(size_t, len, npages * PAGE_SIZE);
338
339         for (i = 0; i < npages; i++) {
340                 chunk = min_t(size_t, PAGE_SIZE, remain);
341                 bv[i].bv_page = pages[i];
342                 bv[i].bv_offset = 0;
343                 bv[i].bv_len = chunk;
344                 remain -= chunk;
345         }
346
347         /* Do the I/O */
348         iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
349         init_sync_kiocb(&kiocb, in);
350         kiocb.ki_pos = *ppos;
351         ret = call_read_iter(in, &kiocb, &to);
352
353         reclaim = npages * PAGE_SIZE;
354         remain = 0;
355         if (ret > 0) {
356                 reclaim -= ret;
357                 remain = ret;
358                 *ppos = kiocb.ki_pos;
359                 file_accessed(in);
360         } else if (ret < 0) {
361                 /*
362                  * callers of ->splice_read() expect -EAGAIN on
363                  * "can't put anything in there", rather than -EFAULT.
364                  */
365                 if (ret == -EFAULT)
366                         ret = -EAGAIN;
367         }
368
369         /* Free any pages that didn't get touched at all. */
370         reclaim /= PAGE_SIZE;
371         if (reclaim) {
372                 npages -= reclaim;
373                 release_pages(pages + npages, reclaim);
374         }
375
376         /* Push the remaining pages into the pipe. */
377         for (i = 0; i < npages; i++) {
378                 struct pipe_buffer *buf = pipe_head_buf(pipe);
379
380                 chunk = min_t(size_t, remain, PAGE_SIZE);
381                 *buf = (struct pipe_buffer) {
382                         .ops    = &default_pipe_buf_ops,
383                         .page   = bv[i].bv_page,
384                         .offset = 0,
385                         .len    = chunk,
386                 };
387                 pipe->head++;
388                 remain -= chunk;
389         }
390
391         kfree(bv);
392         return ret;
393 }
394 EXPORT_SYMBOL(direct_splice_read);
395
396 /**
397  * generic_file_splice_read - splice data from file to a pipe
398  * @in:         file to splice from
399  * @ppos:       position in @in
400  * @pipe:       pipe to splice to
401  * @len:        number of bytes to splice
402  * @flags:      splice modifier flags
403  *
404  * Description:
405  *    Will read pages from given file and fill them into a pipe. Can be
406  *    used as long as it has more or less sane ->read_iter().
407  *
408  */
409 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
410                                  struct pipe_inode_info *pipe, size_t len,
411                                  unsigned int flags)
412 {
413         struct iov_iter to;
414         struct kiocb kiocb;
415         int ret;
416
417         iov_iter_pipe(&to, ITER_DEST, pipe, len);
418         init_sync_kiocb(&kiocb, in);
419         kiocb.ki_pos = *ppos;
420         ret = call_read_iter(in, &kiocb, &to);
421         if (ret > 0) {
422                 *ppos = kiocb.ki_pos;
423                 file_accessed(in);
424         } else if (ret < 0) {
425                 /* free what was emitted */
426                 pipe_discard_from(pipe, to.start_head);
427                 /*
428                  * callers of ->splice_read() expect -EAGAIN on
429                  * "can't put anything in there", rather than -EFAULT.
430                  */
431                 if (ret == -EFAULT)
432                         ret = -EAGAIN;
433         }
434
435         return ret;
436 }
437 EXPORT_SYMBOL(generic_file_splice_read);
438
439 const struct pipe_buf_operations default_pipe_buf_ops = {
440         .release        = generic_pipe_buf_release,
441         .try_steal      = generic_pipe_buf_try_steal,
442         .get            = generic_pipe_buf_get,
443 };
444
445 /* Pipe buffer operations for a socket and similar. */
446 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
447         .release        = generic_pipe_buf_release,
448         .get            = generic_pipe_buf_get,
449 };
450 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
451
452 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
453 {
454         smp_mb();
455         if (waitqueue_active(&pipe->wr_wait))
456                 wake_up_interruptible(&pipe->wr_wait);
457         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
458 }
459
460 /**
461  * splice_from_pipe_feed - feed available data from a pipe to a file
462  * @pipe:       pipe to splice from
463  * @sd:         information to @actor
464  * @actor:      handler that splices the data
465  *
466  * Description:
467  *    This function loops over the pipe and calls @actor to do the
468  *    actual moving of a single struct pipe_buffer to the desired
469  *    destination.  It returns when there's no more buffers left in
470  *    the pipe or if the requested number of bytes (@sd->total_len)
471  *    have been copied.  It returns a positive number (one) if the
472  *    pipe needs to be filled with more data, zero if the required
473  *    number of bytes have been copied and -errno on error.
474  *
475  *    This, together with splice_from_pipe_{begin,end,next}, may be
476  *    used to implement the functionality of __splice_from_pipe() when
477  *    locking is required around copying the pipe buffers to the
478  *    destination.
479  */
480 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
481                           splice_actor *actor)
482 {
483         unsigned int head = pipe->head;
484         unsigned int tail = pipe->tail;
485         unsigned int mask = pipe->ring_size - 1;
486         int ret;
487
488         while (!pipe_empty(head, tail)) {
489                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
490
491                 sd->len = buf->len;
492                 if (sd->len > sd->total_len)
493                         sd->len = sd->total_len;
494
495                 ret = pipe_buf_confirm(pipe, buf);
496                 if (unlikely(ret)) {
497                         if (ret == -ENODATA)
498                                 ret = 0;
499                         return ret;
500                 }
501
502                 ret = actor(pipe, buf, sd);
503                 if (ret <= 0)
504                         return ret;
505
506                 buf->offset += ret;
507                 buf->len -= ret;
508
509                 sd->num_spliced += ret;
510                 sd->len -= ret;
511                 sd->pos += ret;
512                 sd->total_len -= ret;
513
514                 if (!buf->len) {
515                         pipe_buf_release(pipe, buf);
516                         tail++;
517                         pipe->tail = tail;
518                         if (pipe->files)
519                                 sd->need_wakeup = true;
520                 }
521
522                 if (!sd->total_len)
523                         return 0;
524         }
525
526         return 1;
527 }
528
529 /* We know we have a pipe buffer, but maybe it's empty? */
530 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
531 {
532         unsigned int tail = pipe->tail;
533         unsigned int mask = pipe->ring_size - 1;
534         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
535
536         if (unlikely(!buf->len)) {
537                 pipe_buf_release(pipe, buf);
538                 pipe->tail = tail+1;
539                 return true;
540         }
541
542         return false;
543 }
544
545 /**
546  * splice_from_pipe_next - wait for some data to splice from
547  * @pipe:       pipe to splice from
548  * @sd:         information about the splice operation
549  *
550  * Description:
551  *    This function will wait for some data and return a positive
552  *    value (one) if pipe buffers are available.  It will return zero
553  *    or -errno if no more data needs to be spliced.
554  */
555 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
556 {
557         /*
558          * Check for signal early to make process killable when there are
559          * always buffers available
560          */
561         if (signal_pending(current))
562                 return -ERESTARTSYS;
563
564 repeat:
565         while (pipe_empty(pipe->head, pipe->tail)) {
566                 if (!pipe->writers)
567                         return 0;
568
569                 if (sd->num_spliced)
570                         return 0;
571
572                 if (sd->flags & SPLICE_F_NONBLOCK)
573                         return -EAGAIN;
574
575                 if (signal_pending(current))
576                         return -ERESTARTSYS;
577
578                 if (sd->need_wakeup) {
579                         wakeup_pipe_writers(pipe);
580                         sd->need_wakeup = false;
581                 }
582
583                 pipe_wait_readable(pipe);
584         }
585
586         if (eat_empty_buffer(pipe))
587                 goto repeat;
588
589         return 1;
590 }
591
592 /**
593  * splice_from_pipe_begin - start splicing from pipe
594  * @sd:         information about the splice operation
595  *
596  * Description:
597  *    This function should be called before a loop containing
598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
599  *    initialize the necessary fields of @sd.
600  */
601 static void splice_from_pipe_begin(struct splice_desc *sd)
602 {
603         sd->num_spliced = 0;
604         sd->need_wakeup = false;
605 }
606
607 /**
608  * splice_from_pipe_end - finish splicing from pipe
609  * @pipe:       pipe to splice from
610  * @sd:         information about the splice operation
611  *
612  * Description:
613  *    This function will wake up pipe writers if necessary.  It should
614  *    be called after a loop containing splice_from_pipe_next() and
615  *    splice_from_pipe_feed().
616  */
617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
618 {
619         if (sd->need_wakeup)
620                 wakeup_pipe_writers(pipe);
621 }
622
623 /**
624  * __splice_from_pipe - splice data from a pipe to given actor
625  * @pipe:       pipe to splice from
626  * @sd:         information to @actor
627  * @actor:      handler that splices the data
628  *
629  * Description:
630  *    This function does little more than loop over the pipe and call
631  *    @actor to do the actual moving of a single struct pipe_buffer to
632  *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
633  *    pipe_to_user.
634  *
635  */
636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
637                            splice_actor *actor)
638 {
639         int ret;
640
641         splice_from_pipe_begin(sd);
642         do {
643                 cond_resched();
644                 ret = splice_from_pipe_next(pipe, sd);
645                 if (ret > 0)
646                         ret = splice_from_pipe_feed(pipe, sd, actor);
647         } while (ret > 0);
648         splice_from_pipe_end(pipe, sd);
649
650         return sd->num_spliced ? sd->num_spliced : ret;
651 }
652 EXPORT_SYMBOL(__splice_from_pipe);
653
654 /**
655  * splice_from_pipe - splice data from a pipe to a file
656  * @pipe:       pipe to splice from
657  * @out:        file to splice to
658  * @ppos:       position in @out
659  * @len:        how many bytes to splice
660  * @flags:      splice modifier flags
661  * @actor:      handler that splices the data
662  *
663  * Description:
664  *    See __splice_from_pipe. This function locks the pipe inode,
665  *    otherwise it's identical to __splice_from_pipe().
666  *
667  */
668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
669                          loff_t *ppos, size_t len, unsigned int flags,
670                          splice_actor *actor)
671 {
672         ssize_t ret;
673         struct splice_desc sd = {
674                 .total_len = len,
675                 .flags = flags,
676                 .pos = *ppos,
677                 .u.file = out,
678         };
679
680         pipe_lock(pipe);
681         ret = __splice_from_pipe(pipe, &sd, actor);
682         pipe_unlock(pipe);
683
684         return ret;
685 }
686
687 /**
688  * iter_file_splice_write - splice data from a pipe to a file
689  * @pipe:       pipe info
690  * @out:        file to write to
691  * @ppos:       position in @out
692  * @len:        number of bytes to splice
693  * @flags:      splice modifier flags
694  *
695  * Description:
696  *    Will either move or copy pages (determined by @flags options) from
697  *    the given pipe inode to the given file.
698  *    This one is ->write_iter-based.
699  *
700  */
701 ssize_t
702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
703                           loff_t *ppos, size_t len, unsigned int flags)
704 {
705         struct splice_desc sd = {
706                 .total_len = len,
707                 .flags = flags,
708                 .pos = *ppos,
709                 .u.file = out,
710         };
711         int nbufs = pipe->max_usage;
712         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
713                                         GFP_KERNEL);
714         ssize_t ret;
715
716         if (unlikely(!array))
717                 return -ENOMEM;
718
719         pipe_lock(pipe);
720
721         splice_from_pipe_begin(&sd);
722         while (sd.total_len) {
723                 struct iov_iter from;
724                 unsigned int head, tail, mask;
725                 size_t left;
726                 int n;
727
728                 ret = splice_from_pipe_next(pipe, &sd);
729                 if (ret <= 0)
730                         break;
731
732                 if (unlikely(nbufs < pipe->max_usage)) {
733                         kfree(array);
734                         nbufs = pipe->max_usage;
735                         array = kcalloc(nbufs, sizeof(struct bio_vec),
736                                         GFP_KERNEL);
737                         if (!array) {
738                                 ret = -ENOMEM;
739                                 break;
740                         }
741                 }
742
743                 head = pipe->head;
744                 tail = pipe->tail;
745                 mask = pipe->ring_size - 1;
746
747                 /* build the vector */
748                 left = sd.total_len;
749                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
750                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
751                         size_t this_len = buf->len;
752
753                         /* zero-length bvecs are not supported, skip them */
754                         if (!this_len)
755                                 continue;
756                         this_len = min(this_len, left);
757
758                         ret = pipe_buf_confirm(pipe, buf);
759                         if (unlikely(ret)) {
760                                 if (ret == -ENODATA)
761                                         ret = 0;
762                                 goto done;
763                         }
764
765                         bvec_set_page(&array[n], buf->page, this_len,
766                                       buf->offset);
767                         left -= this_len;
768                         n++;
769                 }
770
771                 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
772                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
773                 if (ret <= 0)
774                         break;
775
776                 sd.num_spliced += ret;
777                 sd.total_len -= ret;
778                 *ppos = sd.pos;
779
780                 /* dismiss the fully eaten buffers, adjust the partial one */
781                 tail = pipe->tail;
782                 while (ret) {
783                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
784                         if (ret >= buf->len) {
785                                 ret -= buf->len;
786                                 buf->len = 0;
787                                 pipe_buf_release(pipe, buf);
788                                 tail++;
789                                 pipe->tail = tail;
790                                 if (pipe->files)
791                                         sd.need_wakeup = true;
792                         } else {
793                                 buf->offset += ret;
794                                 buf->len -= ret;
795                                 ret = 0;
796                         }
797                 }
798         }
799 done:
800         kfree(array);
801         splice_from_pipe_end(pipe, &sd);
802
803         pipe_unlock(pipe);
804
805         if (sd.num_spliced)
806                 ret = sd.num_spliced;
807
808         return ret;
809 }
810
811 EXPORT_SYMBOL(iter_file_splice_write);
812
813 #ifdef CONFIG_NET
814 /**
815  * splice_to_socket - splice data from a pipe to a socket
816  * @pipe:       pipe to splice from
817  * @out:        socket to write to
818  * @ppos:       position in @out
819  * @len:        number of bytes to splice
820  * @flags:      splice modifier flags
821  *
822  * Description:
823  *    Will send @len bytes from the pipe to a network socket. No data copying
824  *    is involved.
825  *
826  */
827 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
828                          loff_t *ppos, size_t len, unsigned int flags)
829 {
830         struct socket *sock = sock_from_file(out);
831         struct bio_vec bvec[16];
832         struct msghdr msg = {};
833         ssize_t ret = 0;
834         size_t spliced = 0;
835         bool need_wakeup = false;
836
837         pipe_lock(pipe);
838
839         while (len > 0) {
840                 unsigned int head, tail, mask, bc = 0;
841                 size_t remain = len;
842
843                 /*
844                  * Check for signal early to make process killable when there
845                  * are always buffers available
846                  */
847                 ret = -ERESTARTSYS;
848                 if (signal_pending(current))
849                         break;
850
851                 while (pipe_empty(pipe->head, pipe->tail)) {
852                         ret = 0;
853                         if (!pipe->writers)
854                                 goto out;
855
856                         if (spliced)
857                                 goto out;
858
859                         ret = -EAGAIN;
860                         if (flags & SPLICE_F_NONBLOCK)
861                                 goto out;
862
863                         ret = -ERESTARTSYS;
864                         if (signal_pending(current))
865                                 goto out;
866
867                         if (need_wakeup) {
868                                 wakeup_pipe_writers(pipe);
869                                 need_wakeup = false;
870                         }
871
872                         pipe_wait_readable(pipe);
873                 }
874
875                 head = pipe->head;
876                 tail = pipe->tail;
877                 mask = pipe->ring_size - 1;
878
879                 while (!pipe_empty(head, tail)) {
880                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
881                         size_t seg;
882
883                         if (!buf->len) {
884                                 tail++;
885                                 continue;
886                         }
887
888                         seg = min_t(size_t, remain, buf->len);
889                         seg = min_t(size_t, seg, PAGE_SIZE);
890
891                         ret = pipe_buf_confirm(pipe, buf);
892                         if (unlikely(ret)) {
893                                 if (ret == -ENODATA)
894                                         ret = 0;
895                                 break;
896                         }
897
898                         bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
899                         remain -= seg;
900                         if (seg >= buf->len)
901                                 tail++;
902                         if (bc >= ARRAY_SIZE(bvec))
903                                 break;
904                 }
905
906                 if (!bc)
907                         break;
908
909                 msg.msg_flags = MSG_SPLICE_PAGES;
910                 if (flags & SPLICE_F_MORE)
911                         msg.msg_flags |= MSG_MORE;
912                 if (remain && pipe_occupancy(pipe->head, tail) > 0)
913                         msg.msg_flags |= MSG_MORE;
914
915                 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
916                               len - remain);
917                 ret = sock_sendmsg(sock, &msg);
918                 if (ret <= 0)
919                         break;
920
921                 spliced += ret;
922                 len -= ret;
923                 tail = pipe->tail;
924                 while (ret > 0) {
925                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
926                         size_t seg = min_t(size_t, ret, buf->len);
927
928                         buf->offset += seg;
929                         buf->len -= seg;
930                         ret -= seg;
931
932                         if (!buf->len) {
933                                 pipe_buf_release(pipe, buf);
934                                 tail++;
935                         }
936                 }
937
938                 if (tail != pipe->tail) {
939                         pipe->tail = tail;
940                         if (pipe->files)
941                                 need_wakeup = true;
942                 }
943         }
944
945 out:
946         pipe_unlock(pipe);
947         if (need_wakeup)
948                 wakeup_pipe_writers(pipe);
949         return spliced ?: ret;
950 }
951 #endif
952
953 static int warn_unsupported(struct file *file, const char *op)
954 {
955         pr_debug_ratelimited(
956                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
957                 op, file, current->pid, current->comm);
958         return -EINVAL;
959 }
960
961 /*
962  * Attempt to initiate a splice from pipe to file.
963  */
964 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
965                            loff_t *ppos, size_t len, unsigned int flags)
966 {
967         if (unlikely(!out->f_op->splice_write))
968                 return warn_unsupported(out, "write");
969         return out->f_op->splice_write(pipe, out, ppos, len, flags);
970 }
971
972 /*
973  * Attempt to initiate a splice from a file to a pipe.
974  */
975 static long do_splice_to(struct file *in, loff_t *ppos,
976                          struct pipe_inode_info *pipe, size_t len,
977                          unsigned int flags)
978 {
979         unsigned int p_space;
980         int ret;
981
982         if (unlikely(!(in->f_mode & FMODE_READ)))
983                 return -EBADF;
984
985         /* Don't try to read more the pipe has space for. */
986         p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
987         len = min_t(size_t, len, p_space << PAGE_SHIFT);
988
989         ret = rw_verify_area(READ, in, ppos, len);
990         if (unlikely(ret < 0))
991                 return ret;
992
993         if (unlikely(len > MAX_RW_COUNT))
994                 len = MAX_RW_COUNT;
995
996         if (unlikely(!in->f_op->splice_read))
997                 return warn_unsupported(in, "read");
998         return in->f_op->splice_read(in, ppos, pipe, len, flags);
999 }
1000
1001 /**
1002  * splice_direct_to_actor - splices data directly between two non-pipes
1003  * @in:         file to splice from
1004  * @sd:         actor information on where to splice to
1005  * @actor:      handles the data splicing
1006  *
1007  * Description:
1008  *    This is a special case helper to splice directly between two
1009  *    points, without requiring an explicit pipe. Internally an allocated
1010  *    pipe is cached in the process, and reused during the lifetime of
1011  *    that process.
1012  *
1013  */
1014 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1015                                splice_direct_actor *actor)
1016 {
1017         struct pipe_inode_info *pipe;
1018         long ret, bytes;
1019         size_t len;
1020         int i, flags, more;
1021
1022         /*
1023          * We require the input to be seekable, as we don't want to randomly
1024          * drop data for eg socket -> socket splicing. Use the piped splicing
1025          * for that!
1026          */
1027         if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1028                 return -EINVAL;
1029
1030         /*
1031          * neither in nor out is a pipe, setup an internal pipe attached to
1032          * 'out' and transfer the wanted data from 'in' to 'out' through that
1033          */
1034         pipe = current->splice_pipe;
1035         if (unlikely(!pipe)) {
1036                 pipe = alloc_pipe_info();
1037                 if (!pipe)
1038                         return -ENOMEM;
1039
1040                 /*
1041                  * We don't have an immediate reader, but we'll read the stuff
1042                  * out of the pipe right after the splice_to_pipe(). So set
1043                  * PIPE_READERS appropriately.
1044                  */
1045                 pipe->readers = 1;
1046
1047                 current->splice_pipe = pipe;
1048         }
1049
1050         /*
1051          * Do the splice.
1052          */
1053         bytes = 0;
1054         len = sd->total_len;
1055         flags = sd->flags;
1056
1057         /*
1058          * Don't block on output, we have to drain the direct pipe.
1059          */
1060         sd->flags &= ~SPLICE_F_NONBLOCK;
1061         more = sd->flags & SPLICE_F_MORE;
1062
1063         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1064
1065         while (len) {
1066                 size_t read_len;
1067                 loff_t pos = sd->pos, prev_pos = pos;
1068
1069                 ret = do_splice_to(in, &pos, pipe, len, flags);
1070                 if (unlikely(ret <= 0))
1071                         goto out_release;
1072
1073                 read_len = ret;
1074                 sd->total_len = read_len;
1075
1076                 /*
1077                  * If more data is pending, set SPLICE_F_MORE
1078                  * If this is the last data and SPLICE_F_MORE was not set
1079                  * initially, clears it.
1080                  */
1081                 if (read_len < len)
1082                         sd->flags |= SPLICE_F_MORE;
1083                 else if (!more)
1084                         sd->flags &= ~SPLICE_F_MORE;
1085                 /*
1086                  * NOTE: nonblocking mode only applies to the input. We
1087                  * must not do the output in nonblocking mode as then we
1088                  * could get stuck data in the internal pipe:
1089                  */
1090                 ret = actor(pipe, sd);
1091                 if (unlikely(ret <= 0)) {
1092                         sd->pos = prev_pos;
1093                         goto out_release;
1094                 }
1095
1096                 bytes += ret;
1097                 len -= ret;
1098                 sd->pos = pos;
1099
1100                 if (ret < read_len) {
1101                         sd->pos = prev_pos + ret;
1102                         goto out_release;
1103                 }
1104         }
1105
1106 done:
1107         pipe->tail = pipe->head = 0;
1108         file_accessed(in);
1109         return bytes;
1110
1111 out_release:
1112         /*
1113          * If we did an incomplete transfer we must release
1114          * the pipe buffers in question:
1115          */
1116         for (i = 0; i < pipe->ring_size; i++) {
1117                 struct pipe_buffer *buf = &pipe->bufs[i];
1118
1119                 if (buf->ops)
1120                         pipe_buf_release(pipe, buf);
1121         }
1122
1123         if (!bytes)
1124                 bytes = ret;
1125
1126         goto done;
1127 }
1128 EXPORT_SYMBOL(splice_direct_to_actor);
1129
1130 static int direct_splice_actor(struct pipe_inode_info *pipe,
1131                                struct splice_desc *sd)
1132 {
1133         struct file *file = sd->u.file;
1134
1135         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1136                               sd->flags);
1137 }
1138
1139 /**
1140  * do_splice_direct - splices data directly between two files
1141  * @in:         file to splice from
1142  * @ppos:       input file offset
1143  * @out:        file to splice to
1144  * @opos:       output file offset
1145  * @len:        number of bytes to splice
1146  * @flags:      splice modifier flags
1147  *
1148  * Description:
1149  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1150  *    doing it in the application would incur an extra system call
1151  *    (splice in + splice out, as compared to just sendfile()). So this helper
1152  *    can splice directly through a process-private pipe.
1153  *
1154  */
1155 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1156                       loff_t *opos, size_t len, unsigned int flags)
1157 {
1158         struct splice_desc sd = {
1159                 .len            = len,
1160                 .total_len      = len,
1161                 .flags          = flags,
1162                 .pos            = *ppos,
1163                 .u.file         = out,
1164                 .opos           = opos,
1165         };
1166         long ret;
1167
1168         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1169                 return -EBADF;
1170
1171         if (unlikely(out->f_flags & O_APPEND))
1172                 return -EINVAL;
1173
1174         ret = rw_verify_area(WRITE, out, opos, len);
1175         if (unlikely(ret < 0))
1176                 return ret;
1177
1178         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1179         if (ret > 0)
1180                 *ppos = sd.pos;
1181
1182         return ret;
1183 }
1184 EXPORT_SYMBOL(do_splice_direct);
1185
1186 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1187 {
1188         for (;;) {
1189                 if (unlikely(!pipe->readers)) {
1190                         send_sig(SIGPIPE, current, 0);
1191                         return -EPIPE;
1192                 }
1193                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1194                         return 0;
1195                 if (flags & SPLICE_F_NONBLOCK)
1196                         return -EAGAIN;
1197                 if (signal_pending(current))
1198                         return -ERESTARTSYS;
1199                 pipe_wait_writable(pipe);
1200         }
1201 }
1202
1203 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1204                                struct pipe_inode_info *opipe,
1205                                size_t len, unsigned int flags);
1206
1207 long splice_file_to_pipe(struct file *in,
1208                          struct pipe_inode_info *opipe,
1209                          loff_t *offset,
1210                          size_t len, unsigned int flags)
1211 {
1212         long ret;
1213
1214         pipe_lock(opipe);
1215         ret = wait_for_space(opipe, flags);
1216         if (!ret)
1217                 ret = do_splice_to(in, offset, opipe, len, flags);
1218         pipe_unlock(opipe);
1219         if (ret > 0)
1220                 wakeup_pipe_readers(opipe);
1221         return ret;
1222 }
1223
1224 /*
1225  * Determine where to splice to/from.
1226  */
1227 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1228                loff_t *off_out, size_t len, unsigned int flags)
1229 {
1230         struct pipe_inode_info *ipipe;
1231         struct pipe_inode_info *opipe;
1232         loff_t offset;
1233         long ret;
1234
1235         if (unlikely(!(in->f_mode & FMODE_READ) ||
1236                      !(out->f_mode & FMODE_WRITE)))
1237                 return -EBADF;
1238
1239         ipipe = get_pipe_info(in, true);
1240         opipe = get_pipe_info(out, true);
1241
1242         if (ipipe && opipe) {
1243                 if (off_in || off_out)
1244                         return -ESPIPE;
1245
1246                 /* Splicing to self would be fun, but... */
1247                 if (ipipe == opipe)
1248                         return -EINVAL;
1249
1250                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1251                         flags |= SPLICE_F_NONBLOCK;
1252
1253                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1254         }
1255
1256         if (ipipe) {
1257                 if (off_in)
1258                         return -ESPIPE;
1259                 if (off_out) {
1260                         if (!(out->f_mode & FMODE_PWRITE))
1261                                 return -EINVAL;
1262                         offset = *off_out;
1263                 } else {
1264                         offset = out->f_pos;
1265                 }
1266
1267                 if (unlikely(out->f_flags & O_APPEND))
1268                         return -EINVAL;
1269
1270                 ret = rw_verify_area(WRITE, out, &offset, len);
1271                 if (unlikely(ret < 0))
1272                         return ret;
1273
1274                 if (in->f_flags & O_NONBLOCK)
1275                         flags |= SPLICE_F_NONBLOCK;
1276
1277                 file_start_write(out);
1278                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1279                 file_end_write(out);
1280
1281                 if (ret > 0)
1282                         fsnotify_modify(out);
1283
1284                 if (!off_out)
1285                         out->f_pos = offset;
1286                 else
1287                         *off_out = offset;
1288
1289                 return ret;
1290         }
1291
1292         if (opipe) {
1293                 if (off_out)
1294                         return -ESPIPE;
1295                 if (off_in) {
1296                         if (!(in->f_mode & FMODE_PREAD))
1297                                 return -EINVAL;
1298                         offset = *off_in;
1299                 } else {
1300                         offset = in->f_pos;
1301                 }
1302
1303                 if (out->f_flags & O_NONBLOCK)
1304                         flags |= SPLICE_F_NONBLOCK;
1305
1306                 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1307
1308                 if (ret > 0)
1309                         fsnotify_access(in);
1310
1311                 if (!off_in)
1312                         in->f_pos = offset;
1313                 else
1314                         *off_in = offset;
1315
1316                 return ret;
1317         }
1318
1319         return -EINVAL;
1320 }
1321
1322 static long __do_splice(struct file *in, loff_t __user *off_in,
1323                         struct file *out, loff_t __user *off_out,
1324                         size_t len, unsigned int flags)
1325 {
1326         struct pipe_inode_info *ipipe;
1327         struct pipe_inode_info *opipe;
1328         loff_t offset, *__off_in = NULL, *__off_out = NULL;
1329         long ret;
1330
1331         ipipe = get_pipe_info(in, true);
1332         opipe = get_pipe_info(out, true);
1333
1334         if (ipipe) {
1335                 if (off_in)
1336                         return -ESPIPE;
1337                 pipe_clear_nowait(in);
1338         }
1339         if (opipe) {
1340                 if (off_out)
1341                         return -ESPIPE;
1342                 pipe_clear_nowait(out);
1343         }
1344
1345         if (off_out) {
1346                 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1347                         return -EFAULT;
1348                 __off_out = &offset;
1349         }
1350         if (off_in) {
1351                 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1352                         return -EFAULT;
1353                 __off_in = &offset;
1354         }
1355
1356         ret = do_splice(in, __off_in, out, __off_out, len, flags);
1357         if (ret < 0)
1358                 return ret;
1359
1360         if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1361                 return -EFAULT;
1362         if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1363                 return -EFAULT;
1364
1365         return ret;
1366 }
1367
1368 static int iter_to_pipe(struct iov_iter *from,
1369                         struct pipe_inode_info *pipe,
1370                         unsigned flags)
1371 {
1372         struct pipe_buffer buf = {
1373                 .ops = &user_page_pipe_buf_ops,
1374                 .flags = flags
1375         };
1376         size_t total = 0;
1377         int ret = 0;
1378
1379         while (iov_iter_count(from)) {
1380                 struct page *pages[16];
1381                 ssize_t left;
1382                 size_t start;
1383                 int i, n;
1384
1385                 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1386                 if (left <= 0) {
1387                         ret = left;
1388                         break;
1389                 }
1390
1391                 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1392                 for (i = 0; i < n; i++) {
1393                         int size = min_t(int, left, PAGE_SIZE - start);
1394
1395                         buf.page = pages[i];
1396                         buf.offset = start;
1397                         buf.len = size;
1398                         ret = add_to_pipe(pipe, &buf);
1399                         if (unlikely(ret < 0)) {
1400                                 iov_iter_revert(from, left);
1401                                 // this one got dropped by add_to_pipe()
1402                                 while (++i < n)
1403                                         put_page(pages[i]);
1404                                 goto out;
1405                         }
1406                         total += ret;
1407                         left -= size;
1408                         start = 0;
1409                 }
1410         }
1411 out:
1412         return total ? total : ret;
1413 }
1414
1415 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1416                         struct splice_desc *sd)
1417 {
1418         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1419         return n == sd->len ? n : -EFAULT;
1420 }
1421
1422 /*
1423  * For lack of a better implementation, implement vmsplice() to userspace
1424  * as a simple copy of the pipes pages to the user iov.
1425  */
1426 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1427                              unsigned int flags)
1428 {
1429         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1430         struct splice_desc sd = {
1431                 .total_len = iov_iter_count(iter),
1432                 .flags = flags,
1433                 .u.data = iter
1434         };
1435         long ret = 0;
1436
1437         if (!pipe)
1438                 return -EBADF;
1439
1440         pipe_clear_nowait(file);
1441
1442         if (sd.total_len) {
1443                 pipe_lock(pipe);
1444                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1445                 pipe_unlock(pipe);
1446         }
1447
1448         return ret;
1449 }
1450
1451 /*
1452  * vmsplice splices a user address range into a pipe. It can be thought of
1453  * as splice-from-memory, where the regular splice is splice-from-file (or
1454  * to file). In both cases the output is a pipe, naturally.
1455  */
1456 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1457                              unsigned int flags)
1458 {
1459         struct pipe_inode_info *pipe;
1460         long ret = 0;
1461         unsigned buf_flag = 0;
1462
1463         if (flags & SPLICE_F_GIFT)
1464                 buf_flag = PIPE_BUF_FLAG_GIFT;
1465
1466         pipe = get_pipe_info(file, true);
1467         if (!pipe)
1468                 return -EBADF;
1469
1470         pipe_clear_nowait(file);
1471
1472         pipe_lock(pipe);
1473         ret = wait_for_space(pipe, flags);
1474         if (!ret)
1475                 ret = iter_to_pipe(iter, pipe, buf_flag);
1476         pipe_unlock(pipe);
1477         if (ret > 0)
1478                 wakeup_pipe_readers(pipe);
1479         return ret;
1480 }
1481
1482 static int vmsplice_type(struct fd f, int *type)
1483 {
1484         if (!f.file)
1485                 return -EBADF;
1486         if (f.file->f_mode & FMODE_WRITE) {
1487                 *type = ITER_SOURCE;
1488         } else if (f.file->f_mode & FMODE_READ) {
1489                 *type = ITER_DEST;
1490         } else {
1491                 fdput(f);
1492                 return -EBADF;
1493         }
1494         return 0;
1495 }
1496
1497 /*
1498  * Note that vmsplice only really supports true splicing _from_ user memory
1499  * to a pipe, not the other way around. Splicing from user memory is a simple
1500  * operation that can be supported without any funky alignment restrictions
1501  * or nasty vm tricks. We simply map in the user memory and fill them into
1502  * a pipe. The reverse isn't quite as easy, though. There are two possible
1503  * solutions for that:
1504  *
1505  *      - memcpy() the data internally, at which point we might as well just
1506  *        do a regular read() on the buffer anyway.
1507  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1508  *        has restriction limitations on both ends of the pipe).
1509  *
1510  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1511  *
1512  */
1513 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1514                 unsigned long, nr_segs, unsigned int, flags)
1515 {
1516         struct iovec iovstack[UIO_FASTIOV];
1517         struct iovec *iov = iovstack;
1518         struct iov_iter iter;
1519         ssize_t error;
1520         struct fd f;
1521         int type;
1522
1523         if (unlikely(flags & ~SPLICE_F_ALL))
1524                 return -EINVAL;
1525
1526         f = fdget(fd);
1527         error = vmsplice_type(f, &type);
1528         if (error)
1529                 return error;
1530
1531         error = import_iovec(type, uiov, nr_segs,
1532                              ARRAY_SIZE(iovstack), &iov, &iter);
1533         if (error < 0)
1534                 goto out_fdput;
1535
1536         if (!iov_iter_count(&iter))
1537                 error = 0;
1538         else if (type == ITER_SOURCE)
1539                 error = vmsplice_to_pipe(f.file, &iter, flags);
1540         else
1541                 error = vmsplice_to_user(f.file, &iter, flags);
1542
1543         kfree(iov);
1544 out_fdput:
1545         fdput(f);
1546         return error;
1547 }
1548
1549 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1550                 int, fd_out, loff_t __user *, off_out,
1551                 size_t, len, unsigned int, flags)
1552 {
1553         struct fd in, out;
1554         long error;
1555
1556         if (unlikely(!len))
1557                 return 0;
1558
1559         if (unlikely(flags & ~SPLICE_F_ALL))
1560                 return -EINVAL;
1561
1562         error = -EBADF;
1563         in = fdget(fd_in);
1564         if (in.file) {
1565                 out = fdget(fd_out);
1566                 if (out.file) {
1567                         error = __do_splice(in.file, off_in, out.file, off_out,
1568                                                 len, flags);
1569                         fdput(out);
1570                 }
1571                 fdput(in);
1572         }
1573         return error;
1574 }
1575
1576 /*
1577  * Make sure there's data to read. Wait for input if we can, otherwise
1578  * return an appropriate error.
1579  */
1580 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1581 {
1582         int ret;
1583
1584         /*
1585          * Check the pipe occupancy without the inode lock first. This function
1586          * is speculative anyways, so missing one is ok.
1587          */
1588         if (!pipe_empty(pipe->head, pipe->tail))
1589                 return 0;
1590
1591         ret = 0;
1592         pipe_lock(pipe);
1593
1594         while (pipe_empty(pipe->head, pipe->tail)) {
1595                 if (signal_pending(current)) {
1596                         ret = -ERESTARTSYS;
1597                         break;
1598                 }
1599                 if (!pipe->writers)
1600                         break;
1601                 if (flags & SPLICE_F_NONBLOCK) {
1602                         ret = -EAGAIN;
1603                         break;
1604                 }
1605                 pipe_wait_readable(pipe);
1606         }
1607
1608         pipe_unlock(pipe);
1609         return ret;
1610 }
1611
1612 /*
1613  * Make sure there's writeable room. Wait for room if we can, otherwise
1614  * return an appropriate error.
1615  */
1616 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1617 {
1618         int ret;
1619
1620         /*
1621          * Check pipe occupancy without the inode lock first. This function
1622          * is speculative anyways, so missing one is ok.
1623          */
1624         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1625                 return 0;
1626
1627         ret = 0;
1628         pipe_lock(pipe);
1629
1630         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1631                 if (!pipe->readers) {
1632                         send_sig(SIGPIPE, current, 0);
1633                         ret = -EPIPE;
1634                         break;
1635                 }
1636                 if (flags & SPLICE_F_NONBLOCK) {
1637                         ret = -EAGAIN;
1638                         break;
1639                 }
1640                 if (signal_pending(current)) {
1641                         ret = -ERESTARTSYS;
1642                         break;
1643                 }
1644                 pipe_wait_writable(pipe);
1645         }
1646
1647         pipe_unlock(pipe);
1648         return ret;
1649 }
1650
1651 /*
1652  * Splice contents of ipipe to opipe.
1653  */
1654 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1655                                struct pipe_inode_info *opipe,
1656                                size_t len, unsigned int flags)
1657 {
1658         struct pipe_buffer *ibuf, *obuf;
1659         unsigned int i_head, o_head;
1660         unsigned int i_tail, o_tail;
1661         unsigned int i_mask, o_mask;
1662         int ret = 0;
1663         bool input_wakeup = false;
1664
1665
1666 retry:
1667         ret = ipipe_prep(ipipe, flags);
1668         if (ret)
1669                 return ret;
1670
1671         ret = opipe_prep(opipe, flags);
1672         if (ret)
1673                 return ret;
1674
1675         /*
1676          * Potential ABBA deadlock, work around it by ordering lock
1677          * grabbing by pipe info address. Otherwise two different processes
1678          * could deadlock (one doing tee from A -> B, the other from B -> A).
1679          */
1680         pipe_double_lock(ipipe, opipe);
1681
1682         i_tail = ipipe->tail;
1683         i_mask = ipipe->ring_size - 1;
1684         o_head = opipe->head;
1685         o_mask = opipe->ring_size - 1;
1686
1687         do {
1688                 size_t o_len;
1689
1690                 if (!opipe->readers) {
1691                         send_sig(SIGPIPE, current, 0);
1692                         if (!ret)
1693                                 ret = -EPIPE;
1694                         break;
1695                 }
1696
1697                 i_head = ipipe->head;
1698                 o_tail = opipe->tail;
1699
1700                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1701                         break;
1702
1703                 /*
1704                  * Cannot make any progress, because either the input
1705                  * pipe is empty or the output pipe is full.
1706                  */
1707                 if (pipe_empty(i_head, i_tail) ||
1708                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1709                         /* Already processed some buffers, break */
1710                         if (ret)
1711                                 break;
1712
1713                         if (flags & SPLICE_F_NONBLOCK) {
1714                                 ret = -EAGAIN;
1715                                 break;
1716                         }
1717
1718                         /*
1719                          * We raced with another reader/writer and haven't
1720                          * managed to process any buffers.  A zero return
1721                          * value means EOF, so retry instead.
1722                          */
1723                         pipe_unlock(ipipe);
1724                         pipe_unlock(opipe);
1725                         goto retry;
1726                 }
1727
1728                 ibuf = &ipipe->bufs[i_tail & i_mask];
1729                 obuf = &opipe->bufs[o_head & o_mask];
1730
1731                 if (len >= ibuf->len) {
1732                         /*
1733                          * Simply move the whole buffer from ipipe to opipe
1734                          */
1735                         *obuf = *ibuf;
1736                         ibuf->ops = NULL;
1737                         i_tail++;
1738                         ipipe->tail = i_tail;
1739                         input_wakeup = true;
1740                         o_len = obuf->len;
1741                         o_head++;
1742                         opipe->head = o_head;
1743                 } else {
1744                         /*
1745                          * Get a reference to this pipe buffer,
1746                          * so we can copy the contents over.
1747                          */
1748                         if (!pipe_buf_get(ipipe, ibuf)) {
1749                                 if (ret == 0)
1750                                         ret = -EFAULT;
1751                                 break;
1752                         }
1753                         *obuf = *ibuf;
1754
1755                         /*
1756                          * Don't inherit the gift and merge flags, we need to
1757                          * prevent multiple steals of this page.
1758                          */
1759                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1760                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1761
1762                         obuf->len = len;
1763                         ibuf->offset += len;
1764                         ibuf->len -= len;
1765                         o_len = len;
1766                         o_head++;
1767                         opipe->head = o_head;
1768                 }
1769                 ret += o_len;
1770                 len -= o_len;
1771         } while (len);
1772
1773         pipe_unlock(ipipe);
1774         pipe_unlock(opipe);
1775
1776         /*
1777          * If we put data in the output pipe, wakeup any potential readers.
1778          */
1779         if (ret > 0)
1780                 wakeup_pipe_readers(opipe);
1781
1782         if (input_wakeup)
1783                 wakeup_pipe_writers(ipipe);
1784
1785         return ret;
1786 }
1787
1788 /*
1789  * Link contents of ipipe to opipe.
1790  */
1791 static int link_pipe(struct pipe_inode_info *ipipe,
1792                      struct pipe_inode_info *opipe,
1793                      size_t len, unsigned int flags)
1794 {
1795         struct pipe_buffer *ibuf, *obuf;
1796         unsigned int i_head, o_head;
1797         unsigned int i_tail, o_tail;
1798         unsigned int i_mask, o_mask;
1799         int ret = 0;
1800
1801         /*
1802          * Potential ABBA deadlock, work around it by ordering lock
1803          * grabbing by pipe info address. Otherwise two different processes
1804          * could deadlock (one doing tee from A -> B, the other from B -> A).
1805          */
1806         pipe_double_lock(ipipe, opipe);
1807
1808         i_tail = ipipe->tail;
1809         i_mask = ipipe->ring_size - 1;
1810         o_head = opipe->head;
1811         o_mask = opipe->ring_size - 1;
1812
1813         do {
1814                 if (!opipe->readers) {
1815                         send_sig(SIGPIPE, current, 0);
1816                         if (!ret)
1817                                 ret = -EPIPE;
1818                         break;
1819                 }
1820
1821                 i_head = ipipe->head;
1822                 o_tail = opipe->tail;
1823
1824                 /*
1825                  * If we have iterated all input buffers or run out of
1826                  * output room, break.
1827                  */
1828                 if (pipe_empty(i_head, i_tail) ||
1829                     pipe_full(o_head, o_tail, opipe->max_usage))
1830                         break;
1831
1832                 ibuf = &ipipe->bufs[i_tail & i_mask];
1833                 obuf = &opipe->bufs[o_head & o_mask];
1834
1835                 /*
1836                  * Get a reference to this pipe buffer,
1837                  * so we can copy the contents over.
1838                  */
1839                 if (!pipe_buf_get(ipipe, ibuf)) {
1840                         if (ret == 0)
1841                                 ret = -EFAULT;
1842                         break;
1843                 }
1844
1845                 *obuf = *ibuf;
1846
1847                 /*
1848                  * Don't inherit the gift and merge flag, we need to prevent
1849                  * multiple steals of this page.
1850                  */
1851                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1852                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1853
1854                 if (obuf->len > len)
1855                         obuf->len = len;
1856                 ret += obuf->len;
1857                 len -= obuf->len;
1858
1859                 o_head++;
1860                 opipe->head = o_head;
1861                 i_tail++;
1862         } while (len);
1863
1864         pipe_unlock(ipipe);
1865         pipe_unlock(opipe);
1866
1867         /*
1868          * If we put data in the output pipe, wakeup any potential readers.
1869          */
1870         if (ret > 0)
1871                 wakeup_pipe_readers(opipe);
1872
1873         return ret;
1874 }
1875
1876 /*
1877  * This is a tee(1) implementation that works on pipes. It doesn't copy
1878  * any data, it simply references the 'in' pages on the 'out' pipe.
1879  * The 'flags' used are the SPLICE_F_* variants, currently the only
1880  * applicable one is SPLICE_F_NONBLOCK.
1881  */
1882 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1883 {
1884         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1885         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1886         int ret = -EINVAL;
1887
1888         if (unlikely(!(in->f_mode & FMODE_READ) ||
1889                      !(out->f_mode & FMODE_WRITE)))
1890                 return -EBADF;
1891
1892         /*
1893          * Duplicate the contents of ipipe to opipe without actually
1894          * copying the data.
1895          */
1896         if (ipipe && opipe && ipipe != opipe) {
1897                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1898                         flags |= SPLICE_F_NONBLOCK;
1899
1900                 /*
1901                  * Keep going, unless we encounter an error. The ipipe/opipe
1902                  * ordering doesn't really matter.
1903                  */
1904                 ret = ipipe_prep(ipipe, flags);
1905                 if (!ret) {
1906                         ret = opipe_prep(opipe, flags);
1907                         if (!ret)
1908                                 ret = link_pipe(ipipe, opipe, len, flags);
1909                 }
1910         }
1911
1912         return ret;
1913 }
1914
1915 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1916 {
1917         struct fd in, out;
1918         int error;
1919
1920         if (unlikely(flags & ~SPLICE_F_ALL))
1921                 return -EINVAL;
1922
1923         if (unlikely(!len))
1924                 return 0;
1925
1926         error = -EBADF;
1927         in = fdget(fdin);
1928         if (in.file) {
1929                 out = fdget(fdout);
1930                 if (out.file) {
1931                         error = do_tee(in.file, out.file, len, flags);
1932                         fdput(out);
1933                 }
1934                 fdput(in);
1935         }
1936
1937         return error;
1938 }