crypto: Fix af_alg_sendmsg(MSG_SPLICE_PAGES) sglist limit
[platform/kernel/linux-rpi.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/net.h>
37 #include <linux/socket.h>
38 #include <linux/sched/signal.h>
39
40 #include "internal.h"
41
42 /*
43  * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44  * indicate they support non-blocking reads or writes, we must clear it
45  * here if set to avoid blocking other users of this pipe if splice is
46  * being done on it.
47  */
48 static noinline void noinline pipe_clear_nowait(struct file *file)
49 {
50         fmode_t fmode = READ_ONCE(file->f_mode);
51
52         do {
53                 if (!(fmode & FMODE_NOWAIT))
54                         break;
55         } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 }
57
58 /*
59  * Attempt to steal a page from a pipe buffer. This should perhaps go into
60  * a vm helper function, it's already simplified quite a bit by the
61  * addition of remove_mapping(). If success is returned, the caller may
62  * attempt to reuse this page for another destination.
63  */
64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65                 struct pipe_buffer *buf)
66 {
67         struct folio *folio = page_folio(buf->page);
68         struct address_space *mapping;
69
70         folio_lock(folio);
71
72         mapping = folio_mapping(folio);
73         if (mapping) {
74                 WARN_ON(!folio_test_uptodate(folio));
75
76                 /*
77                  * At least for ext2 with nobh option, we need to wait on
78                  * writeback completing on this folio, since we'll remove it
79                  * from the pagecache.  Otherwise truncate wont wait on the
80                  * folio, allowing the disk blocks to be reused by someone else
81                  * before we actually wrote our data to them. fs corruption
82                  * ensues.
83                  */
84                 folio_wait_writeback(folio);
85
86                 if (folio_has_private(folio) &&
87                     !filemap_release_folio(folio, GFP_KERNEL))
88                         goto out_unlock;
89
90                 /*
91                  * If we succeeded in removing the mapping, set LRU flag
92                  * and return good.
93                  */
94                 if (remove_mapping(mapping, folio)) {
95                         buf->flags |= PIPE_BUF_FLAG_LRU;
96                         return true;
97                 }
98         }
99
100         /*
101          * Raced with truncate or failed to remove folio from current
102          * address space, unlock and return failure.
103          */
104 out_unlock:
105         folio_unlock(folio);
106         return false;
107 }
108
109 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
110                                         struct pipe_buffer *buf)
111 {
112         put_page(buf->page);
113         buf->flags &= ~PIPE_BUF_FLAG_LRU;
114 }
115
116 /*
117  * Check whether the contents of buf is OK to access. Since the content
118  * is a page cache page, IO may be in flight.
119  */
120 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
121                                        struct pipe_buffer *buf)
122 {
123         struct page *page = buf->page;
124         int err;
125
126         if (!PageUptodate(page)) {
127                 lock_page(page);
128
129                 /*
130                  * Page got truncated/unhashed. This will cause a 0-byte
131                  * splice, if this is the first page.
132                  */
133                 if (!page->mapping) {
134                         err = -ENODATA;
135                         goto error;
136                 }
137
138                 /*
139                  * Uh oh, read-error from disk.
140                  */
141                 if (!PageUptodate(page)) {
142                         err = -EIO;
143                         goto error;
144                 }
145
146                 /*
147                  * Page is ok afterall, we are done.
148                  */
149                 unlock_page(page);
150         }
151
152         return 0;
153 error:
154         unlock_page(page);
155         return err;
156 }
157
158 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
159         .confirm        = page_cache_pipe_buf_confirm,
160         .release        = page_cache_pipe_buf_release,
161         .try_steal      = page_cache_pipe_buf_try_steal,
162         .get            = generic_pipe_buf_get,
163 };
164
165 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
166                 struct pipe_buffer *buf)
167 {
168         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
169                 return false;
170
171         buf->flags |= PIPE_BUF_FLAG_LRU;
172         return generic_pipe_buf_try_steal(pipe, buf);
173 }
174
175 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
176         .release        = page_cache_pipe_buf_release,
177         .try_steal      = user_page_pipe_buf_try_steal,
178         .get            = generic_pipe_buf_get,
179 };
180
181 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
182 {
183         smp_mb();
184         if (waitqueue_active(&pipe->rd_wait))
185                 wake_up_interruptible(&pipe->rd_wait);
186         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
187 }
188
189 /**
190  * splice_to_pipe - fill passed data into a pipe
191  * @pipe:       pipe to fill
192  * @spd:        data to fill
193  *
194  * Description:
195  *    @spd contains a map of pages and len/offset tuples, along with
196  *    the struct pipe_buf_operations associated with these pages. This
197  *    function will link that data to the pipe.
198  *
199  */
200 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201                        struct splice_pipe_desc *spd)
202 {
203         unsigned int spd_pages = spd->nr_pages;
204         unsigned int tail = pipe->tail;
205         unsigned int head = pipe->head;
206         unsigned int mask = pipe->ring_size - 1;
207         int ret = 0, page_nr = 0;
208
209         if (!spd_pages)
210                 return 0;
211
212         if (unlikely(!pipe->readers)) {
213                 send_sig(SIGPIPE, current, 0);
214                 ret = -EPIPE;
215                 goto out;
216         }
217
218         while (!pipe_full(head, tail, pipe->max_usage)) {
219                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
220
221                 buf->page = spd->pages[page_nr];
222                 buf->offset = spd->partial[page_nr].offset;
223                 buf->len = spd->partial[page_nr].len;
224                 buf->private = spd->partial[page_nr].private;
225                 buf->ops = spd->ops;
226                 buf->flags = 0;
227
228                 head++;
229                 pipe->head = head;
230                 page_nr++;
231                 ret += buf->len;
232
233                 if (!--spd->nr_pages)
234                         break;
235         }
236
237         if (!ret)
238                 ret = -EAGAIN;
239
240 out:
241         while (page_nr < spd_pages)
242                 spd->spd_release(spd, page_nr++);
243
244         return ret;
245 }
246 EXPORT_SYMBOL_GPL(splice_to_pipe);
247
248 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
249 {
250         unsigned int head = pipe->head;
251         unsigned int tail = pipe->tail;
252         unsigned int mask = pipe->ring_size - 1;
253         int ret;
254
255         if (unlikely(!pipe->readers)) {
256                 send_sig(SIGPIPE, current, 0);
257                 ret = -EPIPE;
258         } else if (pipe_full(head, tail, pipe->max_usage)) {
259                 ret = -EAGAIN;
260         } else {
261                 pipe->bufs[head & mask] = *buf;
262                 pipe->head = head + 1;
263                 return buf->len;
264         }
265         pipe_buf_release(pipe, buf);
266         return ret;
267 }
268 EXPORT_SYMBOL(add_to_pipe);
269
270 /*
271  * Check if we need to grow the arrays holding pages and partial page
272  * descriptions.
273  */
274 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
275 {
276         unsigned int max_usage = READ_ONCE(pipe->max_usage);
277
278         spd->nr_pages_max = max_usage;
279         if (max_usage <= PIPE_DEF_BUFFERS)
280                 return 0;
281
282         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
283         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
284                                      GFP_KERNEL);
285
286         if (spd->pages && spd->partial)
287                 return 0;
288
289         kfree(spd->pages);
290         kfree(spd->partial);
291         return -ENOMEM;
292 }
293
294 void splice_shrink_spd(struct splice_pipe_desc *spd)
295 {
296         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
297                 return;
298
299         kfree(spd->pages);
300         kfree(spd->partial);
301 }
302
303 /*
304  * Splice data from an O_DIRECT file into pages and then add them to the output
305  * pipe.
306  */
307 ssize_t direct_splice_read(struct file *in, loff_t *ppos,
308                            struct pipe_inode_info *pipe,
309                            size_t len, unsigned int flags)
310 {
311         struct iov_iter to;
312         struct bio_vec *bv;
313         struct kiocb kiocb;
314         struct page **pages;
315         ssize_t ret;
316         size_t used, npages, chunk, remain, reclaim;
317         int i;
318
319         /* Work out how much data we can actually add into the pipe */
320         used = pipe_occupancy(pipe->head, pipe->tail);
321         npages = max_t(ssize_t, pipe->max_usage - used, 0);
322         len = min_t(size_t, len, npages * PAGE_SIZE);
323         npages = DIV_ROUND_UP(len, PAGE_SIZE);
324
325         bv = kzalloc(array_size(npages, sizeof(bv[0])) +
326                      array_size(npages, sizeof(struct page *)), GFP_KERNEL);
327         if (!bv)
328                 return -ENOMEM;
329
330         pages = (void *)(bv + npages);
331         npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
332         if (!npages) {
333                 kfree(bv);
334                 return -ENOMEM;
335         }
336
337         remain = len = min_t(size_t, len, npages * PAGE_SIZE);
338
339         for (i = 0; i < npages; i++) {
340                 chunk = min_t(size_t, PAGE_SIZE, remain);
341                 bv[i].bv_page = pages[i];
342                 bv[i].bv_offset = 0;
343                 bv[i].bv_len = chunk;
344                 remain -= chunk;
345         }
346
347         /* Do the I/O */
348         iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
349         init_sync_kiocb(&kiocb, in);
350         kiocb.ki_pos = *ppos;
351         ret = call_read_iter(in, &kiocb, &to);
352
353         reclaim = npages * PAGE_SIZE;
354         remain = 0;
355         if (ret > 0) {
356                 reclaim -= ret;
357                 remain = ret;
358                 *ppos = kiocb.ki_pos;
359                 file_accessed(in);
360         } else if (ret < 0) {
361                 /*
362                  * callers of ->splice_read() expect -EAGAIN on
363                  * "can't put anything in there", rather than -EFAULT.
364                  */
365                 if (ret == -EFAULT)
366                         ret = -EAGAIN;
367         }
368
369         /* Free any pages that didn't get touched at all. */
370         reclaim /= PAGE_SIZE;
371         if (reclaim) {
372                 npages -= reclaim;
373                 release_pages(pages + npages, reclaim);
374         }
375
376         /* Push the remaining pages into the pipe. */
377         for (i = 0; i < npages; i++) {
378                 struct pipe_buffer *buf = pipe_head_buf(pipe);
379
380                 chunk = min_t(size_t, remain, PAGE_SIZE);
381                 *buf = (struct pipe_buffer) {
382                         .ops    = &default_pipe_buf_ops,
383                         .page   = bv[i].bv_page,
384                         .offset = 0,
385                         .len    = chunk,
386                 };
387                 pipe->head++;
388                 remain -= chunk;
389         }
390
391         kfree(bv);
392         return ret;
393 }
394 EXPORT_SYMBOL(direct_splice_read);
395
396 /**
397  * generic_file_splice_read - splice data from file to a pipe
398  * @in:         file to splice from
399  * @ppos:       position in @in
400  * @pipe:       pipe to splice to
401  * @len:        number of bytes to splice
402  * @flags:      splice modifier flags
403  *
404  * Description:
405  *    Will read pages from given file and fill them into a pipe. Can be
406  *    used as long as it has more or less sane ->read_iter().
407  *
408  */
409 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
410                                  struct pipe_inode_info *pipe, size_t len,
411                                  unsigned int flags)
412 {
413         struct iov_iter to;
414         struct kiocb kiocb;
415         int ret;
416
417         iov_iter_pipe(&to, ITER_DEST, pipe, len);
418         init_sync_kiocb(&kiocb, in);
419         kiocb.ki_pos = *ppos;
420         ret = call_read_iter(in, &kiocb, &to);
421         if (ret > 0) {
422                 *ppos = kiocb.ki_pos;
423                 file_accessed(in);
424         } else if (ret < 0) {
425                 /* free what was emitted */
426                 pipe_discard_from(pipe, to.start_head);
427                 /*
428                  * callers of ->splice_read() expect -EAGAIN on
429                  * "can't put anything in there", rather than -EFAULT.
430                  */
431                 if (ret == -EFAULT)
432                         ret = -EAGAIN;
433         }
434
435         return ret;
436 }
437 EXPORT_SYMBOL(generic_file_splice_read);
438
439 const struct pipe_buf_operations default_pipe_buf_ops = {
440         .release        = generic_pipe_buf_release,
441         .try_steal      = generic_pipe_buf_try_steal,
442         .get            = generic_pipe_buf_get,
443 };
444
445 /* Pipe buffer operations for a socket and similar. */
446 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
447         .release        = generic_pipe_buf_release,
448         .get            = generic_pipe_buf_get,
449 };
450 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
451
452 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
453 {
454         smp_mb();
455         if (waitqueue_active(&pipe->wr_wait))
456                 wake_up_interruptible(&pipe->wr_wait);
457         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
458 }
459
460 /**
461  * splice_from_pipe_feed - feed available data from a pipe to a file
462  * @pipe:       pipe to splice from
463  * @sd:         information to @actor
464  * @actor:      handler that splices the data
465  *
466  * Description:
467  *    This function loops over the pipe and calls @actor to do the
468  *    actual moving of a single struct pipe_buffer to the desired
469  *    destination.  It returns when there's no more buffers left in
470  *    the pipe or if the requested number of bytes (@sd->total_len)
471  *    have been copied.  It returns a positive number (one) if the
472  *    pipe needs to be filled with more data, zero if the required
473  *    number of bytes have been copied and -errno on error.
474  *
475  *    This, together with splice_from_pipe_{begin,end,next}, may be
476  *    used to implement the functionality of __splice_from_pipe() when
477  *    locking is required around copying the pipe buffers to the
478  *    destination.
479  */
480 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
481                           splice_actor *actor)
482 {
483         unsigned int head = pipe->head;
484         unsigned int tail = pipe->tail;
485         unsigned int mask = pipe->ring_size - 1;
486         int ret;
487
488         while (!pipe_empty(head, tail)) {
489                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
490
491                 sd->len = buf->len;
492                 if (sd->len > sd->total_len)
493                         sd->len = sd->total_len;
494
495                 ret = pipe_buf_confirm(pipe, buf);
496                 if (unlikely(ret)) {
497                         if (ret == -ENODATA)
498                                 ret = 0;
499                         return ret;
500                 }
501
502                 ret = actor(pipe, buf, sd);
503                 if (ret <= 0)
504                         return ret;
505
506                 buf->offset += ret;
507                 buf->len -= ret;
508
509                 sd->num_spliced += ret;
510                 sd->len -= ret;
511                 sd->pos += ret;
512                 sd->total_len -= ret;
513
514                 if (!buf->len) {
515                         pipe_buf_release(pipe, buf);
516                         tail++;
517                         pipe->tail = tail;
518                         if (pipe->files)
519                                 sd->need_wakeup = true;
520                 }
521
522                 if (!sd->total_len)
523                         return 0;
524         }
525
526         return 1;
527 }
528
529 /* We know we have a pipe buffer, but maybe it's empty? */
530 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
531 {
532         unsigned int tail = pipe->tail;
533         unsigned int mask = pipe->ring_size - 1;
534         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
535
536         if (unlikely(!buf->len)) {
537                 pipe_buf_release(pipe, buf);
538                 pipe->tail = tail+1;
539                 return true;
540         }
541
542         return false;
543 }
544
545 /**
546  * splice_from_pipe_next - wait for some data to splice from
547  * @pipe:       pipe to splice from
548  * @sd:         information about the splice operation
549  *
550  * Description:
551  *    This function will wait for some data and return a positive
552  *    value (one) if pipe buffers are available.  It will return zero
553  *    or -errno if no more data needs to be spliced.
554  */
555 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
556 {
557         /*
558          * Check for signal early to make process killable when there are
559          * always buffers available
560          */
561         if (signal_pending(current))
562                 return -ERESTARTSYS;
563
564 repeat:
565         while (pipe_empty(pipe->head, pipe->tail)) {
566                 if (!pipe->writers)
567                         return 0;
568
569                 if (sd->num_spliced)
570                         return 0;
571
572                 if (sd->flags & SPLICE_F_NONBLOCK)
573                         return -EAGAIN;
574
575                 if (signal_pending(current))
576                         return -ERESTARTSYS;
577
578                 if (sd->need_wakeup) {
579                         wakeup_pipe_writers(pipe);
580                         sd->need_wakeup = false;
581                 }
582
583                 pipe_wait_readable(pipe);
584         }
585
586         if (eat_empty_buffer(pipe))
587                 goto repeat;
588
589         return 1;
590 }
591
592 /**
593  * splice_from_pipe_begin - start splicing from pipe
594  * @sd:         information about the splice operation
595  *
596  * Description:
597  *    This function should be called before a loop containing
598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
599  *    initialize the necessary fields of @sd.
600  */
601 static void splice_from_pipe_begin(struct splice_desc *sd)
602 {
603         sd->num_spliced = 0;
604         sd->need_wakeup = false;
605 }
606
607 /**
608  * splice_from_pipe_end - finish splicing from pipe
609  * @pipe:       pipe to splice from
610  * @sd:         information about the splice operation
611  *
612  * Description:
613  *    This function will wake up pipe writers if necessary.  It should
614  *    be called after a loop containing splice_from_pipe_next() and
615  *    splice_from_pipe_feed().
616  */
617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
618 {
619         if (sd->need_wakeup)
620                 wakeup_pipe_writers(pipe);
621 }
622
623 /**
624  * __splice_from_pipe - splice data from a pipe to given actor
625  * @pipe:       pipe to splice from
626  * @sd:         information to @actor
627  * @actor:      handler that splices the data
628  *
629  * Description:
630  *    This function does little more than loop over the pipe and call
631  *    @actor to do the actual moving of a single struct pipe_buffer to
632  *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
633  *    pipe_to_user.
634  *
635  */
636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
637                            splice_actor *actor)
638 {
639         int ret;
640
641         splice_from_pipe_begin(sd);
642         do {
643                 cond_resched();
644                 ret = splice_from_pipe_next(pipe, sd);
645                 if (ret > 0)
646                         ret = splice_from_pipe_feed(pipe, sd, actor);
647         } while (ret > 0);
648         splice_from_pipe_end(pipe, sd);
649
650         return sd->num_spliced ? sd->num_spliced : ret;
651 }
652 EXPORT_SYMBOL(__splice_from_pipe);
653
654 /**
655  * splice_from_pipe - splice data from a pipe to a file
656  * @pipe:       pipe to splice from
657  * @out:        file to splice to
658  * @ppos:       position in @out
659  * @len:        how many bytes to splice
660  * @flags:      splice modifier flags
661  * @actor:      handler that splices the data
662  *
663  * Description:
664  *    See __splice_from_pipe. This function locks the pipe inode,
665  *    otherwise it's identical to __splice_from_pipe().
666  *
667  */
668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
669                          loff_t *ppos, size_t len, unsigned int flags,
670                          splice_actor *actor)
671 {
672         ssize_t ret;
673         struct splice_desc sd = {
674                 .total_len = len,
675                 .flags = flags,
676                 .pos = *ppos,
677                 .u.file = out,
678         };
679
680         pipe_lock(pipe);
681         ret = __splice_from_pipe(pipe, &sd, actor);
682         pipe_unlock(pipe);
683
684         return ret;
685 }
686
687 /**
688  * iter_file_splice_write - splice data from a pipe to a file
689  * @pipe:       pipe info
690  * @out:        file to write to
691  * @ppos:       position in @out
692  * @len:        number of bytes to splice
693  * @flags:      splice modifier flags
694  *
695  * Description:
696  *    Will either move or copy pages (determined by @flags options) from
697  *    the given pipe inode to the given file.
698  *    This one is ->write_iter-based.
699  *
700  */
701 ssize_t
702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
703                           loff_t *ppos, size_t len, unsigned int flags)
704 {
705         struct splice_desc sd = {
706                 .total_len = len,
707                 .flags = flags,
708                 .pos = *ppos,
709                 .u.file = out,
710         };
711         int nbufs = pipe->max_usage;
712         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
713                                         GFP_KERNEL);
714         ssize_t ret;
715
716         if (unlikely(!array))
717                 return -ENOMEM;
718
719         pipe_lock(pipe);
720
721         splice_from_pipe_begin(&sd);
722         while (sd.total_len) {
723                 struct iov_iter from;
724                 unsigned int head, tail, mask;
725                 size_t left;
726                 int n;
727
728                 ret = splice_from_pipe_next(pipe, &sd);
729                 if (ret <= 0)
730                         break;
731
732                 if (unlikely(nbufs < pipe->max_usage)) {
733                         kfree(array);
734                         nbufs = pipe->max_usage;
735                         array = kcalloc(nbufs, sizeof(struct bio_vec),
736                                         GFP_KERNEL);
737                         if (!array) {
738                                 ret = -ENOMEM;
739                                 break;
740                         }
741                 }
742
743                 head = pipe->head;
744                 tail = pipe->tail;
745                 mask = pipe->ring_size - 1;
746
747                 /* build the vector */
748                 left = sd.total_len;
749                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
750                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
751                         size_t this_len = buf->len;
752
753                         /* zero-length bvecs are not supported, skip them */
754                         if (!this_len)
755                                 continue;
756                         this_len = min(this_len, left);
757
758                         ret = pipe_buf_confirm(pipe, buf);
759                         if (unlikely(ret)) {
760                                 if (ret == -ENODATA)
761                                         ret = 0;
762                                 goto done;
763                         }
764
765                         bvec_set_page(&array[n], buf->page, this_len,
766                                       buf->offset);
767                         left -= this_len;
768                         n++;
769                 }
770
771                 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
772                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
773                 if (ret <= 0)
774                         break;
775
776                 sd.num_spliced += ret;
777                 sd.total_len -= ret;
778                 *ppos = sd.pos;
779
780                 /* dismiss the fully eaten buffers, adjust the partial one */
781                 tail = pipe->tail;
782                 while (ret) {
783                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
784                         if (ret >= buf->len) {
785                                 ret -= buf->len;
786                                 buf->len = 0;
787                                 pipe_buf_release(pipe, buf);
788                                 tail++;
789                                 pipe->tail = tail;
790                                 if (pipe->files)
791                                         sd.need_wakeup = true;
792                         } else {
793                                 buf->offset += ret;
794                                 buf->len -= ret;
795                                 ret = 0;
796                         }
797                 }
798         }
799 done:
800         kfree(array);
801         splice_from_pipe_end(pipe, &sd);
802
803         pipe_unlock(pipe);
804
805         if (sd.num_spliced)
806                 ret = sd.num_spliced;
807
808         return ret;
809 }
810
811 EXPORT_SYMBOL(iter_file_splice_write);
812
813 #ifdef CONFIG_NET
814 /**
815  * splice_to_socket - splice data from a pipe to a socket
816  * @pipe:       pipe to splice from
817  * @out:        socket to write to
818  * @ppos:       position in @out
819  * @len:        number of bytes to splice
820  * @flags:      splice modifier flags
821  *
822  * Description:
823  *    Will send @len bytes from the pipe to a network socket. No data copying
824  *    is involved.
825  *
826  */
827 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
828                          loff_t *ppos, size_t len, unsigned int flags)
829 {
830         struct socket *sock = sock_from_file(out);
831         struct bio_vec bvec[16];
832         struct msghdr msg = {};
833         ssize_t ret = 0;
834         size_t spliced = 0;
835         bool need_wakeup = false;
836
837         pipe_lock(pipe);
838
839         while (len > 0) {
840                 unsigned int head, tail, mask, bc = 0;
841                 size_t remain = len;
842
843                 /*
844                  * Check for signal early to make process killable when there
845                  * are always buffers available
846                  */
847                 ret = -ERESTARTSYS;
848                 if (signal_pending(current))
849                         break;
850
851                 while (pipe_empty(pipe->head, pipe->tail)) {
852                         ret = 0;
853                         if (!pipe->writers)
854                                 goto out;
855
856                         if (spliced)
857                                 goto out;
858
859                         ret = -EAGAIN;
860                         if (flags & SPLICE_F_NONBLOCK)
861                                 goto out;
862
863                         ret = -ERESTARTSYS;
864                         if (signal_pending(current))
865                                 goto out;
866
867                         if (need_wakeup) {
868                                 wakeup_pipe_writers(pipe);
869                                 need_wakeup = false;
870                         }
871
872                         pipe_wait_readable(pipe);
873                 }
874
875                 head = pipe->head;
876                 tail = pipe->tail;
877                 mask = pipe->ring_size - 1;
878
879                 while (!pipe_empty(head, tail)) {
880                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
881                         size_t seg;
882
883                         if (!buf->len) {
884                                 tail++;
885                                 continue;
886                         }
887
888                         seg = min_t(size_t, remain, buf->len);
889
890                         ret = pipe_buf_confirm(pipe, buf);
891                         if (unlikely(ret)) {
892                                 if (ret == -ENODATA)
893                                         ret = 0;
894                                 break;
895                         }
896
897                         bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
898                         remain -= seg;
899                         if (remain == 0 || bc >= ARRAY_SIZE(bvec))
900                                 break;
901                         tail++;
902                 }
903
904                 if (!bc)
905                         break;
906
907                 msg.msg_flags = MSG_SPLICE_PAGES;
908                 if (flags & SPLICE_F_MORE)
909                         msg.msg_flags |= MSG_MORE;
910                 if (remain && pipe_occupancy(pipe->head, tail) > 0)
911                         msg.msg_flags |= MSG_MORE;
912
913                 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
914                               len - remain);
915                 ret = sock_sendmsg(sock, &msg);
916                 if (ret <= 0)
917                         break;
918
919                 spliced += ret;
920                 len -= ret;
921                 tail = pipe->tail;
922                 while (ret > 0) {
923                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
924                         size_t seg = min_t(size_t, ret, buf->len);
925
926                         buf->offset += seg;
927                         buf->len -= seg;
928                         ret -= seg;
929
930                         if (!buf->len) {
931                                 pipe_buf_release(pipe, buf);
932                                 tail++;
933                         }
934                 }
935
936                 if (tail != pipe->tail) {
937                         pipe->tail = tail;
938                         if (pipe->files)
939                                 need_wakeup = true;
940                 }
941         }
942
943 out:
944         pipe_unlock(pipe);
945         if (need_wakeup)
946                 wakeup_pipe_writers(pipe);
947         return spliced ?: ret;
948 }
949 #endif
950
951 static int warn_unsupported(struct file *file, const char *op)
952 {
953         pr_debug_ratelimited(
954                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
955                 op, file, current->pid, current->comm);
956         return -EINVAL;
957 }
958
959 /*
960  * Attempt to initiate a splice from pipe to file.
961  */
962 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
963                            loff_t *ppos, size_t len, unsigned int flags)
964 {
965         if (unlikely(!out->f_op->splice_write))
966                 return warn_unsupported(out, "write");
967         return out->f_op->splice_write(pipe, out, ppos, len, flags);
968 }
969
970 /*
971  * Indicate to the caller that there was a premature EOF when reading from the
972  * source and the caller didn't indicate they would be sending more data after
973  * this.
974  */
975 static void do_splice_eof(struct splice_desc *sd)
976 {
977         if (sd->splice_eof)
978                 sd->splice_eof(sd);
979 }
980
981 /*
982  * Attempt to initiate a splice from a file to a pipe.
983  */
984 static long do_splice_to(struct file *in, loff_t *ppos,
985                          struct pipe_inode_info *pipe, size_t len,
986                          unsigned int flags)
987 {
988         unsigned int p_space;
989         int ret;
990
991         if (unlikely(!(in->f_mode & FMODE_READ)))
992                 return -EBADF;
993
994         /* Don't try to read more the pipe has space for. */
995         p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
996         len = min_t(size_t, len, p_space << PAGE_SHIFT);
997
998         ret = rw_verify_area(READ, in, ppos, len);
999         if (unlikely(ret < 0))
1000                 return ret;
1001
1002         if (unlikely(len > MAX_RW_COUNT))
1003                 len = MAX_RW_COUNT;
1004
1005         if (unlikely(!in->f_op->splice_read))
1006                 return warn_unsupported(in, "read");
1007         return in->f_op->splice_read(in, ppos, pipe, len, flags);
1008 }
1009
1010 /**
1011  * splice_direct_to_actor - splices data directly between two non-pipes
1012  * @in:         file to splice from
1013  * @sd:         actor information on where to splice to
1014  * @actor:      handles the data splicing
1015  *
1016  * Description:
1017  *    This is a special case helper to splice directly between two
1018  *    points, without requiring an explicit pipe. Internally an allocated
1019  *    pipe is cached in the process, and reused during the lifetime of
1020  *    that process.
1021  *
1022  */
1023 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1024                                splice_direct_actor *actor)
1025 {
1026         struct pipe_inode_info *pipe;
1027         long ret, bytes;
1028         size_t len;
1029         int i, flags, more;
1030
1031         /*
1032          * We require the input to be seekable, as we don't want to randomly
1033          * drop data for eg socket -> socket splicing. Use the piped splicing
1034          * for that!
1035          */
1036         if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1037                 return -EINVAL;
1038
1039         /*
1040          * neither in nor out is a pipe, setup an internal pipe attached to
1041          * 'out' and transfer the wanted data from 'in' to 'out' through that
1042          */
1043         pipe = current->splice_pipe;
1044         if (unlikely(!pipe)) {
1045                 pipe = alloc_pipe_info();
1046                 if (!pipe)
1047                         return -ENOMEM;
1048
1049                 /*
1050                  * We don't have an immediate reader, but we'll read the stuff
1051                  * out of the pipe right after the splice_to_pipe(). So set
1052                  * PIPE_READERS appropriately.
1053                  */
1054                 pipe->readers = 1;
1055
1056                 current->splice_pipe = pipe;
1057         }
1058
1059         /*
1060          * Do the splice.
1061          */
1062         bytes = 0;
1063         len = sd->total_len;
1064
1065         /* Don't block on output, we have to drain the direct pipe. */
1066         flags = sd->flags;
1067         sd->flags &= ~SPLICE_F_NONBLOCK;
1068
1069         /*
1070          * We signal MORE until we've read sufficient data to fulfill the
1071          * request and we keep signalling it if the caller set it.
1072          */
1073         more = sd->flags & SPLICE_F_MORE;
1074         sd->flags |= SPLICE_F_MORE;
1075
1076         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1077
1078         while (len) {
1079                 size_t read_len;
1080                 loff_t pos = sd->pos, prev_pos = pos;
1081
1082                 ret = do_splice_to(in, &pos, pipe, len, flags);
1083                 if (unlikely(ret <= 0))
1084                         goto read_failure;
1085
1086                 read_len = ret;
1087                 sd->total_len = read_len;
1088
1089                 /*
1090                  * If we now have sufficient data to fulfill the request then
1091                  * we clear SPLICE_F_MORE if it was not set initially.
1092                  */
1093                 if (read_len >= len && !more)
1094                         sd->flags &= ~SPLICE_F_MORE;
1095
1096                 /*
1097                  * NOTE: nonblocking mode only applies to the input. We
1098                  * must not do the output in nonblocking mode as then we
1099                  * could get stuck data in the internal pipe:
1100                  */
1101                 ret = actor(pipe, sd);
1102                 if (unlikely(ret <= 0)) {
1103                         sd->pos = prev_pos;
1104                         goto out_release;
1105                 }
1106
1107                 bytes += ret;
1108                 len -= ret;
1109                 sd->pos = pos;
1110
1111                 if (ret < read_len) {
1112                         sd->pos = prev_pos + ret;
1113                         goto out_release;
1114                 }
1115         }
1116
1117 done:
1118         pipe->tail = pipe->head = 0;
1119         file_accessed(in);
1120         return bytes;
1121
1122 read_failure:
1123         /*
1124          * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1125          * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1126          * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1127          * least 1 byte *then* we will also do the ->splice_eof() call.
1128          */
1129         if (ret == 0 && !more && len > 0 && bytes)
1130                 do_splice_eof(sd);
1131 out_release:
1132         /*
1133          * If we did an incomplete transfer we must release
1134          * the pipe buffers in question:
1135          */
1136         for (i = 0; i < pipe->ring_size; i++) {
1137                 struct pipe_buffer *buf = &pipe->bufs[i];
1138
1139                 if (buf->ops)
1140                         pipe_buf_release(pipe, buf);
1141         }
1142
1143         if (!bytes)
1144                 bytes = ret;
1145
1146         goto done;
1147 }
1148 EXPORT_SYMBOL(splice_direct_to_actor);
1149
1150 static int direct_splice_actor(struct pipe_inode_info *pipe,
1151                                struct splice_desc *sd)
1152 {
1153         struct file *file = sd->u.file;
1154
1155         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1156                               sd->flags);
1157 }
1158
1159 static void direct_file_splice_eof(struct splice_desc *sd)
1160 {
1161         struct file *file = sd->u.file;
1162
1163         if (file->f_op->splice_eof)
1164                 file->f_op->splice_eof(file);
1165 }
1166
1167 /**
1168  * do_splice_direct - splices data directly between two files
1169  * @in:         file to splice from
1170  * @ppos:       input file offset
1171  * @out:        file to splice to
1172  * @opos:       output file offset
1173  * @len:        number of bytes to splice
1174  * @flags:      splice modifier flags
1175  *
1176  * Description:
1177  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1178  *    doing it in the application would incur an extra system call
1179  *    (splice in + splice out, as compared to just sendfile()). So this helper
1180  *    can splice directly through a process-private pipe.
1181  *
1182  */
1183 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1184                       loff_t *opos, size_t len, unsigned int flags)
1185 {
1186         struct splice_desc sd = {
1187                 .len            = len,
1188                 .total_len      = len,
1189                 .flags          = flags,
1190                 .pos            = *ppos,
1191                 .u.file         = out,
1192                 .splice_eof     = direct_file_splice_eof,
1193                 .opos           = opos,
1194         };
1195         long ret;
1196
1197         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1198                 return -EBADF;
1199
1200         if (unlikely(out->f_flags & O_APPEND))
1201                 return -EINVAL;
1202
1203         ret = rw_verify_area(WRITE, out, opos, len);
1204         if (unlikely(ret < 0))
1205                 return ret;
1206
1207         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1208         if (ret > 0)
1209                 *ppos = sd.pos;
1210
1211         return ret;
1212 }
1213 EXPORT_SYMBOL(do_splice_direct);
1214
1215 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1216 {
1217         for (;;) {
1218                 if (unlikely(!pipe->readers)) {
1219                         send_sig(SIGPIPE, current, 0);
1220                         return -EPIPE;
1221                 }
1222                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1223                         return 0;
1224                 if (flags & SPLICE_F_NONBLOCK)
1225                         return -EAGAIN;
1226                 if (signal_pending(current))
1227                         return -ERESTARTSYS;
1228                 pipe_wait_writable(pipe);
1229         }
1230 }
1231
1232 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1233                                struct pipe_inode_info *opipe,
1234                                size_t len, unsigned int flags);
1235
1236 long splice_file_to_pipe(struct file *in,
1237                          struct pipe_inode_info *opipe,
1238                          loff_t *offset,
1239                          size_t len, unsigned int flags)
1240 {
1241         long ret;
1242
1243         pipe_lock(opipe);
1244         ret = wait_for_space(opipe, flags);
1245         if (!ret)
1246                 ret = do_splice_to(in, offset, opipe, len, flags);
1247         pipe_unlock(opipe);
1248         if (ret > 0)
1249                 wakeup_pipe_readers(opipe);
1250         return ret;
1251 }
1252
1253 /*
1254  * Determine where to splice to/from.
1255  */
1256 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1257                loff_t *off_out, size_t len, unsigned int flags)
1258 {
1259         struct pipe_inode_info *ipipe;
1260         struct pipe_inode_info *opipe;
1261         loff_t offset;
1262         long ret;
1263
1264         if (unlikely(!(in->f_mode & FMODE_READ) ||
1265                      !(out->f_mode & FMODE_WRITE)))
1266                 return -EBADF;
1267
1268         ipipe = get_pipe_info(in, true);
1269         opipe = get_pipe_info(out, true);
1270
1271         if (ipipe && opipe) {
1272                 if (off_in || off_out)
1273                         return -ESPIPE;
1274
1275                 /* Splicing to self would be fun, but... */
1276                 if (ipipe == opipe)
1277                         return -EINVAL;
1278
1279                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1280                         flags |= SPLICE_F_NONBLOCK;
1281
1282                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1283         }
1284
1285         if (ipipe) {
1286                 if (off_in)
1287                         return -ESPIPE;
1288                 if (off_out) {
1289                         if (!(out->f_mode & FMODE_PWRITE))
1290                                 return -EINVAL;
1291                         offset = *off_out;
1292                 } else {
1293                         offset = out->f_pos;
1294                 }
1295
1296                 if (unlikely(out->f_flags & O_APPEND))
1297                         return -EINVAL;
1298
1299                 ret = rw_verify_area(WRITE, out, &offset, len);
1300                 if (unlikely(ret < 0))
1301                         return ret;
1302
1303                 if (in->f_flags & O_NONBLOCK)
1304                         flags |= SPLICE_F_NONBLOCK;
1305
1306                 file_start_write(out);
1307                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1308                 file_end_write(out);
1309
1310                 if (ret > 0)
1311                         fsnotify_modify(out);
1312
1313                 if (!off_out)
1314                         out->f_pos = offset;
1315                 else
1316                         *off_out = offset;
1317
1318                 return ret;
1319         }
1320
1321         if (opipe) {
1322                 if (off_out)
1323                         return -ESPIPE;
1324                 if (off_in) {
1325                         if (!(in->f_mode & FMODE_PREAD))
1326                                 return -EINVAL;
1327                         offset = *off_in;
1328                 } else {
1329                         offset = in->f_pos;
1330                 }
1331
1332                 if (out->f_flags & O_NONBLOCK)
1333                         flags |= SPLICE_F_NONBLOCK;
1334
1335                 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1336
1337                 if (ret > 0)
1338                         fsnotify_access(in);
1339
1340                 if (!off_in)
1341                         in->f_pos = offset;
1342                 else
1343                         *off_in = offset;
1344
1345                 return ret;
1346         }
1347
1348         return -EINVAL;
1349 }
1350
1351 static long __do_splice(struct file *in, loff_t __user *off_in,
1352                         struct file *out, loff_t __user *off_out,
1353                         size_t len, unsigned int flags)
1354 {
1355         struct pipe_inode_info *ipipe;
1356         struct pipe_inode_info *opipe;
1357         loff_t offset, *__off_in = NULL, *__off_out = NULL;
1358         long ret;
1359
1360         ipipe = get_pipe_info(in, true);
1361         opipe = get_pipe_info(out, true);
1362
1363         if (ipipe) {
1364                 if (off_in)
1365                         return -ESPIPE;
1366                 pipe_clear_nowait(in);
1367         }
1368         if (opipe) {
1369                 if (off_out)
1370                         return -ESPIPE;
1371                 pipe_clear_nowait(out);
1372         }
1373
1374         if (off_out) {
1375                 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1376                         return -EFAULT;
1377                 __off_out = &offset;
1378         }
1379         if (off_in) {
1380                 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1381                         return -EFAULT;
1382                 __off_in = &offset;
1383         }
1384
1385         ret = do_splice(in, __off_in, out, __off_out, len, flags);
1386         if (ret < 0)
1387                 return ret;
1388
1389         if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1390                 return -EFAULT;
1391         if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1392                 return -EFAULT;
1393
1394         return ret;
1395 }
1396
1397 static int iter_to_pipe(struct iov_iter *from,
1398                         struct pipe_inode_info *pipe,
1399                         unsigned flags)
1400 {
1401         struct pipe_buffer buf = {
1402                 .ops = &user_page_pipe_buf_ops,
1403                 .flags = flags
1404         };
1405         size_t total = 0;
1406         int ret = 0;
1407
1408         while (iov_iter_count(from)) {
1409                 struct page *pages[16];
1410                 ssize_t left;
1411                 size_t start;
1412                 int i, n;
1413
1414                 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1415                 if (left <= 0) {
1416                         ret = left;
1417                         break;
1418                 }
1419
1420                 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1421                 for (i = 0; i < n; i++) {
1422                         int size = min_t(int, left, PAGE_SIZE - start);
1423
1424                         buf.page = pages[i];
1425                         buf.offset = start;
1426                         buf.len = size;
1427                         ret = add_to_pipe(pipe, &buf);
1428                         if (unlikely(ret < 0)) {
1429                                 iov_iter_revert(from, left);
1430                                 // this one got dropped by add_to_pipe()
1431                                 while (++i < n)
1432                                         put_page(pages[i]);
1433                                 goto out;
1434                         }
1435                         total += ret;
1436                         left -= size;
1437                         start = 0;
1438                 }
1439         }
1440 out:
1441         return total ? total : ret;
1442 }
1443
1444 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1445                         struct splice_desc *sd)
1446 {
1447         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1448         return n == sd->len ? n : -EFAULT;
1449 }
1450
1451 /*
1452  * For lack of a better implementation, implement vmsplice() to userspace
1453  * as a simple copy of the pipes pages to the user iov.
1454  */
1455 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1456                              unsigned int flags)
1457 {
1458         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1459         struct splice_desc sd = {
1460                 .total_len = iov_iter_count(iter),
1461                 .flags = flags,
1462                 .u.data = iter
1463         };
1464         long ret = 0;
1465
1466         if (!pipe)
1467                 return -EBADF;
1468
1469         pipe_clear_nowait(file);
1470
1471         if (sd.total_len) {
1472                 pipe_lock(pipe);
1473                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1474                 pipe_unlock(pipe);
1475         }
1476
1477         return ret;
1478 }
1479
1480 /*
1481  * vmsplice splices a user address range into a pipe. It can be thought of
1482  * as splice-from-memory, where the regular splice is splice-from-file (or
1483  * to file). In both cases the output is a pipe, naturally.
1484  */
1485 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1486                              unsigned int flags)
1487 {
1488         struct pipe_inode_info *pipe;
1489         long ret = 0;
1490         unsigned buf_flag = 0;
1491
1492         if (flags & SPLICE_F_GIFT)
1493                 buf_flag = PIPE_BUF_FLAG_GIFT;
1494
1495         pipe = get_pipe_info(file, true);
1496         if (!pipe)
1497                 return -EBADF;
1498
1499         pipe_clear_nowait(file);
1500
1501         pipe_lock(pipe);
1502         ret = wait_for_space(pipe, flags);
1503         if (!ret)
1504                 ret = iter_to_pipe(iter, pipe, buf_flag);
1505         pipe_unlock(pipe);
1506         if (ret > 0)
1507                 wakeup_pipe_readers(pipe);
1508         return ret;
1509 }
1510
1511 static int vmsplice_type(struct fd f, int *type)
1512 {
1513         if (!f.file)
1514                 return -EBADF;
1515         if (f.file->f_mode & FMODE_WRITE) {
1516                 *type = ITER_SOURCE;
1517         } else if (f.file->f_mode & FMODE_READ) {
1518                 *type = ITER_DEST;
1519         } else {
1520                 fdput(f);
1521                 return -EBADF;
1522         }
1523         return 0;
1524 }
1525
1526 /*
1527  * Note that vmsplice only really supports true splicing _from_ user memory
1528  * to a pipe, not the other way around. Splicing from user memory is a simple
1529  * operation that can be supported without any funky alignment restrictions
1530  * or nasty vm tricks. We simply map in the user memory and fill them into
1531  * a pipe. The reverse isn't quite as easy, though. There are two possible
1532  * solutions for that:
1533  *
1534  *      - memcpy() the data internally, at which point we might as well just
1535  *        do a regular read() on the buffer anyway.
1536  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1537  *        has restriction limitations on both ends of the pipe).
1538  *
1539  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1540  *
1541  */
1542 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1543                 unsigned long, nr_segs, unsigned int, flags)
1544 {
1545         struct iovec iovstack[UIO_FASTIOV];
1546         struct iovec *iov = iovstack;
1547         struct iov_iter iter;
1548         ssize_t error;
1549         struct fd f;
1550         int type;
1551
1552         if (unlikely(flags & ~SPLICE_F_ALL))
1553                 return -EINVAL;
1554
1555         f = fdget(fd);
1556         error = vmsplice_type(f, &type);
1557         if (error)
1558                 return error;
1559
1560         error = import_iovec(type, uiov, nr_segs,
1561                              ARRAY_SIZE(iovstack), &iov, &iter);
1562         if (error < 0)
1563                 goto out_fdput;
1564
1565         if (!iov_iter_count(&iter))
1566                 error = 0;
1567         else if (type == ITER_SOURCE)
1568                 error = vmsplice_to_pipe(f.file, &iter, flags);
1569         else
1570                 error = vmsplice_to_user(f.file, &iter, flags);
1571
1572         kfree(iov);
1573 out_fdput:
1574         fdput(f);
1575         return error;
1576 }
1577
1578 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1579                 int, fd_out, loff_t __user *, off_out,
1580                 size_t, len, unsigned int, flags)
1581 {
1582         struct fd in, out;
1583         long error;
1584
1585         if (unlikely(!len))
1586                 return 0;
1587
1588         if (unlikely(flags & ~SPLICE_F_ALL))
1589                 return -EINVAL;
1590
1591         error = -EBADF;
1592         in = fdget(fd_in);
1593         if (in.file) {
1594                 out = fdget(fd_out);
1595                 if (out.file) {
1596                         error = __do_splice(in.file, off_in, out.file, off_out,
1597                                                 len, flags);
1598                         fdput(out);
1599                 }
1600                 fdput(in);
1601         }
1602         return error;
1603 }
1604
1605 /*
1606  * Make sure there's data to read. Wait for input if we can, otherwise
1607  * return an appropriate error.
1608  */
1609 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1610 {
1611         int ret;
1612
1613         /*
1614          * Check the pipe occupancy without the inode lock first. This function
1615          * is speculative anyways, so missing one is ok.
1616          */
1617         if (!pipe_empty(pipe->head, pipe->tail))
1618                 return 0;
1619
1620         ret = 0;
1621         pipe_lock(pipe);
1622
1623         while (pipe_empty(pipe->head, pipe->tail)) {
1624                 if (signal_pending(current)) {
1625                         ret = -ERESTARTSYS;
1626                         break;
1627                 }
1628                 if (!pipe->writers)
1629                         break;
1630                 if (flags & SPLICE_F_NONBLOCK) {
1631                         ret = -EAGAIN;
1632                         break;
1633                 }
1634                 pipe_wait_readable(pipe);
1635         }
1636
1637         pipe_unlock(pipe);
1638         return ret;
1639 }
1640
1641 /*
1642  * Make sure there's writeable room. Wait for room if we can, otherwise
1643  * return an appropriate error.
1644  */
1645 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1646 {
1647         int ret;
1648
1649         /*
1650          * Check pipe occupancy without the inode lock first. This function
1651          * is speculative anyways, so missing one is ok.
1652          */
1653         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1654                 return 0;
1655
1656         ret = 0;
1657         pipe_lock(pipe);
1658
1659         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1660                 if (!pipe->readers) {
1661                         send_sig(SIGPIPE, current, 0);
1662                         ret = -EPIPE;
1663                         break;
1664                 }
1665                 if (flags & SPLICE_F_NONBLOCK) {
1666                         ret = -EAGAIN;
1667                         break;
1668                 }
1669                 if (signal_pending(current)) {
1670                         ret = -ERESTARTSYS;
1671                         break;
1672                 }
1673                 pipe_wait_writable(pipe);
1674         }
1675
1676         pipe_unlock(pipe);
1677         return ret;
1678 }
1679
1680 /*
1681  * Splice contents of ipipe to opipe.
1682  */
1683 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1684                                struct pipe_inode_info *opipe,
1685                                size_t len, unsigned int flags)
1686 {
1687         struct pipe_buffer *ibuf, *obuf;
1688         unsigned int i_head, o_head;
1689         unsigned int i_tail, o_tail;
1690         unsigned int i_mask, o_mask;
1691         int ret = 0;
1692         bool input_wakeup = false;
1693
1694
1695 retry:
1696         ret = ipipe_prep(ipipe, flags);
1697         if (ret)
1698                 return ret;
1699
1700         ret = opipe_prep(opipe, flags);
1701         if (ret)
1702                 return ret;
1703
1704         /*
1705          * Potential ABBA deadlock, work around it by ordering lock
1706          * grabbing by pipe info address. Otherwise two different processes
1707          * could deadlock (one doing tee from A -> B, the other from B -> A).
1708          */
1709         pipe_double_lock(ipipe, opipe);
1710
1711         i_tail = ipipe->tail;
1712         i_mask = ipipe->ring_size - 1;
1713         o_head = opipe->head;
1714         o_mask = opipe->ring_size - 1;
1715
1716         do {
1717                 size_t o_len;
1718
1719                 if (!opipe->readers) {
1720                         send_sig(SIGPIPE, current, 0);
1721                         if (!ret)
1722                                 ret = -EPIPE;
1723                         break;
1724                 }
1725
1726                 i_head = ipipe->head;
1727                 o_tail = opipe->tail;
1728
1729                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1730                         break;
1731
1732                 /*
1733                  * Cannot make any progress, because either the input
1734                  * pipe is empty or the output pipe is full.
1735                  */
1736                 if (pipe_empty(i_head, i_tail) ||
1737                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1738                         /* Already processed some buffers, break */
1739                         if (ret)
1740                                 break;
1741
1742                         if (flags & SPLICE_F_NONBLOCK) {
1743                                 ret = -EAGAIN;
1744                                 break;
1745                         }
1746
1747                         /*
1748                          * We raced with another reader/writer and haven't
1749                          * managed to process any buffers.  A zero return
1750                          * value means EOF, so retry instead.
1751                          */
1752                         pipe_unlock(ipipe);
1753                         pipe_unlock(opipe);
1754                         goto retry;
1755                 }
1756
1757                 ibuf = &ipipe->bufs[i_tail & i_mask];
1758                 obuf = &opipe->bufs[o_head & o_mask];
1759
1760                 if (len >= ibuf->len) {
1761                         /*
1762                          * Simply move the whole buffer from ipipe to opipe
1763                          */
1764                         *obuf = *ibuf;
1765                         ibuf->ops = NULL;
1766                         i_tail++;
1767                         ipipe->tail = i_tail;
1768                         input_wakeup = true;
1769                         o_len = obuf->len;
1770                         o_head++;
1771                         opipe->head = o_head;
1772                 } else {
1773                         /*
1774                          * Get a reference to this pipe buffer,
1775                          * so we can copy the contents over.
1776                          */
1777                         if (!pipe_buf_get(ipipe, ibuf)) {
1778                                 if (ret == 0)
1779                                         ret = -EFAULT;
1780                                 break;
1781                         }
1782                         *obuf = *ibuf;
1783
1784                         /*
1785                          * Don't inherit the gift and merge flags, we need to
1786                          * prevent multiple steals of this page.
1787                          */
1788                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1789                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1790
1791                         obuf->len = len;
1792                         ibuf->offset += len;
1793                         ibuf->len -= len;
1794                         o_len = len;
1795                         o_head++;
1796                         opipe->head = o_head;
1797                 }
1798                 ret += o_len;
1799                 len -= o_len;
1800         } while (len);
1801
1802         pipe_unlock(ipipe);
1803         pipe_unlock(opipe);
1804
1805         /*
1806          * If we put data in the output pipe, wakeup any potential readers.
1807          */
1808         if (ret > 0)
1809                 wakeup_pipe_readers(opipe);
1810
1811         if (input_wakeup)
1812                 wakeup_pipe_writers(ipipe);
1813
1814         return ret;
1815 }
1816
1817 /*
1818  * Link contents of ipipe to opipe.
1819  */
1820 static int link_pipe(struct pipe_inode_info *ipipe,
1821                      struct pipe_inode_info *opipe,
1822                      size_t len, unsigned int flags)
1823 {
1824         struct pipe_buffer *ibuf, *obuf;
1825         unsigned int i_head, o_head;
1826         unsigned int i_tail, o_tail;
1827         unsigned int i_mask, o_mask;
1828         int ret = 0;
1829
1830         /*
1831          * Potential ABBA deadlock, work around it by ordering lock
1832          * grabbing by pipe info address. Otherwise two different processes
1833          * could deadlock (one doing tee from A -> B, the other from B -> A).
1834          */
1835         pipe_double_lock(ipipe, opipe);
1836
1837         i_tail = ipipe->tail;
1838         i_mask = ipipe->ring_size - 1;
1839         o_head = opipe->head;
1840         o_mask = opipe->ring_size - 1;
1841
1842         do {
1843                 if (!opipe->readers) {
1844                         send_sig(SIGPIPE, current, 0);
1845                         if (!ret)
1846                                 ret = -EPIPE;
1847                         break;
1848                 }
1849
1850                 i_head = ipipe->head;
1851                 o_tail = opipe->tail;
1852
1853                 /*
1854                  * If we have iterated all input buffers or run out of
1855                  * output room, break.
1856                  */
1857                 if (pipe_empty(i_head, i_tail) ||
1858                     pipe_full(o_head, o_tail, opipe->max_usage))
1859                         break;
1860
1861                 ibuf = &ipipe->bufs[i_tail & i_mask];
1862                 obuf = &opipe->bufs[o_head & o_mask];
1863
1864                 /*
1865                  * Get a reference to this pipe buffer,
1866                  * so we can copy the contents over.
1867                  */
1868                 if (!pipe_buf_get(ipipe, ibuf)) {
1869                         if (ret == 0)
1870                                 ret = -EFAULT;
1871                         break;
1872                 }
1873
1874                 *obuf = *ibuf;
1875
1876                 /*
1877                  * Don't inherit the gift and merge flag, we need to prevent
1878                  * multiple steals of this page.
1879                  */
1880                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1881                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1882
1883                 if (obuf->len > len)
1884                         obuf->len = len;
1885                 ret += obuf->len;
1886                 len -= obuf->len;
1887
1888                 o_head++;
1889                 opipe->head = o_head;
1890                 i_tail++;
1891         } while (len);
1892
1893         pipe_unlock(ipipe);
1894         pipe_unlock(opipe);
1895
1896         /*
1897          * If we put data in the output pipe, wakeup any potential readers.
1898          */
1899         if (ret > 0)
1900                 wakeup_pipe_readers(opipe);
1901
1902         return ret;
1903 }
1904
1905 /*
1906  * This is a tee(1) implementation that works on pipes. It doesn't copy
1907  * any data, it simply references the 'in' pages on the 'out' pipe.
1908  * The 'flags' used are the SPLICE_F_* variants, currently the only
1909  * applicable one is SPLICE_F_NONBLOCK.
1910  */
1911 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1912 {
1913         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1914         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1915         int ret = -EINVAL;
1916
1917         if (unlikely(!(in->f_mode & FMODE_READ) ||
1918                      !(out->f_mode & FMODE_WRITE)))
1919                 return -EBADF;
1920
1921         /*
1922          * Duplicate the contents of ipipe to opipe without actually
1923          * copying the data.
1924          */
1925         if (ipipe && opipe && ipipe != opipe) {
1926                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1927                         flags |= SPLICE_F_NONBLOCK;
1928
1929                 /*
1930                  * Keep going, unless we encounter an error. The ipipe/opipe
1931                  * ordering doesn't really matter.
1932                  */
1933                 ret = ipipe_prep(ipipe, flags);
1934                 if (!ret) {
1935                         ret = opipe_prep(opipe, flags);
1936                         if (!ret)
1937                                 ret = link_pipe(ipipe, opipe, len, flags);
1938                 }
1939         }
1940
1941         return ret;
1942 }
1943
1944 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1945 {
1946         struct fd in, out;
1947         int error;
1948
1949         if (unlikely(flags & ~SPLICE_F_ALL))
1950                 return -EINVAL;
1951
1952         if (unlikely(!len))
1953                 return 0;
1954
1955         error = -EBADF;
1956         in = fdget(fdin);
1957         if (in.file) {
1958                 out = fdget(fdout);
1959                 if (out.file) {
1960                         error = do_tee(in.file, out.file, len, flags);
1961                         fdput(out);
1962                 }
1963                 fdput(in);
1964         }
1965
1966         return error;
1967 }