splice, net: Fix SPLICE_F_MORE signalling in splice_direct_to_actor()
[platform/kernel/linux-starfive.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/net.h>
37 #include <linux/socket.h>
38 #include <linux/sched/signal.h>
39
40 #include "internal.h"
41
42 /*
43  * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44  * indicate they support non-blocking reads or writes, we must clear it
45  * here if set to avoid blocking other users of this pipe if splice is
46  * being done on it.
47  */
48 static noinline void noinline pipe_clear_nowait(struct file *file)
49 {
50         fmode_t fmode = READ_ONCE(file->f_mode);
51
52         do {
53                 if (!(fmode & FMODE_NOWAIT))
54                         break;
55         } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 }
57
58 /*
59  * Attempt to steal a page from a pipe buffer. This should perhaps go into
60  * a vm helper function, it's already simplified quite a bit by the
61  * addition of remove_mapping(). If success is returned, the caller may
62  * attempt to reuse this page for another destination.
63  */
64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65                 struct pipe_buffer *buf)
66 {
67         struct folio *folio = page_folio(buf->page);
68         struct address_space *mapping;
69
70         folio_lock(folio);
71
72         mapping = folio_mapping(folio);
73         if (mapping) {
74                 WARN_ON(!folio_test_uptodate(folio));
75
76                 /*
77                  * At least for ext2 with nobh option, we need to wait on
78                  * writeback completing on this folio, since we'll remove it
79                  * from the pagecache.  Otherwise truncate wont wait on the
80                  * folio, allowing the disk blocks to be reused by someone else
81                  * before we actually wrote our data to them. fs corruption
82                  * ensues.
83                  */
84                 folio_wait_writeback(folio);
85
86                 if (folio_has_private(folio) &&
87                     !filemap_release_folio(folio, GFP_KERNEL))
88                         goto out_unlock;
89
90                 /*
91                  * If we succeeded in removing the mapping, set LRU flag
92                  * and return good.
93                  */
94                 if (remove_mapping(mapping, folio)) {
95                         buf->flags |= PIPE_BUF_FLAG_LRU;
96                         return true;
97                 }
98         }
99
100         /*
101          * Raced with truncate or failed to remove folio from current
102          * address space, unlock and return failure.
103          */
104 out_unlock:
105         folio_unlock(folio);
106         return false;
107 }
108
109 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
110                                         struct pipe_buffer *buf)
111 {
112         put_page(buf->page);
113         buf->flags &= ~PIPE_BUF_FLAG_LRU;
114 }
115
116 /*
117  * Check whether the contents of buf is OK to access. Since the content
118  * is a page cache page, IO may be in flight.
119  */
120 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
121                                        struct pipe_buffer *buf)
122 {
123         struct page *page = buf->page;
124         int err;
125
126         if (!PageUptodate(page)) {
127                 lock_page(page);
128
129                 /*
130                  * Page got truncated/unhashed. This will cause a 0-byte
131                  * splice, if this is the first page.
132                  */
133                 if (!page->mapping) {
134                         err = -ENODATA;
135                         goto error;
136                 }
137
138                 /*
139                  * Uh oh, read-error from disk.
140                  */
141                 if (!PageUptodate(page)) {
142                         err = -EIO;
143                         goto error;
144                 }
145
146                 /*
147                  * Page is ok afterall, we are done.
148                  */
149                 unlock_page(page);
150         }
151
152         return 0;
153 error:
154         unlock_page(page);
155         return err;
156 }
157
158 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
159         .confirm        = page_cache_pipe_buf_confirm,
160         .release        = page_cache_pipe_buf_release,
161         .try_steal      = page_cache_pipe_buf_try_steal,
162         .get            = generic_pipe_buf_get,
163 };
164
165 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
166                 struct pipe_buffer *buf)
167 {
168         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
169                 return false;
170
171         buf->flags |= PIPE_BUF_FLAG_LRU;
172         return generic_pipe_buf_try_steal(pipe, buf);
173 }
174
175 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
176         .release        = page_cache_pipe_buf_release,
177         .try_steal      = user_page_pipe_buf_try_steal,
178         .get            = generic_pipe_buf_get,
179 };
180
181 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
182 {
183         smp_mb();
184         if (waitqueue_active(&pipe->rd_wait))
185                 wake_up_interruptible(&pipe->rd_wait);
186         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
187 }
188
189 /**
190  * splice_to_pipe - fill passed data into a pipe
191  * @pipe:       pipe to fill
192  * @spd:        data to fill
193  *
194  * Description:
195  *    @spd contains a map of pages and len/offset tuples, along with
196  *    the struct pipe_buf_operations associated with these pages. This
197  *    function will link that data to the pipe.
198  *
199  */
200 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201                        struct splice_pipe_desc *spd)
202 {
203         unsigned int spd_pages = spd->nr_pages;
204         unsigned int tail = pipe->tail;
205         unsigned int head = pipe->head;
206         unsigned int mask = pipe->ring_size - 1;
207         int ret = 0, page_nr = 0;
208
209         if (!spd_pages)
210                 return 0;
211
212         if (unlikely(!pipe->readers)) {
213                 send_sig(SIGPIPE, current, 0);
214                 ret = -EPIPE;
215                 goto out;
216         }
217
218         while (!pipe_full(head, tail, pipe->max_usage)) {
219                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
220
221                 buf->page = spd->pages[page_nr];
222                 buf->offset = spd->partial[page_nr].offset;
223                 buf->len = spd->partial[page_nr].len;
224                 buf->private = spd->partial[page_nr].private;
225                 buf->ops = spd->ops;
226                 buf->flags = 0;
227
228                 head++;
229                 pipe->head = head;
230                 page_nr++;
231                 ret += buf->len;
232
233                 if (!--spd->nr_pages)
234                         break;
235         }
236
237         if (!ret)
238                 ret = -EAGAIN;
239
240 out:
241         while (page_nr < spd_pages)
242                 spd->spd_release(spd, page_nr++);
243
244         return ret;
245 }
246 EXPORT_SYMBOL_GPL(splice_to_pipe);
247
248 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
249 {
250         unsigned int head = pipe->head;
251         unsigned int tail = pipe->tail;
252         unsigned int mask = pipe->ring_size - 1;
253         int ret;
254
255         if (unlikely(!pipe->readers)) {
256                 send_sig(SIGPIPE, current, 0);
257                 ret = -EPIPE;
258         } else if (pipe_full(head, tail, pipe->max_usage)) {
259                 ret = -EAGAIN;
260         } else {
261                 pipe->bufs[head & mask] = *buf;
262                 pipe->head = head + 1;
263                 return buf->len;
264         }
265         pipe_buf_release(pipe, buf);
266         return ret;
267 }
268 EXPORT_SYMBOL(add_to_pipe);
269
270 /*
271  * Check if we need to grow the arrays holding pages and partial page
272  * descriptions.
273  */
274 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
275 {
276         unsigned int max_usage = READ_ONCE(pipe->max_usage);
277
278         spd->nr_pages_max = max_usage;
279         if (max_usage <= PIPE_DEF_BUFFERS)
280                 return 0;
281
282         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
283         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
284                                      GFP_KERNEL);
285
286         if (spd->pages && spd->partial)
287                 return 0;
288
289         kfree(spd->pages);
290         kfree(spd->partial);
291         return -ENOMEM;
292 }
293
294 void splice_shrink_spd(struct splice_pipe_desc *spd)
295 {
296         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
297                 return;
298
299         kfree(spd->pages);
300         kfree(spd->partial);
301 }
302
303 /*
304  * Splice data from an O_DIRECT file into pages and then add them to the output
305  * pipe.
306  */
307 ssize_t direct_splice_read(struct file *in, loff_t *ppos,
308                            struct pipe_inode_info *pipe,
309                            size_t len, unsigned int flags)
310 {
311         struct iov_iter to;
312         struct bio_vec *bv;
313         struct kiocb kiocb;
314         struct page **pages;
315         ssize_t ret;
316         size_t used, npages, chunk, remain, reclaim;
317         int i;
318
319         /* Work out how much data we can actually add into the pipe */
320         used = pipe_occupancy(pipe->head, pipe->tail);
321         npages = max_t(ssize_t, pipe->max_usage - used, 0);
322         len = min_t(size_t, len, npages * PAGE_SIZE);
323         npages = DIV_ROUND_UP(len, PAGE_SIZE);
324
325         bv = kzalloc(array_size(npages, sizeof(bv[0])) +
326                      array_size(npages, sizeof(struct page *)), GFP_KERNEL);
327         if (!bv)
328                 return -ENOMEM;
329
330         pages = (void *)(bv + npages);
331         npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
332         if (!npages) {
333                 kfree(bv);
334                 return -ENOMEM;
335         }
336
337         remain = len = min_t(size_t, len, npages * PAGE_SIZE);
338
339         for (i = 0; i < npages; i++) {
340                 chunk = min_t(size_t, PAGE_SIZE, remain);
341                 bv[i].bv_page = pages[i];
342                 bv[i].bv_offset = 0;
343                 bv[i].bv_len = chunk;
344                 remain -= chunk;
345         }
346
347         /* Do the I/O */
348         iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
349         init_sync_kiocb(&kiocb, in);
350         kiocb.ki_pos = *ppos;
351         ret = call_read_iter(in, &kiocb, &to);
352
353         reclaim = npages * PAGE_SIZE;
354         remain = 0;
355         if (ret > 0) {
356                 reclaim -= ret;
357                 remain = ret;
358                 *ppos = kiocb.ki_pos;
359                 file_accessed(in);
360         } else if (ret < 0) {
361                 /*
362                  * callers of ->splice_read() expect -EAGAIN on
363                  * "can't put anything in there", rather than -EFAULT.
364                  */
365                 if (ret == -EFAULT)
366                         ret = -EAGAIN;
367         }
368
369         /* Free any pages that didn't get touched at all. */
370         reclaim /= PAGE_SIZE;
371         if (reclaim) {
372                 npages -= reclaim;
373                 release_pages(pages + npages, reclaim);
374         }
375
376         /* Push the remaining pages into the pipe. */
377         for (i = 0; i < npages; i++) {
378                 struct pipe_buffer *buf = pipe_head_buf(pipe);
379
380                 chunk = min_t(size_t, remain, PAGE_SIZE);
381                 *buf = (struct pipe_buffer) {
382                         .ops    = &default_pipe_buf_ops,
383                         .page   = bv[i].bv_page,
384                         .offset = 0,
385                         .len    = chunk,
386                 };
387                 pipe->head++;
388                 remain -= chunk;
389         }
390
391         kfree(bv);
392         return ret;
393 }
394 EXPORT_SYMBOL(direct_splice_read);
395
396 /**
397  * generic_file_splice_read - splice data from file to a pipe
398  * @in:         file to splice from
399  * @ppos:       position in @in
400  * @pipe:       pipe to splice to
401  * @len:        number of bytes to splice
402  * @flags:      splice modifier flags
403  *
404  * Description:
405  *    Will read pages from given file and fill them into a pipe. Can be
406  *    used as long as it has more or less sane ->read_iter().
407  *
408  */
409 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
410                                  struct pipe_inode_info *pipe, size_t len,
411                                  unsigned int flags)
412 {
413         struct iov_iter to;
414         struct kiocb kiocb;
415         int ret;
416
417         iov_iter_pipe(&to, ITER_DEST, pipe, len);
418         init_sync_kiocb(&kiocb, in);
419         kiocb.ki_pos = *ppos;
420         ret = call_read_iter(in, &kiocb, &to);
421         if (ret > 0) {
422                 *ppos = kiocb.ki_pos;
423                 file_accessed(in);
424         } else if (ret < 0) {
425                 /* free what was emitted */
426                 pipe_discard_from(pipe, to.start_head);
427                 /*
428                  * callers of ->splice_read() expect -EAGAIN on
429                  * "can't put anything in there", rather than -EFAULT.
430                  */
431                 if (ret == -EFAULT)
432                         ret = -EAGAIN;
433         }
434
435         return ret;
436 }
437 EXPORT_SYMBOL(generic_file_splice_read);
438
439 const struct pipe_buf_operations default_pipe_buf_ops = {
440         .release        = generic_pipe_buf_release,
441         .try_steal      = generic_pipe_buf_try_steal,
442         .get            = generic_pipe_buf_get,
443 };
444
445 /* Pipe buffer operations for a socket and similar. */
446 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
447         .release        = generic_pipe_buf_release,
448         .get            = generic_pipe_buf_get,
449 };
450 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
451
452 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
453 {
454         smp_mb();
455         if (waitqueue_active(&pipe->wr_wait))
456                 wake_up_interruptible(&pipe->wr_wait);
457         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
458 }
459
460 /**
461  * splice_from_pipe_feed - feed available data from a pipe to a file
462  * @pipe:       pipe to splice from
463  * @sd:         information to @actor
464  * @actor:      handler that splices the data
465  *
466  * Description:
467  *    This function loops over the pipe and calls @actor to do the
468  *    actual moving of a single struct pipe_buffer to the desired
469  *    destination.  It returns when there's no more buffers left in
470  *    the pipe or if the requested number of bytes (@sd->total_len)
471  *    have been copied.  It returns a positive number (one) if the
472  *    pipe needs to be filled with more data, zero if the required
473  *    number of bytes have been copied and -errno on error.
474  *
475  *    This, together with splice_from_pipe_{begin,end,next}, may be
476  *    used to implement the functionality of __splice_from_pipe() when
477  *    locking is required around copying the pipe buffers to the
478  *    destination.
479  */
480 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
481                           splice_actor *actor)
482 {
483         unsigned int head = pipe->head;
484         unsigned int tail = pipe->tail;
485         unsigned int mask = pipe->ring_size - 1;
486         int ret;
487
488         while (!pipe_empty(head, tail)) {
489                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
490
491                 sd->len = buf->len;
492                 if (sd->len > sd->total_len)
493                         sd->len = sd->total_len;
494
495                 ret = pipe_buf_confirm(pipe, buf);
496                 if (unlikely(ret)) {
497                         if (ret == -ENODATA)
498                                 ret = 0;
499                         return ret;
500                 }
501
502                 ret = actor(pipe, buf, sd);
503                 if (ret <= 0)
504                         return ret;
505
506                 buf->offset += ret;
507                 buf->len -= ret;
508
509                 sd->num_spliced += ret;
510                 sd->len -= ret;
511                 sd->pos += ret;
512                 sd->total_len -= ret;
513
514                 if (!buf->len) {
515                         pipe_buf_release(pipe, buf);
516                         tail++;
517                         pipe->tail = tail;
518                         if (pipe->files)
519                                 sd->need_wakeup = true;
520                 }
521
522                 if (!sd->total_len)
523                         return 0;
524         }
525
526         return 1;
527 }
528
529 /* We know we have a pipe buffer, but maybe it's empty? */
530 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
531 {
532         unsigned int tail = pipe->tail;
533         unsigned int mask = pipe->ring_size - 1;
534         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
535
536         if (unlikely(!buf->len)) {
537                 pipe_buf_release(pipe, buf);
538                 pipe->tail = tail+1;
539                 return true;
540         }
541
542         return false;
543 }
544
545 /**
546  * splice_from_pipe_next - wait for some data to splice from
547  * @pipe:       pipe to splice from
548  * @sd:         information about the splice operation
549  *
550  * Description:
551  *    This function will wait for some data and return a positive
552  *    value (one) if pipe buffers are available.  It will return zero
553  *    or -errno if no more data needs to be spliced.
554  */
555 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
556 {
557         /*
558          * Check for signal early to make process killable when there are
559          * always buffers available
560          */
561         if (signal_pending(current))
562                 return -ERESTARTSYS;
563
564 repeat:
565         while (pipe_empty(pipe->head, pipe->tail)) {
566                 if (!pipe->writers)
567                         return 0;
568
569                 if (sd->num_spliced)
570                         return 0;
571
572                 if (sd->flags & SPLICE_F_NONBLOCK)
573                         return -EAGAIN;
574
575                 if (signal_pending(current))
576                         return -ERESTARTSYS;
577
578                 if (sd->need_wakeup) {
579                         wakeup_pipe_writers(pipe);
580                         sd->need_wakeup = false;
581                 }
582
583                 pipe_wait_readable(pipe);
584         }
585
586         if (eat_empty_buffer(pipe))
587                 goto repeat;
588
589         return 1;
590 }
591
592 /**
593  * splice_from_pipe_begin - start splicing from pipe
594  * @sd:         information about the splice operation
595  *
596  * Description:
597  *    This function should be called before a loop containing
598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
599  *    initialize the necessary fields of @sd.
600  */
601 static void splice_from_pipe_begin(struct splice_desc *sd)
602 {
603         sd->num_spliced = 0;
604         sd->need_wakeup = false;
605 }
606
607 /**
608  * splice_from_pipe_end - finish splicing from pipe
609  * @pipe:       pipe to splice from
610  * @sd:         information about the splice operation
611  *
612  * Description:
613  *    This function will wake up pipe writers if necessary.  It should
614  *    be called after a loop containing splice_from_pipe_next() and
615  *    splice_from_pipe_feed().
616  */
617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
618 {
619         if (sd->need_wakeup)
620                 wakeup_pipe_writers(pipe);
621 }
622
623 /**
624  * __splice_from_pipe - splice data from a pipe to given actor
625  * @pipe:       pipe to splice from
626  * @sd:         information to @actor
627  * @actor:      handler that splices the data
628  *
629  * Description:
630  *    This function does little more than loop over the pipe and call
631  *    @actor to do the actual moving of a single struct pipe_buffer to
632  *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
633  *    pipe_to_user.
634  *
635  */
636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
637                            splice_actor *actor)
638 {
639         int ret;
640
641         splice_from_pipe_begin(sd);
642         do {
643                 cond_resched();
644                 ret = splice_from_pipe_next(pipe, sd);
645                 if (ret > 0)
646                         ret = splice_from_pipe_feed(pipe, sd, actor);
647         } while (ret > 0);
648         splice_from_pipe_end(pipe, sd);
649
650         return sd->num_spliced ? sd->num_spliced : ret;
651 }
652 EXPORT_SYMBOL(__splice_from_pipe);
653
654 /**
655  * splice_from_pipe - splice data from a pipe to a file
656  * @pipe:       pipe to splice from
657  * @out:        file to splice to
658  * @ppos:       position in @out
659  * @len:        how many bytes to splice
660  * @flags:      splice modifier flags
661  * @actor:      handler that splices the data
662  *
663  * Description:
664  *    See __splice_from_pipe. This function locks the pipe inode,
665  *    otherwise it's identical to __splice_from_pipe().
666  *
667  */
668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
669                          loff_t *ppos, size_t len, unsigned int flags,
670                          splice_actor *actor)
671 {
672         ssize_t ret;
673         struct splice_desc sd = {
674                 .total_len = len,
675                 .flags = flags,
676                 .pos = *ppos,
677                 .u.file = out,
678         };
679
680         pipe_lock(pipe);
681         ret = __splice_from_pipe(pipe, &sd, actor);
682         pipe_unlock(pipe);
683
684         return ret;
685 }
686
687 /**
688  * iter_file_splice_write - splice data from a pipe to a file
689  * @pipe:       pipe info
690  * @out:        file to write to
691  * @ppos:       position in @out
692  * @len:        number of bytes to splice
693  * @flags:      splice modifier flags
694  *
695  * Description:
696  *    Will either move or copy pages (determined by @flags options) from
697  *    the given pipe inode to the given file.
698  *    This one is ->write_iter-based.
699  *
700  */
701 ssize_t
702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
703                           loff_t *ppos, size_t len, unsigned int flags)
704 {
705         struct splice_desc sd = {
706                 .total_len = len,
707                 .flags = flags,
708                 .pos = *ppos,
709                 .u.file = out,
710         };
711         int nbufs = pipe->max_usage;
712         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
713                                         GFP_KERNEL);
714         ssize_t ret;
715
716         if (unlikely(!array))
717                 return -ENOMEM;
718
719         pipe_lock(pipe);
720
721         splice_from_pipe_begin(&sd);
722         while (sd.total_len) {
723                 struct iov_iter from;
724                 unsigned int head, tail, mask;
725                 size_t left;
726                 int n;
727
728                 ret = splice_from_pipe_next(pipe, &sd);
729                 if (ret <= 0)
730                         break;
731
732                 if (unlikely(nbufs < pipe->max_usage)) {
733                         kfree(array);
734                         nbufs = pipe->max_usage;
735                         array = kcalloc(nbufs, sizeof(struct bio_vec),
736                                         GFP_KERNEL);
737                         if (!array) {
738                                 ret = -ENOMEM;
739                                 break;
740                         }
741                 }
742
743                 head = pipe->head;
744                 tail = pipe->tail;
745                 mask = pipe->ring_size - 1;
746
747                 /* build the vector */
748                 left = sd.total_len;
749                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
750                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
751                         size_t this_len = buf->len;
752
753                         /* zero-length bvecs are not supported, skip them */
754                         if (!this_len)
755                                 continue;
756                         this_len = min(this_len, left);
757
758                         ret = pipe_buf_confirm(pipe, buf);
759                         if (unlikely(ret)) {
760                                 if (ret == -ENODATA)
761                                         ret = 0;
762                                 goto done;
763                         }
764
765                         bvec_set_page(&array[n], buf->page, this_len,
766                                       buf->offset);
767                         left -= this_len;
768                         n++;
769                 }
770
771                 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
772                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
773                 if (ret <= 0)
774                         break;
775
776                 sd.num_spliced += ret;
777                 sd.total_len -= ret;
778                 *ppos = sd.pos;
779
780                 /* dismiss the fully eaten buffers, adjust the partial one */
781                 tail = pipe->tail;
782                 while (ret) {
783                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
784                         if (ret >= buf->len) {
785                                 ret -= buf->len;
786                                 buf->len = 0;
787                                 pipe_buf_release(pipe, buf);
788                                 tail++;
789                                 pipe->tail = tail;
790                                 if (pipe->files)
791                                         sd.need_wakeup = true;
792                         } else {
793                                 buf->offset += ret;
794                                 buf->len -= ret;
795                                 ret = 0;
796                         }
797                 }
798         }
799 done:
800         kfree(array);
801         splice_from_pipe_end(pipe, &sd);
802
803         pipe_unlock(pipe);
804
805         if (sd.num_spliced)
806                 ret = sd.num_spliced;
807
808         return ret;
809 }
810
811 EXPORT_SYMBOL(iter_file_splice_write);
812
813 #ifdef CONFIG_NET
814 /**
815  * splice_to_socket - splice data from a pipe to a socket
816  * @pipe:       pipe to splice from
817  * @out:        socket to write to
818  * @ppos:       position in @out
819  * @len:        number of bytes to splice
820  * @flags:      splice modifier flags
821  *
822  * Description:
823  *    Will send @len bytes from the pipe to a network socket. No data copying
824  *    is involved.
825  *
826  */
827 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
828                          loff_t *ppos, size_t len, unsigned int flags)
829 {
830         struct socket *sock = sock_from_file(out);
831         struct bio_vec bvec[16];
832         struct msghdr msg = {};
833         ssize_t ret = 0;
834         size_t spliced = 0;
835         bool need_wakeup = false;
836
837         pipe_lock(pipe);
838
839         while (len > 0) {
840                 unsigned int head, tail, mask, bc = 0;
841                 size_t remain = len;
842
843                 /*
844                  * Check for signal early to make process killable when there
845                  * are always buffers available
846                  */
847                 ret = -ERESTARTSYS;
848                 if (signal_pending(current))
849                         break;
850
851                 while (pipe_empty(pipe->head, pipe->tail)) {
852                         ret = 0;
853                         if (!pipe->writers)
854                                 goto out;
855
856                         if (spliced)
857                                 goto out;
858
859                         ret = -EAGAIN;
860                         if (flags & SPLICE_F_NONBLOCK)
861                                 goto out;
862
863                         ret = -ERESTARTSYS;
864                         if (signal_pending(current))
865                                 goto out;
866
867                         if (need_wakeup) {
868                                 wakeup_pipe_writers(pipe);
869                                 need_wakeup = false;
870                         }
871
872                         pipe_wait_readable(pipe);
873                 }
874
875                 head = pipe->head;
876                 tail = pipe->tail;
877                 mask = pipe->ring_size - 1;
878
879                 while (!pipe_empty(head, tail)) {
880                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
881                         size_t seg;
882
883                         if (!buf->len) {
884                                 tail++;
885                                 continue;
886                         }
887
888                         seg = min_t(size_t, remain, buf->len);
889                         seg = min_t(size_t, seg, PAGE_SIZE);
890
891                         ret = pipe_buf_confirm(pipe, buf);
892                         if (unlikely(ret)) {
893                                 if (ret == -ENODATA)
894                                         ret = 0;
895                                 break;
896                         }
897
898                         bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
899                         remain -= seg;
900                         if (seg >= buf->len)
901                                 tail++;
902                         if (bc >= ARRAY_SIZE(bvec))
903                                 break;
904                 }
905
906                 if (!bc)
907                         break;
908
909                 msg.msg_flags = MSG_SPLICE_PAGES;
910                 if (flags & SPLICE_F_MORE)
911                         msg.msg_flags |= MSG_MORE;
912                 if (remain && pipe_occupancy(pipe->head, tail) > 0)
913                         msg.msg_flags |= MSG_MORE;
914
915                 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
916                               len - remain);
917                 ret = sock_sendmsg(sock, &msg);
918                 if (ret <= 0)
919                         break;
920
921                 spliced += ret;
922                 len -= ret;
923                 tail = pipe->tail;
924                 while (ret > 0) {
925                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
926                         size_t seg = min_t(size_t, ret, buf->len);
927
928                         buf->offset += seg;
929                         buf->len -= seg;
930                         ret -= seg;
931
932                         if (!buf->len) {
933                                 pipe_buf_release(pipe, buf);
934                                 tail++;
935                         }
936                 }
937
938                 if (tail != pipe->tail) {
939                         pipe->tail = tail;
940                         if (pipe->files)
941                                 need_wakeup = true;
942                 }
943         }
944
945 out:
946         pipe_unlock(pipe);
947         if (need_wakeup)
948                 wakeup_pipe_writers(pipe);
949         return spliced ?: ret;
950 }
951 #endif
952
953 static int warn_unsupported(struct file *file, const char *op)
954 {
955         pr_debug_ratelimited(
956                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
957                 op, file, current->pid, current->comm);
958         return -EINVAL;
959 }
960
961 /*
962  * Attempt to initiate a splice from pipe to file.
963  */
964 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
965                            loff_t *ppos, size_t len, unsigned int flags)
966 {
967         if (unlikely(!out->f_op->splice_write))
968                 return warn_unsupported(out, "write");
969         return out->f_op->splice_write(pipe, out, ppos, len, flags);
970 }
971
972 /*
973  * Indicate to the caller that there was a premature EOF when reading from the
974  * source and the caller didn't indicate they would be sending more data after
975  * this.
976  */
977 static void do_splice_eof(struct splice_desc *sd)
978 {
979         if (sd->splice_eof)
980                 sd->splice_eof(sd);
981 }
982
983 /*
984  * Attempt to initiate a splice from a file to a pipe.
985  */
986 static long do_splice_to(struct file *in, loff_t *ppos,
987                          struct pipe_inode_info *pipe, size_t len,
988                          unsigned int flags)
989 {
990         unsigned int p_space;
991         int ret;
992
993         if (unlikely(!(in->f_mode & FMODE_READ)))
994                 return -EBADF;
995
996         /* Don't try to read more the pipe has space for. */
997         p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
998         len = min_t(size_t, len, p_space << PAGE_SHIFT);
999
1000         ret = rw_verify_area(READ, in, ppos, len);
1001         if (unlikely(ret < 0))
1002                 return ret;
1003
1004         if (unlikely(len > MAX_RW_COUNT))
1005                 len = MAX_RW_COUNT;
1006
1007         if (unlikely(!in->f_op->splice_read))
1008                 return warn_unsupported(in, "read");
1009         return in->f_op->splice_read(in, ppos, pipe, len, flags);
1010 }
1011
1012 /**
1013  * splice_direct_to_actor - splices data directly between two non-pipes
1014  * @in:         file to splice from
1015  * @sd:         actor information on where to splice to
1016  * @actor:      handles the data splicing
1017  *
1018  * Description:
1019  *    This is a special case helper to splice directly between two
1020  *    points, without requiring an explicit pipe. Internally an allocated
1021  *    pipe is cached in the process, and reused during the lifetime of
1022  *    that process.
1023  *
1024  */
1025 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1026                                splice_direct_actor *actor)
1027 {
1028         struct pipe_inode_info *pipe;
1029         long ret, bytes;
1030         size_t len;
1031         int i, flags, more;
1032
1033         /*
1034          * We require the input to be seekable, as we don't want to randomly
1035          * drop data for eg socket -> socket splicing. Use the piped splicing
1036          * for that!
1037          */
1038         if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1039                 return -EINVAL;
1040
1041         /*
1042          * neither in nor out is a pipe, setup an internal pipe attached to
1043          * 'out' and transfer the wanted data from 'in' to 'out' through that
1044          */
1045         pipe = current->splice_pipe;
1046         if (unlikely(!pipe)) {
1047                 pipe = alloc_pipe_info();
1048                 if (!pipe)
1049                         return -ENOMEM;
1050
1051                 /*
1052                  * We don't have an immediate reader, but we'll read the stuff
1053                  * out of the pipe right after the splice_to_pipe(). So set
1054                  * PIPE_READERS appropriately.
1055                  */
1056                 pipe->readers = 1;
1057
1058                 current->splice_pipe = pipe;
1059         }
1060
1061         /*
1062          * Do the splice.
1063          */
1064         bytes = 0;
1065         len = sd->total_len;
1066
1067         /* Don't block on output, we have to drain the direct pipe. */
1068         flags = sd->flags;
1069         sd->flags &= ~SPLICE_F_NONBLOCK;
1070
1071         /*
1072          * We signal MORE until we've read sufficient data to fulfill the
1073          * request and we keep signalling it if the caller set it.
1074          */
1075         more = sd->flags & SPLICE_F_MORE;
1076         sd->flags |= SPLICE_F_MORE;
1077
1078         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1079
1080         while (len) {
1081                 size_t read_len;
1082                 loff_t pos = sd->pos, prev_pos = pos;
1083
1084                 ret = do_splice_to(in, &pos, pipe, len, flags);
1085                 if (unlikely(ret <= 0))
1086                         goto read_failure;
1087
1088                 read_len = ret;
1089                 sd->total_len = read_len;
1090
1091                 /*
1092                  * If we now have sufficient data to fulfill the request then
1093                  * we clear SPLICE_F_MORE if it was not set initially.
1094                  */
1095                 if (read_len >= len && !more)
1096                         sd->flags &= ~SPLICE_F_MORE;
1097
1098                 /*
1099                  * NOTE: nonblocking mode only applies to the input. We
1100                  * must not do the output in nonblocking mode as then we
1101                  * could get stuck data in the internal pipe:
1102                  */
1103                 ret = actor(pipe, sd);
1104                 if (unlikely(ret <= 0)) {
1105                         sd->pos = prev_pos;
1106                         goto out_release;
1107                 }
1108
1109                 bytes += ret;
1110                 len -= ret;
1111                 sd->pos = pos;
1112
1113                 if (ret < read_len) {
1114                         sd->pos = prev_pos + ret;
1115                         goto out_release;
1116                 }
1117         }
1118
1119 done:
1120         pipe->tail = pipe->head = 0;
1121         file_accessed(in);
1122         return bytes;
1123
1124 read_failure:
1125         /*
1126          * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1127          * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1128          * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1129          * least 1 byte *then* we will also do the ->splice_eof() call.
1130          */
1131         if (ret == 0 && !more && len > 0 && bytes)
1132                 do_splice_eof(sd);
1133 out_release:
1134         /*
1135          * If we did an incomplete transfer we must release
1136          * the pipe buffers in question:
1137          */
1138         for (i = 0; i < pipe->ring_size; i++) {
1139                 struct pipe_buffer *buf = &pipe->bufs[i];
1140
1141                 if (buf->ops)
1142                         pipe_buf_release(pipe, buf);
1143         }
1144
1145         if (!bytes)
1146                 bytes = ret;
1147
1148         goto done;
1149 }
1150 EXPORT_SYMBOL(splice_direct_to_actor);
1151
1152 static int direct_splice_actor(struct pipe_inode_info *pipe,
1153                                struct splice_desc *sd)
1154 {
1155         struct file *file = sd->u.file;
1156
1157         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1158                               sd->flags);
1159 }
1160
1161 static void direct_file_splice_eof(struct splice_desc *sd)
1162 {
1163         struct file *file = sd->u.file;
1164
1165         if (file->f_op->splice_eof)
1166                 file->f_op->splice_eof(file);
1167 }
1168
1169 /**
1170  * do_splice_direct - splices data directly between two files
1171  * @in:         file to splice from
1172  * @ppos:       input file offset
1173  * @out:        file to splice to
1174  * @opos:       output file offset
1175  * @len:        number of bytes to splice
1176  * @flags:      splice modifier flags
1177  *
1178  * Description:
1179  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1180  *    doing it in the application would incur an extra system call
1181  *    (splice in + splice out, as compared to just sendfile()). So this helper
1182  *    can splice directly through a process-private pipe.
1183  *
1184  */
1185 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1186                       loff_t *opos, size_t len, unsigned int flags)
1187 {
1188         struct splice_desc sd = {
1189                 .len            = len,
1190                 .total_len      = len,
1191                 .flags          = flags,
1192                 .pos            = *ppos,
1193                 .u.file         = out,
1194                 .splice_eof     = direct_file_splice_eof,
1195                 .opos           = opos,
1196         };
1197         long ret;
1198
1199         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1200                 return -EBADF;
1201
1202         if (unlikely(out->f_flags & O_APPEND))
1203                 return -EINVAL;
1204
1205         ret = rw_verify_area(WRITE, out, opos, len);
1206         if (unlikely(ret < 0))
1207                 return ret;
1208
1209         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1210         if (ret > 0)
1211                 *ppos = sd.pos;
1212
1213         return ret;
1214 }
1215 EXPORT_SYMBOL(do_splice_direct);
1216
1217 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1218 {
1219         for (;;) {
1220                 if (unlikely(!pipe->readers)) {
1221                         send_sig(SIGPIPE, current, 0);
1222                         return -EPIPE;
1223                 }
1224                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1225                         return 0;
1226                 if (flags & SPLICE_F_NONBLOCK)
1227                         return -EAGAIN;
1228                 if (signal_pending(current))
1229                         return -ERESTARTSYS;
1230                 pipe_wait_writable(pipe);
1231         }
1232 }
1233
1234 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1235                                struct pipe_inode_info *opipe,
1236                                size_t len, unsigned int flags);
1237
1238 long splice_file_to_pipe(struct file *in,
1239                          struct pipe_inode_info *opipe,
1240                          loff_t *offset,
1241                          size_t len, unsigned int flags)
1242 {
1243         long ret;
1244
1245         pipe_lock(opipe);
1246         ret = wait_for_space(opipe, flags);
1247         if (!ret)
1248                 ret = do_splice_to(in, offset, opipe, len, flags);
1249         pipe_unlock(opipe);
1250         if (ret > 0)
1251                 wakeup_pipe_readers(opipe);
1252         return ret;
1253 }
1254
1255 /*
1256  * Determine where to splice to/from.
1257  */
1258 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1259                loff_t *off_out, size_t len, unsigned int flags)
1260 {
1261         struct pipe_inode_info *ipipe;
1262         struct pipe_inode_info *opipe;
1263         loff_t offset;
1264         long ret;
1265
1266         if (unlikely(!(in->f_mode & FMODE_READ) ||
1267                      !(out->f_mode & FMODE_WRITE)))
1268                 return -EBADF;
1269
1270         ipipe = get_pipe_info(in, true);
1271         opipe = get_pipe_info(out, true);
1272
1273         if (ipipe && opipe) {
1274                 if (off_in || off_out)
1275                         return -ESPIPE;
1276
1277                 /* Splicing to self would be fun, but... */
1278                 if (ipipe == opipe)
1279                         return -EINVAL;
1280
1281                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1282                         flags |= SPLICE_F_NONBLOCK;
1283
1284                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1285         }
1286
1287         if (ipipe) {
1288                 if (off_in)
1289                         return -ESPIPE;
1290                 if (off_out) {
1291                         if (!(out->f_mode & FMODE_PWRITE))
1292                                 return -EINVAL;
1293                         offset = *off_out;
1294                 } else {
1295                         offset = out->f_pos;
1296                 }
1297
1298                 if (unlikely(out->f_flags & O_APPEND))
1299                         return -EINVAL;
1300
1301                 ret = rw_verify_area(WRITE, out, &offset, len);
1302                 if (unlikely(ret < 0))
1303                         return ret;
1304
1305                 if (in->f_flags & O_NONBLOCK)
1306                         flags |= SPLICE_F_NONBLOCK;
1307
1308                 file_start_write(out);
1309                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1310                 file_end_write(out);
1311
1312                 if (ret > 0)
1313                         fsnotify_modify(out);
1314
1315                 if (!off_out)
1316                         out->f_pos = offset;
1317                 else
1318                         *off_out = offset;
1319
1320                 return ret;
1321         }
1322
1323         if (opipe) {
1324                 if (off_out)
1325                         return -ESPIPE;
1326                 if (off_in) {
1327                         if (!(in->f_mode & FMODE_PREAD))
1328                                 return -EINVAL;
1329                         offset = *off_in;
1330                 } else {
1331                         offset = in->f_pos;
1332                 }
1333
1334                 if (out->f_flags & O_NONBLOCK)
1335                         flags |= SPLICE_F_NONBLOCK;
1336
1337                 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1338
1339                 if (ret > 0)
1340                         fsnotify_access(in);
1341
1342                 if (!off_in)
1343                         in->f_pos = offset;
1344                 else
1345                         *off_in = offset;
1346
1347                 return ret;
1348         }
1349
1350         return -EINVAL;
1351 }
1352
1353 static long __do_splice(struct file *in, loff_t __user *off_in,
1354                         struct file *out, loff_t __user *off_out,
1355                         size_t len, unsigned int flags)
1356 {
1357         struct pipe_inode_info *ipipe;
1358         struct pipe_inode_info *opipe;
1359         loff_t offset, *__off_in = NULL, *__off_out = NULL;
1360         long ret;
1361
1362         ipipe = get_pipe_info(in, true);
1363         opipe = get_pipe_info(out, true);
1364
1365         if (ipipe) {
1366                 if (off_in)
1367                         return -ESPIPE;
1368                 pipe_clear_nowait(in);
1369         }
1370         if (opipe) {
1371                 if (off_out)
1372                         return -ESPIPE;
1373                 pipe_clear_nowait(out);
1374         }
1375
1376         if (off_out) {
1377                 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1378                         return -EFAULT;
1379                 __off_out = &offset;
1380         }
1381         if (off_in) {
1382                 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1383                         return -EFAULT;
1384                 __off_in = &offset;
1385         }
1386
1387         ret = do_splice(in, __off_in, out, __off_out, len, flags);
1388         if (ret < 0)
1389                 return ret;
1390
1391         if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1392                 return -EFAULT;
1393         if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1394                 return -EFAULT;
1395
1396         return ret;
1397 }
1398
1399 static int iter_to_pipe(struct iov_iter *from,
1400                         struct pipe_inode_info *pipe,
1401                         unsigned flags)
1402 {
1403         struct pipe_buffer buf = {
1404                 .ops = &user_page_pipe_buf_ops,
1405                 .flags = flags
1406         };
1407         size_t total = 0;
1408         int ret = 0;
1409
1410         while (iov_iter_count(from)) {
1411                 struct page *pages[16];
1412                 ssize_t left;
1413                 size_t start;
1414                 int i, n;
1415
1416                 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1417                 if (left <= 0) {
1418                         ret = left;
1419                         break;
1420                 }
1421
1422                 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1423                 for (i = 0; i < n; i++) {
1424                         int size = min_t(int, left, PAGE_SIZE - start);
1425
1426                         buf.page = pages[i];
1427                         buf.offset = start;
1428                         buf.len = size;
1429                         ret = add_to_pipe(pipe, &buf);
1430                         if (unlikely(ret < 0)) {
1431                                 iov_iter_revert(from, left);
1432                                 // this one got dropped by add_to_pipe()
1433                                 while (++i < n)
1434                                         put_page(pages[i]);
1435                                 goto out;
1436                         }
1437                         total += ret;
1438                         left -= size;
1439                         start = 0;
1440                 }
1441         }
1442 out:
1443         return total ? total : ret;
1444 }
1445
1446 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1447                         struct splice_desc *sd)
1448 {
1449         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1450         return n == sd->len ? n : -EFAULT;
1451 }
1452
1453 /*
1454  * For lack of a better implementation, implement vmsplice() to userspace
1455  * as a simple copy of the pipes pages to the user iov.
1456  */
1457 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1458                              unsigned int flags)
1459 {
1460         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1461         struct splice_desc sd = {
1462                 .total_len = iov_iter_count(iter),
1463                 .flags = flags,
1464                 .u.data = iter
1465         };
1466         long ret = 0;
1467
1468         if (!pipe)
1469                 return -EBADF;
1470
1471         pipe_clear_nowait(file);
1472
1473         if (sd.total_len) {
1474                 pipe_lock(pipe);
1475                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1476                 pipe_unlock(pipe);
1477         }
1478
1479         return ret;
1480 }
1481
1482 /*
1483  * vmsplice splices a user address range into a pipe. It can be thought of
1484  * as splice-from-memory, where the regular splice is splice-from-file (or
1485  * to file). In both cases the output is a pipe, naturally.
1486  */
1487 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1488                              unsigned int flags)
1489 {
1490         struct pipe_inode_info *pipe;
1491         long ret = 0;
1492         unsigned buf_flag = 0;
1493
1494         if (flags & SPLICE_F_GIFT)
1495                 buf_flag = PIPE_BUF_FLAG_GIFT;
1496
1497         pipe = get_pipe_info(file, true);
1498         if (!pipe)
1499                 return -EBADF;
1500
1501         pipe_clear_nowait(file);
1502
1503         pipe_lock(pipe);
1504         ret = wait_for_space(pipe, flags);
1505         if (!ret)
1506                 ret = iter_to_pipe(iter, pipe, buf_flag);
1507         pipe_unlock(pipe);
1508         if (ret > 0)
1509                 wakeup_pipe_readers(pipe);
1510         return ret;
1511 }
1512
1513 static int vmsplice_type(struct fd f, int *type)
1514 {
1515         if (!f.file)
1516                 return -EBADF;
1517         if (f.file->f_mode & FMODE_WRITE) {
1518                 *type = ITER_SOURCE;
1519         } else if (f.file->f_mode & FMODE_READ) {
1520                 *type = ITER_DEST;
1521         } else {
1522                 fdput(f);
1523                 return -EBADF;
1524         }
1525         return 0;
1526 }
1527
1528 /*
1529  * Note that vmsplice only really supports true splicing _from_ user memory
1530  * to a pipe, not the other way around. Splicing from user memory is a simple
1531  * operation that can be supported without any funky alignment restrictions
1532  * or nasty vm tricks. We simply map in the user memory and fill them into
1533  * a pipe. The reverse isn't quite as easy, though. There are two possible
1534  * solutions for that:
1535  *
1536  *      - memcpy() the data internally, at which point we might as well just
1537  *        do a regular read() on the buffer anyway.
1538  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1539  *        has restriction limitations on both ends of the pipe).
1540  *
1541  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1542  *
1543  */
1544 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1545                 unsigned long, nr_segs, unsigned int, flags)
1546 {
1547         struct iovec iovstack[UIO_FASTIOV];
1548         struct iovec *iov = iovstack;
1549         struct iov_iter iter;
1550         ssize_t error;
1551         struct fd f;
1552         int type;
1553
1554         if (unlikely(flags & ~SPLICE_F_ALL))
1555                 return -EINVAL;
1556
1557         f = fdget(fd);
1558         error = vmsplice_type(f, &type);
1559         if (error)
1560                 return error;
1561
1562         error = import_iovec(type, uiov, nr_segs,
1563                              ARRAY_SIZE(iovstack), &iov, &iter);
1564         if (error < 0)
1565                 goto out_fdput;
1566
1567         if (!iov_iter_count(&iter))
1568                 error = 0;
1569         else if (type == ITER_SOURCE)
1570                 error = vmsplice_to_pipe(f.file, &iter, flags);
1571         else
1572                 error = vmsplice_to_user(f.file, &iter, flags);
1573
1574         kfree(iov);
1575 out_fdput:
1576         fdput(f);
1577         return error;
1578 }
1579
1580 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1581                 int, fd_out, loff_t __user *, off_out,
1582                 size_t, len, unsigned int, flags)
1583 {
1584         struct fd in, out;
1585         long error;
1586
1587         if (unlikely(!len))
1588                 return 0;
1589
1590         if (unlikely(flags & ~SPLICE_F_ALL))
1591                 return -EINVAL;
1592
1593         error = -EBADF;
1594         in = fdget(fd_in);
1595         if (in.file) {
1596                 out = fdget(fd_out);
1597                 if (out.file) {
1598                         error = __do_splice(in.file, off_in, out.file, off_out,
1599                                                 len, flags);
1600                         fdput(out);
1601                 }
1602                 fdput(in);
1603         }
1604         return error;
1605 }
1606
1607 /*
1608  * Make sure there's data to read. Wait for input if we can, otherwise
1609  * return an appropriate error.
1610  */
1611 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1612 {
1613         int ret;
1614
1615         /*
1616          * Check the pipe occupancy without the inode lock first. This function
1617          * is speculative anyways, so missing one is ok.
1618          */
1619         if (!pipe_empty(pipe->head, pipe->tail))
1620                 return 0;
1621
1622         ret = 0;
1623         pipe_lock(pipe);
1624
1625         while (pipe_empty(pipe->head, pipe->tail)) {
1626                 if (signal_pending(current)) {
1627                         ret = -ERESTARTSYS;
1628                         break;
1629                 }
1630                 if (!pipe->writers)
1631                         break;
1632                 if (flags & SPLICE_F_NONBLOCK) {
1633                         ret = -EAGAIN;
1634                         break;
1635                 }
1636                 pipe_wait_readable(pipe);
1637         }
1638
1639         pipe_unlock(pipe);
1640         return ret;
1641 }
1642
1643 /*
1644  * Make sure there's writeable room. Wait for room if we can, otherwise
1645  * return an appropriate error.
1646  */
1647 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1648 {
1649         int ret;
1650
1651         /*
1652          * Check pipe occupancy without the inode lock first. This function
1653          * is speculative anyways, so missing one is ok.
1654          */
1655         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1656                 return 0;
1657
1658         ret = 0;
1659         pipe_lock(pipe);
1660
1661         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1662                 if (!pipe->readers) {
1663                         send_sig(SIGPIPE, current, 0);
1664                         ret = -EPIPE;
1665                         break;
1666                 }
1667                 if (flags & SPLICE_F_NONBLOCK) {
1668                         ret = -EAGAIN;
1669                         break;
1670                 }
1671                 if (signal_pending(current)) {
1672                         ret = -ERESTARTSYS;
1673                         break;
1674                 }
1675                 pipe_wait_writable(pipe);
1676         }
1677
1678         pipe_unlock(pipe);
1679         return ret;
1680 }
1681
1682 /*
1683  * Splice contents of ipipe to opipe.
1684  */
1685 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1686                                struct pipe_inode_info *opipe,
1687                                size_t len, unsigned int flags)
1688 {
1689         struct pipe_buffer *ibuf, *obuf;
1690         unsigned int i_head, o_head;
1691         unsigned int i_tail, o_tail;
1692         unsigned int i_mask, o_mask;
1693         int ret = 0;
1694         bool input_wakeup = false;
1695
1696
1697 retry:
1698         ret = ipipe_prep(ipipe, flags);
1699         if (ret)
1700                 return ret;
1701
1702         ret = opipe_prep(opipe, flags);
1703         if (ret)
1704                 return ret;
1705
1706         /*
1707          * Potential ABBA deadlock, work around it by ordering lock
1708          * grabbing by pipe info address. Otherwise two different processes
1709          * could deadlock (one doing tee from A -> B, the other from B -> A).
1710          */
1711         pipe_double_lock(ipipe, opipe);
1712
1713         i_tail = ipipe->tail;
1714         i_mask = ipipe->ring_size - 1;
1715         o_head = opipe->head;
1716         o_mask = opipe->ring_size - 1;
1717
1718         do {
1719                 size_t o_len;
1720
1721                 if (!opipe->readers) {
1722                         send_sig(SIGPIPE, current, 0);
1723                         if (!ret)
1724                                 ret = -EPIPE;
1725                         break;
1726                 }
1727
1728                 i_head = ipipe->head;
1729                 o_tail = opipe->tail;
1730
1731                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1732                         break;
1733
1734                 /*
1735                  * Cannot make any progress, because either the input
1736                  * pipe is empty or the output pipe is full.
1737                  */
1738                 if (pipe_empty(i_head, i_tail) ||
1739                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1740                         /* Already processed some buffers, break */
1741                         if (ret)
1742                                 break;
1743
1744                         if (flags & SPLICE_F_NONBLOCK) {
1745                                 ret = -EAGAIN;
1746                                 break;
1747                         }
1748
1749                         /*
1750                          * We raced with another reader/writer and haven't
1751                          * managed to process any buffers.  A zero return
1752                          * value means EOF, so retry instead.
1753                          */
1754                         pipe_unlock(ipipe);
1755                         pipe_unlock(opipe);
1756                         goto retry;
1757                 }
1758
1759                 ibuf = &ipipe->bufs[i_tail & i_mask];
1760                 obuf = &opipe->bufs[o_head & o_mask];
1761
1762                 if (len >= ibuf->len) {
1763                         /*
1764                          * Simply move the whole buffer from ipipe to opipe
1765                          */
1766                         *obuf = *ibuf;
1767                         ibuf->ops = NULL;
1768                         i_tail++;
1769                         ipipe->tail = i_tail;
1770                         input_wakeup = true;
1771                         o_len = obuf->len;
1772                         o_head++;
1773                         opipe->head = o_head;
1774                 } else {
1775                         /*
1776                          * Get a reference to this pipe buffer,
1777                          * so we can copy the contents over.
1778                          */
1779                         if (!pipe_buf_get(ipipe, ibuf)) {
1780                                 if (ret == 0)
1781                                         ret = -EFAULT;
1782                                 break;
1783                         }
1784                         *obuf = *ibuf;
1785
1786                         /*
1787                          * Don't inherit the gift and merge flags, we need to
1788                          * prevent multiple steals of this page.
1789                          */
1790                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1791                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1792
1793                         obuf->len = len;
1794                         ibuf->offset += len;
1795                         ibuf->len -= len;
1796                         o_len = len;
1797                         o_head++;
1798                         opipe->head = o_head;
1799                 }
1800                 ret += o_len;
1801                 len -= o_len;
1802         } while (len);
1803
1804         pipe_unlock(ipipe);
1805         pipe_unlock(opipe);
1806
1807         /*
1808          * If we put data in the output pipe, wakeup any potential readers.
1809          */
1810         if (ret > 0)
1811                 wakeup_pipe_readers(opipe);
1812
1813         if (input_wakeup)
1814                 wakeup_pipe_writers(ipipe);
1815
1816         return ret;
1817 }
1818
1819 /*
1820  * Link contents of ipipe to opipe.
1821  */
1822 static int link_pipe(struct pipe_inode_info *ipipe,
1823                      struct pipe_inode_info *opipe,
1824                      size_t len, unsigned int flags)
1825 {
1826         struct pipe_buffer *ibuf, *obuf;
1827         unsigned int i_head, o_head;
1828         unsigned int i_tail, o_tail;
1829         unsigned int i_mask, o_mask;
1830         int ret = 0;
1831
1832         /*
1833          * Potential ABBA deadlock, work around it by ordering lock
1834          * grabbing by pipe info address. Otherwise two different processes
1835          * could deadlock (one doing tee from A -> B, the other from B -> A).
1836          */
1837         pipe_double_lock(ipipe, opipe);
1838
1839         i_tail = ipipe->tail;
1840         i_mask = ipipe->ring_size - 1;
1841         o_head = opipe->head;
1842         o_mask = opipe->ring_size - 1;
1843
1844         do {
1845                 if (!opipe->readers) {
1846                         send_sig(SIGPIPE, current, 0);
1847                         if (!ret)
1848                                 ret = -EPIPE;
1849                         break;
1850                 }
1851
1852                 i_head = ipipe->head;
1853                 o_tail = opipe->tail;
1854
1855                 /*
1856                  * If we have iterated all input buffers or run out of
1857                  * output room, break.
1858                  */
1859                 if (pipe_empty(i_head, i_tail) ||
1860                     pipe_full(o_head, o_tail, opipe->max_usage))
1861                         break;
1862
1863                 ibuf = &ipipe->bufs[i_tail & i_mask];
1864                 obuf = &opipe->bufs[o_head & o_mask];
1865
1866                 /*
1867                  * Get a reference to this pipe buffer,
1868                  * so we can copy the contents over.
1869                  */
1870                 if (!pipe_buf_get(ipipe, ibuf)) {
1871                         if (ret == 0)
1872                                 ret = -EFAULT;
1873                         break;
1874                 }
1875
1876                 *obuf = *ibuf;
1877
1878                 /*
1879                  * Don't inherit the gift and merge flag, we need to prevent
1880                  * multiple steals of this page.
1881                  */
1882                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1883                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1884
1885                 if (obuf->len > len)
1886                         obuf->len = len;
1887                 ret += obuf->len;
1888                 len -= obuf->len;
1889
1890                 o_head++;
1891                 opipe->head = o_head;
1892                 i_tail++;
1893         } while (len);
1894
1895         pipe_unlock(ipipe);
1896         pipe_unlock(opipe);
1897
1898         /*
1899          * If we put data in the output pipe, wakeup any potential readers.
1900          */
1901         if (ret > 0)
1902                 wakeup_pipe_readers(opipe);
1903
1904         return ret;
1905 }
1906
1907 /*
1908  * This is a tee(1) implementation that works on pipes. It doesn't copy
1909  * any data, it simply references the 'in' pages on the 'out' pipe.
1910  * The 'flags' used are the SPLICE_F_* variants, currently the only
1911  * applicable one is SPLICE_F_NONBLOCK.
1912  */
1913 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1914 {
1915         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1916         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1917         int ret = -EINVAL;
1918
1919         if (unlikely(!(in->f_mode & FMODE_READ) ||
1920                      !(out->f_mode & FMODE_WRITE)))
1921                 return -EBADF;
1922
1923         /*
1924          * Duplicate the contents of ipipe to opipe without actually
1925          * copying the data.
1926          */
1927         if (ipipe && opipe && ipipe != opipe) {
1928                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1929                         flags |= SPLICE_F_NONBLOCK;
1930
1931                 /*
1932                  * Keep going, unless we encounter an error. The ipipe/opipe
1933                  * ordering doesn't really matter.
1934                  */
1935                 ret = ipipe_prep(ipipe, flags);
1936                 if (!ret) {
1937                         ret = opipe_prep(opipe, flags);
1938                         if (!ret)
1939                                 ret = link_pipe(ipipe, opipe, len, flags);
1940                 }
1941         }
1942
1943         return ret;
1944 }
1945
1946 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1947 {
1948         struct fd in, out;
1949         int error;
1950
1951         if (unlikely(flags & ~SPLICE_F_ALL))
1952                 return -EINVAL;
1953
1954         if (unlikely(!len))
1955                 return 0;
1956
1957         error = -EBADF;
1958         in = fdget(fdin);
1959         if (in.file) {
1960                 out = fdget(fdout);
1961                 if (out.file) {
1962                         error = do_tee(in.file, out.file, len, flags);
1963                         fdput(out);
1964                 }
1965                 fdput(in);
1966         }
1967
1968         return error;
1969 }