fs/netfs/read_helper.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* Network filesystem high-level read support.
   3  *
   4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/module.h>
   9 #include <linux/export.h>
  10 #include <linux/fs.h>
  11 #include <linux/mm.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/slab.h>
  14 #include <linux/uio.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/task_io_accounting_ops.h>
  17 #include "internal.h"
  18 #define CREATE_TRACE_POINTS
  19 #include <trace/events/netfs.h>
  20
  21 MODULE_DESCRIPTION("Network fs support");
  22 MODULE_AUTHOR("Red Hat, Inc.");
  23 MODULE_LICENSE("GPL");
  24
  25 unsigned netfs_debug;
  26 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
  27 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
  28
  29 /*
  30  * Clear the unread part of an I/O request.
  31  */
  32 static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
  33 {
  34         struct iov_iter iter;
  35
  36         iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
  37                         subreq->start + subreq->transferred,
  38                         subreq->len   - subreq->transferred);
  39         iov_iter_zero(iov_iter_count(&iter), &iter);
  40 }
  41
  42 static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
  43                                         bool was_async)
  44 {
  45         struct netfs_io_subrequest *subreq = priv;
  46
  47         netfs_subreq_terminated(subreq, transferred_or_error, was_async);
  48 }
  49
  50 /*
  51  * Issue a read against the cache.
  52  * - Eats the caller's ref on subreq.
  53  */
  54 static void netfs_read_from_cache(struct netfs_io_request *rreq,
  55                                   struct netfs_io_subrequest *subreq,
  56                                   enum netfs_read_from_hole read_hole)
  57 {
  58         struct netfs_cache_resources *cres = &rreq->cache_resources;
  59         struct iov_iter iter;
  60
  61         netfs_stat(&netfs_n_rh_read);
  62         iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
  63                         subreq->start + subreq->transferred,
  64                         subreq->len   - subreq->transferred);
  65
  66         cres->ops->read(cres, subreq->start, &iter, read_hole,
  67                         netfs_cache_read_terminated, subreq);
  68 }
  69
  70 /*
  71  * Fill a subrequest region with zeroes.
  72  */
  73 static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
  74                                    struct netfs_io_subrequest *subreq)
  75 {
  76         netfs_stat(&netfs_n_rh_zero);
  77         __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
  78         netfs_subreq_terminated(subreq, 0, false);
  79 }
  80
  81 /*
  82  * Ask the netfs to issue a read request to the server for us.
  83  *
  84  * The netfs is expected to read from subreq->pos + subreq->transferred to
  85  * subreq->pos + subreq->len - 1.  It may not backtrack and write data into the
  86  * buffer prior to the transferred point as it might clobber dirty data
  87  * obtained from the cache.
  88  *
  89  * Alternatively, the netfs is allowed to indicate one of two things:
  90  *
  91  * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
  92  *   make progress.
  93  *
  94  * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
  95  *   cleared.
  96  */
  97 static void netfs_read_from_server(struct netfs_io_request *rreq,
  98                                    struct netfs_io_subrequest *subreq)
  99 {
 100         netfs_stat(&netfs_n_rh_download);
 101         rreq->netfs_ops->issue_read(subreq);
 102 }
 103
 104 /*
 105  * Release those waiting.
 106  */
 107 static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
 108 {
 109         trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 110         netfs_clear_subrequests(rreq, was_async);
 111         netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
 112 }
 113
 114 /*
 115  * Deal with the completion of writing the data to the cache.  We have to clear
 116  * the PG_fscache bits on the folios involved and release the caller's ref.
 117  *
 118  * May be called in softirq mode and we inherit a ref from the caller.
 119  */
 120 static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
 121                                           bool was_async)
 122 {
 123         struct netfs_io_subrequest *subreq;
 124         struct folio *folio;
 125         pgoff_t unlocked = 0;
 126         bool have_unlocked = false;
 127
 128         rcu_read_lock();
 129
 130         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 131                 XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
 132
 133                 xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
 134                         /* We might have multiple writes from the same huge
 135                          * folio, but we mustn't unlock a folio more than once.
 136                          */
 137                         if (have_unlocked && folio_index(folio) <= unlocked)
 138                                 continue;
 139                         unlocked = folio_index(folio);
 140                         folio_end_fscache(folio);
 141                         have_unlocked = true;
 142                 }
 143         }
 144
 145         rcu_read_unlock();
 146         netfs_rreq_completed(rreq, was_async);
 147 }
 148
 149 static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
 150                                        bool was_async)
 151 {
 152         struct netfs_io_subrequest *subreq = priv;
 153         struct netfs_io_request *rreq = subreq->rreq;
 154
 155         if (IS_ERR_VALUE(transferred_or_error)) {
 156                 netfs_stat(&netfs_n_rh_write_failed);
 157                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 158                                     netfs_fail_copy_to_cache);
 159         } else {
 160                 netfs_stat(&netfs_n_rh_write_done);
 161         }
 162
 163         trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
 164
 165         /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
 166         if (atomic_dec_and_test(&rreq->nr_copy_ops))
 167                 netfs_rreq_unmark_after_write(rreq, was_async);
 168
 169         netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
 170 }
 171
 172 /*
 173  * Perform any outstanding writes to the cache.  We inherit a ref from the
 174  * caller.
 175  */
 176 static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
 177 {
 178         struct netfs_cache_resources *cres = &rreq->cache_resources;
 179         struct netfs_io_subrequest *subreq, *next, *p;
 180         struct iov_iter iter;
 181         int ret;
 182
 183         trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
 184
 185         /* We don't want terminating writes trying to wake us up whilst we're
 186          * still going through the list.
 187          */
 188         atomic_inc(&rreq->nr_copy_ops);
 189
 190         list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
 191                 if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
 192                         list_del_init(&subreq->rreq_link);
 193                         netfs_put_subrequest(subreq, false,
 194                                              netfs_sreq_trace_put_no_copy);
 195                 }
 196         }
 197
 198         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 199                 /* Amalgamate adjacent writes */
 200                 while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 201                         next = list_next_entry(subreq, rreq_link);
 202                         if (next->start != subreq->start + subreq->len)
 203                                 break;
 204                         subreq->len += next->len;
 205                         list_del_init(&next->rreq_link);
 206                         netfs_put_subrequest(next, false,
 207                                              netfs_sreq_trace_put_merged);
 208                 }
 209
 210                 ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
 211                                                rreq->i_size, true);
 212                 if (ret < 0) {
 213                         trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
 214                         trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
 215                         continue;
 216                 }
 217
 218                 iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
 219                                 subreq->start, subreq->len);
 220
 221                 atomic_inc(&rreq->nr_copy_ops);
 222                 netfs_stat(&netfs_n_rh_write);
 223                 netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
 224                 trace_netfs_sreq(subreq, netfs_sreq_trace_write);
 225                 cres->ops->write(cres, subreq->start, &iter,
 226                                  netfs_rreq_copy_terminated, subreq);
 227         }
 228
 229         /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
 230         if (atomic_dec_and_test(&rreq->nr_copy_ops))
 231                 netfs_rreq_unmark_after_write(rreq, false);
 232 }
 233
 234 static void netfs_rreq_write_to_cache_work(struct work_struct *work)
 235 {
 236         struct netfs_io_request *rreq =
 237                 container_of(work, struct netfs_io_request, work);
 238
 239         netfs_rreq_do_write_to_cache(rreq);
 240 }
 241
 242 static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
 243 {
 244         rreq->work.func = netfs_rreq_write_to_cache_work;
 245         if (!queue_work(system_unbound_wq, &rreq->work))
 246                 BUG();
 247 }
 248
 249 /*
 250  * Unlock the folios in a read operation.  We need to set PG_fscache on any
 251  * folios we're going to write back before we unlock them.
 252  */
 253 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 254 {
 255         struct netfs_io_subrequest *subreq;
 256         struct folio *folio;
 257         unsigned int iopos, account = 0;
 258         pgoff_t start_page = rreq->start / PAGE_SIZE;
 259         pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
 260         bool subreq_failed = false;
 261
 262         XA_STATE(xas, &rreq->mapping->i_pages, start_page);
 263
 264         if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
 265                 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
 266                 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 267                         __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 268                 }
 269         }
 270
 271         /* Walk through the pagecache and the I/O request lists simultaneously.
 272          * We may have a mixture of cached and uncached sections and we only
 273          * really want to write out the uncached sections.  This is slightly
 274          * complicated by the possibility that we might have huge pages with a
 275          * mixture inside.
 276          */
 277         subreq = list_first_entry(&rreq->subrequests,
 278                                   struct netfs_io_subrequest, rreq_link);
 279         iopos = 0;
 280         subreq_failed = (subreq->error < 0);
 281
 282         trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
 283
 284         rcu_read_lock();
 285         xas_for_each(&xas, folio, last_page) {
 286                 unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
 287                 unsigned int pgend = pgpos + folio_size(folio);
 288                 bool pg_failed = false;
 289
 290                 for (;;) {
 291                         if (!subreq) {
 292                                 pg_failed = true;
 293                                 break;
 294                         }
 295                         if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
 296                                 folio_start_fscache(folio);
 297                         pg_failed |= subreq_failed;
 298                         if (pgend < iopos + subreq->len)
 299                                 break;
 300
 301                         account += subreq->transferred;
 302                         iopos += subreq->len;
 303                         if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 304                                 subreq = list_next_entry(subreq, rreq_link);
 305                                 subreq_failed = (subreq->error < 0);
 306                         } else {
 307                                 subreq = NULL;
 308                                 subreq_failed = false;
 309                         }
 310                         if (pgend == iopos)
 311                                 break;
 312                 }
 313
 314                 if (!pg_failed) {
 315                         flush_dcache_folio(folio);
 316                         folio_mark_uptodate(folio);
 317                 }
 318
 319                 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
 320                         if (folio_index(folio) == rreq->no_unlock_folio &&
 321                             test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
 322                                 _debug("no unlock");
 323                         else
 324                                 folio_unlock(folio);
 325                 }
 326         }
 327         rcu_read_unlock();
 328
 329         task_io_account_read(account);
 330         if (rreq->netfs_ops->done)
 331                 rreq->netfs_ops->done(rreq);
 332 }
 333
 334 /*
 335  * Handle a short read.
 336  */
 337 static void netfs_rreq_short_read(struct netfs_io_request *rreq,
 338                                   struct netfs_io_subrequest *subreq)
 339 {
 340         __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
 341         __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
 342
 343         netfs_stat(&netfs_n_rh_short_read);
 344         trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
 345
 346         netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
 347         atomic_inc(&rreq->nr_outstanding);
 348         if (subreq->source == NETFS_READ_FROM_CACHE)
 349                 netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
 350         else
 351                 netfs_read_from_server(rreq, subreq);
 352 }
 353
 354 /*
 355  * Resubmit any short or failed operations.  Returns true if we got the rreq
 356  * ref back.
 357  */
 358 static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
 359 {
 360         struct netfs_io_subrequest *subreq;
 361
 362         WARN_ON(in_interrupt());
 363
 364         trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
 365
 366         /* We don't want terminating submissions trying to wake us up whilst
 367          * we're still going through the list.
 368          */
 369         atomic_inc(&rreq->nr_outstanding);
 370
 371         __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 372         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 373                 if (subreq->error) {
 374                         if (subreq->source != NETFS_READ_FROM_CACHE)
 375                                 break;
 376                         subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 377                         subreq->error = 0;
 378                         netfs_stat(&netfs_n_rh_download_instead);
 379                         trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 380                         netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 381                         atomic_inc(&rreq->nr_outstanding);
 382                         netfs_read_from_server(rreq, subreq);
 383                 } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
 384                         netfs_rreq_short_read(rreq, subreq);
 385                 }
 386         }
 387
 388         /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
 389         if (atomic_dec_and_test(&rreq->nr_outstanding))
 390                 return true;
 391
 392         wake_up_var(&rreq->nr_outstanding);
 393         return false;
 394 }
 395
 396 /*
 397  * Check to see if the data read is still valid.
 398  */
 399 static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
 400 {
 401         struct netfs_io_subrequest *subreq;
 402
 403         if (!rreq->netfs_ops->is_still_valid ||
 404             rreq->netfs_ops->is_still_valid(rreq))
 405                 return;
 406
 407         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 408                 if (subreq->source == NETFS_READ_FROM_CACHE) {
 409                         subreq->error = -ESTALE;
 410                         __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 411                 }
 412         }
 413 }
 414
 415 /*
 416  * Assess the state of a read request and decide what to do next.
 417  *
 418  * Note that we could be in an ordinary kernel thread, on a workqueue or in
 419  * softirq context at this point.  We inherit a ref from the caller.
 420  */
 421 static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
 422 {
 423         trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 424
 425 again:
 426         netfs_rreq_is_still_valid(rreq);
 427
 428         if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
 429             test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
 430                 if (netfs_rreq_perform_resubmissions(rreq))
 431                         goto again;
 432                 return;
 433         }
 434
 435         netfs_rreq_unlock_folios(rreq);
 436
 437         clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 438         wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 439
 440         if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
 441                 return netfs_rreq_write_to_cache(rreq);
 442
 443         netfs_rreq_completed(rreq, was_async);
 444 }
 445
 446 static void netfs_rreq_work(struct work_struct *work)
 447 {
 448         struct netfs_io_request *rreq =
 449                 container_of(work, struct netfs_io_request, work);
 450         netfs_rreq_assess(rreq, false);
 451 }
 452
 453 /*
 454  * Handle the completion of all outstanding I/O operations on a read request.
 455  * We inherit a ref from the caller.
 456  */
 457 static void netfs_rreq_terminated(struct netfs_io_request *rreq,
 458                                   bool was_async)
 459 {
 460         if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
 461             was_async) {
 462                 if (!queue_work(system_unbound_wq, &rreq->work))
 463                         BUG();
 464         } else {
 465                 netfs_rreq_assess(rreq, was_async);
 466         }
 467 }
 468
 469 /**
 470  * netfs_subreq_terminated - Note the termination of an I/O operation.
 471  * @subreq: The I/O request that has terminated.
 472  * @transferred_or_error: The amount of data transferred or an error code.
 473  * @was_async: The termination was asynchronous
 474  *
 475  * This tells the read helper that a contributory I/O operation has terminated,
 476  * one way or another, and that it should integrate the results.
 477  *
 478  * The caller indicates in @transferred_or_error the outcome of the operation,
 479  * supplying a positive value to indicate the number of bytes transferred, 0 to
 480  * indicate a failure to transfer anything that should be retried or a negative
 481  * error code.  The helper will look after reissuing I/O operations as
 482  * appropriate and writing downloaded data to the cache.
 483  *
 484  * If @was_async is true, the caller might be running in softirq or interrupt
 485  * context and we can't sleep.
 486  */
 487 void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
 488                              ssize_t transferred_or_error,
 489                              bool was_async)
 490 {
 491         struct netfs_io_request *rreq = subreq->rreq;
 492         int u;
 493
 494         _enter("[%u]{%llx,%lx},%zd",
 495                subreq->debug_index, subreq->start, subreq->flags,
 496                transferred_or_error);
 497
 498         switch (subreq->source) {
 499         case NETFS_READ_FROM_CACHE:
 500                 netfs_stat(&netfs_n_rh_read_done);
 501                 break;
 502         case NETFS_DOWNLOAD_FROM_SERVER:
 503                 netfs_stat(&netfs_n_rh_download_done);
 504                 break;
 505         default:
 506                 break;
 507         }
 508
 509         if (IS_ERR_VALUE(transferred_or_error)) {
 510                 subreq->error = transferred_or_error;
 511                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 512                                     netfs_fail_read);
 513                 goto failed;
 514         }
 515
 516         if (WARN(transferred_or_error > subreq->len - subreq->transferred,
 517                  "Subreq overread: R%x[%x] %zd > %zu - %zu",
 518                  rreq->debug_id, subreq->debug_index,
 519                  transferred_or_error, subreq->len, subreq->transferred))
 520                 transferred_or_error = subreq->len - subreq->transferred;
 521
 522         subreq->error = 0;
 523         subreq->transferred += transferred_or_error;
 524         if (subreq->transferred < subreq->len)
 525                 goto incomplete;
 526
 527 complete:
 528         __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 529         if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
 530                 set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
 531
 532 out:
 533         trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 534
 535         /* If we decrement nr_outstanding to 0, the ref belongs to us. */
 536         u = atomic_dec_return(&rreq->nr_outstanding);
 537         if (u == 0)
 538                 netfs_rreq_terminated(rreq, was_async);
 539         else if (u == 1)
 540                 wake_up_var(&rreq->nr_outstanding);
 541
 542         netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
 543         return;
 544
 545 incomplete:
 546         if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
 547                 netfs_clear_unread(subreq);
 548                 subreq->transferred = subreq->len;
 549                 goto complete;
 550         }
 551
 552         if (transferred_or_error == 0) {
 553                 if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
 554                         subreq->error = -ENODATA;
 555                         goto failed;
 556                 }
 557         } else {
 558                 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 559         }
 560
 561         __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
 562         set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 563         goto out;
 564
 565 failed:
 566         if (subreq->source == NETFS_READ_FROM_CACHE) {
 567                 netfs_stat(&netfs_n_rh_read_failed);
 568                 set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 569         } else {
 570                 netfs_stat(&netfs_n_rh_download_failed);
 571                 set_bit(NETFS_RREQ_FAILED, &rreq->flags);
 572                 rreq->error = subreq->error;
 573         }
 574         goto out;
 575 }
 576 EXPORT_SYMBOL(netfs_subreq_terminated);
 577
 578 static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
 579                                                        loff_t i_size)
 580 {
 581         struct netfs_io_request *rreq = subreq->rreq;
 582         struct netfs_cache_resources *cres = &rreq->cache_resources;
 583
 584         if (cres->ops)
 585                 return cres->ops->prepare_read(subreq, i_size);
 586         if (subreq->start >= rreq->i_size)
 587                 return NETFS_FILL_WITH_ZEROES;
 588         return NETFS_DOWNLOAD_FROM_SERVER;
 589 }
 590
 591 /*
 592  * Work out what sort of subrequest the next one will be.
 593  */
 594 static enum netfs_io_source
 595 netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 596                         struct netfs_io_subrequest *subreq)
 597 {
 598         enum netfs_io_source source;
 599
 600         _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 601
 602         source = netfs_cache_prepare_read(subreq, rreq->i_size);
 603         if (source == NETFS_INVALID_READ)
 604                 goto out;
 605
 606         if (source == NETFS_DOWNLOAD_FROM_SERVER) {
 607                 /* Call out to the netfs to let it shrink the request to fit
 608                  * its own I/O sizes and boundaries.  If it shinks it here, it
 609                  * will be called again to make simultaneous calls; if it wants
 610                  * to make serial calls, it can indicate a short read and then
 611                  * we will call it again.
 612                  */
 613                 if (subreq->len > rreq->i_size - subreq->start)
 614                         subreq->len = rreq->i_size - subreq->start;
 615
 616                 if (rreq->netfs_ops->clamp_length &&
 617                     !rreq->netfs_ops->clamp_length(subreq)) {
 618                         source = NETFS_INVALID_READ;
 619                         goto out;
 620                 }
 621         }
 622
 623         if (WARN_ON(subreq->len == 0))
 624                 source = NETFS_INVALID_READ;
 625
 626 out:
 627         subreq->source = source;
 628         trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 629         return source;
 630 }
 631
 632 /*
 633  * Slice off a piece of a read request and submit an I/O request for it.
 634  */
 635 static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
 636                                     unsigned int *_debug_index)
 637 {
 638         struct netfs_io_subrequest *subreq;
 639         enum netfs_io_source source;
 640
 641         subreq = netfs_alloc_subrequest(rreq);
 642         if (!subreq)
 643                 return false;
 644
 645         subreq->debug_index     = (*_debug_index)++;
 646         subreq->start           = rreq->start + rreq->submitted;
 647         subreq->len             = rreq->len   - rreq->submitted;
 648
 649         _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
 650         list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 651
 652         /* Call out to the cache to find out what it can do with the remaining
 653          * subset.  It tells us in subreq->flags what it decided should be done
 654          * and adjusts subreq->len down if the subset crosses a cache boundary.
 655          *
 656          * Then when we hand the subset, it can choose to take a subset of that
 657          * (the starts must coincide), in which case, we go around the loop
 658          * again and ask it to download the next piece.
 659          */
 660         source = netfs_rreq_prepare_read(rreq, subreq);
 661         if (source == NETFS_INVALID_READ)
 662                 goto subreq_failed;
 663
 664         atomic_inc(&rreq->nr_outstanding);
 665
 666         rreq->submitted += subreq->len;
 667
 668         trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 669         switch (source) {
 670         case NETFS_FILL_WITH_ZEROES:
 671                 netfs_fill_with_zeroes(rreq, subreq);
 672                 break;
 673         case NETFS_DOWNLOAD_FROM_SERVER:
 674                 netfs_read_from_server(rreq, subreq);
 675                 break;
 676         case NETFS_READ_FROM_CACHE:
 677                 netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
 678                 break;
 679         default:
 680                 BUG();
 681         }
 682
 683         return true;
 684
 685 subreq_failed:
 686         rreq->error = subreq->error;
 687         netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
 688         return false;
 689 }
 690
 691 /*
 692  * Begin the process of reading in a chunk of data, where that data may be
 693  * stitched together from multiple sources, including multiple servers and the
 694  * local cache.
 695  */
 696 int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 697 {
 698         unsigned int debug_index = 0;
 699         int ret;
 700
 701         _enter("R=%x %llx-%llx",
 702                rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
 703
 704         if (rreq->len == 0) {
 705                 pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
 706                 netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
 707                 return -EIO;
 708         }
 709
 710         INIT_WORK(&rreq->work, netfs_rreq_work);
 711
 712         if (sync)
 713                 netfs_get_request(rreq, netfs_rreq_trace_get_hold);
 714
 715         /* Chop the read into slices according to what the cache and the netfs
 716          * want and submit each one.
 717          */
 718         atomic_set(&rreq->nr_outstanding, 1);
 719         do {
 720                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
 721                         break;
 722
 723         } while (rreq->submitted < rreq->len);
 724
 725         if (sync) {
 726                 /* Keep nr_outstanding incremented so that the ref always belongs to
 727                  * us, and the service code isn't punted off to a random thread pool to
 728                  * process.
 729                  */
 730                 for (;;) {
 731                         wait_var_event(&rreq->nr_outstanding,
 732                                        atomic_read(&rreq->nr_outstanding) == 1);
 733                         netfs_rreq_assess(rreq, false);
 734                         if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
 735                                 break;
 736                         cond_resched();
 737                 }
 738
 739                 ret = rreq->error;
 740                 if (ret == 0 && rreq->submitted < rreq->len) {
 741                         trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
 742                         ret = -EIO;
 743                 }
 744                 netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
 745         } else {
 746                 /* If we decrement nr_outstanding to 0, the ref belongs to us. */
 747                 if (atomic_dec_and_test(&rreq->nr_outstanding))
 748                         netfs_rreq_assess(rreq, false);
 749                 ret = 0;
 750         }
 751         return ret;
 752 }
 753
 754 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
 755                                          loff_t *_start, size_t *_len, loff_t i_size)
 756 {
 757         struct netfs_cache_resources *cres = &rreq->cache_resources;
 758
 759         if (cres->ops && cres->ops->expand_readahead)
 760                 cres->ops->expand_readahead(cres, _start, _len, i_size);
 761 }
 762
 763 static void netfs_rreq_expand(struct netfs_io_request *rreq,
 764                               struct readahead_control *ractl)
 765 {
 766         /* Give the cache a chance to change the request parameters.  The
 767          * resultant request must contain the original region.
 768          */
 769         netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
 770
 771         /* Give the netfs a chance to change the request parameters.  The
 772          * resultant request must contain the original region.
 773          */
 774         if (rreq->netfs_ops->expand_readahead)
 775                 rreq->netfs_ops->expand_readahead(rreq);
 776
 777         /* Expand the request if the cache wants it to start earlier.  Note
 778          * that the expansion may get further extended if the VM wishes to
 779          * insert THPs and the preferred start and/or end wind up in the middle
 780          * of THPs.
 781          *
 782          * If this is the case, however, the THP size should be an integer
 783          * multiple of the cache granule size, so we get a whole number of
 784          * granules to deal with.
 785          */
 786         if (rreq->start  != readahead_pos(ractl) ||
 787             rreq->len != readahead_length(ractl)) {
 788                 readahead_expand(ractl, rreq->start, rreq->len);
 789                 rreq->start  = readahead_pos(ractl);
 790                 rreq->len = readahead_length(ractl);
 791
 792                 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 793                                  netfs_read_trace_expanded);
 794         }
 795 }
 796
 797 /**
 798  * netfs_readahead - Helper to manage a read request
 799  * @ractl: The description of the readahead request
 800  *
 801  * Fulfil a readahead request by drawing data from the cache if possible, or
 802  * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
 803  * requests from different sources will get munged together.  If necessary, the
 804  * readahead window can be expanded in either direction to a more convenient
 805  * alighment for RPC efficiency or to make storage in the cache feasible.
 806  *
 807  * The calling netfs must initialise a netfs context contiguous to the vfs
 808  * inode before calling this.
 809  *
 810  * This is usable whether or not caching is enabled.
 811  */
 812 void netfs_readahead(struct readahead_control *ractl)
 813 {
 814         struct netfs_io_request *rreq;
 815         struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
 816         int ret;
 817
 818         _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
 819
 820         if (readahead_count(ractl) == 0)
 821                 return;
 822
 823         rreq = netfs_alloc_request(ractl->mapping, ractl->file,
 824                                    readahead_pos(ractl),
 825                                    readahead_length(ractl),
 826                                    NETFS_READAHEAD);
 827         if (IS_ERR(rreq))
 828                 return;
 829
 830         if (ctx->ops->begin_cache_operation) {
 831                 ret = ctx->ops->begin_cache_operation(rreq);
 832                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 833                         goto cleanup_free;
 834         }
 835
 836         netfs_stat(&netfs_n_rh_readahead);
 837         trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 838                          netfs_read_trace_readahead);
 839
 840         netfs_rreq_expand(rreq, ractl);
 841
 842         /* Drop the refs on the folios here rather than in the cache or
 843          * filesystem.  The locks will be dropped in netfs_rreq_unlock().
 844          */
 845         while (readahead_folio(ractl))
 846                 ;
 847
 848         netfs_begin_read(rreq, false);
 849         return;
 850
 851 cleanup_free:
 852         netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
 853         return;
 854 }
 855 EXPORT_SYMBOL(netfs_readahead);
 856
 857 /**
 858  * netfs_readpage - Helper to manage a readpage request
 859  * @file: The file to read from
 860  * @subpage: A subpage of the folio to read
 861  *
 862  * Fulfil a readpage request by drawing data from the cache if possible, or the
 863  * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
 864  * from different sources will get munged together.
 865  *
 866  * The calling netfs must initialise a netfs context contiguous to the vfs
 867  * inode before calling this.
 868  *
 869  * This is usable whether or not caching is enabled.
 870  */
 871 int netfs_readpage(struct file *file, struct page *subpage)
 872 {
 873         struct folio *folio = page_folio(subpage);
 874         struct address_space *mapping = folio->mapping;
 875         struct netfs_io_request *rreq;
 876         struct netfs_i_context *ctx = netfs_i_context(mapping->host);
 877         int ret;
 878
 879         _enter("%lx", folio_index(folio));
 880
 881         rreq = netfs_alloc_request(mapping, file,
 882                                    folio_file_pos(folio), folio_size(folio),
 883                                    NETFS_READPAGE);
 884         if (IS_ERR(rreq)) {
 885                 ret = PTR_ERR(rreq);
 886                 goto alloc_error;
 887         }
 888
 889         if (ctx->ops->begin_cache_operation) {
 890                 ret = ctx->ops->begin_cache_operation(rreq);
 891                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 892                         goto discard;
 893         }
 894
 895         netfs_stat(&netfs_n_rh_readpage);
 896         trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 897         return netfs_begin_read(rreq, true);
 898
 899 discard:
 900         netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
 901 alloc_error:
 902         folio_unlock(folio);
 903         return ret;
 904 }
 905 EXPORT_SYMBOL(netfs_readpage);
 906
 907 /*
 908  * Prepare a folio for writing without reading first
 909  * @folio: The folio being prepared
 910  * @pos: starting position for the write
 911  * @len: length of write
 912  * @always_fill: T if the folio should always be completely filled/cleared
 913  *
 914  * In some cases, write_begin doesn't need to read at all:
 915  * - full folio write
 916  * - write that lies in a folio that is completely beyond EOF
 917  * - write that covers the folio from start to EOF or beyond it
 918  *
 919  * If any of these criteria are met, then zero out the unwritten parts
 920  * of the folio and return true. Otherwise, return false.
 921  */
 922 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
 923                                  bool always_fill)
 924 {
 925         struct inode *inode = folio_inode(folio);
 926         loff_t i_size = i_size_read(inode);
 927         size_t offset = offset_in_folio(folio, pos);
 928         size_t plen = folio_size(folio);
 929
 930         if (unlikely(always_fill)) {
 931                 if (pos - offset + len <= i_size)
 932                         return false; /* Page entirely before EOF */
 933                 zero_user_segment(&folio->page, 0, plen);
 934                 folio_mark_uptodate(folio);
 935                 return true;
 936         }
 937
 938         /* Full folio write */
 939         if (offset == 0 && len >= plen)
 940                 return true;
 941
 942         /* Page entirely beyond the end of the file */
 943         if (pos - offset >= i_size)
 944                 goto zero_out;
 945
 946         /* Write that covers from the start of the folio to EOF or beyond */
 947         if (offset == 0 && (pos + len) >= i_size)
 948                 goto zero_out;
 949
 950         return false;
 951 zero_out:
 952         zero_user_segments(&folio->page, 0, offset, offset + len, plen);
 953         return true;
 954 }
 955
 956 /**
 957  * netfs_write_begin - Helper to prepare for writing
 958  * @file: The file to read from
 959  * @mapping: The mapping to read from
 960  * @pos: File position at which the write will begin
 961  * @len: The length of the write (may extend beyond the end of the folio chosen)
 962  * @aop_flags: AOP_* flags
 963  * @_folio: Where to put the resultant folio
 964  * @_fsdata: Place for the netfs to store a cookie
 965  *
 966  * Pre-read data for a write-begin request by drawing data from the cache if
 967  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
 968  * Multiple I/O requests from different sources will get munged together.  If
 969  * necessary, the readahead window can be expanded in either direction to a
 970  * more convenient alighment for RPC efficiency or to make storage in the cache
 971  * feasible.
 972  *
 973  * The calling netfs must provide a table of operations, only one of which,
 974  * issue_op, is mandatory.
 975  *
 976  * The check_write_begin() operation can be provided to check for and flush
 977  * conflicting writes once the folio is grabbed and locked.  It is passed a
 978  * pointer to the fsdata cookie that gets returned to the VM to be passed to
 979  * write_end.  It is permitted to sleep.  It should return 0 if the request
 980  * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
 981  * be regot; or return an error.
 982  *
 983  * The calling netfs must initialise a netfs context contiguous to the vfs
 984  * inode before calling this.
 985  *
 986  * This is usable whether or not caching is enabled.
 987  */
 988 int netfs_write_begin(struct file *file, struct address_space *mapping,
 989                       loff_t pos, unsigned int len, unsigned int aop_flags,
 990                       struct folio **_folio, void **_fsdata)
 991 {
 992         struct netfs_io_request *rreq;
 993         struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
 994         struct folio *folio;
 995         unsigned int fgp_flags;
 996         pgoff_t index = pos >> PAGE_SHIFT;
 997         int ret;
 998
 999         DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
1000
1001 retry:
1002         fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
1003         if (aop_flags & AOP_FLAG_NOFS)
1004                 fgp_flags |= FGP_NOFS;
1005         folio = __filemap_get_folio(mapping, index, fgp_flags,
1006                                     mapping_gfp_mask(mapping));
1007         if (!folio)
1008                 return -ENOMEM;
1009
1010         if (ctx->ops->check_write_begin) {
1011                 /* Allow the netfs (eg. ceph) to flush conflicts. */
1012                 ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
1013                 if (ret < 0) {
1014                         trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
1015                         if (ret == -EAGAIN)
1016                                 goto retry;
1017                         goto error;
1018                 }
1019         }
1020
1021         if (folio_test_uptodate(folio))
1022                 goto have_folio;
1023
1024         /* If the page is beyond the EOF, we want to clear it - unless it's
1025          * within the cache granule containing the EOF, in which case we need
1026          * to preload the granule.
1027          */
1028         if (!netfs_is_cache_enabled(ctx) &&
1029             netfs_skip_folio_read(folio, pos, len, false)) {
1030                 netfs_stat(&netfs_n_rh_write_zskip);
1031                 goto have_folio_no_wait;
1032         }
1033
1034         rreq = netfs_alloc_request(mapping, file,
1035                                    folio_file_pos(folio), folio_size(folio),
1036                                    NETFS_READ_FOR_WRITE);
1037         if (IS_ERR(rreq)) {
1038                 ret = PTR_ERR(rreq);
1039                 goto error;
1040         }
1041         rreq->no_unlock_folio   = folio_index(folio);
1042         __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
1043
1044         if (ctx->ops->begin_cache_operation) {
1045                 ret = ctx->ops->begin_cache_operation(rreq);
1046                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
1047                         goto error_put;
1048         }
1049
1050         netfs_stat(&netfs_n_rh_write_begin);
1051         trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
1052
1053         /* Expand the request to meet caching requirements and download
1054          * preferences.
1055          */
1056         ractl._nr_pages = folio_nr_pages(folio);
1057         netfs_rreq_expand(rreq, &ractl);
1058
1059         /* We hold the folio locks, so we can drop the references */
1060         folio_get(folio);
1061         while (readahead_folio(&ractl))
1062                 ;
1063
1064         ret = netfs_begin_read(rreq, true);
1065         if (ret < 0)
1066                 goto error;
1067
1068 have_folio:
1069         ret = folio_wait_fscache_killable(folio);
1070         if (ret < 0)
1071                 goto error;
1072 have_folio_no_wait:
1073         *_folio = folio;
1074         _leave(" = 0");
1075         return 0;
1076
1077 error_put:
1078         netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
1079 error:
1080         folio_unlock(folio);
1081         folio_put(folio);
1082         _leave(" = %d", ret);
1083         return ret;
1084 }
1085 EXPORT_SYMBOL(netfs_write_begin);