fs/netfs/read_helper.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* Network filesystem high-level read support.
   3  *
   4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/module.h>
   9 #include <linux/export.h>
  10 #include <linux/fs.h>
  11 #include <linux/mm.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/slab.h>
  14 #include <linux/uio.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/task_io_accounting_ops.h>
  17 #include <linux/netfs.h>
  18 #include "internal.h"
  19 #define CREATE_TRACE_POINTS
  20 #include <trace/events/netfs.h>
  21
  22 MODULE_DESCRIPTION("Network fs support");
  23 MODULE_AUTHOR("Red Hat, Inc.");
  24 MODULE_LICENSE("GPL");
  25
  26 unsigned netfs_debug;
  27 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
  28 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
  29
  30 static void netfs_rreq_work(struct work_struct *);
  31 static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
  32
  33 static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
  34                                  bool was_async)
  35 {
  36         if (refcount_dec_and_test(&subreq->usage))
  37                 __netfs_put_subrequest(subreq, was_async);
  38 }
  39
  40 static struct netfs_read_request *netfs_alloc_read_request(
  41         const struct netfs_read_request_ops *ops, void *netfs_priv,
  42         struct file *file)
  43 {
  44         static atomic_t debug_ids;
  45         struct netfs_read_request *rreq;
  46
  47         rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
  48         if (rreq) {
  49                 rreq->netfs_ops = ops;
  50                 rreq->netfs_priv = netfs_priv;
  51                 rreq->inode     = file_inode(file);
  52                 rreq->i_size    = i_size_read(rreq->inode);
  53                 rreq->debug_id  = atomic_inc_return(&debug_ids);
  54                 INIT_LIST_HEAD(&rreq->subrequests);
  55                 INIT_WORK(&rreq->work, netfs_rreq_work);
  56                 refcount_set(&rreq->usage, 1);
  57                 __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
  58                 if (ops->init_rreq)
  59                         ops->init_rreq(rreq, file);
  60                 netfs_stat(&netfs_n_rh_rreq);
  61         }
  62
  63         return rreq;
  64 }
  65
  66 static void netfs_get_read_request(struct netfs_read_request *rreq)
  67 {
  68         refcount_inc(&rreq->usage);
  69 }
  70
  71 static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
  72                                      bool was_async)
  73 {
  74         struct netfs_read_subrequest *subreq;
  75
  76         while (!list_empty(&rreq->subrequests)) {
  77                 subreq = list_first_entry(&rreq->subrequests,
  78                                           struct netfs_read_subrequest, rreq_link);
  79                 list_del(&subreq->rreq_link);
  80                 netfs_put_subrequest(subreq, was_async);
  81         }
  82 }
  83
  84 static void netfs_free_read_request(struct work_struct *work)
  85 {
  86         struct netfs_read_request *rreq =
  87                 container_of(work, struct netfs_read_request, work);
  88         netfs_rreq_clear_subreqs(rreq, false);
  89         if (rreq->netfs_priv)
  90                 rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
  91         trace_netfs_rreq(rreq, netfs_rreq_trace_free);
  92         if (rreq->cache_resources.ops)
  93                 rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
  94         kfree(rreq);
  95         netfs_stat_d(&netfs_n_rh_rreq);
  96 }
  97
  98 static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
  99 {
 100         if (refcount_dec_and_test(&rreq->usage)) {
 101                 if (was_async) {
 102                         rreq->work.func = netfs_free_read_request;
 103                         if (!queue_work(system_unbound_wq, &rreq->work))
 104                                 BUG();
 105                 } else {
 106                         netfs_free_read_request(&rreq->work);
 107                 }
 108         }
 109 }
 110
 111 /*
 112  * Allocate and partially initialise an I/O request structure.
 113  */
 114 static struct netfs_read_subrequest *netfs_alloc_subrequest(
 115         struct netfs_read_request *rreq)
 116 {
 117         struct netfs_read_subrequest *subreq;
 118
 119         subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
 120         if (subreq) {
 121                 INIT_LIST_HEAD(&subreq->rreq_link);
 122                 refcount_set(&subreq->usage, 2);
 123                 subreq->rreq = rreq;
 124                 netfs_get_read_request(rreq);
 125                 netfs_stat(&netfs_n_rh_sreq);
 126         }
 127
 128         return subreq;
 129 }
 130
 131 static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
 132 {
 133         refcount_inc(&subreq->usage);
 134 }
 135
 136 static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
 137                                    bool was_async)
 138 {
 139         struct netfs_read_request *rreq = subreq->rreq;
 140
 141         trace_netfs_sreq(subreq, netfs_sreq_trace_free);
 142         kfree(subreq);
 143         netfs_stat_d(&netfs_n_rh_sreq);
 144         netfs_put_read_request(rreq, was_async);
 145 }
 146
 147 /*
 148  * Clear the unread part of an I/O request.
 149  */
 150 static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
 151 {
 152         struct iov_iter iter;
 153
 154         iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
 155                         subreq->start + subreq->transferred,
 156                         subreq->len   - subreq->transferred);
 157         iov_iter_zero(iov_iter_count(&iter), &iter);
 158 }
 159
 160 static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
 161                                         bool was_async)
 162 {
 163         struct netfs_read_subrequest *subreq = priv;
 164
 165         netfs_subreq_terminated(subreq, transferred_or_error, was_async);
 166 }
 167
 168 /*
 169  * Issue a read against the cache.
 170  * - Eats the caller's ref on subreq.
 171  */
 172 static void netfs_read_from_cache(struct netfs_read_request *rreq,
 173                                   struct netfs_read_subrequest *subreq,
 174                                   enum netfs_read_from_hole read_hole)
 175 {
 176         struct netfs_cache_resources *cres = &rreq->cache_resources;
 177         struct iov_iter iter;
 178
 179         netfs_stat(&netfs_n_rh_read);
 180         iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
 181                         subreq->start + subreq->transferred,
 182                         subreq->len   - subreq->transferred);
 183
 184         cres->ops->read(cres, subreq->start, &iter, read_hole,
 185                         netfs_cache_read_terminated, subreq);
 186 }
 187
 188 /*
 189  * Fill a subrequest region with zeroes.
 190  */
 191 static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
 192                                    struct netfs_read_subrequest *subreq)
 193 {
 194         netfs_stat(&netfs_n_rh_zero);
 195         __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 196         netfs_subreq_terminated(subreq, 0, false);
 197 }
 198
 199 /*
 200  * Ask the netfs to issue a read request to the server for us.
 201  *
 202  * The netfs is expected to read from subreq->pos + subreq->transferred to
 203  * subreq->pos + subreq->len - 1.  It may not backtrack and write data into the
 204  * buffer prior to the transferred point as it might clobber dirty data
 205  * obtained from the cache.
 206  *
 207  * Alternatively, the netfs is allowed to indicate one of two things:
 208  *
 209  * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
 210  *   make progress.
 211  *
 212  * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
 213  *   cleared.
 214  */
 215 static void netfs_read_from_server(struct netfs_read_request *rreq,
 216                                    struct netfs_read_subrequest *subreq)
 217 {
 218         netfs_stat(&netfs_n_rh_download);
 219         rreq->netfs_ops->issue_op(subreq);
 220 }
 221
 222 /*
 223  * Release those waiting.
 224  */
 225 static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
 226 {
 227         trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 228         netfs_rreq_clear_subreqs(rreq, was_async);
 229         netfs_put_read_request(rreq, was_async);
 230 }
 231
 232 /*
 233  * Deal with the completion of writing the data to the cache.  We have to clear
 234  * the PG_fscache bits on the folios involved and release the caller's ref.
 235  *
 236  * May be called in softirq mode and we inherit a ref from the caller.
 237  */
 238 static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
 239                                           bool was_async)
 240 {
 241         struct netfs_read_subrequest *subreq;
 242         struct folio *folio;
 243         pgoff_t unlocked = 0;
 244         bool have_unlocked = false;
 245
 246         rcu_read_lock();
 247
 248         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 249                 XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
 250
 251                 xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
 252                         /* We might have multiple writes from the same huge
 253                          * folio, but we mustn't unlock a folio more than once.
 254                          */
 255                         if (have_unlocked && folio_index(folio) <= unlocked)
 256                                 continue;
 257                         unlocked = folio_index(folio);
 258                         folio_end_fscache(folio);
 259                         have_unlocked = true;
 260                 }
 261         }
 262
 263         rcu_read_unlock();
 264         netfs_rreq_completed(rreq, was_async);
 265 }
 266
 267 static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
 268                                        bool was_async)
 269 {
 270         struct netfs_read_subrequest *subreq = priv;
 271         struct netfs_read_request *rreq = subreq->rreq;
 272
 273         if (IS_ERR_VALUE(transferred_or_error)) {
 274                 netfs_stat(&netfs_n_rh_write_failed);
 275                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 276                                     netfs_fail_copy_to_cache);
 277         } else {
 278                 netfs_stat(&netfs_n_rh_write_done);
 279         }
 280
 281         trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
 282
 283         /* If we decrement nr_wr_ops to 0, the ref belongs to us. */
 284         if (atomic_dec_and_test(&rreq->nr_wr_ops))
 285                 netfs_rreq_unmark_after_write(rreq, was_async);
 286
 287         netfs_put_subrequest(subreq, was_async);
 288 }
 289
 290 /*
 291  * Perform any outstanding writes to the cache.  We inherit a ref from the
 292  * caller.
 293  */
 294 static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
 295 {
 296         struct netfs_cache_resources *cres = &rreq->cache_resources;
 297         struct netfs_read_subrequest *subreq, *next, *p;
 298         struct iov_iter iter;
 299         int ret;
 300
 301         trace_netfs_rreq(rreq, netfs_rreq_trace_write);
 302
 303         /* We don't want terminating writes trying to wake us up whilst we're
 304          * still going through the list.
 305          */
 306         atomic_inc(&rreq->nr_wr_ops);
 307
 308         list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
 309                 if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
 310                         list_del_init(&subreq->rreq_link);
 311                         netfs_put_subrequest(subreq, false);
 312                 }
 313         }
 314
 315         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 316                 /* Amalgamate adjacent writes */
 317                 while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 318                         next = list_next_entry(subreq, rreq_link);
 319                         if (next->start != subreq->start + subreq->len)
 320                                 break;
 321                         subreq->len += next->len;
 322                         list_del_init(&next->rreq_link);
 323                         netfs_put_subrequest(next, false);
 324                 }
 325
 326                 ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
 327                                                rreq->i_size, true);
 328                 if (ret < 0) {
 329                         trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
 330                         trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
 331                         continue;
 332                 }
 333
 334                 iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
 335                                 subreq->start, subreq->len);
 336
 337                 atomic_inc(&rreq->nr_wr_ops);
 338                 netfs_stat(&netfs_n_rh_write);
 339                 netfs_get_read_subrequest(subreq);
 340                 trace_netfs_sreq(subreq, netfs_sreq_trace_write);
 341                 cres->ops->write(cres, subreq->start, &iter,
 342                                  netfs_rreq_copy_terminated, subreq);
 343         }
 344
 345         /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
 346         if (atomic_dec_and_test(&rreq->nr_wr_ops))
 347                 netfs_rreq_unmark_after_write(rreq, false);
 348 }
 349
 350 static void netfs_rreq_write_to_cache_work(struct work_struct *work)
 351 {
 352         struct netfs_read_request *rreq =
 353                 container_of(work, struct netfs_read_request, work);
 354
 355         netfs_rreq_do_write_to_cache(rreq);
 356 }
 357
 358 static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq)
 359 {
 360         rreq->work.func = netfs_rreq_write_to_cache_work;
 361         if (!queue_work(system_unbound_wq, &rreq->work))
 362                 BUG();
 363 }
 364
 365 /*
 366  * Unlock the folios in a read operation.  We need to set PG_fscache on any
 367  * folios we're going to write back before we unlock them.
 368  */
 369 static void netfs_rreq_unlock(struct netfs_read_request *rreq)
 370 {
 371         struct netfs_read_subrequest *subreq;
 372         struct folio *folio;
 373         unsigned int iopos, account = 0;
 374         pgoff_t start_page = rreq->start / PAGE_SIZE;
 375         pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
 376         bool subreq_failed = false;
 377
 378         XA_STATE(xas, &rreq->mapping->i_pages, start_page);
 379
 380         if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
 381                 __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 382                 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 383                         __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
 384                 }
 385         }
 386
 387         /* Walk through the pagecache and the I/O request lists simultaneously.
 388          * We may have a mixture of cached and uncached sections and we only
 389          * really want to write out the uncached sections.  This is slightly
 390          * complicated by the possibility that we might have huge pages with a
 391          * mixture inside.
 392          */
 393         subreq = list_first_entry(&rreq->subrequests,
 394                                   struct netfs_read_subrequest, rreq_link);
 395         iopos = 0;
 396         subreq_failed = (subreq->error < 0);
 397
 398         trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
 399
 400         rcu_read_lock();
 401         xas_for_each(&xas, folio, last_page) {
 402                 unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
 403                 unsigned int pgend = pgpos + folio_size(folio);
 404                 bool pg_failed = false;
 405
 406                 for (;;) {
 407                         if (!subreq) {
 408                                 pg_failed = true;
 409                                 break;
 410                         }
 411                         if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
 412                                 folio_start_fscache(folio);
 413                         pg_failed |= subreq_failed;
 414                         if (pgend < iopos + subreq->len)
 415                                 break;
 416
 417                         account += subreq->transferred;
 418                         iopos += subreq->len;
 419                         if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 420                                 subreq = list_next_entry(subreq, rreq_link);
 421                                 subreq_failed = (subreq->error < 0);
 422                         } else {
 423                                 subreq = NULL;
 424                                 subreq_failed = false;
 425                         }
 426                         if (pgend == iopos)
 427                                 break;
 428                 }
 429
 430                 if (!pg_failed) {
 431                         flush_dcache_folio(folio);
 432                         folio_mark_uptodate(folio);
 433                 }
 434
 435                 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
 436                         if (folio_index(folio) == rreq->no_unlock_folio &&
 437                             test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
 438                                 _debug("no unlock");
 439                         else
 440                                 folio_unlock(folio);
 441                 }
 442         }
 443         rcu_read_unlock();
 444
 445         task_io_account_read(account);
 446         if (rreq->netfs_ops->done)
 447                 rreq->netfs_ops->done(rreq);
 448 }
 449
 450 /*
 451  * Handle a short read.
 452  */
 453 static void netfs_rreq_short_read(struct netfs_read_request *rreq,
 454                                   struct netfs_read_subrequest *subreq)
 455 {
 456         __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 457         __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
 458
 459         netfs_stat(&netfs_n_rh_short_read);
 460         trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
 461
 462         netfs_get_read_subrequest(subreq);
 463         atomic_inc(&rreq->nr_rd_ops);
 464         if (subreq->source == NETFS_READ_FROM_CACHE)
 465                 netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
 466         else
 467                 netfs_read_from_server(rreq, subreq);
 468 }
 469
 470 /*
 471  * Resubmit any short or failed operations.  Returns true if we got the rreq
 472  * ref back.
 473  */
 474 static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 475 {
 476         struct netfs_read_subrequest *subreq;
 477
 478         WARN_ON(in_interrupt());
 479
 480         trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
 481
 482         /* We don't want terminating submissions trying to wake us up whilst
 483          * we're still going through the list.
 484          */
 485         atomic_inc(&rreq->nr_rd_ops);
 486
 487         __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 488         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 489                 if (subreq->error) {
 490                         if (subreq->source != NETFS_READ_FROM_CACHE)
 491                                 break;
 492                         subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 493                         subreq->error = 0;
 494                         netfs_stat(&netfs_n_rh_download_instead);
 495                         trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 496                         netfs_get_read_subrequest(subreq);
 497                         atomic_inc(&rreq->nr_rd_ops);
 498                         netfs_read_from_server(rreq, subreq);
 499                 } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
 500                         netfs_rreq_short_read(rreq, subreq);
 501                 }
 502         }
 503
 504         /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
 505         if (atomic_dec_and_test(&rreq->nr_rd_ops))
 506                 return true;
 507
 508         wake_up_var(&rreq->nr_rd_ops);
 509         return false;
 510 }
 511
 512 /*
 513  * Check to see if the data read is still valid.
 514  */
 515 static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
 516 {
 517         struct netfs_read_subrequest *subreq;
 518
 519         if (!rreq->netfs_ops->is_still_valid ||
 520             rreq->netfs_ops->is_still_valid(rreq))
 521                 return;
 522
 523         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 524                 if (subreq->source == NETFS_READ_FROM_CACHE) {
 525                         subreq->error = -ESTALE;
 526                         __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 527                 }
 528         }
 529 }
 530
 531 /*
 532  * Assess the state of a read request and decide what to do next.
 533  *
 534  * Note that we could be in an ordinary kernel thread, on a workqueue or in
 535  * softirq context at this point.  We inherit a ref from the caller.
 536  */
 537 static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
 538 {
 539         trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 540
 541 again:
 542         netfs_rreq_is_still_valid(rreq);
 543
 544         if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
 545             test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
 546                 if (netfs_rreq_perform_resubmissions(rreq))
 547                         goto again;
 548                 return;
 549         }
 550
 551         netfs_rreq_unlock(rreq);
 552
 553         clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 554         wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 555
 556         if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
 557                 return netfs_rreq_write_to_cache(rreq);
 558
 559         netfs_rreq_completed(rreq, was_async);
 560 }
 561
 562 static void netfs_rreq_work(struct work_struct *work)
 563 {
 564         struct netfs_read_request *rreq =
 565                 container_of(work, struct netfs_read_request, work);
 566         netfs_rreq_assess(rreq, false);
 567 }
 568
 569 /*
 570  * Handle the completion of all outstanding I/O operations on a read request.
 571  * We inherit a ref from the caller.
 572  */
 573 static void netfs_rreq_terminated(struct netfs_read_request *rreq,
 574                                   bool was_async)
 575 {
 576         if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
 577             was_async) {
 578                 if (!queue_work(system_unbound_wq, &rreq->work))
 579                         BUG();
 580         } else {
 581                 netfs_rreq_assess(rreq, was_async);
 582         }
 583 }
 584
 585 /**
 586  * netfs_subreq_terminated - Note the termination of an I/O operation.
 587  * @subreq: The I/O request that has terminated.
 588  * @transferred_or_error: The amount of data transferred or an error code.
 589  * @was_async: The termination was asynchronous
 590  *
 591  * This tells the read helper that a contributory I/O operation has terminated,
 592  * one way or another, and that it should integrate the results.
 593  *
 594  * The caller indicates in @transferred_or_error the outcome of the operation,
 595  * supplying a positive value to indicate the number of bytes transferred, 0 to
 596  * indicate a failure to transfer anything that should be retried or a negative
 597  * error code.  The helper will look after reissuing I/O operations as
 598  * appropriate and writing downloaded data to the cache.
 599  *
 600  * If @was_async is true, the caller might be running in softirq or interrupt
 601  * context and we can't sleep.
 602  */
 603 void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
 604                              ssize_t transferred_or_error,
 605                              bool was_async)
 606 {
 607         struct netfs_read_request *rreq = subreq->rreq;
 608         int u;
 609
 610         _enter("[%u]{%llx,%lx},%zd",
 611                subreq->debug_index, subreq->start, subreq->flags,
 612                transferred_or_error);
 613
 614         switch (subreq->source) {
 615         case NETFS_READ_FROM_CACHE:
 616                 netfs_stat(&netfs_n_rh_read_done);
 617                 break;
 618         case NETFS_DOWNLOAD_FROM_SERVER:
 619                 netfs_stat(&netfs_n_rh_download_done);
 620                 break;
 621         default:
 622                 break;
 623         }
 624
 625         if (IS_ERR_VALUE(transferred_or_error)) {
 626                 subreq->error = transferred_or_error;
 627                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 628                                     netfs_fail_read);
 629                 goto failed;
 630         }
 631
 632         if (WARN(transferred_or_error > subreq->len - subreq->transferred,
 633                  "Subreq overread: R%x[%x] %zd > %zu - %zu",
 634                  rreq->debug_id, subreq->debug_index,
 635                  transferred_or_error, subreq->len, subreq->transferred))
 636                 transferred_or_error = subreq->len - subreq->transferred;
 637
 638         subreq->error = 0;
 639         subreq->transferred += transferred_or_error;
 640         if (subreq->transferred < subreq->len)
 641                 goto incomplete;
 642
 643 complete:
 644         __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 645         if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
 646                 set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 647
 648 out:
 649         trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 650
 651         /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
 652         u = atomic_dec_return(&rreq->nr_rd_ops);
 653         if (u == 0)
 654                 netfs_rreq_terminated(rreq, was_async);
 655         else if (u == 1)
 656                 wake_up_var(&rreq->nr_rd_ops);
 657
 658         netfs_put_subrequest(subreq, was_async);
 659         return;
 660
 661 incomplete:
 662         if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
 663                 netfs_clear_unread(subreq);
 664                 subreq->transferred = subreq->len;
 665                 goto complete;
 666         }
 667
 668         if (transferred_or_error == 0) {
 669                 if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
 670                         subreq->error = -ENODATA;
 671                         goto failed;
 672                 }
 673         } else {
 674                 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 675         }
 676
 677         __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 678         set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 679         goto out;
 680
 681 failed:
 682         if (subreq->source == NETFS_READ_FROM_CACHE) {
 683                 netfs_stat(&netfs_n_rh_read_failed);
 684                 set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 685         } else {
 686                 netfs_stat(&netfs_n_rh_download_failed);
 687                 set_bit(NETFS_RREQ_FAILED, &rreq->flags);
 688                 rreq->error = subreq->error;
 689         }
 690         goto out;
 691 }
 692 EXPORT_SYMBOL(netfs_subreq_terminated);
 693
 694 static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
 695                                                        loff_t i_size)
 696 {
 697         struct netfs_read_request *rreq = subreq->rreq;
 698         struct netfs_cache_resources *cres = &rreq->cache_resources;
 699
 700         if (cres->ops)
 701                 return cres->ops->prepare_read(subreq, i_size);
 702         if (subreq->start >= rreq->i_size)
 703                 return NETFS_FILL_WITH_ZEROES;
 704         return NETFS_DOWNLOAD_FROM_SERVER;
 705 }
 706
 707 /*
 708  * Work out what sort of subrequest the next one will be.
 709  */
 710 static enum netfs_read_source
 711 netfs_rreq_prepare_read(struct netfs_read_request *rreq,
 712                         struct netfs_read_subrequest *subreq)
 713 {
 714         enum netfs_read_source source;
 715
 716         _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 717
 718         source = netfs_cache_prepare_read(subreq, rreq->i_size);
 719         if (source == NETFS_INVALID_READ)
 720                 goto out;
 721
 722         if (source == NETFS_DOWNLOAD_FROM_SERVER) {
 723                 /* Call out to the netfs to let it shrink the request to fit
 724                  * its own I/O sizes and boundaries.  If it shinks it here, it
 725                  * will be called again to make simultaneous calls; if it wants
 726                  * to make serial calls, it can indicate a short read and then
 727                  * we will call it again.
 728                  */
 729                 if (subreq->len > rreq->i_size - subreq->start)
 730                         subreq->len = rreq->i_size - subreq->start;
 731
 732                 if (rreq->netfs_ops->clamp_length &&
 733                     !rreq->netfs_ops->clamp_length(subreq)) {
 734                         source = NETFS_INVALID_READ;
 735                         goto out;
 736                 }
 737         }
 738
 739         if (WARN_ON(subreq->len == 0))
 740                 source = NETFS_INVALID_READ;
 741
 742 out:
 743         subreq->source = source;
 744         trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 745         return source;
 746 }
 747
 748 /*
 749  * Slice off a piece of a read request and submit an I/O request for it.
 750  */
 751 static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
 752                                     unsigned int *_debug_index)
 753 {
 754         struct netfs_read_subrequest *subreq;
 755         enum netfs_read_source source;
 756
 757         subreq = netfs_alloc_subrequest(rreq);
 758         if (!subreq)
 759                 return false;
 760
 761         subreq->debug_index     = (*_debug_index)++;
 762         subreq->start           = rreq->start + rreq->submitted;
 763         subreq->len             = rreq->len   - rreq->submitted;
 764
 765         _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
 766         list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 767
 768         /* Call out to the cache to find out what it can do with the remaining
 769          * subset.  It tells us in subreq->flags what it decided should be done
 770          * and adjusts subreq->len down if the subset crosses a cache boundary.
 771          *
 772          * Then when we hand the subset, it can choose to take a subset of that
 773          * (the starts must coincide), in which case, we go around the loop
 774          * again and ask it to download the next piece.
 775          */
 776         source = netfs_rreq_prepare_read(rreq, subreq);
 777         if (source == NETFS_INVALID_READ)
 778                 goto subreq_failed;
 779
 780         atomic_inc(&rreq->nr_rd_ops);
 781
 782         rreq->submitted += subreq->len;
 783
 784         trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 785         switch (source) {
 786         case NETFS_FILL_WITH_ZEROES:
 787                 netfs_fill_with_zeroes(rreq, subreq);
 788                 break;
 789         case NETFS_DOWNLOAD_FROM_SERVER:
 790                 netfs_read_from_server(rreq, subreq);
 791                 break;
 792         case NETFS_READ_FROM_CACHE:
 793                 netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
 794                 break;
 795         default:
 796                 BUG();
 797         }
 798
 799         return true;
 800
 801 subreq_failed:
 802         rreq->error = subreq->error;
 803         netfs_put_subrequest(subreq, false);
 804         return false;
 805 }
 806
 807 static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
 808                                          loff_t *_start, size_t *_len, loff_t i_size)
 809 {
 810         struct netfs_cache_resources *cres = &rreq->cache_resources;
 811
 812         if (cres->ops && cres->ops->expand_readahead)
 813                 cres->ops->expand_readahead(cres, _start, _len, i_size);
 814 }
 815
 816 static void netfs_rreq_expand(struct netfs_read_request *rreq,
 817                               struct readahead_control *ractl)
 818 {
 819         /* Give the cache a chance to change the request parameters.  The
 820          * resultant request must contain the original region.
 821          */
 822         netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
 823
 824         /* Give the netfs a chance to change the request parameters.  The
 825          * resultant request must contain the original region.
 826          */
 827         if (rreq->netfs_ops->expand_readahead)
 828                 rreq->netfs_ops->expand_readahead(rreq);
 829
 830         /* Expand the request if the cache wants it to start earlier.  Note
 831          * that the expansion may get further extended if the VM wishes to
 832          * insert THPs and the preferred start and/or end wind up in the middle
 833          * of THPs.
 834          *
 835          * If this is the case, however, the THP size should be an integer
 836          * multiple of the cache granule size, so we get a whole number of
 837          * granules to deal with.
 838          */
 839         if (rreq->start  != readahead_pos(ractl) ||
 840             rreq->len != readahead_length(ractl)) {
 841                 readahead_expand(ractl, rreq->start, rreq->len);
 842                 rreq->start  = readahead_pos(ractl);
 843                 rreq->len = readahead_length(ractl);
 844
 845                 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 846                                  netfs_read_trace_expanded);
 847         }
 848 }
 849
 850 /**
 851  * netfs_readahead - Helper to manage a read request
 852  * @ractl: The description of the readahead request
 853  * @ops: The network filesystem's operations for the helper to use
 854  * @netfs_priv: Private netfs data to be retained in the request
 855  *
 856  * Fulfil a readahead request by drawing data from the cache if possible, or
 857  * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
 858  * requests from different sources will get munged together.  If necessary, the
 859  * readahead window can be expanded in either direction to a more convenient
 860  * alighment for RPC efficiency or to make storage in the cache feasible.
 861  *
 862  * The calling netfs must provide a table of operations, only one of which,
 863  * issue_op, is mandatory.  It may also be passed a private token, which will
 864  * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
 865  *
 866  * This is usable whether or not caching is enabled.
 867  */
 868 void netfs_readahead(struct readahead_control *ractl,
 869                      const struct netfs_read_request_ops *ops,
 870                      void *netfs_priv)
 871 {
 872         struct netfs_read_request *rreq;
 873         unsigned int debug_index = 0;
 874         int ret;
 875
 876         _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
 877
 878         if (readahead_count(ractl) == 0)
 879                 goto cleanup;
 880
 881         rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
 882         if (!rreq)
 883                 goto cleanup;
 884         rreq->mapping   = ractl->mapping;
 885         rreq->start     = readahead_pos(ractl);
 886         rreq->len       = readahead_length(ractl);
 887
 888         if (ops->begin_cache_operation) {
 889                 ret = ops->begin_cache_operation(rreq);
 890                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 891                         goto cleanup_free;
 892         }
 893
 894         netfs_stat(&netfs_n_rh_readahead);
 895         trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 896                          netfs_read_trace_readahead);
 897
 898         netfs_rreq_expand(rreq, ractl);
 899
 900         atomic_set(&rreq->nr_rd_ops, 1);
 901         do {
 902                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
 903                         break;
 904
 905         } while (rreq->submitted < rreq->len);
 906
 907         /* Drop the refs on the folios here rather than in the cache or
 908          * filesystem.  The locks will be dropped in netfs_rreq_unlock().
 909          */
 910         while (readahead_folio(ractl))
 911                 ;
 912
 913         /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
 914         if (atomic_dec_and_test(&rreq->nr_rd_ops))
 915                 netfs_rreq_assess(rreq, false);
 916         return;
 917
 918 cleanup_free:
 919         netfs_put_read_request(rreq, false);
 920         return;
 921 cleanup:
 922         if (netfs_priv)
 923                 ops->cleanup(ractl->mapping, netfs_priv);
 924         return;
 925 }
 926 EXPORT_SYMBOL(netfs_readahead);
 927
 928 /**
 929  * netfs_readpage - Helper to manage a readpage request
 930  * @file: The file to read from
 931  * @folio: The folio to read
 932  * @ops: The network filesystem's operations for the helper to use
 933  * @netfs_priv: Private netfs data to be retained in the request
 934  *
 935  * Fulfil a readpage request by drawing data from the cache if possible, or the
 936  * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
 937  * from different sources will get munged together.
 938  *
 939  * The calling netfs must provide a table of operations, only one of which,
 940  * issue_op, is mandatory.  It may also be passed a private token, which will
 941  * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
 942  *
 943  * This is usable whether or not caching is enabled.
 944  */
 945 int netfs_readpage(struct file *file,
 946                    struct folio *folio,
 947                    const struct netfs_read_request_ops *ops,
 948                    void *netfs_priv)
 949 {
 950         struct netfs_read_request *rreq;
 951         unsigned int debug_index = 0;
 952         int ret;
 953
 954         _enter("%lx", folio_index(folio));
 955
 956         rreq = netfs_alloc_read_request(ops, netfs_priv, file);
 957         if (!rreq) {
 958                 if (netfs_priv)
 959                         ops->cleanup(folio_file_mapping(folio), netfs_priv);
 960                 folio_unlock(folio);
 961                 return -ENOMEM;
 962         }
 963         rreq->mapping   = folio_file_mapping(folio);
 964         rreq->start     = folio_file_pos(folio);
 965         rreq->len       = folio_size(folio);
 966
 967         if (ops->begin_cache_operation) {
 968                 ret = ops->begin_cache_operation(rreq);
 969                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
 970                         folio_unlock(folio);
 971                         goto out;
 972                 }
 973         }
 974
 975         netfs_stat(&netfs_n_rh_readpage);
 976         trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 977
 978         netfs_get_read_request(rreq);
 979
 980         atomic_set(&rreq->nr_rd_ops, 1);
 981         do {
 982                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
 983                         break;
 984
 985         } while (rreq->submitted < rreq->len);
 986
 987         /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
 988          * the service code isn't punted off to a random thread pool to
 989          * process.
 990          */
 991         do {
 992                 wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
 993                 netfs_rreq_assess(rreq, false);
 994         } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
 995
 996         ret = rreq->error;
 997         if (ret == 0 && rreq->submitted < rreq->len) {
 998                 trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage);
 999                 ret = -EIO;
1000         }
1001 out:
1002         netfs_put_read_request(rreq, false);
1003         return ret;
1004 }
1005 EXPORT_SYMBOL(netfs_readpage);
1006
1007 /*
1008  * Prepare a folio for writing without reading first
1009  * @folio: The folio being prepared
1010  * @pos: starting position for the write
1011  * @len: length of write
1012  *
1013  * In some cases, write_begin doesn't need to read at all:
1014  * - full folio write
1015  * - write that lies in a folio that is completely beyond EOF
1016  * - write that covers the folio from start to EOF or beyond it
1017  *
1018  * If any of these criteria are met, then zero out the unwritten parts
1019  * of the folio and return true. Otherwise, return false.
1020  */
1021 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len)
1022 {
1023         struct inode *inode = folio_inode(folio);
1024         loff_t i_size = i_size_read(inode);
1025         size_t offset = offset_in_folio(folio, pos);
1026
1027         /* Full folio write */
1028         if (offset == 0 && len >= folio_size(folio))
1029                 return true;
1030
1031         /* pos beyond last folio in the file */
1032         if (pos - offset >= i_size)
1033                 goto zero_out;
1034
1035         /* Write that covers from the start of the folio to EOF or beyond */
1036         if (offset == 0 && (pos + len) >= i_size)
1037                 goto zero_out;
1038
1039         return false;
1040 zero_out:
1041         zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio));
1042         return true;
1043 }
1044
1045 /**
1046  * netfs_write_begin - Helper to prepare for writing
1047  * @file: The file to read from
1048  * @mapping: The mapping to read from
1049  * @pos: File position at which the write will begin
1050  * @len: The length of the write (may extend beyond the end of the folio chosen)
1051  * @aop_flags: AOP_* flags
1052  * @_folio: Where to put the resultant folio
1053  * @_fsdata: Place for the netfs to store a cookie
1054  * @ops: The network filesystem's operations for the helper to use
1055  * @netfs_priv: Private netfs data to be retained in the request
1056  *
1057  * Pre-read data for a write-begin request by drawing data from the cache if
1058  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
1059  * Multiple I/O requests from different sources will get munged together.  If
1060  * necessary, the readahead window can be expanded in either direction to a
1061  * more convenient alighment for RPC efficiency or to make storage in the cache
1062  * feasible.
1063  *
1064  * The calling netfs must provide a table of operations, only one of which,
1065  * issue_op, is mandatory.
1066  *
1067  * The check_write_begin() operation can be provided to check for and flush
1068  * conflicting writes once the folio is grabbed and locked.  It is passed a
1069  * pointer to the fsdata cookie that gets returned to the VM to be passed to
1070  * write_end.  It is permitted to sleep.  It should return 0 if the request
1071  * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
1072  * be regot; or return an error.
1073  *
1074  * This is usable whether or not caching is enabled.
1075  */
1076 int netfs_write_begin(struct file *file, struct address_space *mapping,
1077                       loff_t pos, unsigned int len, unsigned int aop_flags,
1078                       struct folio **_folio, void **_fsdata,
1079                       const struct netfs_read_request_ops *ops,
1080                       void *netfs_priv)
1081 {
1082         struct netfs_read_request *rreq;
1083         struct folio *folio;
1084         struct inode *inode = file_inode(file);
1085         unsigned int debug_index = 0, fgp_flags;
1086         pgoff_t index = pos >> PAGE_SHIFT;
1087         int ret;
1088
1089         DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
1090
1091 retry:
1092         fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
1093         if (aop_flags & AOP_FLAG_NOFS)
1094                 fgp_flags |= FGP_NOFS;
1095         folio = __filemap_get_folio(mapping, index, fgp_flags,
1096                                     mapping_gfp_mask(mapping));
1097         if (!folio)
1098                 return -ENOMEM;
1099
1100         if (ops->check_write_begin) {
1101                 /* Allow the netfs (eg. ceph) to flush conflicts. */
1102                 ret = ops->check_write_begin(file, pos, len, folio, _fsdata);
1103                 if (ret < 0) {
1104                         trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
1105                         if (ret == -EAGAIN)
1106                                 goto retry;
1107                         goto error;
1108                 }
1109         }
1110
1111         if (folio_test_uptodate(folio))
1112                 goto have_folio;
1113
1114         /* If the page is beyond the EOF, we want to clear it - unless it's
1115          * within the cache granule containing the EOF, in which case we need
1116          * to preload the granule.
1117          */
1118         if (!ops->is_cache_enabled(inode) &&
1119             netfs_skip_folio_read(folio, pos, len)) {
1120                 netfs_stat(&netfs_n_rh_write_zskip);
1121                 goto have_folio_no_wait;
1122         }
1123
1124         ret = -ENOMEM;
1125         rreq = netfs_alloc_read_request(ops, netfs_priv, file);
1126         if (!rreq)
1127                 goto error;
1128         rreq->mapping           = folio_file_mapping(folio);
1129         rreq->start             = folio_file_pos(folio);
1130         rreq->len               = folio_size(folio);
1131         rreq->no_unlock_folio   = folio_index(folio);
1132         __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
1133         netfs_priv = NULL;
1134
1135         if (ops->begin_cache_operation) {
1136                 ret = ops->begin_cache_operation(rreq);
1137                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
1138                         goto error_put;
1139         }
1140
1141         netfs_stat(&netfs_n_rh_write_begin);
1142         trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
1143
1144         /* Expand the request to meet caching requirements and download
1145          * preferences.
1146          */
1147         ractl._nr_pages = folio_nr_pages(folio);
1148         netfs_rreq_expand(rreq, &ractl);
1149         netfs_get_read_request(rreq);
1150
1151         /* We hold the folio locks, so we can drop the references */
1152         folio_get(folio);
1153         while (readahead_folio(&ractl))
1154                 ;
1155
1156         atomic_set(&rreq->nr_rd_ops, 1);
1157         do {
1158                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
1159                         break;
1160
1161         } while (rreq->submitted < rreq->len);
1162
1163         /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
1164          * the service code isn't punted off to a random thread pool to
1165          * process.
1166          */
1167         for (;;) {
1168                 wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
1169                 netfs_rreq_assess(rreq, false);
1170                 if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
1171                         break;
1172                 cond_resched();
1173         }
1174
1175         ret = rreq->error;
1176         if (ret == 0 && rreq->submitted < rreq->len) {
1177                 trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin);
1178                 ret = -EIO;
1179         }
1180         netfs_put_read_request(rreq, false);
1181         if (ret < 0)
1182                 goto error;
1183
1184 have_folio:
1185         ret = folio_wait_fscache_killable(folio);
1186         if (ret < 0)
1187                 goto error;
1188 have_folio_no_wait:
1189         if (netfs_priv)
1190                 ops->cleanup(mapping, netfs_priv);
1191         *_folio = folio;
1192         _leave(" = 0");
1193         return 0;
1194
1195 error_put:
1196         netfs_put_read_request(rreq, false);
1197 error:
1198         folio_unlock(folio);
1199         folio_put(folio);
1200         if (netfs_priv)
1201                 ops->cleanup(mapping, netfs_priv);
1202         _leave(" = %d", ret);
1203         return ret;
1204 }
1205 EXPORT_SYMBOL(netfs_write_begin);