fs/ceph/caps.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/fs.h>
   5 #include <linux/kernel.h>
   6 #include <linux/sched/signal.h>
   7 #include <linux/slab.h>
   8 #include <linux/vmalloc.h>
   9 #include <linux/wait.h>
  10 #include <linux/writeback.h>
  11 #include <linux/iversion.h>
  12
  13 #include "super.h"
  14 #include "mds_client.h"
  15 #include "cache.h"
  16 #include <linux/ceph/decode.h>
  17 #include <linux/ceph/messenger.h>
  18
  19 /*
  20  * Capability management
  21  *
  22  * The Ceph metadata servers control client access to inode metadata
  23  * and file data by issuing capabilities, granting clients permission
  24  * to read and/or write both inode field and file data to OSDs
  25  * (storage nodes).  Each capability consists of a set of bits
  26  * indicating which operations are allowed.
  27  *
  28  * If the client holds a *_SHARED cap, the client has a coherent value
  29  * that can be safely read from the cached inode.
  30  *
  31  * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
  32  * client is allowed to change inode attributes (e.g., file size,
  33  * mtime), note its dirty state in the ceph_cap, and asynchronously
  34  * flush that metadata change to the MDS.
  35  *
  36  * In the event of a conflicting operation (perhaps by another
  37  * client), the MDS will revoke the conflicting client capabilities.
  38  *
  39  * In order for a client to cache an inode, it must hold a capability
  40  * with at least one MDS server.  When inodes are released, release
  41  * notifications are batched and periodically sent en masse to the MDS
  42  * cluster to release server state.
  43  */
  44
  45 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
  46 static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
  47                                  struct ceph_mds_session *session,
  48                                  struct ceph_inode_info *ci,
  49                                  u64 oldest_flush_tid);
  50
  51 /*
  52  * Generate readable cap strings for debugging output.
  53  */
  54 #define MAX_CAP_STR 20
  55 static char cap_str[MAX_CAP_STR][40];
  56 static DEFINE_SPINLOCK(cap_str_lock);
  57 static int last_cap_str;
  58
  59 static char *gcap_string(char *s, int c)
  60 {
  61         if (c & CEPH_CAP_GSHARED)
  62                 *s++ = 's';
  63         if (c & CEPH_CAP_GEXCL)
  64                 *s++ = 'x';
  65         if (c & CEPH_CAP_GCACHE)
  66                 *s++ = 'c';
  67         if (c & CEPH_CAP_GRD)
  68                 *s++ = 'r';
  69         if (c & CEPH_CAP_GWR)
  70                 *s++ = 'w';
  71         if (c & CEPH_CAP_GBUFFER)
  72                 *s++ = 'b';
  73         if (c & CEPH_CAP_GWREXTEND)
  74                 *s++ = 'a';
  75         if (c & CEPH_CAP_GLAZYIO)
  76                 *s++ = 'l';
  77         return s;
  78 }
  79
  80 const char *ceph_cap_string(int caps)
  81 {
  82         int i;
  83         char *s;
  84         int c;
  85
  86         spin_lock(&cap_str_lock);
  87         i = last_cap_str++;
  88         if (last_cap_str == MAX_CAP_STR)
  89                 last_cap_str = 0;
  90         spin_unlock(&cap_str_lock);
  91
  92         s = cap_str[i];
  93
  94         if (caps & CEPH_CAP_PIN)
  95                 *s++ = 'p';
  96
  97         c = (caps >> CEPH_CAP_SAUTH) & 3;
  98         if (c) {
  99                 *s++ = 'A';
 100                 s = gcap_string(s, c);
 101         }
 102
 103         c = (caps >> CEPH_CAP_SLINK) & 3;
 104         if (c) {
 105                 *s++ = 'L';
 106                 s = gcap_string(s, c);
 107         }
 108
 109         c = (caps >> CEPH_CAP_SXATTR) & 3;
 110         if (c) {
 111                 *s++ = 'X';
 112                 s = gcap_string(s, c);
 113         }
 114
 115         c = caps >> CEPH_CAP_SFILE;
 116         if (c) {
 117                 *s++ = 'F';
 118                 s = gcap_string(s, c);
 119         }
 120
 121         if (s == cap_str[i])
 122                 *s++ = '-';
 123         *s = 0;
 124         return cap_str[i];
 125 }
 126
 127 void ceph_caps_init(struct ceph_mds_client *mdsc)
 128 {
 129         INIT_LIST_HEAD(&mdsc->caps_list);
 130         spin_lock_init(&mdsc->caps_list_lock);
 131 }
 132
 133 void ceph_caps_finalize(struct ceph_mds_client *mdsc)
 134 {
 135         struct ceph_cap *cap;
 136
 137         spin_lock(&mdsc->caps_list_lock);
 138         while (!list_empty(&mdsc->caps_list)) {
 139                 cap = list_first_entry(&mdsc->caps_list,
 140                                        struct ceph_cap, caps_item);
 141                 list_del(&cap->caps_item);
 142                 kmem_cache_free(ceph_cap_cachep, cap);
 143         }
 144         mdsc->caps_total_count = 0;
 145         mdsc->caps_avail_count = 0;
 146         mdsc->caps_use_count = 0;
 147         mdsc->caps_reserve_count = 0;
 148         mdsc->caps_min_count = 0;
 149         spin_unlock(&mdsc->caps_list_lock);
 150 }
 151
 152 void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
 153                               struct ceph_mount_options *fsopt)
 154 {
 155         spin_lock(&mdsc->caps_list_lock);
 156         mdsc->caps_min_count = fsopt->max_readdir;
 157         if (mdsc->caps_min_count < 1024)
 158                 mdsc->caps_min_count = 1024;
 159         mdsc->caps_use_max = fsopt->caps_max;
 160         if (mdsc->caps_use_max > 0 &&
 161             mdsc->caps_use_max < mdsc->caps_min_count)
 162                 mdsc->caps_use_max = mdsc->caps_min_count;
 163         spin_unlock(&mdsc->caps_list_lock);
 164 }
 165
 166 static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
 167 {
 168         struct ceph_cap *cap;
 169         int i;
 170
 171         if (nr_caps) {
 172                 BUG_ON(mdsc->caps_reserve_count < nr_caps);
 173                 mdsc->caps_reserve_count -= nr_caps;
 174                 if (mdsc->caps_avail_count >=
 175                     mdsc->caps_reserve_count + mdsc->caps_min_count) {
 176                         mdsc->caps_total_count -= nr_caps;
 177                         for (i = 0; i < nr_caps; i++) {
 178                                 cap = list_first_entry(&mdsc->caps_list,
 179                                         struct ceph_cap, caps_item);
 180                                 list_del(&cap->caps_item);
 181                                 kmem_cache_free(ceph_cap_cachep, cap);
 182                         }
 183                 } else {
 184                         mdsc->caps_avail_count += nr_caps;
 185                 }
 186
 187                 dout("%s: caps %d = %d used + %d resv + %d avail\n",
 188                      __func__,
 189                      mdsc->caps_total_count, mdsc->caps_use_count,
 190                      mdsc->caps_reserve_count, mdsc->caps_avail_count);
 191                 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 192                                                  mdsc->caps_reserve_count +
 193                                                  mdsc->caps_avail_count);
 194         }
 195 }
 196
 197 /*
 198  * Called under mdsc->mutex.
 199  */
 200 int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 201                       struct ceph_cap_reservation *ctx, int need)
 202 {
 203         int i, j;
 204         struct ceph_cap *cap;
 205         int have;
 206         int alloc = 0;
 207         int max_caps;
 208         int err = 0;
 209         bool trimmed = false;
 210         struct ceph_mds_session *s;
 211         LIST_HEAD(newcaps);
 212
 213         dout("reserve caps ctx=%p need=%d\n", ctx, need);
 214
 215         /* first reserve any caps that are already allocated */
 216         spin_lock(&mdsc->caps_list_lock);
 217         if (mdsc->caps_avail_count >= need)
 218                 have = need;
 219         else
 220                 have = mdsc->caps_avail_count;
 221         mdsc->caps_avail_count -= have;
 222         mdsc->caps_reserve_count += have;
 223         BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 224                                          mdsc->caps_reserve_count +
 225                                          mdsc->caps_avail_count);
 226         spin_unlock(&mdsc->caps_list_lock);
 227
 228         for (i = have; i < need; ) {
 229                 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 230                 if (cap) {
 231                         list_add(&cap->caps_item, &newcaps);
 232                         alloc++;
 233                         i++;
 234                         continue;
 235                 }
 236
 237                 if (!trimmed) {
 238                         for (j = 0; j < mdsc->max_sessions; j++) {
 239                                 s = __ceph_lookup_mds_session(mdsc, j);
 240                                 if (!s)
 241                                         continue;
 242                                 mutex_unlock(&mdsc->mutex);
 243
 244                                 mutex_lock(&s->s_mutex);
 245                                 max_caps = s->s_nr_caps - (need - i);
 246                                 ceph_trim_caps(mdsc, s, max_caps);
 247                                 mutex_unlock(&s->s_mutex);
 248
 249                                 ceph_put_mds_session(s);
 250                                 mutex_lock(&mdsc->mutex);
 251                         }
 252                         trimmed = true;
 253
 254                         spin_lock(&mdsc->caps_list_lock);
 255                         if (mdsc->caps_avail_count) {
 256                                 int more_have;
 257                                 if (mdsc->caps_avail_count >= need - i)
 258                                         more_have = need - i;
 259                                 else
 260                                         more_have = mdsc->caps_avail_count;
 261
 262                                 i += more_have;
 263                                 have += more_have;
 264                                 mdsc->caps_avail_count -= more_have;
 265                                 mdsc->caps_reserve_count += more_have;
 266
 267                         }
 268                         spin_unlock(&mdsc->caps_list_lock);
 269
 270                         continue;
 271                 }
 272
 273                 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
 274                         ctx, need, have + alloc);
 275                 err = -ENOMEM;
 276                 break;
 277         }
 278
 279         if (!err) {
 280                 BUG_ON(have + alloc != need);
 281                 ctx->count = need;
 282                 ctx->used = 0;
 283         }
 284
 285         spin_lock(&mdsc->caps_list_lock);
 286         mdsc->caps_total_count += alloc;
 287         mdsc->caps_reserve_count += alloc;
 288         list_splice(&newcaps, &mdsc->caps_list);
 289
 290         BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 291                                          mdsc->caps_reserve_count +
 292                                          mdsc->caps_avail_count);
 293
 294         if (err)
 295                 __ceph_unreserve_caps(mdsc, have + alloc);
 296
 297         spin_unlock(&mdsc->caps_list_lock);
 298
 299         dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
 300              ctx, mdsc->caps_total_count, mdsc->caps_use_count,
 301              mdsc->caps_reserve_count, mdsc->caps_avail_count);
 302         return err;
 303 }
 304
 305 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 306                          struct ceph_cap_reservation *ctx)
 307 {
 308         bool reclaim = false;
 309         if (!ctx->count)
 310                 return;
 311
 312         dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 313         spin_lock(&mdsc->caps_list_lock);
 314         __ceph_unreserve_caps(mdsc, ctx->count);
 315         ctx->count = 0;
 316
 317         if (mdsc->caps_use_max > 0 &&
 318             mdsc->caps_use_count > mdsc->caps_use_max)
 319                 reclaim = true;
 320         spin_unlock(&mdsc->caps_list_lock);
 321
 322         if (reclaim)
 323                 ceph_reclaim_caps_nr(mdsc, ctx->used);
 324 }
 325
 326 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 327                               struct ceph_cap_reservation *ctx)
 328 {
 329         struct ceph_cap *cap = NULL;
 330
 331         /* temporary, until we do something about cap import/export */
 332         if (!ctx) {
 333                 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 334                 if (cap) {
 335                         spin_lock(&mdsc->caps_list_lock);
 336                         mdsc->caps_use_count++;
 337                         mdsc->caps_total_count++;
 338                         spin_unlock(&mdsc->caps_list_lock);
 339                 } else {
 340                         spin_lock(&mdsc->caps_list_lock);
 341                         if (mdsc->caps_avail_count) {
 342                                 BUG_ON(list_empty(&mdsc->caps_list));
 343
 344                                 mdsc->caps_avail_count--;
 345                                 mdsc->caps_use_count++;
 346                                 cap = list_first_entry(&mdsc->caps_list,
 347                                                 struct ceph_cap, caps_item);
 348                                 list_del(&cap->caps_item);
 349
 350                                 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 351                                        mdsc->caps_reserve_count + mdsc->caps_avail_count);
 352                         }
 353                         spin_unlock(&mdsc->caps_list_lock);
 354                 }
 355
 356                 return cap;
 357         }
 358
 359         spin_lock(&mdsc->caps_list_lock);
 360         dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
 361              ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
 362              mdsc->caps_reserve_count, mdsc->caps_avail_count);
 363         BUG_ON(!ctx->count);
 364         BUG_ON(ctx->count > mdsc->caps_reserve_count);
 365         BUG_ON(list_empty(&mdsc->caps_list));
 366
 367         ctx->count--;
 368         ctx->used++;
 369         mdsc->caps_reserve_count--;
 370         mdsc->caps_use_count++;
 371
 372         cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
 373         list_del(&cap->caps_item);
 374
 375         BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 376                mdsc->caps_reserve_count + mdsc->caps_avail_count);
 377         spin_unlock(&mdsc->caps_list_lock);
 378         return cap;
 379 }
 380
 381 void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 382 {
 383         spin_lock(&mdsc->caps_list_lock);
 384         dout("put_cap %p %d = %d used + %d resv + %d avail\n",
 385              cap, mdsc->caps_total_count, mdsc->caps_use_count,
 386              mdsc->caps_reserve_count, mdsc->caps_avail_count);
 387         mdsc->caps_use_count--;
 388         /*
 389          * Keep some preallocated caps around (ceph_min_count), to
 390          * avoid lots of free/alloc churn.
 391          */
 392         if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
 393                                       mdsc->caps_min_count) {
 394                 mdsc->caps_total_count--;
 395                 kmem_cache_free(ceph_cap_cachep, cap);
 396         } else {
 397                 mdsc->caps_avail_count++;
 398                 list_add(&cap->caps_item, &mdsc->caps_list);
 399         }
 400
 401         BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 402                mdsc->caps_reserve_count + mdsc->caps_avail_count);
 403         spin_unlock(&mdsc->caps_list_lock);
 404 }
 405
 406 void ceph_reservation_status(struct ceph_fs_client *fsc,
 407                              int *total, int *avail, int *used, int *reserved,
 408                              int *min)
 409 {
 410         struct ceph_mds_client *mdsc = fsc->mdsc;
 411
 412         spin_lock(&mdsc->caps_list_lock);
 413
 414         if (total)
 415                 *total = mdsc->caps_total_count;
 416         if (avail)
 417                 *avail = mdsc->caps_avail_count;
 418         if (used)
 419                 *used = mdsc->caps_use_count;
 420         if (reserved)
 421                 *reserved = mdsc->caps_reserve_count;
 422         if (min)
 423                 *min = mdsc->caps_min_count;
 424
 425         spin_unlock(&mdsc->caps_list_lock);
 426 }
 427
 428 /*
 429  * Find ceph_cap for given mds, if any.
 430  *
 431  * Called with i_ceph_lock held.
 432  */
 433 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 434 {
 435         struct ceph_cap *cap;
 436         struct rb_node *n = ci->i_caps.rb_node;
 437
 438         while (n) {
 439                 cap = rb_entry(n, struct ceph_cap, ci_node);
 440                 if (mds < cap->mds)
 441                         n = n->rb_left;
 442                 else if (mds > cap->mds)
 443                         n = n->rb_right;
 444                 else
 445                         return cap;
 446         }
 447         return NULL;
 448 }
 449
 450 struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 451 {
 452         struct ceph_cap *cap;
 453
 454         spin_lock(&ci->i_ceph_lock);
 455         cap = __get_cap_for_mds(ci, mds);
 456         spin_unlock(&ci->i_ceph_lock);
 457         return cap;
 458 }
 459
 460 /*
 461  * Called under i_ceph_lock.
 462  */
 463 static void __insert_cap_node(struct ceph_inode_info *ci,
 464                               struct ceph_cap *new)
 465 {
 466         struct rb_node **p = &ci->i_caps.rb_node;
 467         struct rb_node *parent = NULL;
 468         struct ceph_cap *cap = NULL;
 469
 470         while (*p) {
 471                 parent = *p;
 472                 cap = rb_entry(parent, struct ceph_cap, ci_node);
 473                 if (new->mds < cap->mds)
 474                         p = &(*p)->rb_left;
 475                 else if (new->mds > cap->mds)
 476                         p = &(*p)->rb_right;
 477                 else
 478                         BUG();
 479         }
 480
 481         rb_link_node(&new->ci_node, parent, p);
 482         rb_insert_color(&new->ci_node, &ci->i_caps);
 483 }
 484
 485 /*
 486  * (re)set cap hold timeouts, which control the delayed release
 487  * of unused caps back to the MDS.  Should be called on cap use.
 488  */
 489 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 490                                struct ceph_inode_info *ci)
 491 {
 492         struct ceph_mount_options *opt = mdsc->fsc->mount_options;
 493         ci->i_hold_caps_max = round_jiffies(jiffies +
 494                                             opt->caps_wanted_delay_max * HZ);
 495         dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
 496              ci->i_hold_caps_max - jiffies);
 497 }
 498
 499 /*
 500  * (Re)queue cap at the end of the delayed cap release list.
 501  *
 502  * If I_FLUSH is set, leave the inode at the front of the list.
 503  *
 504  * Caller holds i_ceph_lock
 505  *    -> we take mdsc->cap_delay_lock
 506  */
 507 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 508                                 struct ceph_inode_info *ci)
 509 {
 510         dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
 511              ci->i_ceph_flags, ci->i_hold_caps_max);
 512         if (!mdsc->stopping) {
 513                 spin_lock(&mdsc->cap_delay_lock);
 514                 if (!list_empty(&ci->i_cap_delay_list)) {
 515                         if (ci->i_ceph_flags & CEPH_I_FLUSH)
 516                                 goto no_change;
 517                         list_del_init(&ci->i_cap_delay_list);
 518                 }
 519                 __cap_set_timeouts(mdsc, ci);
 520                 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 521 no_change:
 522                 spin_unlock(&mdsc->cap_delay_lock);
 523         }
 524 }
 525
 526 /*
 527  * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
 528  * indicating we should send a cap message to flush dirty metadata
 529  * asap, and move to the front of the delayed cap list.
 530  */
 531 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 532                                       struct ceph_inode_info *ci)
 533 {
 534         dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
 535         spin_lock(&mdsc->cap_delay_lock);
 536         ci->i_ceph_flags |= CEPH_I_FLUSH;
 537         if (!list_empty(&ci->i_cap_delay_list))
 538                 list_del_init(&ci->i_cap_delay_list);
 539         list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 540         spin_unlock(&mdsc->cap_delay_lock);
 541 }
 542
 543 /*
 544  * Cancel delayed work on cap.
 545  *
 546  * Caller must hold i_ceph_lock.
 547  */
 548 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 549                                struct ceph_inode_info *ci)
 550 {
 551         dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
 552         if (list_empty(&ci->i_cap_delay_list))
 553                 return;
 554         spin_lock(&mdsc->cap_delay_lock);
 555         list_del_init(&ci->i_cap_delay_list);
 556         spin_unlock(&mdsc->cap_delay_lock);
 557 }
 558
 559 /* Common issue checks for add_cap, handle_cap_grant. */
 560 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 561                               unsigned issued)
 562 {
 563         unsigned had = __ceph_caps_issued(ci, NULL);
 564
 565         lockdep_assert_held(&ci->i_ceph_lock);
 566
 567         /*
 568          * Each time we receive FILE_CACHE anew, we increment
 569          * i_rdcache_gen.
 570          */
 571         if (S_ISREG(ci->vfs_inode.i_mode) &&
 572             (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
 573             (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
 574                 ci->i_rdcache_gen++;
 575         }
 576
 577         /*
 578          * If FILE_SHARED is newly issued, mark dir not complete. We don't
 579          * know what happened to this directory while we didn't have the cap.
 580          * If FILE_SHARED is being revoked, also mark dir not complete. It
 581          * stops on-going cached readdir.
 582          */
 583         if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
 584                 if (issued & CEPH_CAP_FILE_SHARED)
 585                         atomic_inc(&ci->i_shared_gen);
 586                 if (S_ISDIR(ci->vfs_inode.i_mode)) {
 587                         dout(" marking %p NOT complete\n", &ci->vfs_inode);
 588                         __ceph_dir_clear_complete(ci);
 589                 }
 590         }
 591
 592         /* Wipe saved layout if we're losing DIR_CREATE caps */
 593         if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
 594                 !(issued & CEPH_CAP_DIR_CREATE)) {
 595              ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
 596              memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
 597         }
 598 }
 599
 600 /**
 601  * change_auth_cap_ses - move inode to appropriate lists when auth caps change
 602  * @ci: inode to be moved
 603  * @session: new auth caps session
 604  */
 605 static void change_auth_cap_ses(struct ceph_inode_info *ci,
 606                                 struct ceph_mds_session *session)
 607 {
 608         lockdep_assert_held(&ci->i_ceph_lock);
 609
 610         if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
 611                 return;
 612
 613         spin_lock(&session->s_mdsc->cap_dirty_lock);
 614         if (!list_empty(&ci->i_dirty_item))
 615                 list_move(&ci->i_dirty_item, &session->s_cap_dirty);
 616         if (!list_empty(&ci->i_flushing_item))
 617                 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 618         spin_unlock(&session->s_mdsc->cap_dirty_lock);
 619 }
 620
 621 /*
 622  * Add a capability under the given MDS session.
 623  *
 624  * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
 625  *
 626  * @fmode is the open file mode, if we are opening a file, otherwise
 627  * it is < 0.  (This is so we can atomically add the cap and add an
 628  * open file reference to it.)
 629  */
 630 void ceph_add_cap(struct inode *inode,
 631                   struct ceph_mds_session *session, u64 cap_id,
 632                   unsigned issued, unsigned wanted,
 633                   unsigned seq, unsigned mseq, u64 realmino, int flags,
 634                   struct ceph_cap **new_cap)
 635 {
 636         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 637         struct ceph_inode_info *ci = ceph_inode(inode);
 638         struct ceph_cap *cap;
 639         int mds = session->s_mds;
 640         int actual_wanted;
 641         u32 gen;
 642
 643         lockdep_assert_held(&ci->i_ceph_lock);
 644
 645         dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 646              session->s_mds, cap_id, ceph_cap_string(issued), seq);
 647
 648         gen = atomic_read(&session->s_cap_gen);
 649
 650         cap = __get_cap_for_mds(ci, mds);
 651         if (!cap) {
 652                 cap = *new_cap;
 653                 *new_cap = NULL;
 654
 655                 cap->issued = 0;
 656                 cap->implemented = 0;
 657                 cap->mds = mds;
 658                 cap->mds_wanted = 0;
 659                 cap->mseq = 0;
 660
 661                 cap->ci = ci;
 662                 __insert_cap_node(ci, cap);
 663
 664                 /* add to session cap list */
 665                 cap->session = session;
 666                 spin_lock(&session->s_cap_lock);
 667                 list_add_tail(&cap->session_caps, &session->s_caps);
 668                 session->s_nr_caps++;
 669                 atomic64_inc(&mdsc->metric.total_caps);
 670                 spin_unlock(&session->s_cap_lock);
 671         } else {
 672                 spin_lock(&session->s_cap_lock);
 673                 list_move_tail(&cap->session_caps, &session->s_caps);
 674                 spin_unlock(&session->s_cap_lock);
 675
 676                 if (cap->cap_gen < gen)
 677                         cap->issued = cap->implemented = CEPH_CAP_PIN;
 678
 679                 /*
 680                  * auth mds of the inode changed. we received the cap export
 681                  * message, but still haven't received the cap import message.
 682                  * handle_cap_export() updated the new auth MDS' cap.
 683                  *
 684                  * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
 685                  * a message that was send before the cap import message. So
 686                  * don't remove caps.
 687                  */
 688                 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
 689                         WARN_ON(cap != ci->i_auth_cap);
 690                         WARN_ON(cap->cap_id != cap_id);
 691                         seq = cap->seq;
 692                         mseq = cap->mseq;
 693                         issued |= cap->issued;
 694                         flags |= CEPH_CAP_FLAG_AUTH;
 695                 }
 696         }
 697
 698         if (!ci->i_snap_realm ||
 699             ((flags & CEPH_CAP_FLAG_AUTH) &&
 700              realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
 701                 /*
 702                  * add this inode to the appropriate snap realm
 703                  */
 704                 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
 705                                                                realmino);
 706                 if (realm) {
 707                         struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
 708                         if (oldrealm) {
 709                                 spin_lock(&oldrealm->inodes_with_caps_lock);
 710                                 list_del_init(&ci->i_snap_realm_item);
 711                                 spin_unlock(&oldrealm->inodes_with_caps_lock);
 712                         }
 713
 714                         spin_lock(&realm->inodes_with_caps_lock);
 715                         list_add(&ci->i_snap_realm_item,
 716                                  &realm->inodes_with_caps);
 717                         ci->i_snap_realm = realm;
 718                         if (realm->ino == ci->i_vino.ino)
 719                                 realm->inode = inode;
 720                         spin_unlock(&realm->inodes_with_caps_lock);
 721
 722                         if (oldrealm)
 723                                 ceph_put_snap_realm(mdsc, oldrealm);
 724                 } else {
 725                         pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
 726                                realmino);
 727                         WARN_ON(!realm);
 728                 }
 729         }
 730
 731         __check_cap_issue(ci, cap, issued);
 732
 733         /*
 734          * If we are issued caps we don't want, or the mds' wanted
 735          * value appears to be off, queue a check so we'll release
 736          * later and/or update the mds wanted value.
 737          */
 738         actual_wanted = __ceph_caps_wanted(ci);
 739         if ((wanted & ~actual_wanted) ||
 740             (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
 741                 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 742                      ceph_cap_string(issued), ceph_cap_string(wanted),
 743                      ceph_cap_string(actual_wanted));
 744                 __cap_delay_requeue(mdsc, ci);
 745         }
 746
 747         if (flags & CEPH_CAP_FLAG_AUTH) {
 748                 if (!ci->i_auth_cap ||
 749                     ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
 750                         if (ci->i_auth_cap &&
 751                             ci->i_auth_cap->session != cap->session)
 752                                 change_auth_cap_ses(ci, cap->session);
 753                         ci->i_auth_cap = cap;
 754                         cap->mds_wanted = wanted;
 755                 }
 756         } else {
 757                 WARN_ON(ci->i_auth_cap == cap);
 758         }
 759
 760         dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 761              inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
 762              ceph_cap_string(issued|cap->issued), seq, mds);
 763         cap->cap_id = cap_id;
 764         cap->issued = issued;
 765         cap->implemented |= issued;
 766         if (ceph_seq_cmp(mseq, cap->mseq) > 0)
 767                 cap->mds_wanted = wanted;
 768         else
 769                 cap->mds_wanted |= wanted;
 770         cap->seq = seq;
 771         cap->issue_seq = seq;
 772         cap->mseq = mseq;
 773         cap->cap_gen = gen;
 774 }
 775
 776 /*
 777  * Return true if cap has not timed out and belongs to the current
 778  * generation of the MDS session (i.e. has not gone 'stale' due to
 779  * us losing touch with the mds).
 780  */
 781 static int __cap_is_valid(struct ceph_cap *cap)
 782 {
 783         unsigned long ttl;
 784         u32 gen;
 785
 786         gen = atomic_read(&cap->session->s_cap_gen);
 787         ttl = cap->session->s_cap_ttl;
 788
 789         if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 790                 dout("__cap_is_valid %p cap %p issued %s "
 791                      "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
 792                      cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
 793                 return 0;
 794         }
 795
 796         return 1;
 797 }
 798
 799 /*
 800  * Return set of valid cap bits issued to us.  Note that caps time
 801  * out, and may be invalidated in bulk if the client session times out
 802  * and session->s_cap_gen is bumped.
 803  */
 804 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 805 {
 806         int have = ci->i_snap_caps;
 807         struct ceph_cap *cap;
 808         struct rb_node *p;
 809
 810         if (implemented)
 811                 *implemented = 0;
 812         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 813                 cap = rb_entry(p, struct ceph_cap, ci_node);
 814                 if (!__cap_is_valid(cap))
 815                         continue;
 816                 dout("__ceph_caps_issued %p cap %p issued %s\n",
 817                      &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
 818                 have |= cap->issued;
 819                 if (implemented)
 820                         *implemented |= cap->implemented;
 821         }
 822         /*
 823          * exclude caps issued by non-auth MDS, but are been revoking
 824          * by the auth MDS. The non-auth MDS should be revoking/exporting
 825          * these caps, but the message is delayed.
 826          */
 827         if (ci->i_auth_cap) {
 828                 cap = ci->i_auth_cap;
 829                 have &= ~cap->implemented | cap->issued;
 830         }
 831         return have;
 832 }
 833
 834 /*
 835  * Get cap bits issued by caps other than @ocap
 836  */
 837 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
 838 {
 839         int have = ci->i_snap_caps;
 840         struct ceph_cap *cap;
 841         struct rb_node *p;
 842
 843         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 844                 cap = rb_entry(p, struct ceph_cap, ci_node);
 845                 if (cap == ocap)
 846                         continue;
 847                 if (!__cap_is_valid(cap))
 848                         continue;
 849                 have |= cap->issued;
 850         }
 851         return have;
 852 }
 853
 854 /*
 855  * Move a cap to the end of the LRU (oldest caps at list head, newest
 856  * at list tail).
 857  */
 858 static void __touch_cap(struct ceph_cap *cap)
 859 {
 860         struct ceph_mds_session *s = cap->session;
 861
 862         spin_lock(&s->s_cap_lock);
 863         if (!s->s_cap_iterator) {
 864                 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
 865                      s->s_mds);
 866                 list_move_tail(&cap->session_caps, &s->s_caps);
 867         } else {
 868                 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
 869                      &cap->ci->vfs_inode, cap, s->s_mds);
 870         }
 871         spin_unlock(&s->s_cap_lock);
 872 }
 873
 874 /*
 875  * Check if we hold the given mask.  If so, move the cap(s) to the
 876  * front of their respective LRUs.  (This is the preferred way for
 877  * callers to check for caps they want.)
 878  */
 879 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 880 {
 881         struct ceph_cap *cap;
 882         struct rb_node *p;
 883         int have = ci->i_snap_caps;
 884
 885         if ((have & mask) == mask) {
 886                 dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
 887                      " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 888                      ceph_cap_string(have),
 889                      ceph_cap_string(mask));
 890                 return 1;
 891         }
 892
 893         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 894                 cap = rb_entry(p, struct ceph_cap, ci_node);
 895                 if (!__cap_is_valid(cap))
 896                         continue;
 897                 if ((cap->issued & mask) == mask) {
 898                         dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
 899                              " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
 900                              ceph_cap_string(cap->issued),
 901                              ceph_cap_string(mask));
 902                         if (touch)
 903                                 __touch_cap(cap);
 904                         return 1;
 905                 }
 906
 907                 /* does a combination of caps satisfy mask? */
 908                 have |= cap->issued;
 909                 if ((have & mask) == mask) {
 910                         dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
 911                              " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 912                              ceph_cap_string(cap->issued),
 913                              ceph_cap_string(mask));
 914                         if (touch) {
 915                                 struct rb_node *q;
 916
 917                                 /* touch this + preceding caps */
 918                                 __touch_cap(cap);
 919                                 for (q = rb_first(&ci->i_caps); q != p;
 920                                      q = rb_next(q)) {
 921                                         cap = rb_entry(q, struct ceph_cap,
 922                                                        ci_node);
 923                                         if (!__cap_is_valid(cap))
 924                                                 continue;
 925                                         if (cap->issued & mask)
 926                                                 __touch_cap(cap);
 927                                 }
 928                         }
 929                         return 1;
 930                 }
 931         }
 932
 933         return 0;
 934 }
 935
 936 int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
 937                                    int touch)
 938 {
 939         struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 940         int r;
 941
 942         r = __ceph_caps_issued_mask(ci, mask, touch);
 943         if (r)
 944                 ceph_update_cap_hit(&fsc->mdsc->metric);
 945         else
 946                 ceph_update_cap_mis(&fsc->mdsc->metric);
 947         return r;
 948 }
 949
 950 /*
 951  * Return true if mask caps are currently being revoked by an MDS.
 952  */
 953 int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 954                                struct ceph_cap *ocap, int mask)
 955 {
 956         struct ceph_cap *cap;
 957         struct rb_node *p;
 958
 959         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 960                 cap = rb_entry(p, struct ceph_cap, ci_node);
 961                 if (cap != ocap &&
 962                     (cap->implemented & ~cap->issued & mask))
 963                         return 1;
 964         }
 965         return 0;
 966 }
 967
 968 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
 969 {
 970         struct inode *inode = &ci->vfs_inode;
 971         int ret;
 972
 973         spin_lock(&ci->i_ceph_lock);
 974         ret = __ceph_caps_revoking_other(ci, NULL, mask);
 975         spin_unlock(&ci->i_ceph_lock);
 976         dout("ceph_caps_revoking %p %s = %d\n", inode,
 977              ceph_cap_string(mask), ret);
 978         return ret;
 979 }
 980
 981 int __ceph_caps_used(struct ceph_inode_info *ci)
 982 {
 983         int used = 0;
 984         if (ci->i_pin_ref)
 985                 used |= CEPH_CAP_PIN;
 986         if (ci->i_rd_ref)
 987                 used |= CEPH_CAP_FILE_RD;
 988         if (ci->i_rdcache_ref ||
 989             (S_ISREG(ci->vfs_inode.i_mode) &&
 990              ci->vfs_inode.i_data.nrpages))
 991                 used |= CEPH_CAP_FILE_CACHE;
 992         if (ci->i_wr_ref)
 993                 used |= CEPH_CAP_FILE_WR;
 994         if (ci->i_wb_ref || ci->i_wrbuffer_ref)
 995                 used |= CEPH_CAP_FILE_BUFFER;
 996         if (ci->i_fx_ref)
 997                 used |= CEPH_CAP_FILE_EXCL;
 998         return used;
 999 }
1000
1001 #define FMODE_WAIT_BIAS 1000
1002
1003 /*
1004  * wanted, by virtue of open file modes
1005  */
1006 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
1007 {
1008         const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
1009         const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
1010         const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
1011         const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
1012         struct ceph_mount_options *opt =
1013                 ceph_inode_to_client(&ci->vfs_inode)->mount_options;
1014         unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1015         unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1016
1017         if (S_ISDIR(ci->vfs_inode.i_mode)) {
1018                 int want = 0;
1019
1020                 /* use used_cutoff here, to keep dir's wanted caps longer */
1021                 if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1022                     time_after(ci->i_last_rd, used_cutoff))
1023                         want |= CEPH_CAP_ANY_SHARED;
1024
1025                 if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1026                     time_after(ci->i_last_wr, used_cutoff)) {
1027                         want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1028                         if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1029                                 want |= CEPH_CAP_ANY_DIR_OPS;
1030                 }
1031
1032                 if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1033                         want |= CEPH_CAP_PIN;
1034
1035                 return want;
1036         } else {
1037                 int bits = 0;
1038
1039                 if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1040                         if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1041                             time_after(ci->i_last_rd, used_cutoff))
1042                                 bits |= 1 << RD_SHIFT;
1043                 } else if (time_after(ci->i_last_rd, idle_cutoff)) {
1044                         bits |= 1 << RD_SHIFT;
1045                 }
1046
1047                 if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1048                         if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1049                             time_after(ci->i_last_wr, used_cutoff))
1050                                 bits |= 1 << WR_SHIFT;
1051                 } else if (time_after(ci->i_last_wr, idle_cutoff)) {
1052                         bits |= 1 << WR_SHIFT;
1053                 }
1054
1055                 /* check lazyio only when read/write is wanted */
1056                 if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1057                     ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1058                         bits |= 1 << LAZY_SHIFT;
1059
1060                 return bits ? ceph_caps_for_mode(bits >> 1) : 0;
1061         }
1062 }
1063
1064 /*
1065  * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1066  */
1067 int __ceph_caps_wanted(struct ceph_inode_info *ci)
1068 {
1069         int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
1070         if (S_ISDIR(ci->vfs_inode.i_mode)) {
1071                 /* we want EXCL if holding caps of dir ops */
1072                 if (w & CEPH_CAP_ANY_DIR_OPS)
1073                         w |= CEPH_CAP_FILE_EXCL;
1074         } else {
1075                 /* we want EXCL if dirty data */
1076                 if (w & CEPH_CAP_FILE_BUFFER)
1077                         w |= CEPH_CAP_FILE_EXCL;
1078         }
1079         return w;
1080 }
1081
1082 /*
1083  * Return caps we have registered with the MDS(s) as 'wanted'.
1084  */
1085 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
1086 {
1087         struct ceph_cap *cap;
1088         struct rb_node *p;
1089         int mds_wanted = 0;
1090
1091         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1092                 cap = rb_entry(p, struct ceph_cap, ci_node);
1093                 if (check && !__cap_is_valid(cap))
1094                         continue;
1095                 if (cap == ci->i_auth_cap)
1096                         mds_wanted |= cap->mds_wanted;
1097                 else
1098                         mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
1099         }
1100         return mds_wanted;
1101 }
1102
1103 int ceph_is_any_caps(struct inode *inode)
1104 {
1105         struct ceph_inode_info *ci = ceph_inode(inode);
1106         int ret;
1107
1108         spin_lock(&ci->i_ceph_lock);
1109         ret = __ceph_is_any_real_caps(ci);
1110         spin_unlock(&ci->i_ceph_lock);
1111
1112         return ret;
1113 }
1114
1115 static void drop_inode_snap_realm(struct ceph_inode_info *ci)
1116 {
1117         struct ceph_snap_realm *realm = ci->i_snap_realm;
1118         spin_lock(&realm->inodes_with_caps_lock);
1119         list_del_init(&ci->i_snap_realm_item);
1120         ci->i_snap_realm_counter++;
1121         ci->i_snap_realm = NULL;
1122         if (realm->ino == ci->i_vino.ino)
1123                 realm->inode = NULL;
1124         spin_unlock(&realm->inodes_with_caps_lock);
1125         ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
1126                             realm);
1127 }
1128
1129 /*
1130  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
1131  *
1132  * caller should hold i_ceph_lock.
1133  * caller will not hold session s_mutex if called from destroy_inode.
1134  */
1135 void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
1136 {
1137         struct ceph_mds_session *session = cap->session;
1138         struct ceph_inode_info *ci = cap->ci;
1139         struct ceph_mds_client *mdsc;
1140         int removed = 0;
1141
1142         /* 'ci' being NULL means the remove have already occurred */
1143         if (!ci) {
1144                 dout("%s: cap inode is NULL\n", __func__);
1145                 return;
1146         }
1147
1148         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
1149
1150         mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
1151
1152         /* remove from inode's cap rbtree, and clear auth cap */
1153         rb_erase(&cap->ci_node, &ci->i_caps);
1154         if (ci->i_auth_cap == cap) {
1155                 WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) &&
1156                              !mdsc->fsc->blocklisted);
1157                 ci->i_auth_cap = NULL;
1158         }
1159
1160         /* remove from session list */
1161         spin_lock(&session->s_cap_lock);
1162         if (session->s_cap_iterator == cap) {
1163                 /* not yet, we are iterating over this very cap */
1164                 dout("__ceph_remove_cap  delaying %p removal from session %p\n",
1165                      cap, cap->session);
1166         } else {
1167                 list_del_init(&cap->session_caps);
1168                 session->s_nr_caps--;
1169                 atomic64_dec(&mdsc->metric.total_caps);
1170                 cap->session = NULL;
1171                 removed = 1;
1172         }
1173         /* protect backpointer with s_cap_lock: see iterate_session_caps */
1174         cap->ci = NULL;
1175
1176         /*
1177          * s_cap_reconnect is protected by s_cap_lock. no one changes
1178          * s_cap_gen while session is in the reconnect state.
1179          */
1180         if (queue_release &&
1181             (!session->s_cap_reconnect ||
1182              cap->cap_gen == atomic_read(&session->s_cap_gen))) {
1183                 cap->queue_release = 1;
1184                 if (removed) {
1185                         __ceph_queue_cap_release(session, cap);
1186                         removed = 0;
1187                 }
1188         } else {
1189                 cap->queue_release = 0;
1190         }
1191         cap->cap_ino = ci->i_vino.ino;
1192
1193         spin_unlock(&session->s_cap_lock);
1194
1195         if (removed)
1196                 ceph_put_cap(mdsc, cap);
1197
1198         if (!__ceph_is_any_real_caps(ci)) {
1199                 /* when reconnect denied, we remove session caps forcibly,
1200                  * i_wr_ref can be non-zero. If there are ongoing write,
1201                  * keep i_snap_realm.
1202                  */
1203                 if (ci->i_wr_ref == 0 && ci->i_snap_realm)
1204                         drop_inode_snap_realm(ci);
1205
1206                 __cap_delay_cancel(mdsc, ci);
1207         }
1208 }
1209
1210 struct cap_msg_args {
1211         struct ceph_mds_session *session;
1212         u64                     ino, cid, follows;
1213         u64                     flush_tid, oldest_flush_tid, size, max_size;
1214         u64                     xattr_version;
1215         u64                     change_attr;
1216         struct ceph_buffer      *xattr_buf;
1217         struct ceph_buffer      *old_xattr_buf;
1218         struct timespec64       atime, mtime, ctime, btime;
1219         int                     op, caps, wanted, dirty;
1220         u32                     seq, issue_seq, mseq, time_warp_seq;
1221         u32                     flags;
1222         kuid_t                  uid;
1223         kgid_t                  gid;
1224         umode_t                 mode;
1225         bool                    inline_data;
1226         bool                    wake;
1227 };
1228
1229 /*
1230  * cap struct size + flock buffer size + inline version + inline data size +
1231  * osd_epoch_barrier + oldest_flush_tid
1232  */
1233 #define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
1234                       4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
1235
1236 /* Marshal up the cap msg to the MDS */
1237 static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
1238 {
1239         struct ceph_mds_caps *fc;
1240         void *p;
1241         struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
1242
1243         dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
1244              __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1245              ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1246              ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1247              arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1248              arg->size, arg->max_size, arg->xattr_version,
1249              arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
1250
1251         msg->hdr.version = cpu_to_le16(10);
1252         msg->hdr.tid = cpu_to_le64(arg->flush_tid);
1253
1254         fc = msg->front.iov_base;
1255         memset(fc, 0, sizeof(*fc));
1256
1257         fc->cap_id = cpu_to_le64(arg->cid);
1258         fc->op = cpu_to_le32(arg->op);
1259         fc->seq = cpu_to_le32(arg->seq);
1260         fc->issue_seq = cpu_to_le32(arg->issue_seq);
1261         fc->migrate_seq = cpu_to_le32(arg->mseq);
1262         fc->caps = cpu_to_le32(arg->caps);
1263         fc->wanted = cpu_to_le32(arg->wanted);
1264         fc->dirty = cpu_to_le32(arg->dirty);
1265         fc->ino = cpu_to_le64(arg->ino);
1266         fc->snap_follows = cpu_to_le64(arg->follows);
1267
1268         fc->size = cpu_to_le64(arg->size);
1269         fc->max_size = cpu_to_le64(arg->max_size);
1270         ceph_encode_timespec64(&fc->mtime, &arg->mtime);
1271         ceph_encode_timespec64(&fc->atime, &arg->atime);
1272         ceph_encode_timespec64(&fc->ctime, &arg->ctime);
1273         fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
1274
1275         fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
1276         fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
1277         fc->mode = cpu_to_le32(arg->mode);
1278
1279         fc->xattr_version = cpu_to_le64(arg->xattr_version);
1280         if (arg->xattr_buf) {
1281                 msg->middle = ceph_buffer_get(arg->xattr_buf);
1282                 fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1283                 msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1284         }
1285
1286         p = fc + 1;
1287         /* flock buffer size (version 2) */
1288         ceph_encode_32(&p, 0);
1289         /* inline version (version 4) */
1290         ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
1291         /* inline data size */
1292         ceph_encode_32(&p, 0);
1293         /*
1294          * osd_epoch_barrier (version 5)
1295          * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
1296          * case it was recently changed
1297          */
1298         ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
1299         /* oldest_flush_tid (version 6) */
1300         ceph_encode_64(&p, arg->oldest_flush_tid);
1301
1302         /*
1303          * caller_uid/caller_gid (version 7)
1304          *
1305          * Currently, we don't properly track which caller dirtied the caps
1306          * last, and force a flush of them when there is a conflict. For now,
1307          * just set this to 0:0, to emulate how the MDS has worked up to now.
1308          */
1309         ceph_encode_32(&p, 0);
1310         ceph_encode_32(&p, 0);
1311
1312         /* pool namespace (version 8) (mds always ignores this) */
1313         ceph_encode_32(&p, 0);
1314
1315         /* btime and change_attr (version 9) */
1316         ceph_encode_timespec64(p, &arg->btime);
1317         p += sizeof(struct ceph_timespec);
1318         ceph_encode_64(&p, arg->change_attr);
1319
1320         /* Advisory flags (version 10) */
1321         ceph_encode_32(&p, arg->flags);
1322 }
1323
1324 /*
1325  * Queue cap releases when an inode is dropped from our cache.
1326  */
1327 void __ceph_remove_caps(struct ceph_inode_info *ci)
1328 {
1329         struct rb_node *p;
1330
1331         /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
1332          * may call __ceph_caps_issued_mask() on a freeing inode. */
1333         spin_lock(&ci->i_ceph_lock);
1334         p = rb_first(&ci->i_caps);
1335         while (p) {
1336                 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1337                 p = rb_next(p);
1338                 __ceph_remove_cap(cap, true);
1339         }
1340         spin_unlock(&ci->i_ceph_lock);
1341 }
1342
1343 /*
1344  * Prepare to send a cap message to an MDS. Update the cap state, and populate
1345  * the arg struct with the parameters that will need to be sent. This should
1346  * be done under the i_ceph_lock to guard against changes to cap state.
1347  *
1348  * Make note of max_size reported/requested from mds, revoked caps
1349  * that have now been implemented.
1350  */
1351 static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
1352                        int op, int flags, int used, int want, int retain,
1353                        int flushing, u64 flush_tid, u64 oldest_flush_tid)
1354 {
1355         struct ceph_inode_info *ci = cap->ci;
1356         struct inode *inode = &ci->vfs_inode;
1357         int held, revoking;
1358
1359         lockdep_assert_held(&ci->i_ceph_lock);
1360
1361         held = cap->issued | cap->implemented;
1362         revoking = cap->implemented & ~cap->issued;
1363         retain &= ~revoking;
1364
1365         dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
1366              __func__, inode, cap, cap->session,
1367              ceph_cap_string(held), ceph_cap_string(held & retain),
1368              ceph_cap_string(revoking));
1369         BUG_ON((retain & CEPH_CAP_PIN) == 0);
1370
1371         ci->i_ceph_flags &= ~CEPH_I_FLUSH;
1372
1373         cap->issued &= retain;  /* drop bits we don't want */
1374         /*
1375          * Wake up any waiters on wanted -> needed transition. This is due to
1376          * the weird transition from buffered to sync IO... we need to flush
1377          * dirty pages _before_ allowing sync writes to avoid reordering.
1378          */
1379         arg->wake = cap->implemented & ~cap->issued;
1380         cap->implemented &= cap->issued | used;
1381         cap->mds_wanted = want;
1382
1383         arg->session = cap->session;
1384         arg->ino = ceph_vino(inode).ino;
1385         arg->cid = cap->cap_id;
1386         arg->follows = flushing ? ci->i_head_snapc->seq : 0;
1387         arg->flush_tid = flush_tid;
1388         arg->oldest_flush_tid = oldest_flush_tid;
1389
1390         arg->size = i_size_read(inode);
1391         ci->i_reported_size = arg->size;
1392         arg->max_size = ci->i_wanted_max_size;
1393         if (cap == ci->i_auth_cap) {
1394                 if (want & CEPH_CAP_ANY_FILE_WR)
1395                         ci->i_requested_max_size = arg->max_size;
1396                 else
1397                         ci->i_requested_max_size = 0;
1398         }
1399
1400         if (flushing & CEPH_CAP_XATTR_EXCL) {
1401                 arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1402                 arg->xattr_version = ci->i_xattrs.version;
1403                 arg->xattr_buf = ci->i_xattrs.blob;
1404         } else {
1405                 arg->xattr_buf = NULL;
1406                 arg->old_xattr_buf = NULL;
1407         }
1408
1409         arg->mtime = inode->i_mtime;
1410         arg->atime = inode->i_atime;
1411         arg->ctime = inode->i_ctime;
1412         arg->btime = ci->i_btime;
1413         arg->change_attr = inode_peek_iversion_raw(inode);
1414
1415         arg->op = op;
1416         arg->caps = cap->implemented;
1417         arg->wanted = want;
1418         arg->dirty = flushing;
1419
1420         arg->seq = cap->seq;
1421         arg->issue_seq = cap->issue_seq;
1422         arg->mseq = cap->mseq;
1423         arg->time_warp_seq = ci->i_time_warp_seq;
1424
1425         arg->uid = inode->i_uid;
1426         arg->gid = inode->i_gid;
1427         arg->mode = inode->i_mode;
1428
1429         arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1430         if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1431             !list_empty(&ci->i_cap_snaps)) {
1432                 struct ceph_cap_snap *capsnap;
1433                 list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1434                         if (capsnap->cap_flush.tid)
1435                                 break;
1436                         if (capsnap->need_flush) {
1437                                 flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1438                                 break;
1439                         }
1440                 }
1441         }
1442         arg->flags = flags;
1443 }
1444
1445 /*
1446  * Send a cap msg on the given inode.
1447  *
1448  * Caller should hold snap_rwsem (read), s_mutex.
1449  */
1450 static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
1451 {
1452         struct ceph_msg *msg;
1453         struct inode *inode = &ci->vfs_inode;
1454
1455         msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1456         if (!msg) {
1457                 pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
1458                        ceph_vinop(inode), ceph_cap_string(arg->dirty),
1459                        arg->flush_tid);
1460                 spin_lock(&ci->i_ceph_lock);
1461                 __cap_delay_requeue(arg->session->s_mdsc, ci);
1462                 spin_unlock(&ci->i_ceph_lock);
1463                 return;
1464         }
1465
1466         encode_cap_msg(msg, arg);
1467         ceph_con_send(&arg->session->s_con, msg);
1468         ceph_buffer_put(arg->old_xattr_buf);
1469         if (arg->wake)
1470                 wake_up_all(&ci->i_cap_wq);
1471 }
1472
1473 static inline int __send_flush_snap(struct inode *inode,
1474                                     struct ceph_mds_session *session,
1475                                     struct ceph_cap_snap *capsnap,
1476                                     u32 mseq, u64 oldest_flush_tid)
1477 {
1478         struct cap_msg_args     arg;
1479         struct ceph_msg         *msg;
1480
1481         msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1482         if (!msg)
1483                 return -ENOMEM;
1484
1485         arg.session = session;
1486         arg.ino = ceph_vino(inode).ino;
1487         arg.cid = 0;
1488         arg.follows = capsnap->follows;
1489         arg.flush_tid = capsnap->cap_flush.tid;
1490         arg.oldest_flush_tid = oldest_flush_tid;
1491
1492         arg.size = capsnap->size;
1493         arg.max_size = 0;
1494         arg.xattr_version = capsnap->xattr_version;
1495         arg.xattr_buf = capsnap->xattr_blob;
1496         arg.old_xattr_buf = NULL;
1497
1498         arg.atime = capsnap->atime;
1499         arg.mtime = capsnap->mtime;
1500         arg.ctime = capsnap->ctime;
1501         arg.btime = capsnap->btime;
1502         arg.change_attr = capsnap->change_attr;
1503
1504         arg.op = CEPH_CAP_OP_FLUSHSNAP;
1505         arg.caps = capsnap->issued;
1506         arg.wanted = 0;
1507         arg.dirty = capsnap->dirty;
1508
1509         arg.seq = 0;
1510         arg.issue_seq = 0;
1511         arg.mseq = mseq;
1512         arg.time_warp_seq = capsnap->time_warp_seq;
1513
1514         arg.uid = capsnap->uid;
1515         arg.gid = capsnap->gid;
1516         arg.mode = capsnap->mode;
1517
1518         arg.inline_data = capsnap->inline_data;
1519         arg.flags = 0;
1520         arg.wake = false;
1521
1522         encode_cap_msg(msg, &arg);
1523         ceph_con_send(&arg.session->s_con, msg);
1524         return 0;
1525 }
1526
1527 /*
1528  * When a snapshot is taken, clients accumulate dirty metadata on
1529  * inodes with capabilities in ceph_cap_snaps to describe the file
1530  * state at the time the snapshot was taken.  This must be flushed
1531  * asynchronously back to the MDS once sync writes complete and dirty
1532  * data is written out.
1533  *
1534  * Called under i_ceph_lock.
1535  */
1536 static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1537                                struct ceph_mds_session *session)
1538                 __releases(ci->i_ceph_lock)
1539                 __acquires(ci->i_ceph_lock)
1540 {
1541         struct inode *inode = &ci->vfs_inode;
1542         struct ceph_mds_client *mdsc = session->s_mdsc;
1543         struct ceph_cap_snap *capsnap;
1544         u64 oldest_flush_tid = 0;
1545         u64 first_tid = 1, last_tid = 0;
1546
1547         dout("__flush_snaps %p session %p\n", inode, session);
1548
1549         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1550                 /*
1551                  * we need to wait for sync writes to complete and for dirty
1552                  * pages to be written out.
1553                  */
1554                 if (capsnap->dirty_pages || capsnap->writing)
1555                         break;
1556
1557                 /* should be removed by ceph_try_drop_cap_snap() */
1558                 BUG_ON(!capsnap->need_flush);
1559
1560                 /* only flush each capsnap once */
1561                 if (capsnap->cap_flush.tid > 0) {
1562                         dout(" already flushed %p, skipping\n", capsnap);
1563                         continue;
1564                 }
1565
1566                 spin_lock(&mdsc->cap_dirty_lock);
1567                 capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1568                 list_add_tail(&capsnap->cap_flush.g_list,
1569                               &mdsc->cap_flush_list);
1570                 if (oldest_flush_tid == 0)
1571                         oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1572                 if (list_empty(&ci->i_flushing_item)) {
1573                         list_add_tail(&ci->i_flushing_item,
1574                                       &session->s_cap_flushing);
1575                 }
1576                 spin_unlock(&mdsc->cap_dirty_lock);
1577
1578                 list_add_tail(&capsnap->cap_flush.i_list,
1579                               &ci->i_cap_flush_list);
1580
1581                 if (first_tid == 1)
1582                         first_tid = capsnap->cap_flush.tid;
1583                 last_tid = capsnap->cap_flush.tid;
1584         }
1585
1586         ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1587
1588         while (first_tid <= last_tid) {
1589                 struct ceph_cap *cap = ci->i_auth_cap;
1590                 struct ceph_cap_flush *cf;
1591                 int ret;
1592
1593                 if (!(cap && cap->session == session)) {
1594                         dout("__flush_snaps %p auth cap %p not mds%d, "
1595                              "stop\n", inode, cap, session->s_mds);
1596                         break;
1597                 }
1598
1599                 ret = -ENOENT;
1600                 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
1601                         if (cf->tid >= first_tid) {
1602                                 ret = 0;
1603                                 break;
1604                         }
1605                 }
1606                 if (ret < 0)
1607                         break;
1608
1609                 first_tid = cf->tid + 1;
1610
1611                 capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
1612                 refcount_inc(&capsnap->nref);
1613                 spin_unlock(&ci->i_ceph_lock);
1614
1615                 dout("__flush_snaps %p capsnap %p tid %llu %s\n",
1616                      inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
1617
1618                 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
1619                                         oldest_flush_tid);
1620                 if (ret < 0) {
1621                         pr_err("__flush_snaps: error sending cap flushsnap, "
1622                                "ino (%llx.%llx) tid %llu follows %llu\n",
1623                                 ceph_vinop(inode), cf->tid, capsnap->follows);
1624                 }
1625
1626                 ceph_put_cap_snap(capsnap);
1627                 spin_lock(&ci->i_ceph_lock);
1628         }
1629 }
1630
1631 void ceph_flush_snaps(struct ceph_inode_info *ci,
1632                       struct ceph_mds_session **psession)
1633 {
1634         struct inode *inode = &ci->vfs_inode;
1635         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1636         struct ceph_mds_session *session = NULL;
1637         int mds;
1638
1639         dout("ceph_flush_snaps %p\n", inode);
1640         if (psession)
1641                 session = *psession;
1642 retry:
1643         spin_lock(&ci->i_ceph_lock);
1644         if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
1645                 dout(" no capsnap needs flush, doing nothing\n");
1646                 goto out;
1647         }
1648         if (!ci->i_auth_cap) {
1649                 dout(" no auth cap (migrating?), doing nothing\n");
1650                 goto out;
1651         }
1652
1653         mds = ci->i_auth_cap->session->s_mds;
1654         if (session && session->s_mds != mds) {
1655                 dout(" oops, wrong session %p mutex\n", session);
1656                 ceph_put_mds_session(session);
1657                 session = NULL;
1658         }
1659         if (!session) {
1660                 spin_unlock(&ci->i_ceph_lock);
1661                 mutex_lock(&mdsc->mutex);
1662                 session = __ceph_lookup_mds_session(mdsc, mds);
1663                 mutex_unlock(&mdsc->mutex);
1664                 goto retry;
1665         }
1666
1667         // make sure flushsnap messages are sent in proper order.
1668         if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
1669                 __kick_flushing_caps(mdsc, session, ci, 0);
1670
1671         __ceph_flush_snaps(ci, session);
1672 out:
1673         spin_unlock(&ci->i_ceph_lock);
1674
1675         if (psession)
1676                 *psession = session;
1677         else
1678                 ceph_put_mds_session(session);
1679         /* we flushed them all; remove this inode from the queue */
1680         spin_lock(&mdsc->snap_flush_lock);
1681         list_del_init(&ci->i_snap_flush_item);
1682         spin_unlock(&mdsc->snap_flush_lock);
1683 }
1684
1685 /*
1686  * Mark caps dirty.  If inode is newly dirty, return the dirty flags.
1687  * Caller is then responsible for calling __mark_inode_dirty with the
1688  * returned flags value.
1689  */
1690 int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
1691                            struct ceph_cap_flush **pcf)
1692 {
1693         struct ceph_mds_client *mdsc =
1694                 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1695         struct inode *inode = &ci->vfs_inode;
1696         int was = ci->i_dirty_caps;
1697         int dirty = 0;
1698
1699         lockdep_assert_held(&ci->i_ceph_lock);
1700
1701         if (!ci->i_auth_cap) {
1702                 pr_warn("__mark_dirty_caps %p %llx mask %s, "
1703                         "but no auth cap (session was closed?)\n",
1704                         inode, ceph_ino(inode), ceph_cap_string(mask));
1705                 return 0;
1706         }
1707
1708         dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1709              ceph_cap_string(mask), ceph_cap_string(was),
1710              ceph_cap_string(was | mask));
1711         ci->i_dirty_caps |= mask;
1712         if (was == 0) {
1713                 struct ceph_mds_session *session = ci->i_auth_cap->session;
1714
1715                 WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1716                 swap(ci->i_prealloc_cap_flush, *pcf);
1717
1718                 if (!ci->i_head_snapc) {
1719                         WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
1720                         ci->i_head_snapc = ceph_get_snap_context(
1721                                 ci->i_snap_realm->cached_context);
1722                 }
1723                 dout(" inode %p now dirty snapc %p auth cap %p\n",
1724                      &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1725                 BUG_ON(!list_empty(&ci->i_dirty_item));
1726                 spin_lock(&mdsc->cap_dirty_lock);
1727                 list_add(&ci->i_dirty_item, &session->s_cap_dirty);
1728                 spin_unlock(&mdsc->cap_dirty_lock);
1729                 if (ci->i_flushing_caps == 0) {
1730                         ihold(inode);
1731                         dirty |= I_DIRTY_SYNC;
1732                 }
1733         } else {
1734                 WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
1735         }
1736         BUG_ON(list_empty(&ci->i_dirty_item));
1737         if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1738             (mask & CEPH_CAP_FILE_BUFFER))
1739                 dirty |= I_DIRTY_DATASYNC;
1740         __cap_delay_requeue(mdsc, ci);
1741         return dirty;
1742 }
1743
1744 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
1745 {
1746         return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1747 }
1748
1749 void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1750 {
1751         if (cf)
1752                 kmem_cache_free(ceph_cap_flush_cachep, cf);
1753 }
1754
1755 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1756 {
1757         if (!list_empty(&mdsc->cap_flush_list)) {
1758                 struct ceph_cap_flush *cf =
1759                         list_first_entry(&mdsc->cap_flush_list,
1760                                          struct ceph_cap_flush, g_list);
1761                 return cf->tid;
1762         }
1763         return 0;
1764 }
1765
1766 /*
1767  * Remove cap_flush from the mdsc's or inode's flushing cap list.
1768  * Return true if caller needs to wake up flush waiters.
1769  */
1770 static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1771                                          struct ceph_cap_flush *cf)
1772 {
1773         struct ceph_cap_flush *prev;
1774         bool wake = cf->wake;
1775
1776         if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1777                 prev = list_prev_entry(cf, g_list);
1778                 prev->wake = true;
1779                 wake = false;
1780         }
1781         list_del(&cf->g_list);
1782         return wake;
1783 }
1784
1785 static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1786                                        struct ceph_cap_flush *cf)
1787 {
1788         struct ceph_cap_flush *prev;
1789         bool wake = cf->wake;
1790
1791         if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1792                 prev = list_prev_entry(cf, i_list);
1793                 prev->wake = true;
1794                 wake = false;
1795         }
1796         list_del(&cf->i_list);
1797         return wake;
1798 }
1799
1800 /*
1801  * Add dirty inode to the flushing list.  Assigned a seq number so we
1802  * can wait for caps to flush without starving.
1803  *
1804  * Called under i_ceph_lock. Returns the flush tid.
1805  */
1806 static u64 __mark_caps_flushing(struct inode *inode,
1807                                 struct ceph_mds_session *session, bool wake,
1808                                 u64 *oldest_flush_tid)
1809 {
1810         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1811         struct ceph_inode_info *ci = ceph_inode(inode);
1812         struct ceph_cap_flush *cf = NULL;
1813         int flushing;
1814
1815         lockdep_assert_held(&ci->i_ceph_lock);
1816         BUG_ON(ci->i_dirty_caps == 0);
1817         BUG_ON(list_empty(&ci->i_dirty_item));
1818         BUG_ON(!ci->i_prealloc_cap_flush);
1819
1820         flushing = ci->i_dirty_caps;
1821         dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1822              ceph_cap_string(flushing),
1823              ceph_cap_string(ci->i_flushing_caps),
1824              ceph_cap_string(ci->i_flushing_caps | flushing));
1825         ci->i_flushing_caps |= flushing;
1826         ci->i_dirty_caps = 0;
1827         dout(" inode %p now !dirty\n", inode);
1828
1829         swap(cf, ci->i_prealloc_cap_flush);
1830         cf->caps = flushing;
1831         cf->wake = wake;
1832
1833         spin_lock(&mdsc->cap_dirty_lock);
1834         list_del_init(&ci->i_dirty_item);
1835
1836         cf->tid = ++mdsc->last_cap_flush_tid;
1837         list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
1838         *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1839
1840         if (list_empty(&ci->i_flushing_item)) {
1841                 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1842                 mdsc->num_cap_flushing++;
1843         }
1844         spin_unlock(&mdsc->cap_dirty_lock);
1845
1846         list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
1847
1848         return cf->tid;
1849 }
1850
1851 /*
1852  * try to invalidate mapping pages without blocking.
1853  */
1854 static int try_nonblocking_invalidate(struct inode *inode)
1855 {
1856         struct ceph_inode_info *ci = ceph_inode(inode);
1857         u32 invalidating_gen = ci->i_rdcache_gen;
1858
1859         spin_unlock(&ci->i_ceph_lock);
1860         ceph_fscache_invalidate(inode);
1861         invalidate_mapping_pages(&inode->i_data, 0, -1);
1862         spin_lock(&ci->i_ceph_lock);
1863
1864         if (inode->i_data.nrpages == 0 &&
1865             invalidating_gen == ci->i_rdcache_gen) {
1866                 /* success. */
1867                 dout("try_nonblocking_invalidate %p success\n", inode);
1868                 /* save any racing async invalidate some trouble */
1869                 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1870                 return 0;
1871         }
1872         dout("try_nonblocking_invalidate %p failed\n", inode);
1873         return -1;
1874 }
1875
1876 bool __ceph_should_report_size(struct ceph_inode_info *ci)
1877 {
1878         loff_t size = i_size_read(&ci->vfs_inode);
1879         /* mds will adjust max size according to the reported size */
1880         if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
1881                 return false;
1882         if (size >= ci->i_max_size)
1883                 return true;
1884         /* half of previous max_size increment has been used */
1885         if (ci->i_max_size > ci->i_reported_size &&
1886             (size << 1) >= ci->i_max_size + ci->i_reported_size)
1887                 return true;
1888         return false;
1889 }
1890
1891 /*
1892  * Swiss army knife function to examine currently used and wanted
1893  * versus held caps.  Release, flush, ack revoked caps to mds as
1894  * appropriate.
1895  *
1896  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
1897  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1898  *    further delay.
1899  */
1900 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1901                      struct ceph_mds_session *session)
1902 {
1903         struct inode *inode = &ci->vfs_inode;
1904         struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
1905         struct ceph_cap *cap;
1906         u64 flush_tid, oldest_flush_tid;
1907         int file_wanted, used, cap_used;
1908         int issued, implemented, want, retain, revoking, flushing = 0;
1909         int mds = -1;   /* keep track of how far we've gone through i_caps list
1910                            to avoid an infinite loop on retry */
1911         struct rb_node *p;
1912         bool queue_invalidate = false;
1913         bool tried_invalidate = false;
1914
1915         if (session)
1916                 ceph_get_mds_session(session);
1917
1918         spin_lock(&ci->i_ceph_lock);
1919         if (ci->i_ceph_flags & CEPH_I_FLUSH)
1920                 flags |= CHECK_CAPS_FLUSH;
1921 retry:
1922         /* Caps wanted by virtue of active open files. */
1923         file_wanted = __ceph_caps_file_wanted(ci);
1924
1925         /* Caps which have active references against them */
1926         used = __ceph_caps_used(ci);
1927
1928         /*
1929          * "issued" represents the current caps that the MDS wants us to have.
1930          * "implemented" is the set that we have been granted, and includes the
1931          * ones that have not yet been returned to the MDS (the "revoking" set,
1932          * usually because they have outstanding references).
1933          */
1934         issued = __ceph_caps_issued(ci, &implemented);
1935         revoking = implemented & ~issued;
1936
1937         want = file_wanted;
1938
1939         /* The ones we currently want to retain (may be adjusted below) */
1940         retain = file_wanted | used | CEPH_CAP_PIN;
1941         if (!mdsc->stopping && inode->i_nlink > 0) {
1942                 if (file_wanted) {
1943                         retain |= CEPH_CAP_ANY;       /* be greedy */
1944                 } else if (S_ISDIR(inode->i_mode) &&
1945                            (issued & CEPH_CAP_FILE_SHARED) &&
1946                            __ceph_dir_is_complete(ci)) {
1947                         /*
1948                          * If a directory is complete, we want to keep
1949                          * the exclusive cap. So that MDS does not end up
1950                          * revoking the shared cap on every create/unlink
1951                          * operation.
1952                          */
1953                         if (IS_RDONLY(inode)) {
1954                                 want = CEPH_CAP_ANY_SHARED;
1955                         } else {
1956                                 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1957                         }
1958                         retain |= want;
1959                 } else {
1960
1961                         retain |= CEPH_CAP_ANY_SHARED;
1962                         /*
1963                          * keep RD only if we didn't have the file open RW,
1964                          * because then the mds would revoke it anyway to
1965                          * journal max_size=0.
1966                          */
1967                         if (ci->i_max_size == 0)
1968                                 retain |= CEPH_CAP_ANY_RD;
1969                 }
1970         }
1971
1972         dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1973              " issued %s revoking %s retain %s %s%s\n", inode,
1974              ceph_cap_string(file_wanted),
1975              ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1976              ceph_cap_string(ci->i_flushing_caps),
1977              ceph_cap_string(issued), ceph_cap_string(revoking),
1978              ceph_cap_string(retain),
1979              (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1980              (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1981
1982         /*
1983          * If we no longer need to hold onto old our caps, and we may
1984          * have cached pages, but don't want them, then try to invalidate.
1985          * If we fail, it's because pages are locked.... try again later.
1986          */
1987         if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
1988             S_ISREG(inode->i_mode) &&
1989             !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
1990             inode->i_data.nrpages &&            /* have cached pages */
1991             (revoking & (CEPH_CAP_FILE_CACHE|
1992                          CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
1993             !tried_invalidate) {
1994                 dout("check_caps trying to invalidate on %p\n", inode);
1995                 if (try_nonblocking_invalidate(inode) < 0) {
1996                         dout("check_caps queuing invalidate\n");
1997                         queue_invalidate = true;
1998                         ci->i_rdcache_revoking = ci->i_rdcache_gen;
1999                 }
2000                 tried_invalidate = true;
2001                 goto retry;
2002         }
2003
2004         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2005                 int mflags = 0;
2006                 struct cap_msg_args arg;
2007
2008                 cap = rb_entry(p, struct ceph_cap, ci_node);
2009
2010                 /* avoid looping forever */
2011                 if (mds >= cap->mds ||
2012                     ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
2013                         continue;
2014
2015                 /*
2016                  * If we have an auth cap, we don't need to consider any
2017                  * overlapping caps as used.
2018                  */
2019                 cap_used = used;
2020                 if (ci->i_auth_cap && cap != ci->i_auth_cap)
2021                         cap_used &= ~ci->i_auth_cap->issued;
2022
2023                 revoking = cap->implemented & ~cap->issued;
2024                 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
2025                      cap->mds, cap, ceph_cap_string(cap_used),
2026                      ceph_cap_string(cap->issued),
2027                      ceph_cap_string(cap->implemented),
2028                      ceph_cap_string(revoking));
2029
2030                 if (cap == ci->i_auth_cap &&
2031                     (cap->issued & CEPH_CAP_FILE_WR)) {
2032                         /* request larger max_size from MDS? */
2033                         if (ci->i_wanted_max_size > ci->i_max_size &&
2034                             ci->i_wanted_max_size > ci->i_requested_max_size) {
2035                                 dout("requesting new max_size\n");
2036                                 goto ack;
2037                         }
2038
2039                         /* approaching file_max? */
2040                         if (__ceph_should_report_size(ci)) {
2041                                 dout("i_size approaching max_size\n");
2042                                 goto ack;
2043                         }
2044                 }
2045                 /* flush anything dirty? */
2046                 if (cap == ci->i_auth_cap) {
2047                         if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
2048                                 dout("flushing dirty caps\n");
2049                                 goto ack;
2050                         }
2051                         if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
2052                                 dout("flushing snap caps\n");
2053                                 goto ack;
2054                         }
2055                 }
2056
2057                 /* completed revocation? going down and there are no caps? */
2058                 if (revoking && (revoking & cap_used) == 0) {
2059                         dout("completed revocation of %s\n",
2060                              ceph_cap_string(cap->implemented & ~cap->issued));
2061                         goto ack;
2062                 }
2063
2064                 /* want more caps from mds? */
2065                 if (want & ~cap->mds_wanted) {
2066                         if (want & ~(cap->mds_wanted | cap->issued))
2067                                 goto ack;
2068                         if (!__cap_is_valid(cap))
2069                                 goto ack;
2070                 }
2071
2072                 /* things we might delay */
2073                 if ((cap->issued & ~retain) == 0)
2074                         continue;     /* nope, all good */
2075
2076 ack:
2077                 ceph_put_mds_session(session);
2078                 session = ceph_get_mds_session(cap->session);
2079
2080                 /* kick flushing and flush snaps before sending normal
2081                  * cap message */
2082                 if (cap == ci->i_auth_cap &&
2083                     (ci->i_ceph_flags &
2084                      (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
2085                         if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2086                                 __kick_flushing_caps(mdsc, session, ci, 0);
2087                         if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2088                                 __ceph_flush_snaps(ci, session);
2089
2090                         goto retry;
2091                 }
2092
2093                 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2094                         flushing = ci->i_dirty_caps;
2095                         flush_tid = __mark_caps_flushing(inode, session, false,
2096                                                          &oldest_flush_tid);
2097                         if (flags & CHECK_CAPS_FLUSH &&
2098                             list_empty(&session->s_cap_dirty))
2099                                 mflags |= CEPH_CLIENT_CAPS_SYNC;
2100                 } else {
2101                         flushing = 0;
2102                         flush_tid = 0;
2103                         spin_lock(&mdsc->cap_dirty_lock);
2104                         oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2105                         spin_unlock(&mdsc->cap_dirty_lock);
2106                 }
2107
2108                 mds = cap->mds;  /* remember mds, so we don't repeat */
2109
2110                 __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
2111                            want, retain, flushing, flush_tid, oldest_flush_tid);
2112
2113                 spin_unlock(&ci->i_ceph_lock);
2114                 __send_cap(&arg, ci);
2115                 spin_lock(&ci->i_ceph_lock);
2116
2117                 goto retry; /* retake i_ceph_lock and restart our cap scan. */
2118         }
2119
2120         /* periodically re-calculate caps wanted by open files */
2121         if (__ceph_is_any_real_caps(ci) &&
2122             list_empty(&ci->i_cap_delay_list) &&
2123             (file_wanted & ~CEPH_CAP_PIN) &&
2124             !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
2125                 __cap_delay_requeue(mdsc, ci);
2126         }
2127
2128         spin_unlock(&ci->i_ceph_lock);
2129
2130         ceph_put_mds_session(session);
2131         if (queue_invalidate)
2132                 ceph_queue_invalidate(inode);
2133 }
2134
2135 /*
2136  * Try to flush dirty caps back to the auth mds.
2137  */
2138 static int try_flush_caps(struct inode *inode, u64 *ptid)
2139 {
2140         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2141         struct ceph_inode_info *ci = ceph_inode(inode);
2142         int flushing = 0;
2143         u64 flush_tid = 0, oldest_flush_tid = 0;
2144
2145         spin_lock(&ci->i_ceph_lock);
2146 retry_locked:
2147         if (ci->i_dirty_caps && ci->i_auth_cap) {
2148                 struct ceph_cap *cap = ci->i_auth_cap;
2149                 struct cap_msg_args arg;
2150                 struct ceph_mds_session *session = cap->session;
2151
2152                 if (session->s_state < CEPH_MDS_SESSION_OPEN) {
2153                         spin_unlock(&ci->i_ceph_lock);
2154                         goto out;
2155                 }
2156
2157                 if (ci->i_ceph_flags &
2158                     (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2159                         if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2160                                 __kick_flushing_caps(mdsc, session, ci, 0);
2161                         if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2162                                 __ceph_flush_snaps(ci, session);
2163                         goto retry_locked;
2164                 }
2165
2166                 flushing = ci->i_dirty_caps;
2167                 flush_tid = __mark_caps_flushing(inode, session, true,
2168                                                  &oldest_flush_tid);
2169
2170                 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2171                            __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2172                            (cap->issued | cap->implemented),
2173                            flushing, flush_tid, oldest_flush_tid);
2174                 spin_unlock(&ci->i_ceph_lock);
2175
2176                 __send_cap(&arg, ci);
2177         } else {
2178                 if (!list_empty(&ci->i_cap_flush_list)) {
2179                         struct ceph_cap_flush *cf =
2180                                 list_last_entry(&ci->i_cap_flush_list,
2181                                                 struct ceph_cap_flush, i_list);
2182                         cf->wake = true;
2183                         flush_tid = cf->tid;
2184                 }
2185                 flushing = ci->i_flushing_caps;
2186                 spin_unlock(&ci->i_ceph_lock);
2187         }
2188 out:
2189         *ptid = flush_tid;
2190         return flushing;
2191 }
2192
2193 /*
2194  * Return true if we've flushed caps through the given flush_tid.
2195  */
2196 static int caps_are_flushed(struct inode *inode, u64 flush_tid)
2197 {
2198         struct ceph_inode_info *ci = ceph_inode(inode);
2199         int ret = 1;
2200
2201         spin_lock(&ci->i_ceph_lock);
2202         if (!list_empty(&ci->i_cap_flush_list)) {
2203                 struct ceph_cap_flush * cf =
2204                         list_first_entry(&ci->i_cap_flush_list,
2205                                          struct ceph_cap_flush, i_list);
2206                 if (cf->tid <= flush_tid)
2207                         ret = 0;
2208         }
2209         spin_unlock(&ci->i_ceph_lock);
2210         return ret;
2211 }
2212
2213 /*
2214  * wait for any unsafe requests to complete.
2215  */
2216 static int unsafe_request_wait(struct inode *inode)
2217 {
2218         struct ceph_inode_info *ci = ceph_inode(inode);
2219         struct ceph_mds_request *req1 = NULL, *req2 = NULL;
2220         int ret, err = 0;
2221
2222         spin_lock(&ci->i_unsafe_lock);
2223         if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
2224                 req1 = list_last_entry(&ci->i_unsafe_dirops,
2225                                         struct ceph_mds_request,
2226                                         r_unsafe_dir_item);
2227                 ceph_mdsc_get_request(req1);
2228         }
2229         if (!list_empty(&ci->i_unsafe_iops)) {
2230                 req2 = list_last_entry(&ci->i_unsafe_iops,
2231                                         struct ceph_mds_request,
2232                                         r_unsafe_target_item);
2233                 ceph_mdsc_get_request(req2);
2234         }
2235         spin_unlock(&ci->i_unsafe_lock);
2236
2237         dout("unsafe_request_wait %p wait on tid %llu %llu\n",
2238              inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
2239         if (req1) {
2240                 ret = !wait_for_completion_timeout(&req1->r_safe_completion,
2241                                         ceph_timeout_jiffies(req1->r_timeout));
2242                 if (ret)
2243                         err = -EIO;
2244                 ceph_mdsc_put_request(req1);
2245         }
2246         if (req2) {
2247                 ret = !wait_for_completion_timeout(&req2->r_safe_completion,
2248                                         ceph_timeout_jiffies(req2->r_timeout));
2249                 if (ret)
2250                         err = -EIO;
2251                 ceph_mdsc_put_request(req2);
2252         }
2253         return err;
2254 }
2255
2256 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2257 {
2258         struct ceph_file_info *fi = file->private_data;
2259         struct inode *inode = file->f_mapping->host;
2260         struct ceph_inode_info *ci = ceph_inode(inode);
2261         u64 flush_tid;
2262         int ret, err;
2263         int dirty;
2264
2265         dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
2266
2267         ret = file_write_and_wait_range(file, start, end);
2268         if (datasync)
2269                 goto out;
2270
2271         ret = ceph_wait_on_async_create(inode);
2272         if (ret)
2273                 goto out;
2274
2275         dirty = try_flush_caps(inode, &flush_tid);
2276         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2277
2278         err = unsafe_request_wait(inode);
2279
2280         /*
2281          * only wait on non-file metadata writeback (the mds
2282          * can recover size and mtime, so we don't need to
2283          * wait for that)
2284          */
2285         if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2286                 err = wait_event_interruptible(ci->i_cap_wq,
2287                                         caps_are_flushed(inode, flush_tid));
2288         }
2289
2290         if (err < 0)
2291                 ret = err;
2292
2293         if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
2294                 spin_lock(&file->f_lock);
2295                 err = errseq_check_and_advance(&ci->i_meta_err,
2296                                                &fi->meta_err);
2297                 spin_unlock(&file->f_lock);
2298                 if (err < 0)
2299                         ret = err;
2300         }
2301 out:
2302         dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2303         return ret;
2304 }
2305
2306 /*
2307  * Flush any dirty caps back to the mds.  If we aren't asked to wait,
2308  * queue inode for flush but don't do so immediately, because we can
2309  * get by with fewer MDS messages if we wait for data writeback to
2310  * complete first.
2311  */
2312 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
2313 {
2314         struct ceph_inode_info *ci = ceph_inode(inode);
2315         u64 flush_tid;
2316         int err = 0;
2317         int dirty;
2318         int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
2319
2320         dout("write_inode %p wait=%d\n", inode, wait);
2321         if (wait) {
2322                 dirty = try_flush_caps(inode, &flush_tid);
2323                 if (dirty)
2324                         err = wait_event_interruptible(ci->i_cap_wq,
2325                                        caps_are_flushed(inode, flush_tid));
2326         } else {
2327                 struct ceph_mds_client *mdsc =
2328                         ceph_sb_to_client(inode->i_sb)->mdsc;
2329
2330                 spin_lock(&ci->i_ceph_lock);
2331                 if (__ceph_caps_dirty(ci))
2332                         __cap_delay_requeue_front(mdsc, ci);
2333                 spin_unlock(&ci->i_ceph_lock);
2334         }
2335         return err;
2336 }
2337
2338 static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2339                                  struct ceph_mds_session *session,
2340                                  struct ceph_inode_info *ci,
2341                                  u64 oldest_flush_tid)
2342         __releases(ci->i_ceph_lock)
2343         __acquires(ci->i_ceph_lock)
2344 {
2345         struct inode *inode = &ci->vfs_inode;
2346         struct ceph_cap *cap;
2347         struct ceph_cap_flush *cf;
2348         int ret;
2349         u64 first_tid = 0;
2350         u64 last_snap_flush = 0;
2351
2352         ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2353
2354         list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2355                 if (!cf->caps) {
2356                         last_snap_flush = cf->tid;
2357                         break;
2358                 }
2359         }
2360
2361         list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2362                 if (cf->tid < first_tid)
2363                         continue;
2364
2365                 cap = ci->i_auth_cap;
2366                 if (!(cap && cap->session == session)) {
2367                         pr_err("%p auth cap %p not mds%d ???\n",
2368                                inode, cap, session->s_mds);
2369                         break;
2370                 }
2371
2372                 first_tid = cf->tid + 1;
2373
2374                 if (cf->caps) {
2375                         struct cap_msg_args arg;
2376
2377                         dout("kick_flushing_caps %p cap %p tid %llu %s\n",
2378                              inode, cap, cf->tid, ceph_cap_string(cf->caps));
2379                         __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
2380                                          (cf->tid < last_snap_flush ?
2381                                           CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2382                                           __ceph_caps_used(ci),
2383                                           __ceph_caps_wanted(ci),
2384                                           (cap->issued | cap->implemented),
2385                                           cf->caps, cf->tid, oldest_flush_tid);
2386                         spin_unlock(&ci->i_ceph_lock);
2387                         __send_cap(&arg, ci);
2388                 } else {
2389                         struct ceph_cap_snap *capsnap =
2390                                         container_of(cf, struct ceph_cap_snap,
2391                                                     cap_flush);
2392                         dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
2393                              inode, capsnap, cf->tid,
2394                              ceph_cap_string(capsnap->dirty));
2395
2396                         refcount_inc(&capsnap->nref);
2397                         spin_unlock(&ci->i_ceph_lock);
2398
2399                         ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
2400                                                 oldest_flush_tid);
2401                         if (ret < 0) {
2402                                 pr_err("kick_flushing_caps: error sending "
2403                                         "cap flushsnap, ino (%llx.%llx) "
2404                                         "tid %llu follows %llu\n",
2405                                         ceph_vinop(inode), cf->tid,
2406                                         capsnap->follows);
2407                         }
2408
2409                         ceph_put_cap_snap(capsnap);
2410                 }
2411
2412                 spin_lock(&ci->i_ceph_lock);
2413         }
2414 }
2415
2416 void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2417                                    struct ceph_mds_session *session)
2418 {
2419         struct ceph_inode_info *ci;
2420         struct ceph_cap *cap;
2421         u64 oldest_flush_tid;
2422
2423         dout("early_kick_flushing_caps mds%d\n", session->s_mds);
2424
2425         spin_lock(&mdsc->cap_dirty_lock);
2426         oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2427         spin_unlock(&mdsc->cap_dirty_lock);
2428
2429         list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2430                 spin_lock(&ci->i_ceph_lock);
2431                 cap = ci->i_auth_cap;
2432                 if (!(cap && cap->session == session)) {
2433                         pr_err("%p auth cap %p not mds%d ???\n",
2434                                 &ci->vfs_inode, cap, session->s_mds);
2435                         spin_unlock(&ci->i_ceph_lock);
2436                         continue;
2437                 }
2438
2439
2440                 /*
2441                  * if flushing caps were revoked, we re-send the cap flush
2442                  * in client reconnect stage. This guarantees MDS * processes
2443                  * the cap flush message before issuing the flushing caps to
2444                  * other client.
2445                  */
2446                 if ((cap->issued & ci->i_flushing_caps) !=
2447                     ci->i_flushing_caps) {
2448                         /* encode_caps_cb() also will reset these sequence
2449                          * numbers. make sure sequence numbers in cap flush
2450                          * message match later reconnect message */
2451                         cap->seq = 0;
2452                         cap->issue_seq = 0;
2453                         cap->mseq = 0;
2454                         __kick_flushing_caps(mdsc, session, ci,
2455                                              oldest_flush_tid);
2456                 } else {
2457                         ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
2458                 }
2459
2460                 spin_unlock(&ci->i_ceph_lock);
2461         }
2462 }
2463
2464 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2465                              struct ceph_mds_session *session)
2466 {
2467         struct ceph_inode_info *ci;
2468         struct ceph_cap *cap;
2469         u64 oldest_flush_tid;
2470
2471         lockdep_assert_held(&session->s_mutex);
2472
2473         dout("kick_flushing_caps mds%d\n", session->s_mds);
2474
2475         spin_lock(&mdsc->cap_dirty_lock);
2476         oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2477         spin_unlock(&mdsc->cap_dirty_lock);
2478
2479         list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2480                 spin_lock(&ci->i_ceph_lock);
2481                 cap = ci->i_auth_cap;
2482                 if (!(cap && cap->session == session)) {
2483                         pr_err("%p auth cap %p not mds%d ???\n",
2484                                 &ci->vfs_inode, cap, session->s_mds);
2485                         spin_unlock(&ci->i_ceph_lock);
2486                         continue;
2487                 }
2488                 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2489                         __kick_flushing_caps(mdsc, session, ci,
2490                                              oldest_flush_tid);
2491                 }
2492                 spin_unlock(&ci->i_ceph_lock);
2493         }
2494 }
2495
2496 void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2497                                    struct ceph_inode_info *ci)
2498 {
2499         struct ceph_mds_client *mdsc = session->s_mdsc;
2500         struct ceph_cap *cap = ci->i_auth_cap;
2501
2502         lockdep_assert_held(&ci->i_ceph_lock);
2503
2504         dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
2505              ceph_cap_string(ci->i_flushing_caps));
2506
2507         if (!list_empty(&ci->i_cap_flush_list)) {
2508                 u64 oldest_flush_tid;
2509                 spin_lock(&mdsc->cap_dirty_lock);
2510                 list_move_tail(&ci->i_flushing_item,
2511                                &cap->session->s_cap_flushing);
2512                 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2513                 spin_unlock(&mdsc->cap_dirty_lock);
2514
2515                 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2516         }
2517 }
2518
2519
2520 /*
2521  * Take references to capabilities we hold, so that we don't release
2522  * them to the MDS prematurely.
2523  */
2524 void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
2525                             bool snap_rwsem_locked)
2526 {
2527         lockdep_assert_held(&ci->i_ceph_lock);
2528
2529         if (got & CEPH_CAP_PIN)
2530                 ci->i_pin_ref++;
2531         if (got & CEPH_CAP_FILE_RD)
2532                 ci->i_rd_ref++;
2533         if (got & CEPH_CAP_FILE_CACHE)
2534                 ci->i_rdcache_ref++;
2535         if (got & CEPH_CAP_FILE_EXCL)
2536                 ci->i_fx_ref++;
2537         if (got & CEPH_CAP_FILE_WR) {
2538                 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
2539                         BUG_ON(!snap_rwsem_locked);
2540                         ci->i_head_snapc = ceph_get_snap_context(
2541                                         ci->i_snap_realm->cached_context);
2542                 }
2543                 ci->i_wr_ref++;
2544         }
2545         if (got & CEPH_CAP_FILE_BUFFER) {
2546                 if (ci->i_wb_ref == 0)
2547                         ihold(&ci->vfs_inode);
2548                 ci->i_wb_ref++;
2549                 dout("%s %p wb %d -> %d (?)\n", __func__,
2550                      &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
2551         }
2552 }
2553
2554 /*
2555  * Try to grab cap references.  Specify those refs we @want, and the
2556  * minimal set we @need.  Also include the larger offset we are writing
2557  * to (when applicable), and check against max_size here as well.
2558  * Note that caller is responsible for ensuring max_size increases are
2559  * requested from the MDS.
2560  *
2561  * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2562  * or a negative error code. There are 3 speical error codes:
2563  *  -EAGAIN: need to sleep but non-blocking is specified
2564  *  -EFBIG:  ask caller to call check_max_size() and try again.
2565  *  -ESTALE: ask caller to call ceph_renew_caps() and try again.
2566  */
2567 enum {
2568         /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2569         NON_BLOCKING    = (1 << 8),
2570         CHECK_FILELOCK  = (1 << 9),
2571 };
2572
2573 static int try_get_cap_refs(struct inode *inode, int need, int want,
2574                             loff_t endoff, int flags, int *got)
2575 {
2576         struct ceph_inode_info *ci = ceph_inode(inode);
2577         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2578         int ret = 0;
2579         int have, implemented;
2580         bool snap_rwsem_locked = false;
2581
2582         dout("get_cap_refs %p need %s want %s\n", inode,
2583              ceph_cap_string(need), ceph_cap_string(want));
2584
2585 again:
2586         spin_lock(&ci->i_ceph_lock);
2587
2588         if ((flags & CHECK_FILELOCK) &&
2589             (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2590                 dout("try_get_cap_refs %p error filelock\n", inode);
2591                 ret = -EIO;
2592                 goto out_unlock;
2593         }
2594
2595         /* finish pending truncate */
2596         while (ci->i_truncate_pending) {
2597                 spin_unlock(&ci->i_ceph_lock);
2598                 if (snap_rwsem_locked) {
2599                         up_read(&mdsc->snap_rwsem);
2600                         snap_rwsem_locked = false;
2601                 }
2602                 __ceph_do_pending_vmtruncate(inode);
2603                 spin_lock(&ci->i_ceph_lock);
2604         }
2605
2606         have = __ceph_caps_issued(ci, &implemented);
2607
2608         if (have & need & CEPH_CAP_FILE_WR) {
2609                 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2610                         dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2611                              inode, endoff, ci->i_max_size);
2612                         if (endoff > ci->i_requested_max_size)
2613                                 ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
2614                         goto out_unlock;
2615                 }
2616                 /*
2617                  * If a sync write is in progress, we must wait, so that we
2618                  * can get a final snapshot value for size+mtime.
2619                  */
2620                 if (__ceph_have_pending_cap_snap(ci)) {
2621                         dout("get_cap_refs %p cap_snap_pending\n", inode);
2622                         goto out_unlock;
2623                 }
2624         }
2625
2626         if ((have & need) == need) {
2627                 /*
2628                  * Look at (implemented & ~have & not) so that we keep waiting
2629                  * on transition from wanted -> needed caps.  This is needed
2630                  * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2631                  * going before a prior buffered writeback happens.
2632                  */
2633                 int not = want & ~(have & need);
2634                 int revoking = implemented & ~have;
2635                 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2636                      inode, ceph_cap_string(have), ceph_cap_string(not),
2637                      ceph_cap_string(revoking));
2638                 if ((revoking & not) == 0) {
2639                         if (!snap_rwsem_locked &&
2640                             !ci->i_head_snapc &&
2641                             (need & CEPH_CAP_FILE_WR)) {
2642                                 if (!down_read_trylock(&mdsc->snap_rwsem)) {
2643                                         /*
2644                                          * we can not call down_read() when
2645                                          * task isn't in TASK_RUNNING state
2646                                          */
2647                                         if (flags & NON_BLOCKING) {
2648                                                 ret = -EAGAIN;
2649                                                 goto out_unlock;
2650                                         }
2651
2652                                         spin_unlock(&ci->i_ceph_lock);
2653                                         down_read(&mdsc->snap_rwsem);
2654                                         snap_rwsem_locked = true;
2655                                         goto again;
2656                                 }
2657                                 snap_rwsem_locked = true;
2658                         }
2659                         if ((have & want) == want)
2660                                 *got = need | want;
2661                         else
2662                                 *got = need;
2663                         ceph_take_cap_refs(ci, *got, true);
2664                         ret = 1;
2665                 }
2666         } else {
2667                 int session_readonly = false;
2668                 int mds_wanted;
2669                 if (ci->i_auth_cap &&
2670                     (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
2671                         struct ceph_mds_session *s = ci->i_auth_cap->session;
2672                         spin_lock(&s->s_cap_lock);
2673                         session_readonly = s->s_readonly;
2674                         spin_unlock(&s->s_cap_lock);
2675                 }
2676                 if (session_readonly) {
2677                         dout("get_cap_refs %p need %s but mds%d readonly\n",
2678                              inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2679                         ret = -EROFS;
2680                         goto out_unlock;
2681                 }
2682
2683                 if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
2684                         dout("get_cap_refs %p forced umount\n", inode);
2685                         ret = -EIO;
2686                         goto out_unlock;
2687                 }
2688                 mds_wanted = __ceph_caps_mds_wanted(ci, false);
2689                 if (need & ~mds_wanted) {
2690                         dout("get_cap_refs %p need %s > mds_wanted %s\n",
2691                              inode, ceph_cap_string(need),
2692                              ceph_cap_string(mds_wanted));
2693                         ret = -ESTALE;
2694                         goto out_unlock;
2695                 }
2696
2697                 dout("get_cap_refs %p have %s need %s\n", inode,
2698                      ceph_cap_string(have), ceph_cap_string(need));
2699         }
2700 out_unlock:
2701
2702         __ceph_touch_fmode(ci, mdsc, flags);
2703
2704         spin_unlock(&ci->i_ceph_lock);
2705         if (snap_rwsem_locked)
2706                 up_read(&mdsc->snap_rwsem);
2707
2708         if (!ret)
2709                 ceph_update_cap_mis(&mdsc->metric);
2710         else if (ret == 1)
2711                 ceph_update_cap_hit(&mdsc->metric);
2712
2713         dout("get_cap_refs %p ret %d got %s\n", inode,
2714              ret, ceph_cap_string(*got));
2715         return ret;
2716 }
2717
2718 /*
2719  * Check the offset we are writing up to against our current
2720  * max_size.  If necessary, tell the MDS we want to write to
2721  * a larger offset.
2722  */
2723 static void check_max_size(struct inode *inode, loff_t endoff)
2724 {
2725         struct ceph_inode_info *ci = ceph_inode(inode);
2726         int check = 0;
2727
2728         /* do we need to explicitly request a larger max_size? */
2729         spin_lock(&ci->i_ceph_lock);
2730         if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
2731                 dout("write %p at large endoff %llu, req max_size\n",
2732                      inode, endoff);
2733                 ci->i_wanted_max_size = endoff;
2734         }
2735         /* duplicate ceph_check_caps()'s logic */
2736         if (ci->i_auth_cap &&
2737             (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2738             ci->i_wanted_max_size > ci->i_max_size &&
2739             ci->i_wanted_max_size > ci->i_requested_max_size)
2740                 check = 1;
2741         spin_unlock(&ci->i_ceph_lock);
2742         if (check)
2743                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2744 }
2745
2746 static inline int get_used_fmode(int caps)
2747 {
2748         int fmode = 0;
2749         if (caps & CEPH_CAP_FILE_RD)
2750                 fmode |= CEPH_FILE_MODE_RD;
2751         if (caps & CEPH_CAP_FILE_WR)
2752                 fmode |= CEPH_FILE_MODE_WR;
2753         return fmode;
2754 }
2755
2756 int ceph_try_get_caps(struct inode *inode, int need, int want,
2757                       bool nonblock, int *got)
2758 {
2759         int ret, flags;
2760
2761         BUG_ON(need & ~CEPH_CAP_FILE_RD);
2762         BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
2763                         CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2764                         CEPH_CAP_ANY_DIR_OPS));
2765         if (need) {
2766                 ret = ceph_pool_perm_check(inode, need);
2767                 if (ret < 0)
2768                         return ret;
2769         }
2770
2771         flags = get_used_fmode(need | want);
2772         if (nonblock)
2773                 flags |= NON_BLOCKING;
2774
2775         ret = try_get_cap_refs(inode, need, want, 0, flags, got);
2776         /* three special error codes */
2777         if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
2778                 ret = 0;
2779         return ret;
2780 }
2781
2782 /*
2783  * Wait for caps, and take cap references.  If we can't get a WR cap
2784  * due to a small max_size, make sure we check_max_size (and possibly
2785  * ask the mds) so we don't get hung up indefinitely.
2786  */
2787 int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
2788 {
2789         struct ceph_file_info *fi = filp->private_data;
2790         struct inode *inode = file_inode(filp);
2791         struct ceph_inode_info *ci = ceph_inode(inode);
2792         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2793         int ret, _got, flags;
2794
2795         ret = ceph_pool_perm_check(inode, need);
2796         if (ret < 0)
2797                 return ret;
2798
2799         if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2800             fi->filp_gen != READ_ONCE(fsc->filp_gen))
2801                 return -EBADF;
2802
2803         flags = get_used_fmode(need | want);
2804
2805         while (true) {
2806                 flags &= CEPH_FILE_MODE_MASK;
2807                 if (atomic_read(&fi->num_locks))
2808                         flags |= CHECK_FILELOCK;
2809                 _got = 0;
2810                 ret = try_get_cap_refs(inode, need, want, endoff,
2811                                        flags, &_got);
2812                 WARN_ON_ONCE(ret == -EAGAIN);
2813                 if (!ret) {
2814                         struct ceph_mds_client *mdsc = fsc->mdsc;
2815                         struct cap_wait cw;
2816                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2817
2818                         cw.ino = ceph_ino(inode);
2819                         cw.tgid = current->tgid;
2820                         cw.need = need;
2821                         cw.want = want;
2822
2823                         spin_lock(&mdsc->caps_list_lock);
2824                         list_add(&cw.list, &mdsc->cap_wait_list);
2825                         spin_unlock(&mdsc->caps_list_lock);
2826
2827                         /* make sure used fmode not timeout */
2828                         ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
2829                         add_wait_queue(&ci->i_cap_wq, &wait);
2830
2831                         flags |= NON_BLOCKING;
2832                         while (!(ret = try_get_cap_refs(inode, need, want,
2833                                                         endoff, flags, &_got))) {
2834                                 if (signal_pending(current)) {
2835                                         ret = -ERESTARTSYS;
2836                                         break;
2837                                 }
2838                                 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2839                         }
2840
2841                         remove_wait_queue(&ci->i_cap_wq, &wait);
2842                         ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
2843
2844                         spin_lock(&mdsc->caps_list_lock);
2845                         list_del(&cw.list);
2846                         spin_unlock(&mdsc->caps_list_lock);
2847
2848                         if (ret == -EAGAIN)
2849                                 continue;
2850                 }
2851
2852                 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2853                     fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
2854                         if (ret >= 0 && _got)
2855                                 ceph_put_cap_refs(ci, _got);
2856                         return -EBADF;
2857                 }
2858
2859                 if (ret < 0) {
2860                         if (ret == -EFBIG || ret == -ESTALE) {
2861                                 int ret2 = ceph_wait_on_async_create(inode);
2862                                 if (ret2 < 0)
2863                                         return ret2;
2864                         }
2865                         if (ret == -EFBIG) {
2866                                 check_max_size(inode, endoff);
2867                                 continue;
2868                         }
2869                         if (ret == -ESTALE) {
2870                                 /* session was killed, try renew caps */
2871                                 ret = ceph_renew_caps(inode, flags);
2872                                 if (ret == 0)
2873                                         continue;
2874                         }
2875                         return ret;
2876                 }
2877
2878                 if (S_ISREG(ci->vfs_inode.i_mode) &&
2879                     ci->i_inline_version != CEPH_INLINE_NONE &&
2880                     (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2881                     i_size_read(inode) > 0) {
2882                         struct page *page =
2883                                 find_get_page(inode->i_mapping, 0);
2884                         if (page) {
2885                                 bool uptodate = PageUptodate(page);
2886
2887                                 put_page(page);
2888                                 if (uptodate)
2889                                         break;
2890                         }
2891                         /*
2892                          * drop cap refs first because getattr while
2893                          * holding * caps refs can cause deadlock.
2894                          */
2895                         ceph_put_cap_refs(ci, _got);
2896                         _got = 0;
2897
2898                         /*
2899                          * getattr request will bring inline data into
2900                          * page cache
2901                          */
2902                         ret = __ceph_do_getattr(inode, NULL,
2903                                                 CEPH_STAT_CAP_INLINE_DATA,
2904                                                 true);
2905                         if (ret < 0)
2906                                 return ret;
2907                         continue;
2908                 }
2909                 break;
2910         }
2911         *got = _got;
2912         return 0;
2913 }
2914
2915 /*
2916  * Take cap refs.  Caller must already know we hold at least one ref
2917  * on the caps in question or we don't know this is safe.
2918  */
2919 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2920 {
2921         spin_lock(&ci->i_ceph_lock);
2922         ceph_take_cap_refs(ci, caps, false);
2923         spin_unlock(&ci->i_ceph_lock);
2924 }
2925
2926
2927 /*
2928  * drop cap_snap that is not associated with any snapshot.
2929  * we don't need to send FLUSHSNAP message for it.
2930  */
2931 static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
2932                                   struct ceph_cap_snap *capsnap)
2933 {
2934         if (!capsnap->need_flush &&
2935             !capsnap->writing && !capsnap->dirty_pages) {
2936                 dout("dropping cap_snap %p follows %llu\n",
2937                      capsnap, capsnap->follows);
2938                 BUG_ON(capsnap->cap_flush.tid > 0);
2939                 ceph_put_snap_context(capsnap->context);
2940                 if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
2941                         ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
2942
2943                 list_del(&capsnap->ci_item);
2944                 ceph_put_cap_snap(capsnap);
2945                 return 1;
2946         }
2947         return 0;
2948 }
2949
2950 enum put_cap_refs_mode {
2951         PUT_CAP_REFS_SYNC = 0,
2952         PUT_CAP_REFS_NO_CHECK,
2953         PUT_CAP_REFS_ASYNC,
2954 };
2955
2956 /*
2957  * Release cap refs.
2958  *
2959  * If we released the last ref on any given cap, call ceph_check_caps
2960  * to release (or schedule a release).
2961  *
2962  * If we are releasing a WR cap (from a sync write), finalize any affected
2963  * cap_snap, and wake up any waiters.
2964  */
2965 static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
2966                                 enum put_cap_refs_mode mode)
2967 {
2968         struct inode *inode = &ci->vfs_inode;
2969         int last = 0, put = 0, flushsnaps = 0, wake = 0;
2970         bool check_flushsnaps = false;
2971
2972         spin_lock(&ci->i_ceph_lock);
2973         if (had & CEPH_CAP_PIN)
2974                 --ci->i_pin_ref;
2975         if (had & CEPH_CAP_FILE_RD)
2976                 if (--ci->i_rd_ref == 0)
2977                         last++;
2978         if (had & CEPH_CAP_FILE_CACHE)
2979                 if (--ci->i_rdcache_ref == 0)
2980                         last++;
2981         if (had & CEPH_CAP_FILE_EXCL)
2982                 if (--ci->i_fx_ref == 0)
2983                         last++;
2984         if (had & CEPH_CAP_FILE_BUFFER) {
2985                 if (--ci->i_wb_ref == 0) {
2986                         last++;
2987                         /* put the ref held by ceph_take_cap_refs() */
2988                         put++;
2989                         check_flushsnaps = true;
2990                 }
2991                 dout("put_cap_refs %p wb %d -> %d (?)\n",
2992                      inode, ci->i_wb_ref+1, ci->i_wb_ref);
2993         }
2994         if (had & CEPH_CAP_FILE_WR) {
2995                 if (--ci->i_wr_ref == 0) {
2996                         last++;
2997                         check_flushsnaps = true;
2998                         if (ci->i_wrbuffer_ref_head == 0 &&
2999                             ci->i_dirty_caps == 0 &&
3000                             ci->i_flushing_caps == 0) {
3001                                 BUG_ON(!ci->i_head_snapc);
3002                                 ceph_put_snap_context(ci->i_head_snapc);
3003                                 ci->i_head_snapc = NULL;
3004                         }
3005                         /* see comment in __ceph_remove_cap() */
3006                         if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
3007                                 drop_inode_snap_realm(ci);
3008                 }
3009         }
3010         if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
3011                 struct ceph_cap_snap *capsnap =
3012                         list_last_entry(&ci->i_cap_snaps,
3013                                         struct ceph_cap_snap,
3014                                         ci_item);
3015
3016                 capsnap->writing = 0;
3017                 if (ceph_try_drop_cap_snap(ci, capsnap))
3018                         /* put the ref held by ceph_queue_cap_snap() */
3019                         put++;
3020                 else if (__ceph_finish_cap_snap(ci, capsnap))
3021                         flushsnaps = 1;
3022                 wake = 1;
3023         }
3024         spin_unlock(&ci->i_ceph_lock);
3025
3026         dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
3027              last ? " last" : "", put ? " put" : "");
3028
3029         switch (mode) {
3030         case PUT_CAP_REFS_SYNC:
3031                 if (last)
3032                         ceph_check_caps(ci, 0, NULL);
3033                 else if (flushsnaps)
3034                         ceph_flush_snaps(ci, NULL);
3035                 break;
3036         case PUT_CAP_REFS_ASYNC:
3037                 if (last)
3038                         ceph_queue_check_caps(inode);
3039                 else if (flushsnaps)
3040                         ceph_queue_flush_snaps(inode);
3041                 break;
3042         default:
3043                 break;
3044         }
3045         if (wake)
3046                 wake_up_all(&ci->i_cap_wq);
3047         while (put-- > 0)
3048                 iput(inode);
3049 }
3050
3051 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3052 {
3053         __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
3054 }
3055
3056 void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
3057 {
3058         __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
3059 }
3060
3061 void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
3062 {
3063         __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
3064 }
3065
3066 /*
3067  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3068  * context.  Adjust per-snap dirty page accounting as appropriate.
3069  * Once all dirty data for a cap_snap is flushed, flush snapped file
3070  * metadata back to the MDS.  If we dropped the last ref, call
3071  * ceph_check_caps.
3072  */
3073 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
3074                                 struct ceph_snap_context *snapc)
3075 {
3076         struct inode *inode = &ci->vfs_inode;
3077         struct ceph_cap_snap *capsnap = NULL;
3078         int put = 0;
3079         bool last = false;
3080         bool found = false;
3081         bool flush_snaps = false;
3082         bool complete_capsnap = false;
3083
3084         spin_lock(&ci->i_ceph_lock);
3085         ci->i_wrbuffer_ref -= nr;
3086         if (ci->i_wrbuffer_ref == 0) {
3087                 last = true;
3088                 put++;
3089         }
3090
3091         if (ci->i_head_snapc == snapc) {
3092                 ci->i_wrbuffer_ref_head -= nr;
3093                 if (ci->i_wrbuffer_ref_head == 0 &&
3094                     ci->i_wr_ref == 0 &&
3095                     ci->i_dirty_caps == 0 &&
3096                     ci->i_flushing_caps == 0) {
3097                         BUG_ON(!ci->i_head_snapc);
3098                         ceph_put_snap_context(ci->i_head_snapc);
3099                         ci->i_head_snapc = NULL;
3100                 }
3101                 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
3102                      inode,
3103                      ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
3104                      ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
3105                      last ? " LAST" : "");
3106         } else {
3107                 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
3108                         if (capsnap->context == snapc) {
3109                                 found = true;
3110                                 break;
3111                         }
3112                 }
3113                 BUG_ON(!found);
3114                 capsnap->dirty_pages -= nr;
3115                 if (capsnap->dirty_pages == 0) {
3116                         complete_capsnap = true;
3117                         if (!capsnap->writing) {
3118                                 if (ceph_try_drop_cap_snap(ci, capsnap)) {
3119                                         put++;
3120                                 } else {
3121                                         ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
3122                                         flush_snaps = true;
3123                                 }
3124                         }
3125                 }
3126                 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
3127                      " snap %lld %d/%d -> %d/%d %s%s\n",
3128                      inode, capsnap, capsnap->context->seq,
3129                      ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3130                      ci->i_wrbuffer_ref, capsnap->dirty_pages,
3131                      last ? " (wrbuffer last)" : "",
3132                      complete_capsnap ? " (complete capsnap)" : "");
3133         }
3134
3135         spin_unlock(&ci->i_ceph_lock);
3136
3137         if (last) {
3138                 ceph_check_caps(ci, 0, NULL);
3139         } else if (flush_snaps) {
3140                 ceph_flush_snaps(ci, NULL);
3141         }
3142         if (complete_capsnap)
3143                 wake_up_all(&ci->i_cap_wq);
3144         while (put-- > 0) {
3145                 iput(inode);
3146         }
3147 }
3148
3149 /*
3150  * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3151  */
3152 static void invalidate_aliases(struct inode *inode)
3153 {
3154         struct dentry *dn, *prev = NULL;
3155
3156         dout("invalidate_aliases inode %p\n", inode);
3157         d_prune_aliases(inode);
3158         /*
3159          * For non-directory inode, d_find_alias() only returns
3160          * hashed dentry. After calling d_invalidate(), the
3161          * dentry becomes unhashed.
3162          *
3163          * For directory inode, d_find_alias() can return
3164          * unhashed dentry. But directory inode should have
3165          * one alias at most.
3166          */
3167         while ((dn = d_find_alias(inode))) {
3168                 if (dn == prev) {
3169                         dput(dn);
3170                         break;
3171                 }
3172                 d_invalidate(dn);
3173                 if (prev)
3174                         dput(prev);
3175                 prev = dn;
3176         }
3177         if (prev)
3178                 dput(prev);
3179 }
3180
3181 struct cap_extra_info {
3182         struct ceph_string *pool_ns;
3183         /* inline data */
3184         u64 inline_version;
3185         void *inline_data;
3186         u32 inline_len;
3187         /* dirstat */
3188         bool dirstat_valid;
3189         u64 nfiles;
3190         u64 nsubdirs;
3191         u64 change_attr;
3192         /* currently issued */
3193         int issued;
3194         struct timespec64 btime;
3195 };
3196
3197 /*
3198  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
3199  * actually be a revocation if it specifies a smaller cap set.)
3200  *
3201  * caller holds s_mutex and i_ceph_lock, we drop both.
3202  */
3203 static void handle_cap_grant(struct inode *inode,
3204                              struct ceph_mds_session *session,
3205                              struct ceph_cap *cap,
3206                              struct ceph_mds_caps *grant,
3207                              struct ceph_buffer *xattr_buf,
3208                              struct cap_extra_info *extra_info)
3209         __releases(ci->i_ceph_lock)
3210         __releases(session->s_mdsc->snap_rwsem)
3211 {
3212         struct ceph_inode_info *ci = ceph_inode(inode);
3213         int seq = le32_to_cpu(grant->seq);
3214         int newcaps = le32_to_cpu(grant->caps);
3215         int used, wanted, dirty;
3216         u64 size = le64_to_cpu(grant->size);
3217         u64 max_size = le64_to_cpu(grant->max_size);
3218         unsigned char check_caps = 0;
3219         bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
3220         bool wake = false;
3221         bool writeback = false;
3222         bool queue_trunc = false;
3223         bool queue_invalidate = false;
3224         bool deleted_inode = false;
3225         bool fill_inline = false;
3226
3227         dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
3228              inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
3229         dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
3230                 i_size_read(inode));
3231
3232
3233         /*
3234          * If CACHE is being revoked, and we have no dirty buffers,
3235          * try to invalidate (once).  (If there are dirty buffers, we
3236          * will invalidate _after_ writeback.)
3237          */
3238         if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
3239             ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3240             (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3241             !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3242                 if (try_nonblocking_invalidate(inode)) {
3243                         /* there were locked pages.. invalidate later
3244                            in a separate thread. */
3245                         if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3246                                 queue_invalidate = true;
3247                                 ci->i_rdcache_revoking = ci->i_rdcache_gen;
3248                         }
3249                 }
3250         }
3251
3252         if (was_stale)
3253                 cap->issued = cap->implemented = CEPH_CAP_PIN;
3254
3255         /*
3256          * auth mds of the inode changed. we received the cap export message,
3257          * but still haven't received the cap import message. handle_cap_export
3258          * updated the new auth MDS' cap.
3259          *
3260          * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3261          * that was sent before the cap import message. So don't remove caps.
3262          */
3263         if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3264                 WARN_ON(cap != ci->i_auth_cap);
3265                 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3266                 seq = cap->seq;
3267                 newcaps |= cap->issued;
3268         }
3269
3270         /* side effects now are allowed */
3271         cap->cap_gen = atomic_read(&session->s_cap_gen);
3272         cap->seq = seq;
3273
3274         __check_cap_issue(ci, cap, newcaps);
3275
3276         inode_set_max_iversion_raw(inode, extra_info->change_attr);
3277
3278         if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
3279             (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
3280                 umode_t mode = le32_to_cpu(grant->mode);
3281
3282                 if (inode_wrong_type(inode, mode))
3283                         pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
3284                                      ceph_vinop(inode), inode->i_mode, mode);
3285                 else
3286                         inode->i_mode = mode;
3287                 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
3288                 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
3289                 ci->i_btime = extra_info->btime;
3290                 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
3291                      from_kuid(&init_user_ns, inode->i_uid),
3292                      from_kgid(&init_user_ns, inode->i_gid));
3293         }
3294
3295         if ((newcaps & CEPH_CAP_LINK_SHARED) &&
3296             (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
3297                 set_nlink(inode, le32_to_cpu(grant->nlink));
3298                 if (inode->i_nlink == 0 &&
3299                     (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
3300                         deleted_inode = true;
3301         }
3302
3303         if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
3304             grant->xattr_len) {
3305                 int len = le32_to_cpu(grant->xattr_len);
3306                 u64 version = le64_to_cpu(grant->xattr_version);
3307
3308                 if (version > ci->i_xattrs.version) {
3309                         dout(" got new xattrs v%llu on %p len %d\n",
3310                              version, inode, len);
3311                         if (ci->i_xattrs.blob)
3312                                 ceph_buffer_put(ci->i_xattrs.blob);
3313                         ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
3314                         ci->i_xattrs.version = version;
3315                         ceph_forget_all_cached_acls(inode);
3316                         ceph_security_invalidate_secctx(inode);
3317                 }
3318         }
3319
3320         if (newcaps & CEPH_CAP_ANY_RD) {
3321                 struct timespec64 mtime, atime, ctime;
3322                 /* ctime/mtime/atime? */
3323                 ceph_decode_timespec64(&mtime, &grant->mtime);
3324                 ceph_decode_timespec64(&atime, &grant->atime);
3325                 ceph_decode_timespec64(&ctime, &grant->ctime);
3326                 ceph_fill_file_time(inode, extra_info->issued,
3327                                     le32_to_cpu(grant->time_warp_seq),
3328                                     &ctime, &mtime, &atime);
3329         }
3330
3331         if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
3332                 ci->i_files = extra_info->nfiles;
3333                 ci->i_subdirs = extra_info->nsubdirs;
3334         }
3335
3336         if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
3337                 /* file layout may have changed */
3338                 s64 old_pool = ci->i_layout.pool_id;
3339                 struct ceph_string *old_ns;
3340
3341                 ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
3342                 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3343                                         lockdep_is_held(&ci->i_ceph_lock));
3344                 rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
3345
3346                 if (ci->i_layout.pool_id != old_pool ||
3347                     extra_info->pool_ns != old_ns)
3348                         ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
3349
3350                 extra_info->pool_ns = old_ns;
3351
3352                 /* size/truncate_seq? */
3353                 queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
3354                                         le32_to_cpu(grant->truncate_seq),
3355                                         le64_to_cpu(grant->truncate_size),
3356                                         size);
3357         }
3358
3359         if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
3360                 if (max_size != ci->i_max_size) {
3361                         dout("max_size %lld -> %llu\n",
3362                              ci->i_max_size, max_size);
3363                         ci->i_max_size = max_size;
3364                         if (max_size >= ci->i_wanted_max_size) {
3365                                 ci->i_wanted_max_size = 0;  /* reset */
3366                                 ci->i_requested_max_size = 0;
3367                         }
3368                         wake = true;
3369                 }
3370         }
3371
3372         /* check cap bits */
3373         wanted = __ceph_caps_wanted(ci);
3374         used = __ceph_caps_used(ci);
3375         dirty = __ceph_caps_dirty(ci);
3376         dout(" my wanted = %s, used = %s, dirty %s\n",
3377              ceph_cap_string(wanted),
3378              ceph_cap_string(used),
3379              ceph_cap_string(dirty));
3380
3381         if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3382             (wanted & ~(cap->mds_wanted | newcaps))) {
3383                 /*
3384                  * If mds is importing cap, prior cap messages that update
3385                  * 'wanted' may get dropped by mds (migrate seq mismatch).
3386                  *
3387                  * We don't send cap message to update 'wanted' if what we
3388                  * want are already issued. If mds revokes caps, cap message
3389                  * that releases caps also tells mds what we want. But if
3390                  * caps got revoked by mds forcedly (session stale). We may
3391                  * haven't told mds what we want.
3392                  */
3393                 check_caps = 1;
3394         }
3395
3396         /* revocation, grant, or no-op? */
3397         if (cap->issued & ~newcaps) {
3398                 int revoking = cap->issued & ~newcaps;
3399
3400                 dout("revocation: %s -> %s (revoking %s)\n",
3401                      ceph_cap_string(cap->issued),
3402                      ceph_cap_string(newcaps),
3403                      ceph_cap_string(revoking));
3404                 if (S_ISREG(inode->i_mode) &&
3405                     (revoking & used & CEPH_CAP_FILE_BUFFER))
3406                         writeback = true;  /* initiate writeback; will delay ack */
3407                 else if (queue_invalidate &&
3408                          revoking == CEPH_CAP_FILE_CACHE &&
3409                          (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
3410                         ; /* do nothing yet, invalidation will be queued */
3411                 else if (cap == ci->i_auth_cap)
3412                         check_caps = 1; /* check auth cap only */
3413                 else
3414                         check_caps = 2; /* check all caps */
3415                 cap->issued = newcaps;
3416                 cap->implemented |= newcaps;
3417         } else if (cap->issued == newcaps) {
3418                 dout("caps unchanged: %s -> %s\n",
3419                      ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
3420         } else {
3421                 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
3422                      ceph_cap_string(newcaps));
3423                 /* non-auth MDS is revoking the newly grant caps ? */
3424                 if (cap == ci->i_auth_cap &&
3425                     __ceph_caps_revoking_other(ci, cap, newcaps))
3426                     check_caps = 2;
3427
3428                 cap->issued = newcaps;
3429                 cap->implemented |= newcaps; /* add bits only, to
3430                                               * avoid stepping on a
3431                                               * pending revocation */
3432                 wake = true;
3433         }
3434         BUG_ON(cap->issued & ~cap->implemented);
3435
3436         if (extra_info->inline_version > 0 &&
3437             extra_info->inline_version >= ci->i_inline_version) {
3438                 ci->i_inline_version = extra_info->inline_version;
3439                 if (ci->i_inline_version != CEPH_INLINE_NONE &&
3440                     (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
3441                         fill_inline = true;
3442         }
3443
3444         if (ci->i_auth_cap == cap &&
3445             le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3446                 if (newcaps & ~extra_info->issued)
3447                         wake = true;
3448
3449                 if (ci->i_requested_max_size > max_size ||
3450                     !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3451                         /* re-request max_size if necessary */
3452                         ci->i_requested_max_size = 0;
3453                         wake = true;
3454                 }
3455
3456                 ceph_kick_flushing_inode_caps(session, ci);
3457                 spin_unlock(&ci->i_ceph_lock);
3458                 up_read(&session->s_mdsc->snap_rwsem);
3459         } else {
3460                 spin_unlock(&ci->i_ceph_lock);
3461         }
3462
3463         if (fill_inline)
3464                 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
3465                                       extra_info->inline_len);
3466
3467         if (queue_trunc)
3468                 ceph_queue_vmtruncate(inode);
3469
3470         if (writeback)
3471                 /*
3472                  * queue inode for writeback: we can't actually call
3473                  * filemap_write_and_wait, etc. from message handler
3474                  * context.
3475                  */
3476                 ceph_queue_writeback(inode);
3477         if (queue_invalidate)
3478                 ceph_queue_invalidate(inode);
3479         if (deleted_inode)
3480                 invalidate_aliases(inode);
3481         if (wake)
3482                 wake_up_all(&ci->i_cap_wq);
3483
3484         mutex_unlock(&session->s_mutex);
3485         if (check_caps == 1)
3486                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
3487                                 session);
3488         else if (check_caps == 2)
3489                 ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
3490 }
3491
3492 /*
3493  * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3494  * MDS has been safely committed.
3495  */
3496 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3497                                  struct ceph_mds_caps *m,
3498                                  struct ceph_mds_session *session,
3499                                  struct ceph_cap *cap)
3500         __releases(ci->i_ceph_lock)
3501 {
3502         struct ceph_inode_info *ci = ceph_inode(inode);
3503         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3504         struct ceph_cap_flush *cf, *tmp_cf;
3505         LIST_HEAD(to_remove);
3506         unsigned seq = le32_to_cpu(m->seq);
3507         int dirty = le32_to_cpu(m->dirty);
3508         int cleaned = 0;
3509         bool drop = false;
3510         bool wake_ci = false;
3511         bool wake_mdsc = false;
3512
3513         list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3514                 /* Is this the one that was flushed? */
3515                 if (cf->tid == flush_tid)
3516                         cleaned = cf->caps;
3517
3518                 /* Is this a capsnap? */
3519                 if (cf->caps == 0)
3520                         continue;
3521
3522                 if (cf->tid <= flush_tid) {
3523                         /*
3524                          * An earlier or current tid. The FLUSH_ACK should
3525                          * represent a superset of this flush's caps.
3526                          */
3527                         wake_ci |= __detach_cap_flush_from_ci(ci, cf);
3528                         list_add_tail(&cf->i_list, &to_remove);
3529                 } else {
3530                         /*
3531                          * This is a later one. Any caps in it are still dirty
3532                          * so don't count them as cleaned.
3533                          */
3534                         cleaned &= ~cf->caps;
3535                         if (!cleaned)
3536                                 break;
3537                 }
3538         }
3539
3540         dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
3541              " flushing %s -> %s\n",
3542              inode, session->s_mds, seq, ceph_cap_string(dirty),
3543              ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
3544              ceph_cap_string(ci->i_flushing_caps & ~cleaned));
3545
3546         if (list_empty(&to_remove) && !cleaned)
3547                 goto out;
3548
3549         ci->i_flushing_caps &= ~cleaned;
3550
3551         spin_lock(&mdsc->cap_dirty_lock);
3552
3553         list_for_each_entry(cf, &to_remove, i_list)
3554                 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
3555
3556         if (ci->i_flushing_caps == 0) {
3557                 if (list_empty(&ci->i_cap_flush_list)) {
3558                         list_del_init(&ci->i_flushing_item);
3559                         if (!list_empty(&session->s_cap_flushing)) {
3560                                 dout(" mds%d still flushing cap on %p\n",
3561                                      session->s_mds,
3562                                      &list_first_entry(&session->s_cap_flushing,
3563                                                 struct ceph_inode_info,
3564                                                 i_flushing_item)->vfs_inode);
3565                         }
3566                 }
3567                 mdsc->num_cap_flushing--;
3568                 dout(" inode %p now !flushing\n", inode);
3569
3570                 if (ci->i_dirty_caps == 0) {
3571                         dout(" inode %p now clean\n", inode);
3572                         BUG_ON(!list_empty(&ci->i_dirty_item));
3573                         drop = true;
3574                         if (ci->i_wr_ref == 0 &&
3575                             ci->i_wrbuffer_ref_head == 0) {
3576                                 BUG_ON(!ci->i_head_snapc);
3577                                 ceph_put_snap_context(ci->i_head_snapc);
3578                                 ci->i_head_snapc = NULL;
3579                         }
3580                 } else {
3581                         BUG_ON(list_empty(&ci->i_dirty_item));
3582                 }
3583         }
3584         spin_unlock(&mdsc->cap_dirty_lock);
3585
3586 out:
3587         spin_unlock(&ci->i_ceph_lock);
3588
3589         while (!list_empty(&to_remove)) {
3590                 cf = list_first_entry(&to_remove,
3591                                       struct ceph_cap_flush, i_list);
3592                 list_del(&cf->i_list);
3593                 ceph_free_cap_flush(cf);
3594         }
3595
3596         if (wake_ci)
3597                 wake_up_all(&ci->i_cap_wq);
3598         if (wake_mdsc)
3599                 wake_up_all(&mdsc->cap_flushing_wq);
3600         if (drop)
3601                 iput(inode);
3602 }
3603
3604 /*
3605  * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
3606  * throw away our cap_snap.
3607  *
3608  * Caller hold s_mutex.
3609  */
3610 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3611                                      struct ceph_mds_caps *m,
3612                                      struct ceph_mds_session *session)
3613 {
3614         struct ceph_inode_info *ci = ceph_inode(inode);
3615         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3616         u64 follows = le64_to_cpu(m->snap_follows);
3617         struct ceph_cap_snap *capsnap;
3618         bool flushed = false;
3619         bool wake_ci = false;
3620         bool wake_mdsc = false;
3621
3622         dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3623              inode, ci, session->s_mds, follows);
3624
3625         spin_lock(&ci->i_ceph_lock);
3626         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
3627                 if (capsnap->follows == follows) {
3628                         if (capsnap->cap_flush.tid != flush_tid) {
3629                                 dout(" cap_snap %p follows %lld tid %lld !="
3630                                      " %lld\n", capsnap, follows,
3631                                      flush_tid, capsnap->cap_flush.tid);
3632                                 break;
3633                         }
3634                         flushed = true;
3635                         break;
3636                 } else {
3637                         dout(" skipping cap_snap %p follows %lld\n",
3638                              capsnap, capsnap->follows);
3639                 }
3640         }
3641         if (flushed) {
3642                 WARN_ON(capsnap->dirty_pages || capsnap->writing);
3643                 dout(" removing %p cap_snap %p follows %lld\n",
3644                      inode, capsnap, follows);
3645                 list_del(&capsnap->ci_item);
3646                 wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
3647
3648                 spin_lock(&mdsc->cap_dirty_lock);
3649
3650                 if (list_empty(&ci->i_cap_flush_list))
3651                         list_del_init(&ci->i_flushing_item);
3652
3653                 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc,
3654                                                           &capsnap->cap_flush);
3655                 spin_unlock(&mdsc->cap_dirty_lock);
3656         }
3657         spin_unlock(&ci->i_ceph_lock);
3658         if (flushed) {
3659                 ceph_put_snap_context(capsnap->context);
3660                 ceph_put_cap_snap(capsnap);
3661                 if (wake_ci)
3662                         wake_up_all(&ci->i_cap_wq);
3663                 if (wake_mdsc)
3664                         wake_up_all(&mdsc->cap_flushing_wq);
3665                 iput(inode);
3666         }
3667 }
3668
3669 /*
3670  * Handle TRUNC from MDS, indicating file truncation.
3671  *
3672  * caller hold s_mutex.
3673  */
3674 static bool handle_cap_trunc(struct inode *inode,
3675                              struct ceph_mds_caps *trunc,
3676                              struct ceph_mds_session *session)
3677 {
3678         struct ceph_inode_info *ci = ceph_inode(inode);
3679         int mds = session->s_mds;
3680         int seq = le32_to_cpu(trunc->seq);
3681         u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
3682         u64 truncate_size = le64_to_cpu(trunc->truncate_size);
3683         u64 size = le64_to_cpu(trunc->size);
3684         int implemented = 0;
3685         int dirty = __ceph_caps_dirty(ci);
3686         int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
3687         bool queue_trunc = false;
3688
3689         lockdep_assert_held(&ci->i_ceph_lock);
3690
3691         issued |= implemented | dirty;
3692
3693         dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
3694              inode, mds, seq, truncate_size, truncate_seq);
3695         queue_trunc = ceph_fill_file_size(inode, issued,
3696                                           truncate_seq, truncate_size, size);
3697         return queue_trunc;
3698 }
3699
3700 /*
3701  * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
3702  * different one.  If we are the most recent migration we've seen (as
3703  * indicated by mseq), make note of the migrating cap bits for the
3704  * duration (until we see the corresponding IMPORT).
3705  *
3706  * caller holds s_mutex
3707  */
3708 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
3709                               struct ceph_mds_cap_peer *ph,
3710                               struct ceph_mds_session *session)
3711 {
3712         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
3713         struct ceph_mds_session *tsession = NULL;
3714         struct ceph_cap *cap, *tcap, *new_cap = NULL;
3715         struct ceph_inode_info *ci = ceph_inode(inode);
3716         u64 t_cap_id;
3717         unsigned mseq = le32_to_cpu(ex->migrate_seq);
3718         unsigned t_seq, t_mseq;
3719         int target, issued;
3720         int mds = session->s_mds;
3721
3722         if (ph) {
3723                 t_cap_id = le64_to_cpu(ph->cap_id);
3724                 t_seq = le32_to_cpu(ph->seq);
3725                 t_mseq = le32_to_cpu(ph->mseq);
3726                 target = le32_to_cpu(ph->mds);
3727         } else {
3728                 t_cap_id = t_seq = t_mseq = 0;
3729                 target = -1;
3730         }
3731
3732         dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
3733              inode, ci, mds, mseq, target);
3734 retry:
3735         spin_lock(&ci->i_ceph_lock);
3736         cap = __get_cap_for_mds(ci, mds);
3737         if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
3738                 goto out_unlock;
3739
3740         if (target < 0) {
3741                 __ceph_remove_cap(cap, false);
3742                 goto out_unlock;
3743         }
3744
3745         /*
3746          * now we know we haven't received the cap import message yet
3747          * because the exported cap still exist.
3748          */
3749
3750         issued = cap->issued;
3751         if (issued != cap->implemented)
3752                 pr_err_ratelimited("handle_cap_export: issued != implemented: "
3753                                 "ino (%llx.%llx) mds%d seq %d mseq %d "
3754                                 "issued %s implemented %s\n",
3755                                 ceph_vinop(inode), mds, cap->seq, cap->mseq,
3756                                 ceph_cap_string(issued),
3757                                 ceph_cap_string(cap->implemented));
3758
3759
3760         tcap = __get_cap_for_mds(ci, target);
3761         if (tcap) {
3762                 /* already have caps from the target */
3763                 if (tcap->cap_id == t_cap_id &&
3764                     ceph_seq_cmp(tcap->seq, t_seq) < 0) {
3765                         dout(" updating import cap %p mds%d\n", tcap, target);
3766                         tcap->cap_id = t_cap_id;
3767                         tcap->seq = t_seq - 1;
3768                         tcap->issue_seq = t_seq - 1;
3769                         tcap->issued |= issued;
3770                         tcap->implemented |= issued;
3771                         if (cap == ci->i_auth_cap) {
3772                                 ci->i_auth_cap = tcap;
3773                                 change_auth_cap_ses(ci, tcap->session);
3774                         }
3775                 }
3776                 __ceph_remove_cap(cap, false);
3777                 goto out_unlock;
3778         } else if (tsession) {
3779                 /* add placeholder for the export tagert */
3780                 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
3781                 tcap = new_cap;
3782                 ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
3783                              t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
3784
3785                 if (!list_empty(&ci->i_cap_flush_list) &&
3786                     ci->i_auth_cap == tcap) {
3787                         spin_lock(&mdsc->cap_dirty_lock);
3788                         list_move_tail(&ci->i_flushing_item,
3789                                        &tcap->session->s_cap_flushing);
3790                         spin_unlock(&mdsc->cap_dirty_lock);
3791                 }
3792
3793                 __ceph_remove_cap(cap, false);
3794                 goto out_unlock;
3795         }
3796
3797         spin_unlock(&ci->i_ceph_lock);
3798         mutex_unlock(&session->s_mutex);
3799
3800         /* open target session */
3801         tsession = ceph_mdsc_open_export_target_session(mdsc, target);
3802         if (!IS_ERR(tsession)) {
3803                 if (mds > target) {
3804                         mutex_lock(&session->s_mutex);
3805                         mutex_lock_nested(&tsession->s_mutex,
3806                                           SINGLE_DEPTH_NESTING);
3807                 } else {
3808                         mutex_lock(&tsession->s_mutex);
3809                         mutex_lock_nested(&session->s_mutex,
3810                                           SINGLE_DEPTH_NESTING);
3811                 }
3812                 new_cap = ceph_get_cap(mdsc, NULL);
3813         } else {
3814                 WARN_ON(1);
3815                 tsession = NULL;
3816                 target = -1;
3817                 mutex_lock(&session->s_mutex);
3818         }
3819         goto retry;
3820
3821 out_unlock:
3822         spin_unlock(&ci->i_ceph_lock);
3823         mutex_unlock(&session->s_mutex);
3824         if (tsession) {
3825                 mutex_unlock(&tsession->s_mutex);
3826                 ceph_put_mds_session(tsession);
3827         }
3828         if (new_cap)
3829                 ceph_put_cap(mdsc, new_cap);
3830 }
3831
3832 /*
3833  * Handle cap IMPORT.
3834  *
3835  * caller holds s_mutex. acquires i_ceph_lock
3836  */
3837 static void handle_cap_import(struct ceph_mds_client *mdsc,
3838                               struct inode *inode, struct ceph_mds_caps *im,
3839                               struct ceph_mds_cap_peer *ph,
3840                               struct ceph_mds_session *session,
3841                               struct ceph_cap **target_cap, int *old_issued)
3842 {
3843         struct ceph_inode_info *ci = ceph_inode(inode);
3844         struct ceph_cap *cap, *ocap, *new_cap = NULL;
3845         int mds = session->s_mds;
3846         int issued;
3847         unsigned caps = le32_to_cpu(im->caps);
3848         unsigned wanted = le32_to_cpu(im->wanted);
3849         unsigned seq = le32_to_cpu(im->seq);
3850         unsigned mseq = le32_to_cpu(im->migrate_seq);
3851         u64 realmino = le64_to_cpu(im->realm);
3852         u64 cap_id = le64_to_cpu(im->cap_id);
3853         u64 p_cap_id;
3854         int peer;
3855
3856         if (ph) {
3857                 p_cap_id = le64_to_cpu(ph->cap_id);
3858                 peer = le32_to_cpu(ph->mds);
3859         } else {
3860                 p_cap_id = 0;
3861                 peer = -1;
3862         }
3863
3864         dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
3865              inode, ci, mds, mseq, peer);
3866 retry:
3867         cap = __get_cap_for_mds(ci, mds);
3868         if (!cap) {
3869                 if (!new_cap) {
3870                         spin_unlock(&ci->i_ceph_lock);
3871                         new_cap = ceph_get_cap(mdsc, NULL);
3872                         spin_lock(&ci->i_ceph_lock);
3873                         goto retry;
3874                 }
3875                 cap = new_cap;
3876         } else {
3877                 if (new_cap) {
3878                         ceph_put_cap(mdsc, new_cap);
3879                         new_cap = NULL;
3880                 }
3881         }
3882
3883         __ceph_caps_issued(ci, &issued);
3884         issued |= __ceph_caps_dirty(ci);
3885
3886         ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
3887                      realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
3888
3889         ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
3890         if (ocap && ocap->cap_id == p_cap_id) {
3891                 dout(" remove export cap %p mds%d flags %d\n",
3892                      ocap, peer, ph->flags);
3893                 if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
3894                     (ocap->seq != le32_to_cpu(ph->seq) ||
3895                      ocap->mseq != le32_to_cpu(ph->mseq))) {
3896                         pr_err_ratelimited("handle_cap_import: "
3897                                         "mismatched seq/mseq: ino (%llx.%llx) "
3898                                         "mds%d seq %d mseq %d importer mds%d "
3899                                         "has peer seq %d mseq %d\n",
3900                                         ceph_vinop(inode), peer, ocap->seq,
3901                                         ocap->mseq, mds, le32_to_cpu(ph->seq),
3902                                         le32_to_cpu(ph->mseq));
3903                 }
3904                 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
3905         }
3906
3907         *old_issued = issued;
3908         *target_cap = cap;
3909 }
3910
3911 /*
3912  * Handle a caps message from the MDS.
3913  *
3914  * Identify the appropriate session, inode, and call the right handler
3915  * based on the cap op.
3916  */
3917 void ceph_handle_caps(struct ceph_mds_session *session,
3918                       struct ceph_msg *msg)
3919 {
3920         struct ceph_mds_client *mdsc = session->s_mdsc;
3921         struct inode *inode;
3922         struct ceph_inode_info *ci;
3923         struct ceph_cap *cap;
3924         struct ceph_mds_caps *h;
3925         struct ceph_mds_cap_peer *peer = NULL;
3926         struct ceph_snap_realm *realm = NULL;
3927         int op;
3928         int msg_version = le16_to_cpu(msg->hdr.version);
3929         u32 seq, mseq;
3930         struct ceph_vino vino;
3931         void *snaptrace;
3932         size_t snaptrace_len;
3933         void *p, *end;
3934         struct cap_extra_info extra_info = {};
3935         bool queue_trunc;
3936
3937         dout("handle_caps from mds%d\n", session->s_mds);
3938
3939         /* decode */
3940         end = msg->front.iov_base + msg->front.iov_len;
3941         if (msg->front.iov_len < sizeof(*h))
3942                 goto bad;
3943         h = msg->front.iov_base;
3944         op = le32_to_cpu(h->op);
3945         vino.ino = le64_to_cpu(h->ino);
3946         vino.snap = CEPH_NOSNAP;
3947         seq = le32_to_cpu(h->seq);
3948         mseq = le32_to_cpu(h->migrate_seq);
3949
3950         snaptrace = h + 1;
3951         snaptrace_len = le32_to_cpu(h->snap_trace_len);
3952         p = snaptrace + snaptrace_len;
3953
3954         if (msg_version >= 2) {
3955                 u32 flock_len;
3956                 ceph_decode_32_safe(&p, end, flock_len, bad);
3957                 if (p + flock_len > end)
3958                         goto bad;
3959                 p += flock_len;
3960         }
3961
3962         if (msg_version >= 3) {
3963                 if (op == CEPH_CAP_OP_IMPORT) {
3964                         if (p + sizeof(*peer) > end)
3965                                 goto bad;
3966                         peer = p;
3967                         p += sizeof(*peer);
3968                 } else if (op == CEPH_CAP_OP_EXPORT) {
3969                         /* recorded in unused fields */
3970                         peer = (void *)&h->size;
3971                 }
3972         }
3973
3974         if (msg_version >= 4) {
3975                 ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
3976                 ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
3977                 if (p + extra_info.inline_len > end)
3978                         goto bad;
3979                 extra_info.inline_data = p;
3980                 p += extra_info.inline_len;
3981         }
3982
3983         if (msg_version >= 5) {
3984                 struct ceph_osd_client  *osdc = &mdsc->fsc->client->osdc;
3985                 u32                     epoch_barrier;
3986
3987                 ceph_decode_32_safe(&p, end, epoch_barrier, bad);
3988                 ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
3989         }
3990
3991         if (msg_version >= 8) {
3992                 u32 pool_ns_len;
3993
3994                 /* version >= 6 */
3995                 ceph_decode_skip_64(&p, end, bad);      // flush_tid
3996                 /* version >= 7 */
3997                 ceph_decode_skip_32(&p, end, bad);      // caller_uid
3998                 ceph_decode_skip_32(&p, end, bad);      // caller_gid
3999                 /* version >= 8 */
4000                 ceph_decode_32_safe(&p, end, pool_ns_len, bad);
4001                 if (pool_ns_len > 0) {
4002                         ceph_decode_need(&p, end, pool_ns_len, bad);
4003                         extra_info.pool_ns =
4004                                 ceph_find_or_create_string(p, pool_ns_len);
4005                         p += pool_ns_len;
4006                 }
4007         }
4008
4009         if (msg_version >= 9) {
4010                 struct ceph_timespec *btime;
4011
4012                 if (p + sizeof(*btime) > end)
4013                         goto bad;
4014                 btime = p;
4015                 ceph_decode_timespec64(&extra_info.btime, btime);
4016                 p += sizeof(*btime);
4017                 ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4018         }
4019
4020         if (msg_version >= 11) {
4021                 /* version >= 10 */
4022                 ceph_decode_skip_32(&p, end, bad); // flags
4023                 /* version >= 11 */
4024                 extra_info.dirstat_valid = true;
4025                 ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
4026                 ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
4027         }
4028
4029         /* lookup ino */
4030         inode = ceph_find_inode(mdsc->fsc->sb, vino);
4031         ci = ceph_inode(inode);
4032         dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
4033              vino.snap, inode);
4034
4035         mutex_lock(&session->s_mutex);
4036         inc_session_sequence(session);
4037         dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
4038              (unsigned)seq);
4039
4040         if (!inode) {
4041                 dout(" i don't have ino %llx\n", vino.ino);
4042
4043                 if (op == CEPH_CAP_OP_IMPORT) {
4044                         cap = ceph_get_cap(mdsc, NULL);
4045                         cap->cap_ino = vino.ino;
4046                         cap->queue_release = 1;
4047                         cap->cap_id = le64_to_cpu(h->cap_id);
4048                         cap->mseq = mseq;
4049                         cap->seq = seq;
4050                         cap->issue_seq = seq;
4051                         spin_lock(&session->s_cap_lock);
4052                         __ceph_queue_cap_release(session, cap);
4053                         spin_unlock(&session->s_cap_lock);
4054                 }
4055                 goto flush_cap_releases;
4056         }
4057
4058         /* these will work even if we don't have a cap yet */
4059         switch (op) {
4060         case CEPH_CAP_OP_FLUSHSNAP_ACK:
4061                 handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4062                                          h, session);
4063                 goto done;
4064
4065         case CEPH_CAP_OP_EXPORT:
4066                 handle_cap_export(inode, h, peer, session);
4067                 goto done_unlocked;
4068
4069         case CEPH_CAP_OP_IMPORT:
4070                 realm = NULL;
4071                 if (snaptrace_len) {
4072                         down_write(&mdsc->snap_rwsem);
4073                         ceph_update_snap_trace(mdsc, snaptrace,
4074                                                snaptrace + snaptrace_len,
4075                                                false, &realm);
4076                         downgrade_write(&mdsc->snap_rwsem);
4077                 } else {
4078                         down_read(&mdsc->snap_rwsem);
4079                 }
4080                 spin_lock(&ci->i_ceph_lock);
4081                 handle_cap_import(mdsc, inode, h, peer, session,
4082                                   &cap, &extra_info.issued);
4083                 handle_cap_grant(inode, session, cap,
4084                                  h, msg->middle, &extra_info);
4085                 if (realm)
4086                         ceph_put_snap_realm(mdsc, realm);
4087                 goto done_unlocked;
4088         }
4089
4090         /* the rest require a cap */
4091         spin_lock(&ci->i_ceph_lock);
4092         cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
4093         if (!cap) {
4094                 dout(" no cap on %p ino %llx.%llx from mds%d\n",
4095                      inode, ceph_ino(inode), ceph_snap(inode),
4096                      session->s_mds);
4097                 spin_unlock(&ci->i_ceph_lock);
4098                 goto flush_cap_releases;
4099         }
4100
4101         /* note that each of these drops i_ceph_lock for us */
4102         switch (op) {
4103         case CEPH_CAP_OP_REVOKE:
4104         case CEPH_CAP_OP_GRANT:
4105                 __ceph_caps_issued(ci, &extra_info.issued);
4106                 extra_info.issued |= __ceph_caps_dirty(ci);
4107                 handle_cap_grant(inode, session, cap,
4108                                  h, msg->middle, &extra_info);
4109                 goto done_unlocked;
4110
4111         case CEPH_CAP_OP_FLUSH_ACK:
4112                 handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4113                                      h, session, cap);
4114                 break;
4115
4116         case CEPH_CAP_OP_TRUNC:
4117                 queue_trunc = handle_cap_trunc(inode, h, session);
4118                 spin_unlock(&ci->i_ceph_lock);
4119                 if (queue_trunc)
4120                         ceph_queue_vmtruncate(inode);
4121                 break;
4122
4123         default:
4124                 spin_unlock(&ci->i_ceph_lock);
4125                 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
4126                        ceph_cap_op_name(op));
4127         }
4128
4129 done:
4130         mutex_unlock(&session->s_mutex);
4131 done_unlocked:
4132         ceph_put_string(extra_info.pool_ns);
4133         iput(inode);
4134         return;
4135
4136 flush_cap_releases:
4137         /*
4138          * send any cap release message to try to move things
4139          * along for the mds (who clearly thinks we still have this
4140          * cap).
4141          */
4142         ceph_flush_cap_releases(mdsc, session);
4143         goto done;
4144
4145 bad:
4146         pr_err("ceph_handle_caps: corrupt message\n");
4147         ceph_msg_dump(msg);
4148         return;
4149 }
4150
4151 /*
4152  * Delayed work handler to process end of delayed cap release LRU list.
4153  */
4154 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4155 {
4156         struct inode *inode;
4157         struct ceph_inode_info *ci;
4158
4159         dout("check_delayed_caps\n");
4160         spin_lock(&mdsc->cap_delay_lock);
4161         while (!list_empty(&mdsc->cap_delay_list)) {
4162                 ci = list_first_entry(&mdsc->cap_delay_list,
4163                                       struct ceph_inode_info,
4164                                       i_cap_delay_list);
4165                 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
4166                     time_before(jiffies, ci->i_hold_caps_max))
4167                         break;
4168                 list_del_init(&ci->i_cap_delay_list);
4169
4170                 inode = igrab(&ci->vfs_inode);
4171                 if (inode) {
4172                         spin_unlock(&mdsc->cap_delay_lock);
4173                         dout("check_delayed_caps on %p\n", inode);
4174                         ceph_check_caps(ci, 0, NULL);
4175                         iput(inode);
4176                         spin_lock(&mdsc->cap_delay_lock);
4177                 }
4178         }
4179         spin_unlock(&mdsc->cap_delay_lock);
4180 }
4181
4182 /*
4183  * Flush all dirty caps to the mds
4184  */
4185 static void flush_dirty_session_caps(struct ceph_mds_session *s)
4186 {
4187         struct ceph_mds_client *mdsc = s->s_mdsc;
4188         struct ceph_inode_info *ci;
4189         struct inode *inode;
4190
4191         dout("flush_dirty_caps\n");
4192         spin_lock(&mdsc->cap_dirty_lock);
4193         while (!list_empty(&s->s_cap_dirty)) {
4194                 ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
4195                                       i_dirty_item);
4196                 inode = &ci->vfs_inode;
4197                 ihold(inode);
4198                 dout("flush_dirty_caps %p\n", inode);
4199                 spin_unlock(&mdsc->cap_dirty_lock);
4200                 ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
4201                 iput(inode);
4202                 spin_lock(&mdsc->cap_dirty_lock);
4203         }
4204         spin_unlock(&mdsc->cap_dirty_lock);
4205         dout("flush_dirty_caps done\n");
4206 }
4207
4208 static void iterate_sessions(struct ceph_mds_client *mdsc,
4209                              void (*cb)(struct ceph_mds_session *))
4210 {
4211         int mds;
4212
4213         mutex_lock(&mdsc->mutex);
4214         for (mds = 0; mds < mdsc->max_sessions; ++mds) {
4215                 struct ceph_mds_session *s;
4216
4217                 if (!mdsc->sessions[mds])
4218                         continue;
4219
4220                 s = ceph_get_mds_session(mdsc->sessions[mds]);
4221                 if (!s)
4222                         continue;
4223
4224                 mutex_unlock(&mdsc->mutex);
4225                 cb(s);
4226                 ceph_put_mds_session(s);
4227                 mutex_lock(&mdsc->mutex);
4228         }
4229         mutex_unlock(&mdsc->mutex);
4230 }
4231
4232 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4233 {
4234         iterate_sessions(mdsc, flush_dirty_session_caps);
4235 }
4236
4237 void __ceph_touch_fmode(struct ceph_inode_info *ci,
4238                         struct ceph_mds_client *mdsc, int fmode)
4239 {
4240         unsigned long now = jiffies;
4241         if (fmode & CEPH_FILE_MODE_RD)
4242                 ci->i_last_rd = now;
4243         if (fmode & CEPH_FILE_MODE_WR)
4244                 ci->i_last_wr = now;
4245         /* queue periodic check */
4246         if (fmode &&
4247             __ceph_is_any_real_caps(ci) &&
4248             list_empty(&ci->i_cap_delay_list))
4249                 __cap_delay_requeue(mdsc, ci);
4250 }
4251
4252 void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4253 {
4254         struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
4255         int bits = (fmode << 1) | 1;
4256         bool is_opened = false;
4257         int i;
4258
4259         if (count == 1)
4260                 atomic64_inc(&mdsc->metric.opened_files);
4261
4262         spin_lock(&ci->i_ceph_lock);
4263         for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4264                 if (bits & (1 << i))
4265                         ci->i_nr_by_mode[i] += count;
4266
4267                 /*
4268                  * If any of the mode ref is larger than 1,
4269                  * that means it has been already opened by
4270                  * others. Just skip checking the PIN ref.
4271                  */
4272                 if (i && ci->i_nr_by_mode[i] > 1)
4273                         is_opened = true;
4274         }
4275
4276         if (!is_opened)
4277                 percpu_counter_inc(&mdsc->metric.opened_inodes);
4278         spin_unlock(&ci->i_ceph_lock);
4279 }
4280
4281 /*
4282  * Drop open file reference.  If we were the last open file,
4283  * we may need to release capabilities to the MDS (or schedule
4284  * their delayed release).
4285  */
4286 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
4287 {
4288         struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
4289         int bits = (fmode << 1) | 1;
4290         bool is_closed = true;
4291         int i;
4292
4293         if (count == 1)
4294                 atomic64_dec(&mdsc->metric.opened_files);
4295
4296         spin_lock(&ci->i_ceph_lock);
4297         for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4298                 if (bits & (1 << i)) {
4299                         BUG_ON(ci->i_nr_by_mode[i] < count);
4300                         ci->i_nr_by_mode[i] -= count;
4301                 }
4302
4303                 /*
4304                  * If any of the mode ref is not 0 after
4305                  * decreased, that means it is still opened
4306                  * by others. Just skip checking the PIN ref.
4307                  */
4308                 if (i && ci->i_nr_by_mode[i])
4309                         is_closed = false;
4310         }
4311
4312         if (is_closed)
4313                 percpu_counter_dec(&mdsc->metric.opened_inodes);
4314         spin_unlock(&ci->i_ceph_lock);
4315 }
4316
4317 /*
4318  * For a soon-to-be unlinked file, drop the LINK caps. If it
4319  * looks like the link count will hit 0, drop any other caps (other
4320  * than PIN) we don't specifically want (due to the file still being
4321  * open).
4322  */
4323 int ceph_drop_caps_for_unlink(struct inode *inode)
4324 {
4325         struct ceph_inode_info *ci = ceph_inode(inode);
4326         int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
4327
4328         spin_lock(&ci->i_ceph_lock);
4329         if (inode->i_nlink == 1) {
4330                 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
4331
4332                 if (__ceph_caps_dirty(ci)) {
4333                         struct ceph_mds_client *mdsc =
4334                                 ceph_inode_to_client(inode)->mdsc;
4335                         __cap_delay_requeue_front(mdsc, ci);
4336                 }
4337         }
4338         spin_unlock(&ci->i_ceph_lock);
4339         return drop;
4340 }
4341
4342 /*
4343  * Helpers for embedding cap and dentry lease releases into mds
4344  * requests.
4345  *
4346  * @force is used by dentry_release (below) to force inclusion of a
4347  * record for the directory inode, even when there aren't any caps to
4348  * drop.
4349  */
4350 int ceph_encode_inode_release(void **p, struct inode *inode,
4351                               int mds, int drop, int unless, int force)
4352 {
4353         struct ceph_inode_info *ci = ceph_inode(inode);
4354         struct ceph_cap *cap;
4355         struct ceph_mds_request_release *rel = *p;
4356         int used, dirty;
4357         int ret = 0;
4358
4359         spin_lock(&ci->i_ceph_lock);
4360         used = __ceph_caps_used(ci);
4361         dirty = __ceph_caps_dirty(ci);
4362
4363         dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
4364              inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
4365              ceph_cap_string(unless));
4366
4367         /* only drop unused, clean caps */
4368         drop &= ~(used | dirty);
4369
4370         cap = __get_cap_for_mds(ci, mds);
4371         if (cap && __cap_is_valid(cap)) {
4372                 unless &= cap->issued;
4373                 if (unless) {
4374                         if (unless & CEPH_CAP_AUTH_EXCL)
4375                                 drop &= ~CEPH_CAP_AUTH_SHARED;
4376                         if (unless & CEPH_CAP_LINK_EXCL)
4377                                 drop &= ~CEPH_CAP_LINK_SHARED;
4378                         if (unless & CEPH_CAP_XATTR_EXCL)
4379                                 drop &= ~CEPH_CAP_XATTR_SHARED;
4380                         if (unless & CEPH_CAP_FILE_EXCL)
4381                                 drop &= ~CEPH_CAP_FILE_SHARED;
4382                 }
4383
4384                 if (force || (cap->issued & drop)) {
4385                         if (cap->issued & drop) {
4386                                 int wanted = __ceph_caps_wanted(ci);
4387                                 dout("encode_inode_release %p cap %p "
4388                                      "%s -> %s, wanted %s -> %s\n", inode, cap,
4389                                      ceph_cap_string(cap->issued),
4390                                      ceph_cap_string(cap->issued & ~drop),
4391                                      ceph_cap_string(cap->mds_wanted),
4392                                      ceph_cap_string(wanted));
4393
4394                                 cap->issued &= ~drop;
4395                                 cap->implemented &= ~drop;
4396                                 cap->mds_wanted = wanted;
4397                                 if (cap == ci->i_auth_cap &&
4398                                     !(wanted & CEPH_CAP_ANY_FILE_WR))
4399                                         ci->i_requested_max_size = 0;
4400                         } else {
4401                                 dout("encode_inode_release %p cap %p %s"
4402                                      " (force)\n", inode, cap,
4403                                      ceph_cap_string(cap->issued));
4404                         }
4405
4406                         rel->ino = cpu_to_le64(ceph_ino(inode));
4407                         rel->cap_id = cpu_to_le64(cap->cap_id);
4408                         rel->seq = cpu_to_le32(cap->seq);
4409                         rel->issue_seq = cpu_to_le32(cap->issue_seq);
4410                         rel->mseq = cpu_to_le32(cap->mseq);
4411                         rel->caps = cpu_to_le32(cap->implemented);
4412                         rel->wanted = cpu_to_le32(cap->mds_wanted);
4413                         rel->dname_len = 0;
4414                         rel->dname_seq = 0;
4415                         *p += sizeof(*rel);
4416                         ret = 1;
4417                 } else {
4418                         dout("encode_inode_release %p cap %p %s (noop)\n",
4419                              inode, cap, ceph_cap_string(cap->issued));
4420                 }
4421         }
4422         spin_unlock(&ci->i_ceph_lock);
4423         return ret;
4424 }
4425
4426 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
4427                                struct inode *dir,
4428                                int mds, int drop, int unless)
4429 {
4430         struct dentry *parent = NULL;
4431         struct ceph_mds_request_release *rel = *p;
4432         struct ceph_dentry_info *di = ceph_dentry(dentry);
4433         int force = 0;
4434         int ret;
4435
4436         /*
4437          * force an record for the directory caps if we have a dentry lease.
4438          * this is racy (can't take i_ceph_lock and d_lock together), but it
4439          * doesn't have to be perfect; the mds will revoke anything we don't
4440          * release.
4441          */
4442         spin_lock(&dentry->d_lock);
4443         if (di->lease_session && di->lease_session->s_mds == mds)
4444                 force = 1;
4445         if (!dir) {
4446                 parent = dget(dentry->d_parent);
4447                 dir = d_inode(parent);
4448         }
4449         spin_unlock(&dentry->d_lock);
4450
4451         ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
4452         dput(parent);
4453
4454         spin_lock(&dentry->d_lock);
4455         if (ret && di->lease_session && di->lease_session->s_mds == mds) {
4456                 dout("encode_dentry_release %p mds%d seq %d\n",
4457                      dentry, mds, (int)di->lease_seq);
4458                 rel->dname_len = cpu_to_le32(dentry->d_name.len);
4459                 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4460                 *p += dentry->d_name.len;
4461                 rel->dname_seq = cpu_to_le32(di->lease_seq);
4462                 __ceph_mdsc_drop_dentry_lease(dentry);
4463         }
4464         spin_unlock(&dentry->d_lock);
4465         return ret;
4466 }