Fix inotify watch removal/umount races

author Al Viro <viro@ZenIV.linux.org.uk>

Sat, 15 Nov 2008 01:15:43 +0000 (01:15 +0000)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 15 Nov 2008 20:26:44 +0000 (12:26 -0800)
author Al Viro <viro@ZenIV.linux.org.uk>
Sat, 15 Nov 2008 01:15:43 +0000 (01:15 +0000)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 15 Nov 2008 20:26:44 +0000 (12:26 -0800)
diff --git a/fs/inotify.c b/fs/inotify.c

index 690e725..7bbed1b 100644 (file)
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
  }
  EXPORT_SYMBOL_GPL(get_inotify_watch);
  
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+       struct super_block *sb = watch->inode->i_sb;
+       spin_lock(&sb_lock);
+       if (sb->s_count >= S_BIAS) {
+               atomic_inc(&sb->s_active);
+               spin_unlock(&sb_lock);
+               atomic_inc(&watch->count);
+               return 1;
+       }
+       spin_unlock(&sb_lock);
+       return 0;
+}
+
  /**
   * put_inotify_watch - decrements the ref count on a given watch.  cleans up
   * watch references if the count reaches zero.  inotify_watch is freed by
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
  }
  EXPORT_SYMBOL_GPL(put_inotify_watch);
  
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+       struct super_block *sb = watch->inode->i_sb;
+       put_inotify_watch(watch);
+       deactivate_super(sb);
+}
+
  /*
   * inotify_handle_get_wd - returns the next WD for use by the given handle
   *
@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch)
  }
  EXPORT_SYMBOL_GPL(inotify_init_watch);
  
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+       struct super_block *sb = watch->inode->i_sb;
+       s32 wd = watch->wd;
+
+       spin_lock(&sb_lock);
+       if (sb->s_count >= S_BIAS) {
+               atomic_inc(&sb->s_active);
+               spin_unlock(&sb_lock);
+               get_inotify_watch(watch);
+               mutex_unlock(&ih->mutex);
+               return 1;       /* the best outcome */
+       }
+       sb->s_count++;
+       spin_unlock(&sb_lock);
+       mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+       down_read(&sb->s_umount);
+       if (likely(!sb->s_root)) {
+               /* fs is already shut down; the watch is dead */
+               drop_super(sb);
+               return 0;
+       }
+       /* raced with the final deactivate_super() */
+       mutex_lock(&ih->mutex);
+       if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+               /* the watch is dead */
+               mutex_unlock(&ih->mutex);
+               drop_super(sb);
+               return 0;
+       }
+       /* still alive or freed and reused with the same sb and wd; kill */
+       get_inotify_watch(watch);
+       mutex_unlock(&ih->mutex);
+       return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+       struct super_block *sb = watch->inode->i_sb;
+       put_inotify_watch(watch);
+       switch (how) {
+       case 1:
+               deactivate_super(sb);
+               break;
+       case 2:
+               drop_super(sb);
+       }
+}
+
  /**
   * inotify_destroy - clean up and destroy an inotify instance
   * @ih: inotify handle
@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih)
          * pretty.  We cannot do a simple iteration over the list, because we
          * do not know the inode until we iterate to the watch.  But we need to
          * hold inode->inotify_mutex before ih->mutex.  The following works.
+        *
+        * AV: it had to become even uglier to start working ;-/
          */
         while (1) {
                 struct inotify_watch *watch;
                 struct list_head *watches;
+               struct super_block *sb;
                 struct inode *inode;
+               int how;
  
                 mutex_lock(&ih->mutex);
                 watches = &ih->watches;
@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih)
                         break;
                 }
                 watch = list_first_entry(watches, struct inotify_watch, h_list);
-               get_inotify_watch(watch);
-               mutex_unlock(&ih->mutex);
+               sb = watch->inode->i_sb;
+               how = pin_to_kill(ih, watch);
+               if (!how)
+                       continue;
  
                 inode = watch->inode;
                 mutex_lock(&inode->inotify_mutex);
@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih)
  
                 mutex_unlock(&ih->mutex);
                 mutex_unlock(&inode->inotify_mutex);
-               put_inotify_watch(watch);
+               unpin_and_kill(watch, how);
         }
  
         /* free this handle: the put matching the get in inotify_init() */
@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
  int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
  {
         struct inotify_watch *watch;
+       struct super_block *sb;
         struct inode *inode;
+       int how;
  
         mutex_lock(&ih->mutex);
         watch = idr_find(&ih->idr, wd);
@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
                 mutex_unlock(&ih->mutex);
                 return -EINVAL;
         }
-       get_inotify_watch(watch);
+       sb = watch->inode->i_sb;
+       how = pin_to_kill(ih, watch);
+       if (!how)
+               return 0;
+
         inode = watch->inode;
-       mutex_unlock(&ih->mutex);
  
         mutex_lock(&inode->inotify_mutex);
         mutex_lock(&ih->mutex);
@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
  
         mutex_unlock(&ih->mutex);
         mutex_unlock(&inode->inotify_mutex);
-       put_inotify_watch(watch);
+       unpin_and_kill(watch, how);
  
         return 0;
  }
diff --git a/include/linux/inotify.h b/include/linux/inotify.h

index bd57857..37ea289 100644 (file)
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -134,6 +134,8 @@ extern void inotify_remove_watch_locked(struct inotify_handle *,
                                         struct inotify_watch *);
  extern void get_inotify_watch(struct inotify_watch *);
  extern void put_inotify_watch(struct inotify_watch *);
+extern int pin_inotify_watch(struct inotify_watch *);
+extern void unpin_inotify_watch(struct inotify_watch *);
  
  #else
  
@@ -228,6 +230,15 @@ static inline void put_inotify_watch(struct inotify_watch *watch)
  {
  }
  
+extern inline int pin_inotify_watch(struct inotify_watch *watch)
+{
+       return 0;
+}
+
+extern inline void unpin_inotify_watch(struct inotify_watch *watch)
+{
+}
+
  #endif /* CONFIG_INOTIFY */
  
  #endif /* __KERNEL __ */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c

index 8ba0e0d..8b50944 100644 (file)
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
         struct list_head trees;         /* with root here */
         int dead;
         int count;
+       atomic_long_t refs;
         struct rcu_head head;
         struct node {
                 struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
   * tree is refcounted; one reference for "some rules on rules_list refer to
   * it", one for each chunk with pointer to it.
   *
- * chunk is refcounted by embedded inotify_watch.
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
   *
   * node.index allows to get from node.list to containing chunk.
   * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
         INIT_LIST_HEAD(&chunk->hash);
         INIT_LIST_HEAD(&chunk->trees);
         chunk->count = count;
+       atomic_long_set(&chunk->refs, 1);
         for (i = 0; i < count; i++) {
                 INIT_LIST_HEAD(&chunk->owners[i].list);
                 chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
         return chunk;
  }
  
-static void __free_chunk(struct rcu_head *rcu)
+static void free_chunk(struct audit_chunk *chunk)
  {
-       struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
         int i;
  
         for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
         kfree(chunk);
  }
  
-static inline void free_chunk(struct audit_chunk *chunk)
+void audit_put_chunk(struct audit_chunk *chunk)
  {
-       call_rcu(&chunk->head, __free_chunk);
+       if (atomic_long_dec_and_test(&chunk->refs))
+               free_chunk(chunk);
  }
  
-void audit_put_chunk(struct audit_chunk *chunk)
+static void __put_chunk(struct rcu_head *rcu)
  {
-       put_inotify_watch(&chunk->watch);
+       struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+       audit_put_chunk(chunk);
  }
  
  enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
  
         list_for_each_entry_rcu(p, list, hash) {
                 if (p->watch.inode == inode) {
-                       get_inotify_watch(&p->watch);
+                       atomic_long_inc(&p->refs);
                         return p;
                 }
         }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
  
  /* tagging and untagging inodes with trees */
  
-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+static struct audit_chunk *find_chunk(struct node *p)
+{
+       int index = p->index & ~(1U<<31);
+       p -= index;
+       return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
  {
+       struct audit_chunk *chunk = find_chunk(p);
         struct audit_chunk *new;
         struct audit_tree *owner;
         int size = chunk->count - 1;
         int i, j;
  
+       if (!pin_inotify_watch(&chunk->watch)) {
+               /*
+                * Filesystem is shutting down; all watches are getting
+                * evicted, just take it off the node list for this
+                * tree and let the eviction logics take care of the
+                * rest.
+                */
+               owner = p->owner;
+               if (owner->root == chunk) {
+                       list_del_init(&owner->same_root);
+                       owner->root = NULL;
+               }
+               list_del_init(&p->list);
+               p->owner = NULL;
+               put_tree(owner);
+               return;
+       }
+
+       spin_unlock(&hash_lock);
+
+       /*
+        * pin_inotify_watch() succeeded, so the watch won't go away
+        * from under us.
+        */
         mutex_lock(&chunk->watch.inode->inotify_mutex);
         if (chunk->dead) {
                 mutex_unlock(&chunk->watch.inode->inotify_mutex);
-               return;
+               goto out;
         }
  
         owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
                 inotify_evict_watch(&chunk->watch);
                 mutex_unlock(&chunk->watch.inode->inotify_mutex);
                 put_inotify_watch(&chunk->watch);
-               return;
+               goto out;
         }
  
         new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
         inotify_evict_watch(&chunk->watch);
         mutex_unlock(&chunk->watch.inode->inotify_mutex);
         put_inotify_watch(&chunk->watch);
-       return;
+       goto out;
  
  Fallback:
         // do the best we can
@@ -277,6 +313,9 @@ Fallback:
         put_tree(owner);
         spin_unlock(&hash_lock);
         mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+       unpin_inotify_watch(&chunk->watch);
+       spin_lock(&hash_lock);
  }
  
  static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
         return 0;
  }
  
-static struct audit_chunk *find_chunk(struct node *p)
-{
-       int index = p->index & ~(1U<<31);
-       p -= index;
-       return container_of(p, struct audit_chunk, owners[0]);
-}
-
  static void kill_rules(struct audit_tree *tree)
  {
         struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
         spin_lock(&hash_lock);
         while (!list_empty(&victim->chunks)) {
                 struct node *p;
-               struct audit_chunk *chunk;
  
                 p = list_entry(victim->chunks.next, struct node, list);
-               chunk = find_chunk(p);
-               get_inotify_watch(&chunk->watch);
-               spin_unlock(&hash_lock);
-
-               untag_chunk(chunk, p);
  
-               put_inotify_watch(&chunk->watch);
-               spin_lock(&hash_lock);
+               untag_chunk(p);
         }
         spin_unlock(&hash_lock);
         put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
  
         while (!list_empty(&tree->chunks)) {
                 struct node *node;
-               struct audit_chunk *chunk;
  
                 node = list_entry(tree->chunks.next, struct node, list);
  
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
                 if (!(node->index & (1U<<31)))
                         break;
  
-               chunk = find_chunk(node);
-               get_inotify_watch(&chunk->watch);
-               spin_unlock(&hash_lock);
-
-               untag_chunk(chunk, node);
-
-               put_inotify_watch(&chunk->watch);
-               spin_lock(&hash_lock);
+               untag_chunk(node);
         }
         if (!tree->root && !tree->goner) {
                 tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
  static void destroy_watch(struct inotify_watch *watch)
  {
         struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-       free_chunk(chunk);
+       call_rcu(&chunk->head, __put_chunk);
  }
  
  static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c

index b7d354e..9fd85a4 100644 (file)
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
         list_for_each_entry_safe(p, n, in_list, ilist) {
                 list_del(&p->ilist);
                 inotify_rm_watch(audit_ih, &p->wdata);
-               /* the put matching the get in audit_do_del_rule() */
-               put_inotify_watch(&p->wdata);
+               /* the unpin matching the pin in audit_do_del_rule() */
+               unpin_inotify_watch(&p->wdata);
         }
  }
  
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
                                 /* Put parent on the inotify un-registration
                                  * list.  Grab a reference before releasing
                                  * audit_filter_mutex, to be released in
-                                * audit_inotify_unregister(). */
-                               list_add(&parent->ilist, &inotify_list);
-                               get_inotify_watch(&parent->wdata);
+                                * audit_inotify_unregister().
+                                * If filesystem is going away, just leave
+                                * the sucker alone, eviction will take
+                                * care of it.
+                                */
+                               if (pin_inotify_watch(&parent->wdata))
+                                       list_add(&parent->ilist, &inotify_list);
                         }
                 }
         }
author	Al Viro <viro@ZenIV.linux.org.uk>
	Sat, 15 Nov 2008 01:15:43 +0000 (01:15 +0000)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 15 Nov 2008 20:26:44 +0000 (12:26 -0800)
fs/inotify.c		patch \| blob \| history
include/linux/inotify.h		patch \| blob \| history
kernel/audit_tree.c		patch \| blob \| history
kernel/auditfilter.c		patch \| blob \| history