erofs: kill hooked chains to avoid loops on deduplicated compressed images
authorGao Xiang <hsiangkao@linux.alibaba.com>
Fri, 26 May 2023 20:14:56 +0000 (04:14 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Jul 2023 14:20:55 +0000 (16:20 +0200)
[ Upstream commit 967c28b23f6c89bb8eef6a046ea88afe0d7c1029 ]

After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted.  As a
consequence, the corresponding file pages will remain locked forever.

It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
Chain 1:  2->3->4->5    -- The tail pcluster is 5;
        Chain 2:  5->1->2       -- The tail pcluster is 2.

Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.

Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race.  Instead, let's avoid hooked chains completely
until I could work out a proper way to fix this and end users finally
tell us that it's needed to add it back.

Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.

Fixes: 267f2492c8f7 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-4-hsiangkao@linux.alibaba.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
fs/erofs/zdata.c

index aaddb67..92b2e4d 100644 (file)
@@ -94,11 +94,8 @@ struct z_erofs_pcluster {
 
 /* let's avoid the valid 32-bit kernel addresses */
 
-/* the chained workgroup has't submitted io (still open) */
+/* the end of a chain of pclusters */
 #define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)
-
 #define Z_EROFS_PCLUSTER_NIL            (NULL)
 
 struct z_erofs_decompressqueue {
@@ -376,20 +373,6 @@ int __init z_erofs_init_zip_subsystem(void)
 enum z_erofs_pclustermode {
        Z_EROFS_PCLUSTER_INFLIGHT,
        /*
-        * The current pclusters was the tail of an exist chain, in addition
-        * that the previous processed chained pclusters are all decided to
-        * be hooked up to it.
-        * A new chain will be created for the remaining pclusters which are
-        * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
-        * the next pcluster cannot reuse the whole page safely for inplace I/O
-        * in the following scenario:
-        *  ________________________________________________________________
-        * |      tail (partial) page     |       head (partial) page       |
-        * |   (belongs to the next pcl)  |   (belongs to the current pcl)  |
-        * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
-        */
-       Z_EROFS_PCLUSTER_HOOKED,
-       /*
         * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
         * could be dispatched into bypass queue later due to uptodated managed
         * pages. All related online pages cannot be reused for inplace I/O (or
@@ -406,8 +389,8 @@ enum z_erofs_pclustermode {
         *  ________________________________________________________________
         * |  tail (partial) page |          head (partial) page           |
         * |  (of the current cl) |      (of the previous collection)      |
-        * | PCLUSTER_FOLLOWED or |                                        |
-        * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
+        * |                      |                                        |
+        * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
         *
         * [  (*) the above page can be used as inplace I/O.               ]
         */
@@ -420,7 +403,7 @@ struct z_erofs_decompress_frontend {
        struct z_erofs_bvec_iter biter;
 
        struct page *candidate_bvpage;
-       struct z_erofs_pcluster *pcl, *tailpcl;
+       struct z_erofs_pcluster *pcl;
        z_erofs_next_pcluster_t owned_head;
        enum z_erofs_pclustermode mode;
 
@@ -626,19 +609,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
                return;
        }
 
-       /*
-        * type 2, link to the end of an existing open chain, be careful
-        * that its submission is controlled by the original attached chain.
-        */
-       if (*owned_head != &pcl->next && pcl != f->tailpcl &&
-           cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
-                   *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
-               *owned_head = Z_EROFS_PCLUSTER_TAIL;
-               f->mode = Z_EROFS_PCLUSTER_HOOKED;
-               f->tailpcl = NULL;
-               return;
-       }
-       /* type 3, it belongs to a chain, but it isn't the end of the chain */
+       /* type 2, it belongs to an ongoing chain */
        f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
 }
 
@@ -699,9 +670,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
                        goto err_out;
                }
        }
-       /* used to check tail merging loop due to corrupted images */
-       if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
-               fe->tailpcl = pcl;
        fe->owned_head = &pcl->next;
        fe->pcl = pcl;
        return 0;
@@ -722,7 +690,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
 
        /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
        DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
-       DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
 
        if (!(map->m_flags & EROFS_MAP_META)) {
                grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -741,10 +708,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
 
        if (ret == -EEXIST) {
                mutex_lock(&fe->pcl->lock);
-               /* used to check tail merging loop due to corrupted images */
-               if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
-                       fe->tailpcl = fe->pcl;
-
                z_erofs_try_to_claim_pcluster(fe);
        } else if (ret) {
                return ret;
@@ -901,8 +864,7 @@ hitted:
         * those chains are handled asynchronously thus the page cannot be used
         * for inplace I/O or bvpage (should be processed in a strict order.)
         */
-       tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
-                 fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+       tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
 
        cur = end - min_t(unsigned int, offset + end - map->m_la, end);
        if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1283,10 +1245,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
        };
        z_erofs_next_pcluster_t owned = io->head;
 
-       while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
-               /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
-               DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
-               /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
+       while (owned != Z_EROFS_PCLUSTER_TAIL) {
                DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
 
                be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1303,7 +1262,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
                container_of(work, struct z_erofs_decompressqueue, u.work);
        struct page *pagepool = NULL;
 
-       DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+       DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
        z_erofs_decompress_queue(bgq, &pagepool);
 
        erofs_release_pages(&pagepool);
@@ -1465,7 +1424,7 @@ fg_out:
                q->sync = true;
        }
        q->sb = sb;
-       q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+       q->head = Z_EROFS_PCLUSTER_TAIL;
        return q;
 }
 
@@ -1483,11 +1442,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
        z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
        z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
 
-       DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
-       if (owned_head == Z_EROFS_PCLUSTER_TAIL)
-               owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
-       WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+       WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
 
        WRITE_ONCE(*submit_qtail, owned_head);
        WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1558,15 +1513,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
                unsigned int i = 0;
                bool bypass = true;
 
-               /* no possible 'owned_head' equals the following */
-               DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
                DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
                pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+               owned_head = READ_ONCE(pcl->next);
 
-               /* close the main owned chain at first */
-               owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
-                                    Z_EROFS_PCLUSTER_TAIL_CLOSED);
                if (z_erofs_is_inline_pcluster(pcl)) {
                        move_to_bypass_jobqueue(pcl, qtail, owned_head);
                        continue;