erofs-utils: lib: treat data blocks filled with 0s as a hole
authorSandeep Dhavale <dhavale@google.com>
Wed, 17 Apr 2024 23:48:44 +0000 (16:48 -0700)
committerGao Xiang <hsiangkao@linux.alibaba.com>
Mon, 22 Apr 2024 00:39:44 +0000 (08:39 +0800)
Add optimization to treat data blocks filled with 0s as a hole.
Even though diskspace savings are comparable to chunk based or dedupe,
having no block assigned saves us redundant disk IOs during read.

To detect blocks filled with zeros during chunking, we insert block
filled with zeros (zerochunk) in the hashmap. If we detect a possible
dedupe, we map it to the hole so there is no physical block assigned.

Signed-off-by: Sandeep Dhavale <dhavale@google.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240417234845.2758882-1-dhavale@google.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
include/erofs/blobchunk.h
lib/blobchunk.c
mkfs/main.c

index a67464033dc53147c75e7a4137bbb37e5531a9d8..ebe2efe477185c4061bdf2efaf02302453b5fa31 100644 (file)
@@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
 int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t data_offset);
 int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
 void erofs_blob_exit(void);
-int erofs_blob_init(const char *blobfile_path);
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
 
 #ifdef __cplusplus
index 641e3d4d549a1b8bced4082d7d4a3fab4027acb9..645bcc11be50010573dc781e992419a85b397463 100644 (file)
@@ -69,8 +69,15 @@ static struct erofs_blobchunk *erofs_blob_getchunk(struct erofs_sb_info *sbi,
        chunk = hashmap_get_from_hash(&blob_hashmap, hash, sha256);
        if (chunk) {
                DBG_BUGON(chunksize != chunk->chunksize);
+
                sbi->saved_by_deduplication += chunksize;
-               erofs_dbg("Found duplicated chunk at %u", chunk->blkaddr);
+               if (chunk->blkaddr == erofs_holechunk.blkaddr) {
+                       chunk = &erofs_holechunk;
+                       erofs_dbg("Found duplicated hole chunk");
+               } else {
+                       erofs_dbg("Found duplicated chunk at %u",
+                                 chunk->blkaddr);
+               }
                return chunk;
        }
 
@@ -231,7 +238,21 @@ static void erofs_update_minextblks(struct erofs_sb_info *sbi,
        if (lb && lb < *minextblks)
                *minextblks = lb;
 }
-
+static bool erofs_blob_can_merge(struct erofs_sb_info *sbi,
+                                struct erofs_blobchunk *lastch,
+                                struct erofs_blobchunk *chunk)
+{
+       if (!lastch)
+               return true;
+       if (lastch == &erofs_holechunk && chunk == &erofs_holechunk)
+               return true;
+       if (lastch->device_id == chunk->device_id &&
+               erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize ==
+               erofs_pos(sbi, chunk->blkaddr))
+               return true;
+
+       return false;
+}
 int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
                                  erofs_off_t startoff)
 {
@@ -303,16 +324,19 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
                }
 
                if (offset > pos) {
-                       len = 0;
-                       erofs_update_minextblks(sbi, interval_start, pos,
-                                               &minextblks);
+                       if (!erofs_blob_can_merge(sbi, lastch,
+                                                       &erofs_holechunk)) {
+                               erofs_update_minextblks(sbi, interval_start,
+                                                       pos, &minextblks);
+                               interval_start = pos;
+                       }
                        do {
                                *(void **)idx++ = &erofs_holechunk;
                                pos += chunksize;
                        } while (pos < offset);
                        DBG_BUGON(pos != offset);
-                       lastch = NULL;
-                       interval_start = pos;
+                       lastch = &erofs_holechunk;
+                       len = 0;
                        continue;
                }
 #endif
@@ -330,9 +354,7 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
                        goto err;
                }
 
-               if (lastch && (lastch->device_id != chunk->device_id ||
-                   erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
-                   erofs_pos(sbi, chunk->blkaddr))) {
+               if (!erofs_blob_can_merge(sbi, lastch, chunk)) {
                        erofs_update_minextblks(sbi, interval_start, pos,
                                                &minextblks);
                        interval_start = pos;
@@ -540,7 +562,36 @@ void erofs_blob_exit(void)
        }
 }
 
-int erofs_blob_init(const char *blobfile_path)
+static int erofs_insert_zerochunk(erofs_off_t chunksize)
+{
+       u8 *zeros;
+       struct erofs_blobchunk *chunk;
+       u8 sha256[32];
+       unsigned int hash;
+       int ret = 0;
+
+       zeros = calloc(1, chunksize);
+       if (!zeros)
+               return -ENOMEM;
+
+       erofs_sha256(zeros, chunksize, sha256);
+       free(zeros);
+       hash = memhash(sha256, sizeof(sha256));
+       chunk = malloc(sizeof(struct erofs_blobchunk));
+       if (!chunk)
+               return -ENOMEM;
+
+       chunk->chunksize = chunksize;
+       /* treat chunk filled with zeros as hole */
+       chunk->blkaddr = erofs_holechunk.blkaddr;
+       memcpy(chunk->sha256, sha256, sizeof(sha256));
+
+       hashmap_entry_init(&chunk->ent, hash);
+       hashmap_add(&blob_hashmap, chunk);
+       return ret;
+}
+
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize)
 {
        if (!blobfile_path) {
 #ifdef HAVE_TMPFILE64
@@ -557,7 +608,7 @@ int erofs_blob_init(const char *blobfile_path)
                return -EACCES;
 
        hashmap_init(&blob_hashmap, erofs_blob_hashmap_cmp, 0);
-       return 0;
+       return erofs_insert_zerochunk(chunksize);
 }
 
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices)
index 2fb4a57e1aed09a6e4ea5196473623bd1a679cee..d632f740ad3fa10c1bf0398bd353a1a6a5ce70a6 100644 (file)
@@ -1255,7 +1255,7 @@ int main(int argc, char **argv)
        }
 
        if (cfg.c_chunkbits) {
-               err = erofs_blob_init(cfg.c_blobdev_path);
+               err = erofs_blob_init(cfg.c_blobdev_path, 1 << cfg.c_chunkbits);
                if (err)
                        return 1;
        }