Btrfs: heuristic: add byte set calculation

author Timofey Titovets <nefelim4ag@gmail.com>

Thu, 28 Sep 2017 14:33:40 +0000 (17:33 +0300)

committer David Sterba <dsterba@suse.com>

Wed, 1 Nov 2017 19:45:36 +0000 (20:45 +0100)
author Timofey Titovets <nefelim4ag@gmail.com>
Thu, 28 Sep 2017 14:33:40 +0000 (17:33 +0300)
committer David Sterba <dsterba@suse.com>
Wed, 1 Nov 2017 19:45:36 +0000 (20:45 +0100)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 0d445c8..e949f07 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
         return 1;
  }
  
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ *     less - compressible
+ *     more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD             (64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+       u32 i;
+       u32 byte_set_size = 0;
+
+       for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+               if (ws->bucket[i].count > 0)
+                       byte_set_size++;
+       }
+
+       /*
+        * Continue collecting count of byte values in buckets.  If the byte
+        * set size is bigger then the threshold, it's pointless to continue,
+        * the detection technique would fail for this type of data.
+        */
+       for (; i < BUCKET_SIZE; i++) {
+               if (ws->bucket[i].count > 0) {
+                       byte_set_size++;
+                       if (byte_set_size > BYTE_SET_THRESHOLD)
+                               return byte_set_size;
+               }
+       }
+
+       return byte_set_size;
+}
+
  static bool sample_repeated_patterns(struct heuristic_ws *ws)
  {
         const u32 half_of_sample = ws->sample_size / 2;
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
                 ws->bucket[byte].count++;
         }
  
+       i = byte_set_size(ws);
+       if (i < BYTE_SET_THRESHOLD) {
+               ret = 2;
+               goto out;
+       }
+
  out:
         __free_workspace(0, ws_list, true);
         return ret;
author	Timofey Titovets <nefelim4ag@gmail.com>
	Thu, 28 Sep 2017 14:33:40 +0000 (17:33 +0300)
committer	David Sterba <dsterba@suse.com>
	Wed, 1 Nov 2017 19:45:36 +0000 (20:45 +0100)