drivers/block/drbd/drbd_vli.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3 -*- linux-c -*-
   4    drbd_receiver.c
   5    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   6
   7    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   8    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   9    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  10
  11  */
  12
  13 #ifndef _DRBD_VLI_H
  14 #define _DRBD_VLI_H
  15
  16 /*
  17  * At a granularity of 4KiB storage represented per bit,
  18  * and stroage sizes of several TiB,
  19  * and possibly small-bandwidth replication,
  20  * the bitmap transfer time can take much too long,
  21  * if transmitted in plain text.
  22  *
  23  * We try to reduce the transferred bitmap information
  24  * by encoding runlengths of bit polarity.
  25  *
  26  * We never actually need to encode a "zero" (runlengths are positive).
  27  * But then we have to store the value of the first bit.
  28  * The first bit of information thus shall encode if the first runlength
  29  * gives the number of set or unset bits.
  30  *
  31  * We assume that large areas are either completely set or unset,
  32  * which gives good compression with any runlength method,
  33  * even when encoding the runlength as fixed size 32bit/64bit integers.
  34  *
  35  * Still, there may be areas where the polarity flips every few bits,
  36  * and encoding the runlength sequence of those areas with fix size
  37  * integers would be much worse than plaintext.
  38  *
  39  * We want to encode small runlength values with minimum code length,
  40  * while still being able to encode a Huge run of all zeros.
  41  *
  42  * Thus we need a Variable Length Integer encoding, VLI.
  43  *
  44  * For some cases, we produce more code bits than plaintext input.
  45  * We need to send incompressible chunks as plaintext, skip over them
  46  * and then see if the next chunk compresses better.
  47  *
  48  * We don't care too much about "excellent" compression ratio for large
  49  * runlengths (all set/all clear): whether we achieve a factor of 100
  50  * or 1000 is not that much of an issue.
  51  * We do not want to waste too much on short runlengths in the "noisy"
  52  * parts of the bitmap, though.
  53  *
  54  * There are endless variants of VLI, we experimented with:
  55  *  * simple byte-based
  56  *  * various bit based with different code word length.
  57  *
  58  * To avoid yet an other configuration parameter (choice of bitmap compression
  59  * algorithm) which was difficult to explain and tune, we just chose the one
  60  * variant that turned out best in all test cases.
  61  * Based on real world usage patterns, with device sizes ranging from a few GiB
  62  * to several TiB, file server/mailserver/webserver/mysql/postgress,
  63  * mostly idle to really busy, the all time winner (though sometimes only
  64  * marginally better) is:
  65  */
  66
  67 /*
  68  * encoding is "visualised" as
  69  * __little endian__ bitstream, least significant bit first (left most)
  70  *
  71  * this particular encoding is chosen so that the prefix code
  72  * starts as unary encoding the level, then modified so that
  73  * 10 levels can be described in 8bit, with minimal overhead
  74  * for the smaller levels.
  75  *
  76  * Number of data bits follow fibonacci sequence, with the exception of the
  77  * last level (+1 data bit, so it makes 64bit total).  The only worse code when
  78  * encoding bit polarity runlength is 1 plain bits => 2 code bits.
  79 prefix    data bits                                    max val  Nº data bits
  80 0 x                                                         0x2            1
  81 10 x                                                        0x4            1
  82 110 xx                                                      0x8            2
  83 1110 xxx                                                   0x10            3
  84 11110 xxx xx                                               0x30            5
  85 111110 xx xxxxxx                                          0x130            8
  86 11111100  xxxxxxxx xxxxx                                 0x2130           13
  87 11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
  88 11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
  89 11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
  90  * maximum encodable value: 0x100000400202130 == 2**56 + some */
  91
  92 /* compression "table":
  93  transmitted   x                                0.29
  94  as plaintext x                                  ........................
  95              x                                   ........................
  96             x                                    ........................
  97            x    0.59                         0.21........................
  98           x      ........................................................
  99          x       .. c ...................................................
 100         x    0.44.. o ...................................................
 101        x .......... d ...................................................
 102       x  .......... e ...................................................
 103      X.............   ...................................................
 104     x.............. b ...................................................
 105 2.0x............... i ...................................................
 106  #X................ t ...................................................
 107  #................. s ...........................  plain bits  ..........
 108 -+-----------------------------------------------------------------------
 109  1             16              32                              64
 110 */
 111
 112 /* LEVEL: (total bits, prefix bits, prefix value),
 113  * sorted ascending by number of total bits.
 114  * The rest of the code table is calculated at compiletime from this. */
 115
 116 /* fibonacci data 1, 1, ... */
 117 #define VLI_L_1_1() do { \
 118         LEVEL( 2, 1, 0x00); \
 119         LEVEL( 3, 2, 0x01); \
 120         LEVEL( 5, 3, 0x03); \
 121         LEVEL( 7, 4, 0x07); \
 122         LEVEL(10, 5, 0x0f); \
 123         LEVEL(14, 6, 0x1f); \
 124         LEVEL(21, 8, 0x3f); \
 125         LEVEL(29, 8, 0x7f); \
 126         LEVEL(42, 8, 0xbf); \
 127         LEVEL(64, 8, 0xff); \
 128         } while (0)
 129
 130 /* finds a suitable level to decode the least significant part of in.
 131  * returns number of bits consumed.
 132  *
 133  * BUG() for bad input, as that would mean a buggy code table. */
 134 static inline int vli_decode_bits(u64 *out, const u64 in)
 135 {
 136         u64 adj = 1;
 137
 138 #define LEVEL(t,b,v)                                    \
 139         do {                                            \
 140                 if ((in & ((1 << b) -1)) == v) {        \
 141                         *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
 142                         return t;                       \
 143                 }                                       \
 144                 adj += 1ULL << (t - b);                 \
 145         } while (0)
 146
 147         VLI_L_1_1();
 148
 149         /* NOT REACHED, if VLI_LEVELS code table is defined properly */
 150         BUG();
 151 #undef LEVEL
 152 }
 153
 154 /* return number of code bits needed,
 155  * or negative error number */
 156 static inline int __vli_encode_bits(u64 *out, const u64 in)
 157 {
 158         u64 max = 0;
 159         u64 adj = 1;
 160
 161         if (in == 0)
 162                 return -EINVAL;
 163
 164 #define LEVEL(t,b,v) do {               \
 165                 max += 1ULL << (t - b); \
 166                 if (in <= max) {        \
 167                         if (out)        \
 168                                 *out = ((in - adj) << b) | v;   \
 169                         return t;       \
 170                 }                       \
 171                 adj = max + 1;          \
 172         } while (0)
 173
 174         VLI_L_1_1();
 175
 176         return -EOVERFLOW;
 177 #undef LEVEL
 178 }
 179
 180 #undef VLI_L_1_1
 181
 182 /* code from here down is independend of actually used bit code */
 183
 184 /*
 185  * Code length is determined by some unique (e.g. unary) prefix.
 186  * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
 187  * not a byte stream.
 188  */
 189
 190 /* for the bitstream, we need a cursor */
 191 struct bitstream_cursor {
 192         /* the current byte */
 193         u8 *b;
 194         /* the current bit within *b, nomalized: 0..7 */
 195         unsigned int bit;
 196 };
 197
 198 /* initialize cursor to point to first bit of stream */
 199 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
 200 {
 201         cur->b = s;
 202         cur->bit = 0;
 203 }
 204
 205 /* advance cursor by that many bits; maximum expected input value: 64,
 206  * but depending on VLI implementation, it may be more. */
 207 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
 208 {
 209         bits += cur->bit;
 210         cur->b = cur->b + (bits >> 3);
 211         cur->bit = bits & 7;
 212 }
 213
 214 /* the bitstream itself knows its length */
 215 struct bitstream {
 216         struct bitstream_cursor cur;
 217         unsigned char *buf;
 218         size_t buf_len;         /* in bytes */
 219
 220         /* for input stream:
 221          * number of trailing 0 bits for padding
 222          * total number of valid bits in stream: buf_len * 8 - pad_bits */
 223         unsigned int pad_bits;
 224 };
 225
 226 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
 227 {
 228         bs->buf = s;
 229         bs->buf_len = len;
 230         bs->pad_bits = pad_bits;
 231         bitstream_cursor_reset(&bs->cur, bs->buf);
 232 }
 233
 234 static inline void bitstream_rewind(struct bitstream *bs)
 235 {
 236         bitstream_cursor_reset(&bs->cur, bs->buf);
 237         memset(bs->buf, 0, bs->buf_len);
 238 }
 239
 240 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
 241  * Ignores "pad_bits".
 242  * Returns zero if bits == 0 (nothing to do).
 243  * Returns number of bits used if successful.
 244  *
 245  * If there is not enough room left in bitstream,
 246  * leaves bitstream unchanged and returns -ENOBUFS.
 247  */
 248 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
 249 {
 250         unsigned char *b = bs->cur.b;
 251         unsigned int tmp;
 252
 253         if (bits == 0)
 254                 return 0;
 255
 256         if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
 257                 return -ENOBUFS;
 258
 259         /* paranoia: strip off hi bits; they should not be set anyways. */
 260         if (bits < 64)
 261                 val &= ~0ULL >> (64 - bits);
 262
 263         *b++ |= (val & 0xff) << bs->cur.bit;
 264
 265         for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
 266                 *b++ |= (val >> tmp) & 0xff;
 267
 268         bitstream_cursor_advance(&bs->cur, bits);
 269         return bits;
 270 }
 271
 272 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
 273  *
 274  * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
 275  *
 276  * If there are less than the requested number of valid bits left in the
 277  * bitstream, still fetches all available bits.
 278  *
 279  * Returns number of actually fetched bits.
 280  */
 281 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
 282 {
 283         u64 val;
 284         unsigned int n;
 285
 286         if (bits > 64)
 287                 return -EINVAL;
 288
 289         if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
 290                 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
 291                         - bs->cur.bit - bs->pad_bits;
 292
 293         if (bits == 0) {
 294                 *out = 0;
 295                 return 0;
 296         }
 297
 298         /* get the high bits */
 299         val = 0;
 300         n = (bs->cur.bit + bits + 7) >> 3;
 301         /* n may be at most 9, if cur.bit + bits > 64 */
 302         /* which means this copies at most 8 byte */
 303         if (n) {
 304                 memcpy(&val, bs->cur.b+1, n - 1);
 305                 val = le64_to_cpu(val) << (8 - bs->cur.bit);
 306         }
 307
 308         /* we still need the low bits */
 309         val |= bs->cur.b[0] >> bs->cur.bit;
 310
 311         /* and mask out bits we don't want */
 312         val &= ~0ULL >> (64 - bits);
 313
 314         bitstream_cursor_advance(&bs->cur, bits);
 315         *out = val;
 316
 317         return bits;
 318 }
 319
 320 /* encodes @in as vli into @bs;
 321
 322  * return values
 323  *  > 0: number of bits successfully stored in bitstream
 324  * -ENOBUFS @bs is full
 325  * -EINVAL input zero (invalid)
 326  * -EOVERFLOW input too large for this vli code (invalid)
 327  */
 328 static inline int vli_encode_bits(struct bitstream *bs, u64 in)
 329 {
 330         u64 code;
 331         int bits = __vli_encode_bits(&code, in);
 332
 333         if (bits <= 0)
 334                 return bits;
 335
 336         return bitstream_put_bits(bs, code, bits);
 337 }
 338
 339 #endif