linux-zen-server/drivers/block/drbd/drbd_vli.h

/* SPDX-License-Identifier: GPL-2.0-only */
/*
-*- linux-c -*-
   drbd_receiver.c
   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.

   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.

 */

#ifndef _DRBD_VLI_H
#define _DRBD_VLI_H

/*
 * At a granularity of 4KiB storage represented per bit,
 * and stroage sizes of several TiB,
 * and possibly small-bandwidth replication,
 * the bitmap transfer time can take much too long,
 * if transmitted in plain text.
 *
 * We try to reduce the transferred bitmap information
 * by encoding runlengths of bit polarity.
 *
 * We never actually need to encode a "zero" (runlengths are positive).
 * But then we have to store the value of the first bit.
 * The first bit of information thus shall encode if the first runlength
 * gives the number of set or unset bits.
 *
 * We assume that large areas are either completely set or unset,
 * which gives good compression with any runlength method,
 * even when encoding the runlength as fixed size 32bit/64bit integers.
 *
 * Still, there may be areas where the polarity flips every few bits,
 * and encoding the runlength sequence of those areas with fix size
 * integers would be much worse than plaintext.
 *
 * We want to encode small runlength values with minimum code length,
 * while still being able to encode a Huge run of all zeros.
 *
 * Thus we need a Variable Length Integer encoding, VLI.
 *
 * For some cases, we produce more code bits than plaintext input.
 * We need to send incompressible chunks as plaintext, skip over them
 * and then see if the next chunk compresses better.
 *
 * We don't care too much about "excellent" compression ratio for large
 * runlengths (all set/all clear): whether we achieve a factor of 100
 * or 1000 is not that much of an issue.
 * We do not want to waste too much on short runlengths in the "noisy"
 * parts of the bitmap, though.
 *
 * There are endless variants of VLI, we experimented with:
 *  * simple byte-based
 *  * various bit based with different code word length.
 *
 * To avoid yet an other configuration parameter (choice of bitmap compression
 * algorithm) which was difficult to explain and tune, we just chose the one
 * variant that turned out best in all test cases.
 * Based on real world usage patterns, with device sizes ranging from a few GiB
 * to several TiB, file server/mailserver/webserver/mysql/postgress,
 * mostly idle to really busy, the all time winner (though sometimes only
 * marginally better) is:
 */

/*
 * encoding is "visualised" as
 * __little endian__ bitstream, least significant bit first (left most)
 *
 * this particular encoding is chosen so that the prefix code
 * starts as unary encoding the level, then modified so that
 * 10 levels can be described in 8bit, with minimal overhead
 * for the smaller levels.
 *
 * Number of data bits follow fibonacci sequence, with the exception of the
 * last level (+1 data bit, so it makes 64bit total).  The only worse code when
 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
prefix    data bits                                    max val  Nº data bits
0 x                                                         0x2            1
10 x                                                        0x4            1
110 xx                                                      0x8            2
1110 xxx                                                   0x10            3
11110 xxx xx                                               0x30            5
111110 xx xxxxxx                                          0x130            8
11111100  xxxxxxxx xxxxx                                 0x2130           13
11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
 * maximum encodable value: 0x100000400202130 == 2**56 + some */

/* compression "table":
 transmitted   x                                0.29
 as plaintext x                                  ........................
             x                                   ........................
            x                                    ........................
           x    0.59                         0.21........................
          x      ........................................................
         x       .. c ...................................................
        x    0.44.. o ...................................................
       x .......... d ...................................................
      x  .......... e ...................................................
     X.............   ...................................................
    x.............. b ...................................................
2.0x............... i ...................................................
 #X................ t ...................................................
 #................. s ...........................  plain bits  ..........
-+-----------------------------------------------------------------------
 1             16              32                              64
*/

/* LEVEL: (total bits, prefix bits, prefix value),
 * sorted ascending by number of total bits.
 * The rest of the code table is calculated at compiletime from this. */

/* fibonacci data 1, 1, ... */
#define VLI_L_1_1() do { \
	LEVEL( 2, 1, 0x00); \
	LEVEL( 3, 2, 0x01); \
	LEVEL( 5, 3, 0x03); \
	LEVEL( 7, 4, 0x07); \
	LEVEL(10, 5, 0x0f); \
	LEVEL(14, 6, 0x1f); \
	LEVEL(21, 8, 0x3f); \
	LEVEL(29, 8, 0x7f); \
	LEVEL(42, 8, 0xbf); \
	LEVEL(64, 8, 0xff); \
	} while (0)

/* finds a suitable level to decode the least significant part of in.
 * returns number of bits consumed.
 *
 * BUG() for bad input, as that would mean a buggy code table. */
static inline int vli_decode_bits(u64 *out, const u64 in)
{
	u64 adj = 1;

#define LEVEL(t,b,v)					\
	do {						\
		if ((in & ((1 << b) -1)) == v) {	\
			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
			return t;			\
		}					\
		adj += 1ULL << (t - b);			\
	} while (0)

	VLI_L_1_1();

	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
	BUG();
#undef LEVEL
}

/* return number of code bits needed,
 * or negative error number */
static inline int __vli_encode_bits(u64 *out, const u64 in)
{
	u64 max = 0;
	u64 adj = 1;

	if (in == 0)
		return -EINVAL;

#define LEVEL(t,b,v) do {		\
		max += 1ULL << (t - b);	\
		if (in <= max) {	\
			if (out)	\
				*out = ((in - adj) << b) | v;	\
			return t;	\
		}			\
		adj = max + 1;		\
	} while (0)

	VLI_L_1_1();

	return -EOVERFLOW;
#undef LEVEL
}

#undef VLI_L_1_1

/* code from here down is independend of actually used bit code */

/*
 * Code length is determined by some unique (e.g. unary) prefix.
 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
 * not a byte stream.
 */

/* for the bitstream, we need a cursor */
struct bitstream_cursor {
	/* the current byte */
	u8 *b;
	/* the current bit within *b, nomalized: 0..7 */
	unsigned int bit;
};

/* initialize cursor to point to first bit of stream */
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
{
	cur->b = s;
	cur->bit = 0;
}

/* advance cursor by that many bits; maximum expected input value: 64,
 * but depending on VLI implementation, it may be more. */
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
{
	bits += cur->bit;
	cur->b = cur->b + (bits >> 3);
	cur->bit = bits & 7;
}

/* the bitstream itself knows its length */
struct bitstream {
	struct bitstream_cursor cur;
	unsigned char *buf;
	size_t buf_len;		/* in bytes */

	/* for input stream:
	 * number of trailing 0 bits for padding
	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
	unsigned int pad_bits;
};

static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
{
	bs->buf = s;
	bs->buf_len = len;
	bs->pad_bits = pad_bits;
	bitstream_cursor_reset(&bs->cur, bs->buf);
}

static inline void bitstream_rewind(struct bitstream *bs)
{
	bitstream_cursor_reset(&bs->cur, bs->buf);
	memset(bs->buf, 0, bs->buf_len);
}

/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
 * Ignores "pad_bits".
 * Returns zero if bits == 0 (nothing to do).
 * Returns number of bits used if successful.
 *
 * If there is not enough room left in bitstream,
 * leaves bitstream unchanged and returns -ENOBUFS.
 */
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
{
	unsigned char *b = bs->cur.b;
	unsigned int tmp;

	if (bits == 0)
		return 0;

	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
		return -ENOBUFS;

	/* paranoia: strip off hi bits; they should not be set anyways. */
	if (bits < 64)
		val &= ~0ULL >> (64 - bits);

	*b++ |= (val & 0xff) << bs->cur.bit;

	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
		*b++ |= (val >> tmp) & 0xff;

	bitstream_cursor_advance(&bs->cur, bits);
	return bits;
}

/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
 *
 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
 *
 * If there are less than the requested number of valid bits left in the
 * bitstream, still fetches all available bits.
 *
 * Returns number of actually fetched bits.
 */
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
{
	u64 val;
	unsigned int n;

	if (bits > 64)
		return -EINVAL;

	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
			- bs->cur.bit - bs->pad_bits;

	if (bits == 0) {
		*out = 0;
		return 0;
	}

	/* get the high bits */
	val = 0;
	n = (bs->cur.bit + bits + 7) >> 3;
	/* n may be at most 9, if cur.bit + bits > 64 */
	/* which means this copies at most 8 byte */
	if (n) {
		memcpy(&val, bs->cur.b+1, n - 1);
		val = le64_to_cpu(val) << (8 - bs->cur.bit);
	}

	/* we still need the low bits */
	val |= bs->cur.b[0] >> bs->cur.bit;

	/* and mask out bits we don't want */
	val &= ~0ULL >> (64 - bits);

	bitstream_cursor_advance(&bs->cur, bits);
	*out = val;

	return bits;
}

/* encodes @in as vli into @bs;

 * return values
 *  > 0: number of bits successfully stored in bitstream
 * -ENOBUFS @bs is full
 * -EINVAL input zero (invalid)
 * -EOVERFLOW input too large for this vli code (invalid)
 */
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
{
	u64 code;
	int bits = __vli_encode_bits(&code, in);

	if (bits <= 0)
		return bits;

	return bitstream_put_bits(bs, code, bits);
}

#endif
Initial commit 2023-08-30 17:53:23 +02:00			`/* SPDX-License-Identifier: GPL-2.0-only */`
			`/*`
			`-- linux-c --`
			`drbd_receiver.c`
			`This file is part of DRBD by Philipp Reisner and Lars Ellenberg.`

			`Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.`
			`Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.`
			`Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.`

			`*/`

			`#ifndef _DRBD_VLI_H`
			`#define _DRBD_VLI_H`

			`/*`
			`* At a granularity of 4KiB storage represented per bit,`
			`* and stroage sizes of several TiB,`
			`* and possibly small-bandwidth replication,`
			`* the bitmap transfer time can take much too long,`
			`* if transmitted in plain text.`
			`*`
			`* We try to reduce the transferred bitmap information`
			`* by encoding runlengths of bit polarity.`
			`*`
			`* We never actually need to encode a "zero" (runlengths are positive).`
			`* But then we have to store the value of the first bit.`
			`* The first bit of information thus shall encode if the first runlength`
			`* gives the number of set or unset bits.`
			`*`
			`* We assume that large areas are either completely set or unset,`
			`* which gives good compression with any runlength method,`
			`* even when encoding the runlength as fixed size 32bit/64bit integers.`
			`*`
			`* Still, there may be areas where the polarity flips every few bits,`
			`* and encoding the runlength sequence of those areas with fix size`
			`* integers would be much worse than plaintext.`
			`*`
			`* We want to encode small runlength values with minimum code length,`
			`* while still being able to encode a Huge run of all zeros.`
			`*`
			`* Thus we need a Variable Length Integer encoding, VLI.`
			`*`
			`* For some cases, we produce more code bits than plaintext input.`
			`* We need to send incompressible chunks as plaintext, skip over them`
			`* and then see if the next chunk compresses better.`
			`*`
			`* We don't care too much about "excellent" compression ratio for large`
			`* runlengths (all set/all clear): whether we achieve a factor of 100`
			`* or 1000 is not that much of an issue.`
			`* We do not want to waste too much on short runlengths in the "noisy"`
			`* parts of the bitmap, though.`
			`*`
			`* There are endless variants of VLI, we experimented with:`
			`* * simple byte-based`
			`* * various bit based with different code word length.`
			`*`
			`* To avoid yet an other configuration parameter (choice of bitmap compression`
			`* algorithm) which was difficult to explain and tune, we just chose the one`
			`* variant that turned out best in all test cases.`
			`* Based on real world usage patterns, with device sizes ranging from a few GiB`
			`* to several TiB, file server/mailserver/webserver/mysql/postgress,`
			`* mostly idle to really busy, the all time winner (though sometimes only`
			`* marginally better) is:`
			`*/`

			`/*`
			`* encoding is "visualised" as`
			`* __little endian__ bitstream, least significant bit first (left most)`
			`*`
			`* this particular encoding is chosen so that the prefix code`
			`* starts as unary encoding the level, then modified so that`
			`* 10 levels can be described in 8bit, with minimal overhead`
			`* for the smaller levels.`
			`*`
			`* Number of data bits follow fibonacci sequence, with the exception of the`
			`* last level (+1 data bit, so it makes 64bit total). The only worse code when`
			`* encoding bit polarity runlength is 1 plain bits => 2 code bits.`
			`prefix data bits max val Nº data bits`
			`0 x 0x2 1`
			`10 x 0x4 1`
			`110 xx 0x8 2`
			`1110 xxx 0x10 3`
			`11110 xxx xx 0x30 5`
			`111110 xx xxxxxx 0x130 8`
			`11111100 xxxxxxxx xxxxx 0x2130 13`
			`11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21`
			`11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34`
			`11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56`
			`* maximum encodable value: 0x100000400202130 == 2*56 + some /`

			`/* compression "table":`
			`transmitted x 0.29`
			`as plaintext x ........................`
			`x ........................`
			`x ........................`
			`x 0.59 0.21........................`
			`x ........................................................`
			`x .. c ...................................................`
			`x 0.44.. o ...................................................`
			`x .......... d ...................................................`
			`x .......... e ...................................................`
			`X............. ...................................................`
			`x.............. b ...................................................`
			`2.0x............... i ...................................................`
			`#X................ t ...................................................`
			`#................. s ........................... plain bits ..........`
			`-+-----------------------------------------------------------------------`
			`1 16 32 64`
			`*/`

			`/* LEVEL: (total bits, prefix bits, prefix value),`
			`* sorted ascending by number of total bits.`
			`* The rest of the code table is calculated at compiletime from this. */`

			`/* fibonacci data 1, 1, ... */`
			`#define VLI_L_1_1() do { \`
			`LEVEL( 2, 1, 0x00); \`
			`LEVEL( 3, 2, 0x01); \`
			`LEVEL( 5, 3, 0x03); \`
			`LEVEL( 7, 4, 0x07); \`
			`LEVEL(10, 5, 0x0f); \`
			`LEVEL(14, 6, 0x1f); \`
			`LEVEL(21, 8, 0x3f); \`
			`LEVEL(29, 8, 0x7f); \`
			`LEVEL(42, 8, 0xbf); \`
			`LEVEL(64, 8, 0xff); \`
			`} while (0)`

			`/* finds a suitable level to decode the least significant part of in.`
			`* returns number of bits consumed.`
			`*`
			`* BUG() for bad input, as that would mean a buggy code table. */`
			`static inline int vli_decode_bits(u64 *out, const u64 in)`
			`{`
			`u64 adj = 1;`

			`#define LEVEL(t,b,v) \`
			`do { \`
			`if ((in & ((1 << b) -1)) == v) { \`
			`*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \`
			`return t; \`
			`} \`
			`adj += 1ULL << (t - b); \`
			`} while (0)`

			`VLI_L_1_1();`

			`/* NOT REACHED, if VLI_LEVELS code table is defined properly */`
			`BUG();`
			`#undef LEVEL`
			`}`

			`/* return number of code bits needed,`
			`* or negative error number */`
			`static inline int __vli_encode_bits(u64 *out, const u64 in)`
			`{`
			`u64 max = 0;`
			`u64 adj = 1;`

			`if (in == 0)`
			`return -EINVAL;`

			`#define LEVEL(t,b,v) do { \`
			`max += 1ULL << (t - b); \`
			`if (in <= max) { \`
			`if (out) \`
			`*out = ((in - adj) << b) \| v; \`
			`return t; \`
			`} \`
			`adj = max + 1; \`
			`} while (0)`

			`VLI_L_1_1();`

			`return -EOVERFLOW;`
			`#undef LEVEL`
			`}`

			`#undef VLI_L_1_1`

			`/* code from here down is independend of actually used bit code */`

			`/*`
			`* Code length is determined by some unique (e.g. unary) prefix.`
			`* This encodes arbitrary bit length, not whole bytes: we have a bit-stream,`
			`* not a byte stream.`
			`*/`

			`/* for the bitstream, we need a cursor */`
			`struct bitstream_cursor {`
			`/* the current byte */`
			`u8 *b;`
			`/* the current bit within b, nomalized: 0..7 /`
			`unsigned int bit;`
			`};`

			`/* initialize cursor to point to first bit of stream */`
			`static inline void bitstream_cursor_reset(struct bitstream_cursor cur, void s)`
			`{`
			`cur->b = s;`
			`cur->bit = 0;`
			`}`

			`/* advance cursor by that many bits; maximum expected input value: 64,`
			`* but depending on VLI implementation, it may be more. */`
			`static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)`
			`{`
			`bits += cur->bit;`
			`cur->b = cur->b + (bits >> 3);`
			`cur->bit = bits & 7;`
			`}`

			`/* the bitstream itself knows its length */`
			`struct bitstream {`
			`struct bitstream_cursor cur;`
			`unsigned char *buf;`
			`size_t buf_len; /* in bytes */`

			`/* for input stream:`
			`* number of trailing 0 bits for padding`
			`* total number of valid bits in stream: buf_len * 8 - pad_bits */`
			`unsigned int pad_bits;`
			`};`

			`static inline void bitstream_init(struct bitstream bs, void s, size_t len, unsigned int pad_bits)`
			`{`
			`bs->buf = s;`
			`bs->buf_len = len;`
			`bs->pad_bits = pad_bits;`
			`bitstream_cursor_reset(&bs->cur, bs->buf);`
			`}`

			`static inline void bitstream_rewind(struct bitstream *bs)`
			`{`
			`bitstream_cursor_reset(&bs->cur, bs->buf);`
			`memset(bs->buf, 0, bs->buf_len);`
			`}`

			`/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.`
			`* Ignores "pad_bits".`
			`* Returns zero if bits == 0 (nothing to do).`
			`* Returns number of bits used if successful.`
			`*`
			`* If there is not enough room left in bitstream,`
			`* leaves bitstream unchanged and returns -ENOBUFS.`
			`*/`
			`static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)`
			`{`
			`unsigned char *b = bs->cur.b;`
			`unsigned int tmp;`

			`if (bits == 0)`
			`return 0;`

			`if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)`
			`return -ENOBUFS;`

			`/* paranoia: strip off hi bits; they should not be set anyways. */`
			`if (bits < 64)`
			`val &= ~0ULL >> (64 - bits);`

			`*b++ \|= (val & 0xff) << bs->cur.bit;`

			`for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)`
			`*b++ \|= (val >> tmp) & 0xff;`

			`bitstream_cursor_advance(&bs->cur, bits);`
			`return bits;`
			`}`

			`/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.`
			`*`
			`* If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.`
			`*`
			`* If there are less than the requested number of valid bits left in the`
			`* bitstream, still fetches all available bits.`
			`*`
			`* Returns number of actually fetched bits.`
			`*/`
			`static inline int bitstream_get_bits(struct bitstream bs, u64 out, int bits)`
			`{`
			`u64 val;`
			`unsigned int n;`

			`if (bits > 64)`
			`return -EINVAL;`

			`if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)`
			`bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)`
			`- bs->cur.bit - bs->pad_bits;`

			`if (bits == 0) {`
			`*out = 0;`
			`return 0;`
			`}`

			`/* get the high bits */`
			`val = 0;`
			`n = (bs->cur.bit + bits + 7) >> 3;`
			`/* n may be at most 9, if cur.bit + bits > 64 */`
			`/* which means this copies at most 8 byte */`
			`if (n) {`
			`memcpy(&val, bs->cur.b+1, n - 1);`
			`val = le64_to_cpu(val) << (8 - bs->cur.bit);`
			`}`

			`/* we still need the low bits */`
			`val \|= bs->cur.b[0] >> bs->cur.bit;`

			`/* and mask out bits we don't want */`
			`val &= ~0ULL >> (64 - bits);`

			`bitstream_cursor_advance(&bs->cur, bits);`
			`*out = val;`

			`return bits;`
			`}`

			`/* encodes @in as vli into @bs;`

			`* return values`
			`* > 0: number of bits successfully stored in bitstream`
			`* -ENOBUFS @bs is full`
			`* -EINVAL input zero (invalid)`
			`* -EOVERFLOW input too large for this vli code (invalid)`
			`*/`
			`static inline int vli_encode_bits(struct bitstream *bs, u64 in)`
			`{`
			`u64 code;`
			`int bits = __vli_encode_bits(&code, in);`

			`if (bits <= 0)`
			`return bits;`

			`return bitstream_put_bits(bs, code, bits);`
			`}`

			`#endif`