From 6f6c41992772b279ebb2fb0d8f786e98292f23af Mon Sep 17 00:00:00 2001 From: Dongwoo Lee Date: Thu, 4 Jun 2020 13:54:44 +0900 Subject: [PATCH] lib: zstd: Improve decode performance To speed up decode performance, optimizations are brought from zstd github repository and ported as kernel-style. Since the low-level algorithm is preferred in linux due to compression/decompression performance (default level 3), the optimization for low-level is chosed as follows: [1] lib: zstd: Speed up single segment zstd_fast by 5% (https://github.com/facebook/zstd/pull/1562/commits/95624b77e477752b3c380c22be7bcf67f06c9934) [2] perf improvements for zstd decode (https://github.com/facebook/zstd/pull/1668/commits/29d1e81bbdfc21085529623e7bc5abcb3e1627ae) [3] updated double_fast complementary insertion (https://github.com/facebook/zstd/pull/1681/commits/d1327738c277643f09c972a407083ad73c8ecf7b) [4] Improvements in zstd decode performance (https://github.com/facebook/zstd/pull/1756/commits/b83059958246dfcb5b91af9c187fad8c706869a0) [5] Optimize decompression and fix wildcopy overread (https://github.com/facebook/zstd/pull/1804/commits/efd37a64eaff5a0a26ae2566fdb45dc4a0c91673) [6] Improve ZSTD_highbit32's codegen (https://github.com/facebook/zstd/commit/a07da7b0db682c170a330a8c21585be3d68275fa) [7] Optimize decompression speed for gcc and clang (#1892) (https://github.com/facebook/zstd/commit/718f00ff6fe42db7e6ba09a7f7992b3e85283f77) [8] Fix performance regression on aarch64 with clang (https://github.com/facebook/zstd/pull/1973/commits/cb2abc3dbe010113d9e00ca3b612bf61983145a2) Change-Id: Ia2cf120879a415988dbbc2fce59a994915c8c77c Signed-off-by: Dongwoo Lee --- lib/zstd/bitstream.h | 25 ++- lib/zstd/compress.c | 201 +++++++++++++------ lib/zstd/decompress.c | 490 ++++++++++++++++++++-------------------------- lib/zstd/huf_decompress.c | 24 ++- lib/zstd/zstd_internal.h | 95 ++++++--- lib/zstd/zstd_opt.h | 4 +- 6 files changed, 450 insertions(+), 389 deletions(-) diff --git a/lib/zstd/bitstream.h b/lib/zstd/bitstream.h index 3a49784..8fbe2ed 100644 --- a/lib/zstd/bitstream.h +++ b/lib/zstd/bitstream.h @@ -145,7 +145,7 @@ ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits); /*-************************************************************** * Internal functions ****************************************************************/ -ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); } +ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return __builtin_clz(val) ^ 31; } /*===== Local Constants =====*/ static const unsigned BIT_mask[] = {0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, @@ -333,6 +333,24 @@ ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits) return value; } +/*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! + * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this + * point you must use BIT_reloadDStream() to reload. + */ +ZSTD_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t *bitD) +{ + if (unlikely(bitD->ptr < bitD->start + sizeof(bitD->bitContainer))) + return BIT_DStream_overflow; + + bitD->ptr -= bitD->bitsConsumed >> 3; + bitD->bitsConsumed &= 7; + bitD->bitContainer = ZSTD_readLEST(bitD->ptr); + + return BIT_DStream_unfinished; +} + /*! BIT_reloadDStream() : * Refill `bitD` from buffer previously set in BIT_initDStream() . * This function is safe, it guarantees it will not read beyond src buffer. @@ -344,10 +362,7 @@ ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD) return BIT_DStream_overflow; if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) { - bitD->ptr -= bitD->bitsConsumed >> 3; - bitD->bitsConsumed &= 7; - bitD->bitContainer = ZSTD_readLEST(bitD->ptr); - return BIT_DStream_unfinished; + return BIT_reloadDStreamFast(bitD); } if (bitD->ptr == bitD->start) { if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8) diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c index 5e0b670..4a33566 100644 --- a/lib/zstd/compress.c +++ b/lib/zstd/compress.c @@ -853,15 +853,46 @@ ZSTD_STATIC size_t ZSTD_compressSequences(ZSTD_CCtx *zc, void *dst, size_t dstCa return cSize; } +/*! ZSTD_safecopyLiterals() : + * memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w. + * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single + * large copies. + */ +static void ZSTD_safecopyLiterals(BYTE *op, BYTE const *ip, BYTE const * const iend, BYTE const *ilimit_w) +{ + if (ip <= ilimit_w) { + ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); + + op += ilimit_w - ip; + ip = ilimit_w; + } + while (ip < iend) + *op++ = *ip++; +} + /*! ZSTD_storeSeq() : Store a sequence (literal length, literals, offset code and match length code) into seqStore_t. `offsetCode` : distance to match, or 0 == repCode. `matchCode` : matchLength - MINMATCH */ -ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, U32 offsetCode, size_t matchCode) +ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, const BYTE *litLimit, U32 offsetCode, size_t matchCode) { + BYTE const * const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; + BYTE const * const litEnd = literals + litLength; /* copy Literals */ - ZSTD_wildcopy(seqStorePtr->lit, literals, litLength); + /* We are guaranteed at least 8 bytes of literals space because of HASH_READ_SIZE and + * MINMATCH. + */ + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. + * First copy 16 bytes, because literals are likely short. + */ + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); + } else { + ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w); + } seqStorePtr->lit += litLength; /* literal Length */ @@ -1010,9 +1041,12 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t sr U32 *const hashTable = cctx->hashTable; U32 const hBits = cctx->params.cParams.hashLog; seqStore_t *seqStorePtr = &(cctx->seqStore); + size_t const stepSize = 2; const BYTE *const base = cctx->base; const BYTE *const istart = (const BYTE *)src; - const BYTE *ip = istart; + /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ + const BYTE *ip0 = istart; + const BYTE *ip1; const BYTE *anchor = istart; const U32 lowestIndex = cctx->dictLimit; const BYTE *const lowest = base + lowestIndex; @@ -1022,9 +1056,10 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t sr U32 offsetSaved = 0; /* init */ - ip += (ip == lowest); + ip0 += (ip0 == lowest); + ip1 = ip0 + 1; { - U32 const maxRep = (U32)(ip - lowest); + U32 const maxRep = (U32)(ip0 - lowest); if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; if (offset_1 > maxRep) @@ -1032,58 +1067,88 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t sr } /* Main Search Loop */ - while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + while (ip1 < ilimit) { /* < instead of <=, because check at (ip0+2) */ size_t mLength; - size_t const h = ZSTD_hashPtr(ip, hBits, mls); - U32 const curr = (U32)(ip - base); - U32 const matchIndex = hashTable[h]; - const BYTE *match = base + matchIndex; - hashTable[h] = curr; /* update hash table */ - - if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { - mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); - } else { - U32 offset; - if ((matchIndex <= lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) { - ip += ((ip - anchor) >> g_searchStrength) + 1; - continue; - } - mLength = ZSTD_count(ip + 4, match + 4, iend) + 4; - offset = (U32)(ip - match); - while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) { - ip--; - match--; - mLength++; - } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; + BYTE const *ip2 = ip0 + 2; + size_t const h0 = ZSTD_hashPtr(ip0, hBits, mls); + U32 const val0 = ZSTD_read32(ip0); + size_t const h1 = ZSTD_hashPtr(ip1, hBits, mls); + U32 const val1 = ZSTD_read32(ip1); + U32 const current0 = (U32)(ip0 - base); + U32 const current1 = (U32)(ip1 - base); + U32 const matchIndex0 = hashTable[h0]; + U32 const matchIndex1 = hashTable[h1]; + BYTE const *repMatch = ip2 - offset_1; + const BYTE *match0 = base + matchIndex0; + const BYTE *match1 = base + matchIndex1; + U32 offcode; + + hashTable[h0] = current0; + hashTable[h1] = current1; + + if ((offset_1 > 0) & (ZSTD_read32(repMatch) == ZSTD_read32(ip2))) { + mLength = ip2[-1] == repMatch[-1] ? 1 : 0; + ip0 = ip2 - mLength; + match0 = repMatch - mLength; + offcode = 0; + goto _match; + } + if ((matchIndex0 > lowestIndex) && ZSTD_read32(match0) == val0) { + goto _offset; + } + if ((matchIndex1 > lowestIndex) && ZSTD_read32(match1) == val1) { + ip0 = ip1; + match0 = match1; + goto _offset; + } + { + size_t const step = ((ip0 - anchor) >> (g_searchStrength - 1)) + stepSize; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + ip0 += step; + ip1 += step; + continue; } +_offset: /* Requires: ip0, match0 */ + /* Compute the offset code */ + offset_2 = offset_1; + offset_1 = (U32)(ip0 - match0); + offcode = offset_1 + ZSTD_REP_MOVE; + mLength = 0; + /* Count the backwards match length */ + while (((ip0 > anchor) & (match0 > lowest)) && (ip0[-1] == match0[-1])) { + ip0--; + match0--; + mLength++; + } /* catch up */ + +_match: /* Requires: ip0, match0, offcode */ + /* Count the forward length */ + mLength += ZSTD_count(ip0 + mLength + 4, match0 + mLength + 4, iend) + 4; + ZSTD_storeSeq(seqStorePtr, ip0-anchor, anchor, iend, offcode, mLength - MINMATCH); /* match found */ - ip += mLength; - anchor = ip; + ip0 += mLength; + anchor = ip0; + ip1 = ip0 + 1; - if (ip <= ilimit) { + if (ip0 <= ilimit) { /* Fill Table */ - hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base); + hashTable[ZSTD_hashPtr(base + current0 + 2, hBits, mls)] = current0 + 2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0 - 2, hBits, mls)] = (U32)(ip0 - 2 - base); /* check immediate repcode */ - while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) { + while ((ip0 <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip0) == ZSTD_read32(ip0 - offset_2)))) { /* store sequence */ - size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4; + size_t const rLength = ZSTD_count(ip0 + 4, ip0 + 4 - offset_2, iend) + 4; { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ - hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base); - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH); - ip += rLength; - anchor = ip; + hashTable[ZSTD_hashPtr(ip0, hBits, mls)] = (U32)(ip0 - base); + ip0 += rLength; + ip1 = ip0 + 1; + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, rLength - MINMATCH); + anchor = ip0; continue; /* faster when present ... (?) */ } } @@ -1150,7 +1215,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void * const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend; mLength = ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32; ip++; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, 0, mLength - MINMATCH); } else { if ((matchIndex < lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) { ip += ((ip - anchor) >> g_searchStrength) + 1; @@ -1169,7 +1234,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void * offset = curr - matchIndex; offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, offset + ZSTD_REP_MOVE, mLength - MINMATCH); } } @@ -1194,7 +1259,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void * U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH); + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, repLength2 - MINMATCH); hashTable[ZSTD_hashPtr(ip, hBits, mls)] = curr2; ip += repLength2; anchor = ip; @@ -1294,7 +1359,7 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, siz if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { /* note : by construction, offset_1 <= curr */ mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4; ip++; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, 0, mLength - MINMATCH); } else { U32 offset; if ((matchIndexL > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) { @@ -1336,7 +1401,7 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, siz offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, offset + ZSTD_REP_MOVE, mLength - MINMATCH); } /* match found */ @@ -1345,10 +1410,14 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, siz if (ip <= ilimit) { /* Fill Table */ - hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = - curr + 2; /* here because curr+2 could be > iend-8 */ - hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base); + { + U32 const insert_idx = curr + 2; + hashLong[ZSTD_hashPtr(base + insert_idx, hBitsL, 8)] = insert_idx; + hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base); + hashSmall[ZSTD_hashPtr(base + insert_idx, hBitsS, mls)] = insert_idx; + hashSmall[ZSTD_hashPtr(ip - 1, hBitsS, mls)] = (U32)(ip - 1 - base); + } /* check immediate repcode */ while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) { /* store sequence */ @@ -1360,7 +1429,7 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, siz } /* swap offset_2 <=> offset_1 */ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base); hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base); - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, rLength - MINMATCH); ip += rLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -1437,7 +1506,7 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend; mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, lowPrefixPtr) + 4; ip++; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, 0, mLength - MINMATCH); } else { if ((matchLongIndex > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) { const BYTE *matchEnd = matchLongIndex < dictLimit ? dictEnd : iend; @@ -1452,7 +1521,7 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, offset + ZSTD_REP_MOVE, mLength - MINMATCH); } else if ((matchIndex > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) { size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8); @@ -1485,7 +1554,7 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const } offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, iend, offset + ZSTD_REP_MOVE, mLength - MINMATCH); } else { ip += ((ip - anchor) >> g_searchStrength) + 1; @@ -1499,10 +1568,14 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const if (ip <= ilimit) { /* Fill Table */ - hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = curr + 2; - hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = curr + 2; - hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base); - hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base); + { + U32 const insert_idx = curr + 2; + + hashLong[ZSTD_hashPtr(base + insert_idx, hBitsL, 8)] = insert_idx; + hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base); + hashSmall[ZSTD_hashPtr(base + insert_idx, hBitsS, mls)] = insert_idx; + hashSmall[ZSTD_hashPtr(ip - 1, hBitsS, mls)] = (U32)(ip - 1 - base); + } /* check immediate repcode */ while (ip <= ilimit) { U32 const curr2 = (U32)(ip - base); @@ -1516,7 +1589,7 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH); + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, repLength2 - MINMATCH); hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = curr2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = curr2; ip += repLength2; @@ -2016,7 +2089,7 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx *ctx, const void *src, size_t src _storeSequence: { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, iend, (U32)offset, matchLength - MINMATCH); anchor = ip = start + matchLength; } @@ -2027,7 +2100,7 @@ _storeSequence: offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, matchLength - MINMATCH); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -2210,7 +2283,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, si /* store sequence */ _storeSequence : { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, iend, (U32)offset, matchLength - MINMATCH); anchor = ip = start + matchLength; } @@ -2228,7 +2301,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, si offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH); + ZSTD_storeSeq(seqStorePtr, 0, anchor, iend, 0, matchLength - MINMATCH); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c index db6761e..dd9ecf3 100644 --- a/lib/zstd/decompress.c +++ b/lib/zstd/decompress.c @@ -876,40 +876,113 @@ typedef struct { uPtrDiff gotoDict; } seqState_t; +/*! ZSTD_overlapCopy8() : + * Copies 8 bytes from ip to op and updates op and ip where ip <= op. + * If the offset is < 8 then the offset is spread to at least 8 bytes. + * + * Precondition: *ip <= *op + * Postcondition: *op - *op >= 8 + */ +static inline __attribute__((always_inline)) void ZSTD_overlapCopy8(BYTE **op, BYTE const **ip, size_t offset) +{ + if (offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9, 10, 11 }; /* subtracted */ + int const sub2 = dec64table[offset]; + (*op)[0] = (*ip)[0]; + (*op)[1] = (*ip)[1]; + (*op)[2] = (*ip)[2]; + (*op)[3] = (*ip)[3]; + *ip += dec32table[offset]; + ZSTD_copy4(*op+4, *ip); + *ip -= sub2; + } else { + ZSTD_copy8(*op, *ip); + } + *ip += 8; + *op += 8; +} + +/*! ZSTD_safecopy() : + * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer + * and write up to 16 bytes past oend_w (op >= oend_w is allowed). + * This function is only called in the uncommon case where the sequence is near the end of the block. It + * should be fast for a single long sequence, but can be slow for several short sequences. + * + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. + * The src buffer must be before the dst buffer. + */ +static void ZSTD_safecopy(BYTE *op, BYTE *const oend_w, BYTE const *ip, ptrdiff_t length, enum ZSTD_overlap_e ovtype) +{ + ptrdiff_t const diff = op - ip; + BYTE *const oend = op + length; + + if (length < 8) { + /* Handle short lengths. */ + while (op < oend) + *op++ = *ip++; + return; + } + if (ovtype == ZSTD_overlap_src_before_dst) { + /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ + ZSTD_overlapCopy8(&op, &ip, diff); + } + + if (oend <= oend_w) { + /* No risk of overwrite. */ + ZSTD_wildcopy(op, ip, length, ovtype); + return; + } + if (op <= oend_w) { + /* Wildcopy until we get close to the end. */ + ZSTD_wildcopy(op, ip, oend_w - op, ovtype); + ip += oend_w - op; + op = oend_w; + } + /* Handle the leftovers. */ + while (op < oend) + *op++ = *ip++; +} + +/* ZSTD_execSequenceEnd(): + * This version handles cases that are near the end of the output buffer. It requires + * more careful checks to make sure there is no overflow. By separating out these hard + * and unlikely cases, we can speed up the common cases. + * + * NOTE: This function needs to be fast for a single long sequence, but doesn't need + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ FORCE_NOINLINE -size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base, - const BYTE *const vBase, const BYTE *const dictEnd) +size_t ZSTD_execSequenceEnd(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const prefixStart, + const BYTE *const virtualStart, const BYTE *const dictEnd) { BYTE *const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ - BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; const BYTE *const iLitEnd = *litPtr + sequence.litLength; const BYTE *match = oLitEnd - sequence.offset; + BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; /* check */ if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ - if (oLitEnd <= oend_w) - return ERROR(GENERIC); /* Precondition */ /* copy literals */ - if (op < oend_w) { - ZSTD_wildcopy(op, *litPtr, oend_w - op); - *litPtr += oend_w - op; - op = oend_w; - } - while (op < oLitEnd) - *op++ = *(*litPtr)++; + ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); + op = oLitEnd; + *litPtr = iLitEnd; /* copy Match */ - if (sequence.offset > (size_t)(oLitEnd - base)) { + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) + if (sequence.offset > (size_t)(oLitEnd - virtualStart)) return ERROR(corruption_detected); - match = dictEnd - (base - match); + match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; @@ -920,15 +993,16 @@ size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; - match = base; + match = prefixStart; } } - while (op < oMatchEnd) - *op++ = *match++; + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } -static seq_t ZSTD_decodeSequence(seqState_t *seqState) +enum ZSTD_prefetch_e { ZSTD_p_noPrefetch = 0, ZSTD_p_prefetch = 1 }; + +static seq_t ZSTD_decodeSequence(seqState_t *seqState, int const longOffsets, const enum ZSTD_prefetch_e prefetch) { seq_t seq; @@ -955,30 +1029,47 @@ static seq_t ZSTD_decodeSequence(seqState_t *seqState) /* sequence */ { size_t offset; - if (!ofCode) - offset = 0; - else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (ZSTD_32bits()) - BIT_reloadDStream(&seqState->DStream); - } - if (ofCode <= 1) { - offset += (llCode == 0); - if (offset) { - size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; + if (ofCode > 1) { + if (longOffsets) { + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN); + offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + if (ZSTD_32bits() || extraBits) + BIT_reloadDStream(&seqState->DStream); + if (extraBits) + offset += BIT_readBitsFast(&seqState->DStream, extraBits); } else { - offset = seqState->prevOffset[0]; + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (ZSTD_32bits()) + BIT_reloadDStream(&seqState->DStream); } - } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset; + + } else { + U32 const ll0 = (llCode == 0); + + if (likely((ofCode == 0))) { + if (likely(!ll0)) + offset = seqState->prevOffset[0]; + else { + offset = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + } else { + offset = OF_base[ofCode] + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { + size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } + } } seq.offset = offset; } @@ -991,15 +1082,30 @@ static seq_t ZSTD_decodeSequence(seqState_t *seqState) if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog))) BIT_reloadDStream(&seqState->DStream); - /* ANS state update */ + seq.match = NULL; + + if (prefetch == ZSTD_p_prefetch) { + size_t const pos = seqState->pos + seq.litLength; + const BYTE *const matchBase = (seq.offset > pos) ? seqState->gotoDict + seqState->base : seqState->base; + seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : no memory access will occur, offset is only used for prefetching */ + seqState->pos = pos + seq.matchLength; + } + + /* ANS state update + * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). + * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). + * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the + * better option, so it is the default for other compilers. But, if you + * measure that it is worse, please put up a pull request. + */ + FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ if (ZSTD_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - seq.match = NULL; - return seq; } @@ -1014,26 +1120,24 @@ size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE const BYTE *const iLitEnd = *litPtr + sequence.litLength; const BYTE *match = oLitEnd - sequence.offset; - /* check */ - if (oMatchEnd > oend) - return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ - if (iLitEnd > litLimit) - return ERROR(corruption_detected); /* over-read beyond lit buffer */ - if (oLitEnd > oend_w) - return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); - - /* copy Literals */ - ZSTD_copy8(op, *litPtr); - if (sequence.litLength > 8) - ZSTD_wildcopy(op + 8, (*litPtr) + 8, - sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + /* Errors and uncommon cases handled here. */ + if (unlikely(iLitEnd > litLimit || oMatchEnd > oend_w)) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + ZSTD_copy16(op, *litPtr); + if (unlikely(sequence.litLength > 16)) + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); op = oLitEnd; - *litPtr = iLitEnd; /* update for next sequence */ + *litPtr = iLitEnd; /* update for next sequence */ /* copy Match */ if (sequence.offset > (size_t)(oLitEnd - base)) { /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) + if (unlikely(sequence.offset > (size_t)(oLitEnd - vBase))) return ERROR(corruption_detected); match = dictEnd + (match - base); if (match + sequence.matchLength <= dictEnd) { @@ -1047,45 +1151,27 @@ size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE op = oLitEnd + length1; sequence.matchLength -= length1; match = base; - if (op > oend_w || sequence.matchLength < MINMATCH) { - U32 i; - for (i = 0; i < sequence.matchLength; ++i) - op[i] = match[i]; - return sequenceLength; - } } } - /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ - /* match within prefix */ - if (sequence.offset < 8) { - /* close range match, overlap */ - static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */ - static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */ - int const sub2 = dec64table[sequence.offset]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[sequence.offset]; - ZSTD_copy4(op + 4, match); - match -= sub2; - } else { - ZSTD_copy8(op, match); + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (likely(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; } - op += 8; - match += 8; - if (oMatchEnd > oend - (16 - MINMATCH)) { - if (op < oend_w) { - ZSTD_wildcopy(op, match, oend_w - op); - match += oend_w - op; - op = oend_w; - } - while (op < oMatchEnd) - *op++ = *match++; - } else { - ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */ + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); } return sequenceLength; } @@ -1094,7 +1180,7 @@ static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDst { const BYTE *ip = (const BYTE *)seqStart; const BYTE *const iend = ip + seqSize; - BYTE *const ostart = (BYTE * const)dst; + BYTE *const ostart = (BYTE *const)dst; BYTE *const oend = ostart + maxDstSize; BYTE *op = ostart; const BYTE *litPtr = dctx->litPtr; @@ -1115,6 +1201,7 @@ static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDst /* Regen sequences */ if (nbSeq) { seqState_t seqState; + size_t error = 0; dctx->fseEntropy = 1; { U32 i; @@ -1126,18 +1213,29 @@ static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDst FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) { - nbSeq--; - { - seq_t const sequence = ZSTD_decodeSequence(&seqState); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); - if (ZSTD_isError(oneSeqSize)) - return oneSeqSize; + for ( ; ; ) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, 0, ZSTD_p_noPrefetch); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); + + BIT_reloadDStream(&(seqState.DStream)); + /* gcc and clang both don't like early returns in this loop. + * gcc doesn't like early breaks either. + * Instead save an error and report it at the end. + * When there is an error, don't increment op, so we don't + * overwrite. + */ + if (unlikely(ZSTD_isError(oneSeqSize))) + error = oneSeqSize; + else op += oneSeqSize; - } + + if (unlikely(!--nbSeq)) + break; } /* check if reached exact end */ + if (ZSTD_isError(error)) + return error; if (nbSeq) return ERROR(corruption_detected); /* save reps for next block */ @@ -1160,196 +1258,20 @@ static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDst return op - ostart; } -FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t *seqState, int const longOffsets) -{ - seq_t seq; - - U32 const llCode = FSE_peekSymbol(&seqState->stateLL); - U32 const mlCode = FSE_peekSymbol(&seqState->stateML); - U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ - - U32 const llBits = LL_bits[llCode]; - U32 const mlBits = ML_bits[mlCode]; - U32 const ofBits = ofCode; - U32 const totalBits = llBits + mlBits + ofBits; - - static const U32 LL_base[MaxLL + 1] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, - 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000}; - - static const U32 ML_base[MaxML + 1] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41, - 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003}; - - static const U32 OF_base[MaxOff + 1] = {0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD, - 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, - 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD}; - - /* sequence */ - { - size_t offset; - if (!ofCode) - offset = 0; - else { - if (longOffsets) { - int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN); - offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - if (ZSTD_32bits() || extraBits) - BIT_reloadDStream(&seqState->DStream); - if (extraBits) - offset += BIT_readBitsFast(&seqState->DStream, extraBits); - } else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (ZSTD_32bits()) - BIT_reloadDStream(&seqState->DStream); - } - } - - if (ofCode <= 1) { - offset += (llCode == 0); - if (offset) { - size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } else { - offset = seqState->prevOffset[0]; - } - } else { - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } - seq.offset = offset; - } - - seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ - if (ZSTD_32bits() && (mlBits + llBits > 24)) - BIT_reloadDStream(&seqState->DStream); - - seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ - if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - - { - size_t const pos = seqState->pos + seq.litLength; - seq.match = seqState->base + pos - seq.offset; /* single memory segment */ - if (seq.offset > pos) - seq.match += seqState->gotoDict; /* separate memory segment */ - seqState->pos = pos + seq.matchLength; - } - - /* ANS state update */ - FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (ZSTD_32bits()) - BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - - return seq; -} - -static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize) +static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize, enum ZSTD_prefetch_e const prefetch) { if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) { - return ZSTD_decodeSequenceLong_generic(seqState, 1); + return ZSTD_decodeSequence(seqState, 1, prefetch); } else { - return ZSTD_decodeSequenceLong_generic(seqState, 0); + return ZSTD_decodeSequence(seqState, 0, prefetch); } } -FORCE_INLINE -size_t ZSTD_execSequenceLong(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base, - const BYTE *const vBase, const BYTE *const dictEnd) -{ - BYTE *const oLitEnd = op + sequence.litLength; - size_t const sequenceLength = sequence.litLength + sequence.matchLength; - BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ - BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; - const BYTE *const iLitEnd = *litPtr + sequence.litLength; - const BYTE *match = sequence.match; - - /* check */ - if (oMatchEnd > oend) - return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ - if (iLitEnd > litLimit) - return ERROR(corruption_detected); /* over-read beyond lit buffer */ - if (oLitEnd > oend_w) - return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); - - /* copy Literals */ - ZSTD_copy8(op, *litPtr); - if (sequence.litLength > 8) - ZSTD_wildcopy(op + 8, (*litPtr) + 8, - sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ - op = oLitEnd; - *litPtr = iLitEnd; /* update for next sequence */ - - /* copy Match */ - if (sequence.offset > (size_t)(oLitEnd - base)) { - /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) - return ERROR(corruption_detected); - if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); - return sequenceLength; - } - /* span extDict & currPrefixSegment */ - { - size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = base; - if (op > oend_w || sequence.matchLength < MINMATCH) { - U32 i; - for (i = 0; i < sequence.matchLength; ++i) - op[i] = match[i]; - return sequenceLength; - } - } - } - /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ - - /* match within prefix */ - if (sequence.offset < 8) { - /* close range match, overlap */ - static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */ - static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */ - int const sub2 = dec64table[sequence.offset]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[sequence.offset]; - ZSTD_copy4(op + 4, match); - match -= sub2; - } else { - ZSTD_copy8(op, match); - } - op += 8; - match += 8; - - if (oMatchEnd > oend - (16 - MINMATCH)) { - if (op < oend_w) { - ZSTD_wildcopy(op, match, oend_w - op); - match += oend_w - op; - op = oend_w; - } - while (op < oMatchEnd) - *op++ = *match++; - } else { - ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */ - } - return sequenceLength; -} - static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize) { const BYTE *ip = (const BYTE *)seqStart; const BYTE *const iend = ip + seqSize; - BYTE *const ostart = (BYTE * const)dst; + BYTE *const ostart = (BYTE *const)dst; BYTE *const oend = ostart + maxDstSize; BYTE *op = ostart; const BYTE *litPtr = dctx->litPtr; @@ -1394,16 +1316,18 @@ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t ma /* prepare in advance */ for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb < seqAdvance; seqNb++) { - sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize); + sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize, ZSTD_p_prefetch); + ZSTD_PREFETCH(sequences[seqNb].match); + ZSTD_PREFETCH(sequences[seqNb].match + sequences[seqNb].matchLength - 1); } if (seqNb < seqAdvance) return ERROR(corruption_detected); /* decode and decompress */ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb < nbSeq; seqNb++) { - seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize); + seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize, ZSTD_p_prefetch); size_t const oneSeqSize = - ZSTD_execSequenceLong(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ZSTD_PREFETCH(sequence.match); @@ -1416,7 +1340,7 @@ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t ma /* finish queue */ seqNb -= seqAdvance; for (; seqNb < nbSeq; seqNb++) { - size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; @@ -1566,7 +1490,7 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) static size_t ZSTD_decompressFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void **srcPtr, size_t *srcSizePtr) { const BYTE *ip = (const BYTE *)(*srcPtr); - BYTE *const ostart = (BYTE * const)dst; + BYTE *const ostart = (BYTE *const)dst; BYTE *const oend = ostart + dstCapacity; BYTE *op = ostart; size_t remainingSize = *srcSizePtr; diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c index 6526482..6753153 100644 --- a/lib/zstd/huf_decompress.c +++ b/lib/zstd/huf_decompress.c @@ -254,6 +254,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const BYTE *const istart = (const BYTE *)cSrc; BYTE *const ostart = (BYTE *)dst; BYTE *const oend = ostart + dstSize; + BYTE *const olimit = oend - 3; const void *const dtPtr = DTable + 1; const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr; @@ -278,7 +279,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, BYTE *op2 = opStart2; BYTE *op3 = opStart3; BYTE *op4 = opStart4; - U32 endSignal; + U32 endSignal = 1; DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; @@ -306,8 +307,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, } /* 16-32 symbols per loop (4-8 symbols per stream) */ - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) { + for ( ; (endSignal) & (op4 < olimit); ) { HUF_DECODE_SYMBOLX2_2(op1, &bitD1); HUF_DECODE_SYMBOLX2_2(op2, &bitD2); HUF_DECODE_SYMBOLX2_2(op3, &bitD3); @@ -324,7 +324,10 @@ static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, HUF_DECODE_SYMBOLX2_0(op2, &bitD2); HUF_DECODE_SYMBOLX2_0(op3, &bitD3); HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; } /* check corruption */ @@ -713,6 +716,7 @@ static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const BYTE *const istart = (const BYTE *)cSrc; BYTE *const ostart = (BYTE *)dst; BYTE *const oend = ostart + dstSize; + BYTE *const olimit = oend - (sizeof(size_t) - 1); const void *const dtPtr = DTable + 1; const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr; @@ -737,7 +741,7 @@ static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, BYTE *op2 = opStart2; BYTE *op3 = opStart3; BYTE *op4 = opStart4; - U32 endSignal; + U32 endSignal = 1; DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; @@ -765,8 +769,7 @@ static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, } /* 16-32 symbols per loop (4-8 symbols per stream) */ - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) { + for ( ; (endSignal) & (op4 < olimit); ) { HUF_DECODE_SYMBOLX4_2(op1, &bitD1); HUF_DECODE_SYMBOLX4_2(op2, &bitD2); HUF_DECODE_SYMBOLX4_2(op3, &bitD3); @@ -783,8 +786,11 @@ static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, HUF_DECODE_SYMBOLX4_0(op2, &bitD2); HUF_DECODE_SYMBOLX4_0(op3, &bitD3); HUF_DECODE_SYMBOLX4_0(op4, &bitD4); - - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + endSignal = likely( + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); } /* check corruption */ diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h index dac7533..53950d2 100644 --- a/lib/zstd/zstd_internal.h +++ b/lib/zstd/zstd_internal.h @@ -126,34 +126,77 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG; /*-******************************************* * Shared functions to include for inlining *********************************************/ -ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) { - /* - * zstd relies heavily on gcc being able to analyze and inline this - * memcpy() call, since it is called in a tight loop. Preboot mode - * is compiled in freestanding mode, which stops gcc from analyzing - * memcpy(). Use __builtin_memcpy() to tell gcc to analyze this as a - * regular memcpy(). - */ - __builtin_memcpy(dst, src, 8); + +FORCE_INLINE void ZSTD_copy8(void *dst, const void *src) +{ + memcpy(dst, src, 8); +} + +FORCE_INLINE void ZSTD_copy16(void *dst, const void *src) +{ + memcpy(dst, src, 16); } + +enum ZSTD_overlap_e { + ZSTD_no_overlap, + ZSTD_overlap_src_before_dst, + /* ZSTD_overlap_dst_before_src, */ +}; + +#define WILDCOPY_OVERLENGTH 32 +#define WILDCOPY_VECLEN 16 + /*! ZSTD_wildcopy() : -* custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */ -#define WILDCOPY_OVERLENGTH 8 -ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length) + * Custom version of memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0) + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart. + * The src buffer must be before the dst buffer. + */ +FORCE_INLINE void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length, enum ZSTD_overlap_e const ovtype) { - const BYTE* ip = (const BYTE*)src; - BYTE* op = (BYTE*)dst; - BYTE* const oend = op + length; -#if defined(GCC_VERSION) && GCC_VERSION >= 70000 && GCC_VERSION < 70200 - /* - * Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388. - * Avoid the bad case where the loop only runs once by handling the - * special case separately. This doesn't trigger the bug because it - * doesn't involve pointer/integer overflow. - */ - if (length <= 8) - return ZSTD_copy8(dst, src); -#endif + ptrdiff_t diff = (BYTE *)dst - (const BYTE *)src; + const BYTE *ip = (const BYTE *)src; + BYTE *op = (BYTE *)dst; + BYTE *const oend = op + length; + + if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { + /* Handle short offset copies */ + do { + ZSTD_copy8(op, ip); + op += 8; + ip += 8; + } while (op < oend); + } else { + ZSTD_copy16(op, ip); + op += 16; + ip += 16; + ZSTD_copy16(op, ip); + op += 16; + ip += 16; + if (op >= oend) + return; + do { + ZSTD_copy16(op, ip); + op += 16; + ip += 16; + ZSTD_copy16(op, ip); + op += 16; + ip += 16; + } while (op < oend); + } +} + +/*! ZSTD_wildcopy8() : + * The same as ZSTD_wildcopy(), but it can only overwrite 8 bytes, and works for + * overlapping buffers that are at least 8 bytes apart. + */ +ZSTD_STATIC void ZSTD_wildcopy8(void *dst, const void *src, ptrdiff_t length) +{ + const BYTE *ip = (const BYTE *)src; + BYTE *op = (BYTE *)dst; + BYTE *const oend = (BYTE *)op + length; + do { ZSTD_copy8(op, ip); op += 8; @@ -253,7 +296,7 @@ void ZSTD_stackFree(void *opaque, void *address); /*====== common function ======*/ -ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); } +ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return __builtin_clz(val) ^ 31; } /* hidden functions */ diff --git a/lib/zstd/zstd_opt.h b/lib/zstd/zstd_opt.h index 55e1b4c..d731c42 100644 --- a/lib/zstd/zstd_opt.h +++ b/lib/zstd/zstd_opt.h @@ -676,7 +676,7 @@ _storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ } ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); - ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, iend, offset, mlen - MINMATCH); anchor = ip = ip + mlen; } } /* for (cur=0; cur < last_pos; ) */ @@ -991,7 +991,7 @@ _storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ } ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); - ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, iend, offset, mlen - MINMATCH); anchor = ip = ip + mlen; } } /* for (cur=0; cur < last_pos; ) */ -- 2.7.4