[LZ4_compress_destSize] Fix rare data corruption bug

author Nick Terrell <terrelln@fb.com>

Wed, 17 Jul 2019 18:28:38 +0000 (11:28 -0700)

committer Nick Terrell <terrelln@fb.com>

Wed, 17 Jul 2019 18:38:38 +0000 (11:38 -0700)
author Nick Terrell <terrelln@fb.com>
Wed, 17 Jul 2019 18:28:38 +0000 (11:28 -0700)
committer Nick Terrell <terrelln@fb.com>
Wed, 17 Jul 2019 18:38:38 +0000 (11:38 -0700)
diff --git a/lib/lz4.c b/lib/lz4.c

index 1e80c98..461644d 100644 (file)
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -648,6 +648,18 @@ LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tab
      return LZ4_hash4(LZ4_read32(p), tableType);
  }
  
+static void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
+    }
+}
+
  static void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
  {
      switch (tableType)
@@ -848,6 +860,7 @@ LZ4_FORCE_INLINE int LZ4_compress_generic(
      for ( ; ; ) {
          const BYTE* match;
          BYTE* token;
+        const BYTE* filledIp;
  
          /* Find a match */
          if (tableType == byPtr) {
@@ -934,6 +947,7 @@ LZ4_FORCE_INLINE int LZ4_compress_generic(
          }
  
          /* Catch up */
+        filledIp = ip;
          while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
  
          /* Encode Literals */
@@ -1018,7 +1032,21 @@ _next_match:
                      /* Match description too long : reduce it */
                      U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
                      ip -= matchCode - newMatchCode;
+                    assert(newMatchCode < matchCode);
                      matchCode = newMatchCode;
+                    if (unlikely(ip < filledIp)) {
+                        /* We have already filled up to filledIp so if ip ends up less than filledIp
+                         * we have positions in the hash table beyond the current position. This is
+                         * a problem if we reuse the hash table. So we have to remove these positions
+                         * from the hash table.
+                         */
+                        const BYTE* ptr;
+                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                        for (ptr = ip + 1; ptr <= filledIp; ++ptr) {
+                            U32 const h = LZ4_hashPosition(ptr, tableType);
+                            LZ4_clearHash(h, cctx->hashTable, tableType);
+                        }
+                    }
                  } else {
                      assert(outputDirective == limitedOutput);
                      return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
@@ -1038,6 +1066,8 @@ _next_match:
              } else
                  *token += (BYTE)(matchCode);
          }
+        /* Ensure we have enough space for the last literals. */
+        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
  
          anchor = ip;
author	Nick Terrell <terrelln@fb.com>
	Wed, 17 Jul 2019 18:28:38 +0000 (11:28 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Wed, 17 Jul 2019 18:38:38 +0000 (11:38 -0700)