Improved compression speed for big-endian CPU
authoryann.collet.73@gmail.com <yann.collet.73@gmail.com@650e7d94-2a16-8b24-b05c-7c0b3f6821cd>
Tue, 10 Jan 2012 20:03:01 +0000 (20:03 +0000)
committeryann.collet.73@gmail.com <yann.collet.73@gmail.com@650e7d94-2a16-8b24-b05c-7c0b3f6821cd>
Tue, 10 Jan 2012 20:03:01 +0000 (20:03 +0000)
git-svn-id: https://lz4.googlecode.com/svn/trunk@48 650e7d94-2a16-8b24-b05c-7c0b3f6821cd

Makefile
lz4.c

index da77c9d..84da2c0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,10 @@
 all: lz4demo64 lz4demo32 
 
 lz4demo64: lz4.c lz4.h bench.c lz4demo.c  
-       gcc      -g -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo64.exe
+       gcc      -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo64.exe
 
 lz4demo32: lz4.c lz4.h bench.c lz4demo.c
-       gcc -m32 -g -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo32.exe
+       gcc -m32 -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo32.exe
 
 clean:
        rm -f core *.o lz4demo32.exe lz4demo64.exe
diff --git a/lz4.c b/lz4.c
index a5f34e6..5ad13ee 100644 (file)
--- a/lz4.c
+++ b/lz4.c
@@ -95,7 +95,6 @@
 #define SKIPSTRENGTH 6\r
 #define STACKLIMIT 13\r
 #define HEAPMODE (HASH_LOG>STACKLIMIT)  // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()).\r
-#define COPYTOKEN 4\r
 #define COPYLENGTH 8\r
 #define LASTLITERALS 5\r
 #define MFLIMIT (COPYLENGTH+MINMATCH)\r
@@ -142,44 +141,47 @@ typedef struct _U16_S
 \r
 \r
 //**************************************\r
-// Macros\r
-//**************************************\r
-#define LZ4_HASH_FUNCTION(i)   (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))\r
-#define LZ4_HASH_VALUE(p)              LZ4_HASH_FUNCTION(A32(p))\r
-#define LZ4_COPYPACKET32(s,d)  A32(d) = A32(s); d+=4; s+=4; A32(d) = A32(s); d+=4; s+=4;\r
-#define LZ4_COPYPACKET64(s,d)  A64(d) = A64(s); d+=8; s+=8;\r
-#define LZ4_WILDCOPY32(s,d,e)  do { LZ4_COPYPACKET32(s,d) } while (d<e);\r
-#define LZ4_WILDCOPY64(s,d,e)  do { LZ4_COPYPACKET64(s,d) } while (d<e);\r
-\r
-\r
-//**************************************\r
 // Architecture-specific macros\r
 //**************************************\r
 #if ARCH64     // 64-bit\r
-#define LZ4_WILDCOPY LZ4_WILDCOPY64\r
-#define LZ4_BLINDCOPY(s,d,l)   { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }\r
+#define COPYSTEP 8\r
+#define LZ4_COPYSTEP(s,d)              A64(d) = A64(s); d+=8; s+=8;\r
+#define LZ4_COPYPACKET(s,d)            LZ4_COPYSTEP(s,d)\r
 #else          // 32-bit\r
-#define LZ4_WILDCOPY LZ4_WILDCOPY32\r
-#define LZ4_BLINDCOPY(s,d,l)   { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }\r
+#define COPYSTEP 4\r
+#define LZ4_COPYSTEP(s,d)              A32(d) = A32(s); d+=4; s+=4;\r
+#define LZ4_COPYPACKET(s,d)            LZ4_COPYSTEP(s,d); LZ4_COPYSTEP(s,d);\r
 #endif\r
 \r
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\r
 #define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = s - A16(p); }\r
 #define LZ4_WRITE_LITTLEENDIAN_16(p,v) { A16(p) = v; p+=2; }\r
+#define LZ4_NbCommonBytes LZ4_NbCommonBytes_LittleEndian\r
 #else          // Big Endian\r
 #define LZ4_READ_LITTLEENDIAN_16(d,s,p) { int delta = p[0]; delta += p[1] << 8; d = s - delta; }\r
 #define LZ4_WRITE_LITTLEENDIAN_16(p,v) { int delta = v; *p++ = delta; *op++ = delta>>8; }\r
+#define LZ4_NbCommonBytes LZ4_NbCommonBytes_BigEndian\r
 #endif\r
 \r
+\r
+//**************************************\r
+// Macros\r
+//**************************************\r
+#define LZ4_HASH_FUNCTION(i)   (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))\r
+#define LZ4_HASH_VALUE(p)              LZ4_HASH_FUNCTION(A32(p))\r
+#define LZ4_WILDCOPY(s,d,e)            do { LZ4_COPYPACKET(s,d) } while (d<e);\r
+#define LZ4_BLINDCOPY(s,d,l)   { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }\r
+\r
+\r
 //****************************\r
 // Private functions\r
 //****************************\r
-inline static int LZ4_NbCommonBytes_LittleEndian( register U32 val )\r
+inline static int LZ4_NbCommonBytes_LittleEndian (register U32 val)\r
 {\r
     #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT)\r
-    unsigned long b = 0;\r
-    _BitScanForward( &b, val );\r
-    return (int)(b>>3);\r
+    unsigned long r = 0;\r
+    _BitScanForward( &r, val );\r
+    return (int)(r>>3);\r
     #elif defined(__GNUC__)  && !defined(_FORCE_SW_BITCOUNT)\r
     return (__builtin_ctz(val) >> 3); \r
     #else\r
@@ -188,6 +190,22 @@ inline static int LZ4_NbCommonBytes_LittleEndian( register U32 val )
     #endif\r
 }\r
 \r
+inline static int LZ4_NbCommonBytes_BigEndian (register U32 val)\r
+{\r
+    #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT)\r
+    unsigned long r = 0;\r
+    _BitScanReverse( &r, val );\r
+    return (int)(r>>3);\r
+    #elif defined(__GNUC__)  && !defined(_FORCE_SW_BITCOUNT)\r
+    return (__builtin_clz(val) >> 3); \r
+    #else\r
+       int r;\r
+       if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }\r
+       r += (!val);\r
+       return r;\r
+    #endif\r
+}\r
+\r
 \r
 //******************************\r
 // Public Compression functions\r
@@ -281,15 +299,9 @@ _next_match:
                anchor = ip;\r
                while (ip<matchlimit-3)\r
                {\r
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\r
                        U32 diff = A32(ref) ^ A32(ip);\r
                        if (!diff) { ip+=4; ref+=4; continue; }\r
-                       ip += LZ4_NbCommonBytes_LittleEndian(diff);\r
-#else\r
-                       if (A32(ref) == A32(ip)) { ip+=4; ref+=4; continue; }\r
-                       if (A16(ref) == A16(ip)) { ip+=2; ref+=2; }\r
-                       if (*ref == *ip) ip++;\r
-#endif\r
+                       ip += LZ4_NbCommonBytes(diff);\r
                        goto _endCount;\r
                }\r
                if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }\r
@@ -427,15 +439,9 @@ _next_match:
                anchor = ip;\r
                while (ip<matchlimit-3)\r
                {\r
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\r
                        U32 diff = A32(ref) ^ A32(ip);\r
                        if (!diff) { ip+=4; ref+=4; continue; }\r
-                       ip += LZ4_NbCommonBytes_LittleEndian(diff);\r
-#else\r
-                       if (A32(ref) == A32(ip)) { ip+=4; ref+=4; continue; }\r
-                       if (A16(ref) == A16(ip)) { ip+=2; ref+=2; }\r
-                       if (*ref == *ip) ip++;\r
-#endif\r
+                       ip += LZ4_NbCommonBytes(diff);\r
                        goto _endCount;\r
                }\r
                if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }\r
@@ -501,14 +507,13 @@ int LZ4_compress(char* source,
 \r
 \r
 //****************************\r
-// Decompression CODE\r
+// Decompression functions\r
 //****************************\r
 \r
 // Note : The decoding functions LZ4_uncompress() and LZ4_uncompress_unknownOutputSize() \r
 //             are safe against "buffer overflow" attack type.\r
-//             They will *never* write nor read outside of the provided input and output buffer :\r
-//             they both check this condition *before* writing anything.\r
-//             A corrupted packet will trigger an error result, a negative int, indicating the position of the error within input stream.\r
+//             They will never write nor read outside of the provided input and output buffers.\r
+//             A corrupted input will produce an error result, a negative int, indicating the position of the error within input stream.\r
 \r
 int LZ4_uncompress(char* source, \r
                                 char* dest,\r
@@ -525,10 +530,7 @@ int LZ4_uncompress(char* source,
        BYTE token;\r
        \r
        int     len, length;\r
-       size_t dec1[] ={0, 3, 2, 3, 0, 0, 0, 0};\r
-#if ARCH64\r
-       size_t dec2[]={0, 4, 4, 3, 4, 5, 6, 7};\r
-#endif\r
+       size_t dec[] ={0, 3, 2, 3, 0, 0, 0, 0};\r
 \r
 \r
        // Main Loop\r
@@ -557,53 +559,34 @@ int LZ4_uncompress(char* source,
                if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; } \r
 \r
                // copy repeated sequence\r
-#if ARCH64\r
-               if (op-ref<8)\r
+               if (op-ref<COPYSTEP)\r
                {\r
-                       int tmp = dec2[op-ref];\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       ref -= dec1[op-ref];\r
-                       A32(op)=A32(ref); op += 4; ref += 4;\r
-                       ref -= tmp;\r
-               } else { LZ4_COPYPACKET64(ref,op); }\r
-               cpy = op + length - 4;\r
-               if (cpy > oend-COPYLENGTH)\r
-               {\r
-                       if (cpy > oend) goto _output_error;     \r
-                       LZ4_WILDCOPY64(ref, op, (oend-COPYLENGTH));\r
-                       while(op<cpy) *op++=*ref++;\r
-                       op=cpy;\r
-                       if (op == oend) break;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)\r
-                       continue;\r
-               }\r
-               LZ4_WILDCOPY64(ref, op, cpy);\r
-               op=cpy;         // correction\r
+#if ARCH64\r
+                       size_t dec2table[]={0, 4, 4, 3, 4, 5, 6, 7};\r
+                       size_t dec2 = dec2table[op-ref];\r
 #else\r
-               if (op-ref<COPYTOKEN)\r
-               {\r
+                       int dec2 = 0;\r
+#endif\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
-                       ref -= dec1[op-ref];\r
-                       A32(op)=A32(ref); \r
-               } else { A32(op)=A32(ref); op+=4; ref+=4; }\r
-               cpy = op + length;\r
+                       ref -= dec[op-ref];\r
+                       A32(op)=A32(ref); op += COPYSTEP-4; ref += COPYSTEP-4;\r
+                       ref -= dec2;\r
+               } else { LZ4_COPYSTEP(ref,op); }\r
+               cpy = op + length - (COPYSTEP-4);\r
                if (cpy>oend-COPYLENGTH)\r
                {\r
                        if (cpy > oend) goto _output_error;     \r
-                       LZ4_WILDCOPY32(ref, op, (oend-COPYLENGTH));\r
+                       LZ4_WILDCOPY(ref, op, (oend-COPYLENGTH));\r
                        while(op<cpy) *op++=*ref++;\r
                        op=cpy;\r
                        if (op == oend) break;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)\r
                        continue;\r
                }\r
-               LZ4_WILDCOPY32(ref, op, cpy);\r
+               LZ4_WILDCOPY(ref, op, cpy);\r
                op=cpy;         // correction\r
-#endif\r
        }\r
 \r
        // end of decoding\r
@@ -633,10 +616,7 @@ int LZ4_uncompress_unknownOutputSize(
        BYTE token;\r
        \r
        int     len, length;\r
-       size_t dec1[] ={0, 3, 2, 3, 0, 0, 0, 0};\r
-#if ARCH64\r
-       size_t dec2[]={0, 4, 4, 3, 4, 5, 6, 7};\r
-#endif\r
+       size_t dec[] ={0, 3, 2, 3, 0, 0, 0, 0};\r
 \r
 \r
        // Main Loop\r
@@ -660,59 +640,40 @@ int LZ4_uncompress_unknownOutputSize(
 \r
                // get offset\r
                LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;\r
-               if (ref < (BYTE* const)dest) goto _output_error;                \r\r
+               if (ref < (BYTE* const)dest) goto _output_error;\r
+\r
                // get matchlength\r
                if ((length=(token&ML_MASK)) == ML_MASK) { for (;(len=*ip++)==255;length+=255){} length += len; }\r
 \r
                // copy repeated sequence\r
-#if ARCH64\r
-               if (op-ref<8)\r
+               if (op-ref<COPYSTEP)\r
                {\r
-                       int tmp = dec2[op-ref];\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       *op++ = *ref++;\r
-                       ref -= dec1[op-ref];\r
-                       A32(op)=A32(ref); op += 4; ref += 4;\r
-                       ref -= tmp;\r
-               } else { LZ4_COPYPACKET64(ref,op); }\r
-               cpy = op + length - 4;\r
-               if (cpy > oend-COPYLENGTH)\r
-               {\r
-                       if (cpy > oend) goto _output_error;     \r
-                       LZ4_WILDCOPY64(ref, op, (oend-COPYLENGTH));\r
-                       while(op<cpy) *op++=*ref++;\r
-                       op=cpy;\r
-                       if (op == oend) break;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)\r
-                       continue;\r
-               }\r
-               LZ4_WILDCOPY64(ref, op, cpy);\r
-               op=cpy;         // correction\r
+#if ARCH64\r
+                       size_t dec2table[]={0, 4, 4, 3, 4, 5, 6, 7};\r
+                       size_t dec2 = dec2table[op-ref];\r
 #else\r
-               if (op-ref<COPYTOKEN)\r
-               {\r
+                       int dec2 = 0;\r
+#endif\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
                        *op++ = *ref++;\r
-                       ref -= dec1[op-ref];\r
-                       A32(op)=A32(ref); \r
-               } else { A32(op)=A32(ref); op+=4; ref+=4; }\r
-               cpy = op + length;\r
+                       ref -= dec[op-ref];\r
+                       A32(op)=A32(ref); op += COPYSTEP-4; ref += COPYSTEP-4;\r
+                       ref -= dec2;\r
+               } else { LZ4_COPYSTEP(ref,op); }\r
+               cpy = op + length - (COPYSTEP-4);\r
                if (cpy>oend-COPYLENGTH)\r
                {\r
                        if (cpy > oend) goto _output_error;     \r
-                       LZ4_WILDCOPY32(ref, op, (oend-COPYLENGTH));\r
+                       LZ4_WILDCOPY(ref, op, (oend-COPYLENGTH));\r
                        while(op<cpy) *op++=*ref++;\r
                        op=cpy;\r
                        if (op == oend) break;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)\r
                        continue;\r
                }\r
-               LZ4_WILDCOPY32(ref, op, cpy);\r
+               LZ4_WILDCOPY(ref, op, cpy);\r
                op=cpy;         // correction\r
-#endif\r
-\r
        }\r
 \r
        // end of decoding\r