Merge pull request lz4#530 from lz4/lz4fRingBuffer

Random lz4f clarifications
tanaynv · May 3, 2018 · 2b6c4f3 · 2b6c4f3
2 parents 95607a7 + ffbff1f
commit 2b6c4f3
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 64 deletions.
diff --git a/NEWS b/NEWS
@@ -1,13 +1,13 @@
 v1.8.2
 perf: *much* faster dictionary compression on small files, by @felixhandte
+perf: improved decompression speed and binary size, by Alexey Tourbin (@svpv)
 perf: slightly faster HC compression and decompression speed
 perf: very small compression ratio improvement
-perf: improved decompression binary size and speed, by Alexey Tourbin (@svpv)
 fix : compression compatible with low memory addresses (< 0xFFFF)
 fix : decompression segfault when provided with NULL input, by @terrelln
 cli : new command --favor-decSpeed
 cli : benchmark mode more accurate for small inputs
-fullbench : can measure _destSize() variants, by @felixhandte
+fullbench : can bench _destSize() variants, by @felixhandte
 doc : clarified block format parsing restrictions, by Alexey Tourbin (@svpv)
 
 v1.8.1

diff --git a/README.md b/README.md
@@ -43,33 +43,32 @@ Benchmarks
 -------------------------
 
 The benchmark uses [lzbench], from @inikep
-compiled with GCC v6.2.0 on Linux 64-bits.
-The reference system uses a Core i7-3930K CPU @ 4.5GHz.
+compiled with GCC v7.3.0 on Linux 64-bits (Debian 4.15.17-1).
+The reference system uses a Core i7-6700K CPU @ 4.0GHz.
 Benchmark evaluates the compression of reference [Silesia Corpus]
 in single-thread mode.
 
 [lzbench]: https://github.com/inikep/lzbench
 [Silesia Corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
 
-|  Compressor            | Ratio   | Compression | Decompression |
-|  ----------            | -----   | ----------- | ------------- |
-|  memcpy                |  1.000  | 7300 MB/s   |   7300 MB/s   |
-|**LZ4 fast 8  (v1.7.3)**|  1.799  |**911 MB/s** | **3360 MB/s** |
-|**LZ4 default (v1.7.3)**|**2.101**|**625 MB/s** | **3220 MB/s** |
-|  LZO 2.09              |  2.108  |  620 MB/s   |    845 MB/s   |
-|  QuickLZ 1.5.0         |  2.238  |  510 MB/s   |    600 MB/s   |
-|  Snappy 1.1.3          |  2.091  |  450 MB/s   |   1550 MB/s   |
-|  LZF v3.6              |  2.073  |  365 MB/s   |    820 MB/s   |
-|  [Zstandard] 1.1.1 -1  |  2.876  |  330 MB/s   |    930 MB/s   |
-|  [Zstandard] 1.1.1 -3  |  3.164  |  200 MB/s   |    810 MB/s   |
-| [zlib] deflate 1.2.8 -1|  2.730  |  100 MB/s   |    370 MB/s   |
-|**LZ4 HC -9 (v1.7.3)**  |**2.720**|   34 MB/s   | **3240 MB/s** |
-| [zlib] deflate 1.2.8 -6|  3.099  |   33 MB/s   |    390 MB/s   |
+|  Compressor             | Ratio   | Compression | Decompression |
+|  ----------             | -----   | ----------- | ------------- |
+|  memcpy                 |  1.000  |13100 MB/s   |  13100 MB/s   |
+|**LZ4 default (v1.8.2)** |**2.101**|**730 MB/s** | **3900 MB/s** |
+|  LZO 2.09               |  2.108  |  630 MB/s   |    800 MB/s   |
+|  QuickLZ 1.5.0          |  2.238  |  530 MB/s   |    720 MB/s   |
+|  Snappy 1.1.4           |  2.091  |  525 MB/s   |   1750 MB/s   |
+|  [Zstandard] 1.3.4 -1   |  2.877  |  470 MB/s   |   1380 MB/s   |
+|  LZF v3.6               |  2.073  |  380 MB/s   |    840 MB/s   |
+| [zlib] deflate 1.2.11 -1|  2.730  |  100 MB/s   |    380 MB/s   |
+|**LZ4 HC -9 (v1.8.2)**   |**2.721**|   40 MB/s   | **3920 MB/s** |
+| [zlib] deflate 1.2.11 -6|  3.099  |   34 MB/s   |    410 MB/s   |
 
 [zlib]: http://www.zlib.net/
 [Zstandard]: http://www.zstd.net/
 
-LZ4 is also compatible and well optimized for x32 mode, for which it provides an additional +10% speed performance.
+LZ4 is also compatible and well optimized for x32 mode,
+for which it provides some additional speed performance.
 
 
 Installation

diff --git a/doc/lz4_manual.html b/doc/lz4_manual.html
@@ -206,45 +206,67 @@ <h1>1.8.2 Manual</h1>
 
 <pre><b>LZ4_streamDecode_t* LZ4_createStreamDecode(void);
 int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
-</b><p>  creation / destruction of streaming decompression tracking structure.
-  A tracking structure can be re-used multiple times sequentially. 
+</b><p>  creation / destruction of streaming decompression tracking context.
+  A tracking context can be re-used multiple times.
+
 </p></pre><BR>
 
 <pre><b>int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
-</b><p>  An LZ4_streamDecode_t structure can be allocated once and re-used multiple times.
+</b><p>  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
   Use this function to start decompression of a new stream of blocks.
   A dictionary can optionnally be set. Use NULL or size 0 for a reset order.
+  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  @return : 1 if OK, 0 if error
 
 </p></pre><BR>
 
+<pre><b>int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(mbs) (65536 + 14 + (mbs))  </b>/* for static allocation; mbs presumed valid */<b>
+</b><p>  Note : in a ring buffer scenario (optional),
+  blocks are presumed decompressed next to each other
+  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+  at which stage it resumes from beginning of ring buffer.
+  When setting such a ring buffer for streaming decompression,
+  provides the minimum size of this ring buffer
+  to be compatible with any source respecting maxBlockSize condition.
+ @return : minimum ring buffer size,
+           or 0 if there is an error (invalid maxBlockSize).
+
+</p></pre><BR>
+
 <pre><b>int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
 int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
 </b><p>  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
   A block is an unsplittable entity, it must be presented entirely to a decompression function.
-  Decompression functions only accept one block at a time.
+  Decompression functions only accepts one block at a time.
   The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
-  If less than 64KB of data has been decoded all the data must be present.
-
-  Special : if application sets a ring buffer for decompression, it must respect one of the following conditions :
-  - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
-    In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
-  - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
-    maxBlockSize is implementation dependent. It's the maximum size of any single block.
+  If less than 64KB of data has been decoded, all the data must be present.
+
+  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+    In which case, encoding and decoding buffers do not need to be synchronized.
+    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+  - Synchronized mode :
+    Decompression buffer size is _exactly_ the same as compression buffer size,
+    and follows exactly same update rule (block boundaries at same positions),
+    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
     In which case, encoding and decoding buffers do not need to be synchronized,
     and encoding ring buffer can have any size, including small ones ( < 64 KB).
-  - _At least_ 64 KB + 8 bytes + maxBlockSize.
-    In which case, encoding and decoding buffers do not need to be synchronized,
-    and encoding ring buffer can have any size, including larger than decoding buffer.
-  Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
-  and indicate where it is saved using LZ4_setStreamDecode() before decompressing next block.
+
+  Whenever these conditions are not possible,
+  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
 </p></pre><BR>
 
 <pre><b>int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
 int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
 </b><p>  These decoding functions work the same as
   a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
   They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
 
 </p></pre><BR>
 

diff --git a/lib/lz4frame.c b/lib/lz4frame.c
@@ -96,6 +96,19 @@ You can contact the author at :
 
 #define LZ4F_STATIC_ASSERT(c)    { enum { LZ4F_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) && !defined(DEBUGLOG)
+#  include <stdio.h>
+static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                                  \
+                if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+                    fprintf(stderr, __FILE__ ": ");           \
+                    fprintf(stderr, __VA_ARGS__);             \
+                    fprintf(stderr, " \n");                   \
+            }   }
+#else
+#  define DEBUGLOG(l, ...)      {}    /* disabled */
+#endif
+
 
 /*-************************************
 *  Basic Types
@@ -408,6 +421,7 @@ size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
     LZ4_stream_t lz4ctx;
     LZ4F_cctx_t *cctxPtr = &cctx;
 
+    DEBUGLOG(4, "LZ4F_compressFrame");
     MEM_INIT(&cctx, 0, sizeof(cctx));
     cctx.version = LZ4F_VERSION;
     cctx.maxBufferSize = 5 MB;   /* mess with real buffer size to prevent dynamic allocation; works only because autoflush==1 & stableSrc==1 */
@@ -1198,24 +1212,31 @@ LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_dctx* dctx, LZ4F_frameInfo_t* frameInfoP
 
 /* LZ4F_updateDict() :
  * only used for LZ4F_blockLinked mode */
-static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize, const BYTE* dstPtr0, unsigned withinTmp)
+static void LZ4F_updateDict(LZ4F_dctx* dctx,
+                      const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart,
+                      unsigned withinTmp)
 {
     if (dctx->dictSize==0)
         dctx->dict = (const BYTE*)dstPtr;   /* priority to dictionary continuity */
 
-    if (dctx->dict + dctx->dictSize == dstPtr) {  /* dictionary continuity */
+    if (dctx->dict + dctx->dictSize == dstPtr) {  /* dictionary continuity, directly within dstBuffer */
         dctx->dictSize += dstSize;
         return;
     }
 
-    if (dstPtr - dstPtr0 + dstSize >= 64 KB) {  /* dstBuffer large enough to become dictionary */
-        dctx->dict = (const BYTE*)dstPtr0;
-        dctx->dictSize = dstPtr - dstPtr0 + dstSize;
+    if (dstPtr - dstBufferStart + dstSize >= 64 KB) {  /* history in dstBuffer becomes large enough to become dictionary */
+        dctx->dict = (const BYTE*)dstBufferStart;
+        dctx->dictSize = dstPtr - dstBufferStart + dstSize;
         return;
     }
 
-    if ((withinTmp) && (dctx->dict == dctx->tmpOutBuffer)) {
-        /* assumption : dctx->dict + dctx->dictSize == dctx->tmpOut + dctx->tmpOutStart */
+    assert(dstSize < 64 KB);   /* if dstSize >= 64 KB, dictionary would be set into dstBuffer directly */
+
+    /* dstBuffer does not contain whole useful history (64 KB), so it must be saved within tmpOut */
+
+    if ((withinTmp) && (dctx->dict == dctx->tmpOutBuffer)) {   /* continue history within tmpOutBuffer */
+        /* withinTmp expectation : content of [dstPtr,dstSize] is same as [dict+dictSize,dstSize], so we just extend it */
+        assert(dctx->dict + dctx->dictSize == dctx->tmpOut + dctx->tmpOutStart);
         dctx->dictSize += dstSize;
         return;
     }
@@ -1236,7 +1257,7 @@ static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize,
 
     if (dctx->dict == dctx->tmpOutBuffer) {    /* copy dst into tmp to complete dict */
         if (dctx->dictSize + dstSize > dctx->maxBufferSize) {  /* tmp buffer not large enough */
-            size_t const preserveSize = 64 KB - dstSize;   /* note : dstSize < 64 KB */
+            size_t const preserveSize = 64 KB - dstSize;
             memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize);
             dctx->dictSize = preserveSize;
         }
@@ -1246,7 +1267,7 @@ static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize,
     }
 
     /* join dict & dest into tmp */
-    {   size_t preserveSize = 64 KB - dstSize;   /* note : dstSize < 64 KB */
+    {   size_t preserveSize = 64 KB - dstSize;
         if (preserveSize > dctx->dictSize) preserveSize = dctx->dictSize;
         memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize);
         memcpy(dctx->tmpOutBuffer + preserveSize, dstPtr, dstSize);
@@ -1313,7 +1334,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
             }
             dctx->tmpInSize = 0;
             if (srcEnd-srcPtr == 0) return minFHSize;   /* 0-size input */
-            dctx->tmpInTarget = minFHSize;   /* minimum to attempt decode */
+            dctx->tmpInTarget = minFHSize;   /* minimum size to decode header */
             dctx->dStage = dstage_storeFrameHeader;
             /* fall-through */
 
@@ -1470,8 +1491,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                     U32 const calcCRC = XXH32_digest(&dctx->blockChecksum);
                     if (readCRC != calcCRC)
                         return err0r(LZ4F_ERROR_blockChecksum_invalid);
-                }
-            }
+            }   }
             dctx->dStage = dstage_getBlockHeader;  /* new block */
             break;
 
@@ -1512,13 +1532,13 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
             }   }
 
             if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) {
-                const char *dict = (const char *)dctx->dict;
+                const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
                 if (dict && dictSize > 1 GB) {
                     /* the dictSize param is an int, avoid truncation / sign issues */
-                    dict += dictSize - 1 GB;
-                    dictSize = 1 GB;
+                    dict += dictSize - 64 KB;
+                    dictSize = 64 KB;
                 }
                 /* enough capacity in `dst` to decompress directly there */
                 decodedSize = LZ4_decompress_safe_usingDict(
@@ -1552,18 +1572,16 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 } else {  /* dict not within tmp */
                     size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB);
                     dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace;
-                }
-            }
+            }   }
 
             /* Decode block */
-            {
-                const char *dict = (const char *)dctx->dict;
+            {   const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
                 if (dict && dictSize > 1 GB) {
                     /* the dictSize param is an int, avoid truncation / sign issues */
-                    dict += dictSize - 1 GB;
-                    dictSize = 1 GB;
+                    dict += dictSize - 64 KB;
+                    dictSize = 64 KB;
                 }
                 decodedSize = LZ4_decompress_safe_usingDict(
                         (const char*)selectedIn, (char*)dctx->tmpOut,
@@ -1586,8 +1604,8 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 memcpy(dstPtr, dctx->tmpOut + dctx->tmpOutStart, sizeToCopy);
 
                 /* dictionary management */
-                if (dctx->frameInfo.blockMode==LZ4F_blockLinked)
-                    LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 1);
+                if (dctx->frameInfo.blockMode == LZ4F_blockLinked)
+                    LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 1 /*withinTmp*/);
 
                 dctx->tmpOutStart += sizeToCopy;
                 dstPtr += sizeToCopy;
@@ -1596,8 +1614,9 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                     dctx->dStage = dstage_getBlockHeader;  /* get next block */
                     break;
                 }
+                /* could not flush everything : stop there, just request a block header */
+                doAnotherStage = 0;
                 nextSrcSizeHint = BHSize;
-                doAnotherStage = 0;   /* still some data to flush */
                 break;
             }
 
@@ -1634,7 +1653,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 selectedIn = dctx->tmpIn;
             }   /* if (dctx->dStage == dstage_storeSuffix) */
 
-        /* case dstage_checkSuffix: */   /* no direct call, avoid scan-build warning */
+        /* case dstage_checkSuffix: */   /* no direct entry, avoid initialization risks */
             {   U32 const readCRC = LZ4F_readLE32(selectedIn);
                 U32 const resultCRC = XXH32_digest(&(dctx->xxh));
                 if (readCRC != resultCRC)
@@ -1658,8 +1677,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
 
             if (dctx->dStage == dstage_storeSFrameSize)
         case dstage_storeSFrameSize:
-            {
-                size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize,
+            {   size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize,
                                              (size_t)(srcEnd - srcPtr) );
                 memcpy(dctx->header + dctx->tmpInSize, srcPtr, sizeToCopy);
                 srcPtr += sizeToCopy;
@@ -1673,7 +1691,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 selectedIn = dctx->header + 4;
             }   /* if (dctx->dStage == dstage_storeSFrameSize) */
 
-        /* case dstage_decodeSFrameSize: */   /* no direct access */
+        /* case dstage_decodeSFrameSize: */   /* no direct entry */
             {   size_t const SFrameSize = LZ4F_readLE32(selectedIn);
                 dctx->frameInfo.contentSize = SFrameSize;
                 dctx->tmpInTarget = SFrameSize;
@@ -1692,7 +1710,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 LZ4F_resetDecompressionContext(dctx);
                 break;
             }
-        }
+        }   /* switch (dctx->dStage) */
     }   /* while (doAnotherStage) */
 
     /* preserve history within tmp whenever necessary */