liblzma: Add lzma_stream_encoder_mt() for threaded compression.
authorLasse Collin <lasse.collin@tukaani.org>
Mon, 11 Apr 2011 19:03:30 +0000 (22:03 +0300)
committerLasse Collin <lasse.collin@tukaani.org>
Mon, 11 Apr 2011 19:03:30 +0000 (22:03 +0300)
This is the simplest method to do threading, which splits
the uncompressed data into blocks and compresses them
independently from each other. There's room for improvement
especially to reduce the memory usage, but nevertheless,
this is a good start.

configure.ac
src/liblzma/api/lzma/container.h
src/liblzma/common/Makefile.inc
src/liblzma/common/common.c
src/liblzma/common/common.h
src/liblzma/common/outqueue.c [new file with mode: 0644]
src/liblzma/common/outqueue.h [new file with mode: 0644]
src/liblzma/common/stream_encoder_mt.c [new file with mode: 0644]

index 72ea6cc..8cba630 100644 (file)
@@ -437,6 +437,7 @@ if test "x$enable_threads" = xyes; then
        CC="$PTHREAD_CC"
        AC_SEARCH_LIBS([clock_gettime], [rt])
 fi
+AM_CONDITIONAL([COND_THREADS], [test "x$ax_pthread_ok" = xyes])
 
 echo
 echo "Initializing Libtool:"
index 7a9ffc6..e68c894 100644 (file)
 
 
 /**
+ * \brief       Multithreading options
+ */
+typedef struct {
+       /**
+        * \brief       Flags
+        *
+        * Set this to zero if no flags are wanted.
+        *
+        * No flags are currently supported.
+        */
+       uint32_t flags;
+
+       /**
+        * \brief       Number of worker threads to use
+        */
+       uint32_t threads;
+
+       /**
+        * \brief       Maximum uncompressed size of a Block
+        *
+        * The encoder will start a new .xz Block every block_size bytes.
+        * Using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER with lzma_code()
+        * the caller may tell liblzma to start a new Block earlier.
+        *
+        * With LZMA2, a recommended block size is 2-4 times the LZMA2
+        * dictionary size. With very small dictionaries, it is recommended
+        * to use at least 1 MiB block size for good compression ratio, even
+        * if this is more than four times the dictionary size. Note that
+        * these are only recommendations for typical use cases; feel free
+        * to use other values. Just keep in mind that using a block size
+        * less than the LZMA2 dictionary size is waste of RAM.
+        *
+        * Set this to 0 to let liblzma choose the block size depending
+        * on the compression options. For LZMA2 it will be 3*dict_size
+        * or 1 MiB, whichever is more.
+        */
+       uint64_t block_size;
+
+       /**
+        * \brief       Timeout to allow lzma_code() to return early
+        *
+        * Multithreading can make liblzma to consume input and produce
+        * output in a very bursty way: it may first read a lot of input
+        * to fill internal buffers, then no input or output occurs for
+        * a while.
+        *
+        * In single-threaded mode, lzma_code() won't return until it has
+        * either consumed all the input or filled the output buffer. If
+        * this is done in multithreaded mode, it may cause a call
+        * lzma_code() to take even tens of seconds, which isn't acceptable
+        * in all applications.
+        *
+        * To avoid very long blocking times in lzma_code(), a timeout
+        * (in milliseconds) may be set here. If lzma_code() would block
+        * longer than this number of milliseconds, it will return with
+        * LZMA_OK. Reasonable values are 100 ms or more. The xz command
+        * line tool uses 300 ms.
+        *
+        * If long blocking times are fine for you, set timeout to a special
+        * value of 0, which will disable the timeout mechanism and will make
+        * lzma_code() block until all the input is consumed or the output
+        * buffer has been filled.
+        *
+        * \note        Even with a timeout, lzma_code() might sometimes take
+        *              somewhat long time to return. No timing guarantees
+        *              are made.
+        */
+       uint32_t timeout;
+
+       /**
+        * \brief       Compression preset (level and possible flags)
+        *
+        * The preset is set just like with lzma_easy_encoder().
+        * The preset is ignored if filters below is non-NULL.
+        */
+       uint32_t preset;
+
+       /**
+        * \brief       Filter chain (alternative to a preset)
+        *
+        * If this is NULL, the preset above is used. Otherwise the preset
+        * is ignored and the filter chain specified here is used.
+        */
+       const lzma_filter *filters;
+
+       /**
+        * \brief       Integrity check type
+        *
+        * See check.h for available checks. The xz command line tool
+        * defaults to LZMA_CHECK_CRC64, which is a good choice if you
+        * are unsure.
+        */
+       lzma_check check;
+
+       /*
+        * Reserved space to allow possible future extensions without
+        * breaking the ABI. You should not touch these, because the names
+        * of these variables may change. These are and will never be used
+        * with the currently supported options, so it is safe to leave these
+        * uninitialized.
+        */
+       lzma_reserved_enum reserved_enum1;
+       lzma_reserved_enum reserved_enum2;
+       lzma_reserved_enum reserved_enum3;
+       uint32_t reserved_int1;
+       uint32_t reserved_int2;
+       uint32_t reserved_int3;
+       uint32_t reserved_int4;
+       uint64_t reserved_int5;
+       uint64_t reserved_int6;
+       uint64_t reserved_int7;
+       uint64_t reserved_int8;
+       void *reserved_ptr1;
+       void *reserved_ptr2;
+       void *reserved_ptr3;
+       void *reserved_ptr4;
+
+} lzma_mt;
+
+
+/**
  * \brief       Calculate approximate memory usage of easy encoder
  *
  * This function is a wrapper for lzma_raw_encoder_memusage().
@@ -191,6 +312,48 @@ extern LZMA_API(lzma_ret) lzma_stream_encoder(lzma_stream *strm,
 
 
 /**
+ * \brief       Calculate approximate memory usage of multithreaded .xz encoder
+ *
+ * Since doing the encoding in threaded mode doesn't affect the memory
+ * requirements of single-threaded decompressor, you can use
+ * lzma_easy_decoder_memusage(options->preset) or
+ * lzma_raw_decoder_memusage(options->filters) to calculate
+ * the decompressor memory requirements.
+ *
+ * \param       options Compression options
+ *
+ * \return      Number of bytes of memory required for encoding with the
+ *              given options. If an error occurs, for example due to
+ *              unsupported preset or filter chain, UINT64_MAX is returned.
+ */
+extern LZMA_API(uint64_t) lzma_stream_encoder_mt_memusage(
+               const lzma_mt *options) lzma_nothrow lzma_attr_pure;
+
+
+/**
+ * \brief       Initialize multithreaded .xz Stream encoder
+ *
+ * This provides the functionality of lzma_easy_encoder() and
+ * lzma_stream_encoder() as a single function for multithreaded use.
+ *
+ * TODO: For lzma_code(), only LZMA_RUN and LZMA_FINISH are currently
+ * supported. Support for other actions has been planned.
+ *
+ * \param       strm    Pointer to properly prepared lzma_stream
+ * \param       options Pointer to multithreaded compression options
+ *
+ * \return      - LZMA_OK
+ *              - LZMA_MEM_ERROR
+ *              - LZMA_UNSUPPORTED_CHECK
+ *              - LZMA_OPTIONS_ERROR
+ *              - LZMA_PROG_ERROR
+ */
+extern LZMA_API(lzma_ret) lzma_stream_encoder_mt(
+               lzma_stream *strm, const lzma_mt *options)
+               lzma_nothrow lzma_attr_warn_unused_result;
+
+
+/**
  * \brief       Initialize .lzma encoder (legacy file format)
  *
  * The .lzma format is sometimes called the LZMA_Alone format, which is the
index 81d751e..dd5a8c8 100644 (file)
@@ -40,6 +40,13 @@ liblzma_la_SOURCES += \
        common/stream_encoder.c \
        common/stream_flags_encoder.c \
        common/vli_encoder.c
+
+if COND_THREADS
+liblzma_la_SOURCES += \
+       common/outqueue.c \
+       common/outqueue.h \
+       common/stream_encoder_mt.c
+endif
 endif
 
 if COND_MAIN_DECODER
index 3005cca..6afb4fb 100644 (file)
@@ -263,7 +263,9 @@ lzma_code(lzma_stream *strm, lzma_action action)
 
        strm->internal->avail_in = strm->avail_in;
 
-       switch (ret) {
+       // Cast is needed to silence a warning about LZMA_TIMED_OUT, which
+       // isn't part of lzma_ret enumeration.
+       switch ((unsigned int)(ret)) {
        case LZMA_OK:
                // Don't return LZMA_BUF_ERROR when it happens the first time.
                // This is to avoid returning LZMA_BUF_ERROR when avail_out
@@ -279,6 +281,11 @@ lzma_code(lzma_stream *strm, lzma_action action)
                }
                break;
 
+       case LZMA_TIMED_OUT:
+               strm->internal->allow_buf_error = false;
+               ret = LZMA_OK;
+               break;
+
        case LZMA_STREAM_END:
                if (strm->internal->sequence == ISEQ_SYNC_FLUSH
                                || strm->internal->sequence == ISEQ_FULL_FLUSH)
index b819432..8e9a387 100644 (file)
 #define LZMA_BUFFER_SIZE 4096
 
 
+/// Maximum number of worker threads within one multithreaded component.
+/// The limit exists solely to make it simpler to prevent integer overflows
+/// when allocating structures etc. This should be big enough for now...
+/// the code won't scale anywhere close to this number anyway.
+#define LZMA_THREADS_MAX 16384
+
+
 /// Starting value for memory usage estimates. Instead of calculating size
 /// of _every_ structure and taking into account malloc() overhead etc., we
 /// add a base size to all memory usage estimates. It's not very accurate
        | LZMA_CONCATENATED )
 
 
+/// Special return value (lzma_ret) to indicate that a timeout was reached
+/// and lzma_code() must not return LZMA_BUF_ERROR. This is converted to
+/// LZMA_OK in lzma_code(). This is not in the lzma_ret enumeration because
+/// there's no need to have it in the public API.
+#define LZMA_TIMED_OUT 32
+
+
 /// Type of encoder/decoder specific data; the actual structure is defined
 /// differently in different coders.
 typedef struct lzma_coder_s lzma_coder;
diff --git a/src/liblzma/common/outqueue.c b/src/liblzma/common/outqueue.c
new file mode 100644 (file)
index 0000000..b9eac16
--- /dev/null
@@ -0,0 +1,180 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       outqueue.c
+/// \brief      Output queue handling in multithreaded coding
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "outqueue.h"
+
+
+/// This is to ease integer overflow checking: We may allocate up to
+/// 2 * LZMA_THREADS_MAX buffers and we need some extra memory for other
+/// data structures (that's the second /2).
+#define BUF_SIZE_MAX (UINT64_MAX / LZMA_THREADS_MAX / 2 / 2)
+
+
+static lzma_ret
+get_options(uint64_t *bufs_alloc_size, uint32_t *bufs_count,
+               uint64_t buf_size_max, uint32_t threads)
+{
+       if (threads > LZMA_THREADS_MAX || buf_size_max > BUF_SIZE_MAX)
+               return LZMA_OPTIONS_ERROR;
+
+       // The number of buffers is twice the number of threads.
+       // This wastes RAM but keeps the threads busy when buffers
+       // finish out of order.
+       //
+       // NOTE: If this is changed, update BUF_SIZE_MAX too.
+       *bufs_count = threads * 2;
+       *bufs_alloc_size = *bufs_count * buf_size_max;
+
+       return LZMA_OK;
+}
+
+
+extern uint64_t
+lzma_outq_memusage(uint64_t buf_size_max, uint32_t threads)
+{
+       uint64_t bufs_alloc_size;
+       uint32_t bufs_count;
+
+       if (get_options(&bufs_alloc_size, &bufs_count, buf_size_max, threads)
+                       != LZMA_OK)
+               return UINT64_MAX;
+
+       return sizeof(lzma_outq) + bufs_count * sizeof(lzma_outbuf)
+                       + bufs_alloc_size;
+}
+
+
+extern lzma_ret
+lzma_outq_init(lzma_outq *outq, lzma_allocator *allocator,
+               uint64_t buf_size_max, uint32_t threads)
+{
+       uint64_t bufs_alloc_size;
+       uint32_t bufs_count;
+
+       // Set bufs_count and bufs_alloc_size.
+       return_if_error(get_options(&bufs_alloc_size, &bufs_count,
+                       buf_size_max, threads));
+
+       // Allocate memory if needed.
+       if (outq->buf_size_max != buf_size_max
+                       || outq->bufs_allocated != bufs_count) {
+               lzma_outq_end(outq, allocator);
+
+#if SIZE_MAX < UINT64_MAX
+               if (bufs_alloc_size > SIZE_MAX)
+                       return LZMA_MEM_ERROR;
+#endif
+
+               outq->bufs = lzma_alloc(bufs_count * sizeof(lzma_outbuf),
+                               allocator);
+               outq->bufs_mem = lzma_alloc((size_t)(bufs_alloc_size),
+                               allocator);
+
+               if (outq->bufs == NULL || outq->bufs_mem == NULL) {
+                       lzma_outq_end(outq, allocator);
+                       return LZMA_MEM_ERROR;
+               }
+       }
+
+       // Initialize the rest of the main structure. Initialization of
+       // outq->bufs[] is done when they are actually needed.
+       outq->buf_size_max = (size_t)(buf_size_max);
+       outq->bufs_allocated = bufs_count;
+       outq->bufs_pos = 0;
+       outq->bufs_used = 0;
+       outq->read_pos = 0;
+
+       return LZMA_OK;
+}
+
+
+extern void
+lzma_outq_end(lzma_outq *outq, lzma_allocator *allocator)
+{
+       lzma_free(outq->bufs, allocator);
+       lzma_free(outq->bufs_mem, allocator);
+       return;
+}
+
+
+extern lzma_outbuf *
+lzma_outq_get_buf(lzma_outq *outq)
+{
+       // Caller must have checked it with lzma_outq_has_buf().
+       assert(outq->bufs_used < outq->bufs_allocated);
+
+       // Initialize the new buffer.
+       lzma_outbuf *buf = &outq->bufs[outq->bufs_pos];
+       buf->buf = outq->bufs_mem + outq->bufs_pos * outq->buf_size_max;
+       buf->size = 0;
+       buf->finished = false;
+
+       // Update the queue state.
+       if (++outq->bufs_pos == outq->bufs_allocated)
+               outq->bufs_pos = 0;
+
+       ++outq->bufs_used;
+
+       return buf;
+}
+
+
+extern bool
+lzma_outq_is_readable(const lzma_outq *outq)
+{
+       uint32_t i = outq->bufs_pos - outq->bufs_used;
+       if (outq->bufs_pos < outq->bufs_used)
+               i += outq->bufs_allocated;
+
+       return outq->bufs[i].finished;
+}
+
+
+extern lzma_ret
+lzma_outq_read(lzma_outq *restrict outq, uint8_t *restrict out,
+               size_t *restrict out_pos, size_t out_size,
+               lzma_vli *restrict unpadded_size,
+               lzma_vli *restrict uncompressed_size)
+{
+       // There must be at least one buffer from which to read.
+       if (outq->bufs_used == 0)
+               return LZMA_OK;
+
+       // Get the buffer.
+       uint32_t i = outq->bufs_pos - outq->bufs_used;
+       if (outq->bufs_pos < outq->bufs_used)
+               i += outq->bufs_allocated;
+
+       lzma_outbuf *buf = &outq->bufs[i];
+
+       // If it isn't finished yet, we cannot read from it.
+       if (!buf->finished)
+               return LZMA_OK;
+
+       // Copy from the buffer to output.
+       lzma_bufcpy(buf->buf, &outq->read_pos, buf->size,
+                       out, out_pos, out_size);
+
+       // Return if we didn't get all the data from the buffer.
+       if (outq->read_pos < buf->size)
+               return LZMA_OK;
+
+       // The buffer was finished. Tell the caller its size information.
+       *unpadded_size = buf->unpadded_size;
+       *uncompressed_size = buf->uncompressed_size;
+
+       // Free this buffer for further use.
+       --outq->bufs_used;
+       outq->read_pos = 0;
+
+       return LZMA_STREAM_END;
+}
diff --git a/src/liblzma/common/outqueue.h b/src/liblzma/common/outqueue.h
new file mode 100644 (file)
index 0000000..154f91b
--- /dev/null
@@ -0,0 +1,155 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       outqueue.h
+/// \brief      Output queue handling in multithreaded coding
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "common.h"
+
+
+/// Output buffer for a single thread
+typedef struct {
+       /// Pointer to the output buffer of lzma_outq.buf_size_max bytes
+       uint8_t *buf;
+
+       /// Amount of data written to buf
+       size_t size;
+
+       /// Additional size information
+       lzma_vli unpadded_size;
+       lzma_vli uncompressed_size;
+
+       /// True when no more data will be written into this buffer.
+       ///
+       /// \note       This is read by another thread and thus access
+       ///             to this variable needs a mutex.
+       bool finished;
+
+} lzma_outbuf;
+
+
+typedef struct {
+       /// Array of buffers that are used cyclically.
+       lzma_outbuf *bufs;
+
+       /// Memory allocated for all the buffers
+       uint8_t *bufs_mem;
+
+       /// Amount of buffer space available in each buffer
+       size_t buf_size_max;
+
+       /// Number of buffers allocated
+       uint32_t bufs_allocated;
+
+       /// Position in the bufs array. The next buffer to be taken
+       /// into use is bufs[bufs_pos].
+       uint32_t bufs_pos;
+
+       /// Number of buffers in use
+       uint32_t bufs_used;
+
+       /// Position in the buffer in lzma_outq_read()
+       size_t read_pos;
+
+} lzma_outq;
+
+
+/**
+ * \brief       Calculate the memory usage of an output queue
+ *
+ * \return      Approximate memory usage in bytes or UINT64_MAX on error.
+ */
+extern uint64_t lzma_outq_memusage(uint64_t buf_size_max, uint32_t threads);
+
+
+/// \brief      Initialize an output queue
+///
+/// \param      outq            Pointer to an output queue. Before calling
+///                             this function the first time, *outq should
+///                             have been zeroed with memzero() so that this
+///                             function knows that there are no previous
+///                             allocations to free.
+/// \param      allocator       Pointer to allocator or NULL
+/// \param      buf_size_max    Maximum amount of data that a single buffer
+///                             in the queue may need to store.
+/// \param      threads         Number of buffers that may be in use
+///                             concurrently. Note that more than this number
+///                             of buffers will actually get allocated to
+///                             improve performance when buffers finish
+///                             out of order.
+///
+/// \return     - LZMA_OK
+///             - LZMA_MEM_ERROR
+///
+extern lzma_ret lzma_outq_init(lzma_outq *outq, lzma_allocator *allocator,
+               uint64_t buf_size_max, uint32_t threads);
+
+
+/// \brief      Free the memory associated with the output queue
+extern void lzma_outq_end(lzma_outq *outq, lzma_allocator *allocator);
+
+
+/// \brief      Get a new buffer
+///
+/// lzma_outq_has_buf() must be used to check that there is a buffer
+/// available before calling lzma_outq_get_buf().
+///
+extern lzma_outbuf *lzma_outq_get_buf(lzma_outq *outq);
+
+
+/// \brief      Test if there is data ready to be read
+///
+/// Call to this function must be protected with the same mutex that
+/// is used to protect lzma_outbuf.finished.
+///
+extern bool lzma_outq_is_readable(const lzma_outq *outq);
+
+
+/// \brief      Read finished data
+///
+/// \param      outq            Pointer to an output queue
+/// \param      out             Beginning of the output buffer
+/// \param      out_pos         The next byte will be written to
+///                             out[*out_pos].
+/// \param      out_size        Size of the out buffer; the first byte into
+///                             which no data is written to is out[out_size].
+/// \param      unpadded_size   Unpadded Size from the Block encoder
+/// \param      uncompressed_size Uncompressed Size from the Block encoder
+///
+/// \return     - LZMA: All OK. Either no data was available or the buffer
+///               being read didn't become empty yet.
+///             - LZMA_STREAM_END: The buffer being read was finished.
+///               *unpadded_size and *uncompressed_size were set.
+///
+/// \note       This reads lzma_outbuf.finished variables and thus call
+///             to this function needs to be protected with a mutex.
+///
+extern lzma_ret lzma_outq_read(lzma_outq *restrict outq,
+               uint8_t *restrict out, size_t *restrict out_pos,
+               size_t out_size, lzma_vli *restrict unpadded_size,
+               lzma_vli *restrict uncompressed_size);
+
+
+/// \brief      Test if there is at least one buffer free
+///
+/// This must be used before getting a new buffer with lzma_outq_get_buf().
+///
+static inline bool
+lzma_outq_has_buf(const lzma_outq *outq)
+{
+       return outq->bufs_used < outq->bufs_allocated;
+}
+
+
+/// \brief      Test if the queue is completely empty
+static inline bool
+lzma_outq_is_empty(const lzma_outq *outq)
+{
+       return outq->bufs_used == 0;
+}
diff --git a/src/liblzma/common/stream_encoder_mt.c b/src/liblzma/common/stream_encoder_mt.c
new file mode 100644 (file)
index 0000000..323f04a
--- /dev/null
@@ -0,0 +1,1011 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       stream_encoder_mt.c
+/// \brief      Multithreaded .xz Stream encoder
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "filter_encoder.h"
+#include "easy_preset.h"
+#include "block_encoder.h"
+#include "index_encoder.h"
+#include "outqueue.h"
+
+
+/// Maximum supported block size. This makes it simpler to prevent integer
+/// overflows if we are given unusually large block size.
+#define BLOCK_SIZE_MAX (UINT64_MAX / LZMA_THREADS_MAX)
+
+
+typedef enum {
+       /// Waiting for work.
+       THR_IDLE,
+
+       /// Encoding is in progress.
+       THR_RUN,
+
+       /// Encoding is in progress but no more input data will
+       /// be read.
+       THR_FINISH,
+
+       /// The main thread wants the thread to stop whatever it was doing
+       /// but not exit.
+       THR_STOP,
+
+       /// The main thread wants the thread to exit. We could use
+       /// cancellation but since there's stopped anyway, this is lazier.
+       THR_EXIT,
+
+} worker_state;
+
+
+typedef struct worker_thread_s worker_thread;
+struct worker_thread_s {
+       worker_state state;
+
+       /// Input buffer of coder->block_size bytes. The main thread will
+       /// put new input into this and update in_size accordingly. Once
+       /// no more input is coming, state will be set to THR_FINISH.
+       uint8_t *in;
+
+       /// Amount of data available in the input buffer. This is modified
+       /// only by the main thread.
+       size_t in_size;
+
+       /// Output buffer for this thread. This is set by the main
+       /// thread every time a new Block is started with this thread
+       /// structure.
+       lzma_outbuf *outbuf;
+
+       /// Pointer to the main structure is needed when putting this
+       /// thread back to the stack of free threads.
+       lzma_coder *coder;
+
+       /// The allocator is set by the main thread. Since a copy of the
+       /// pointer is kept here, the application must not change the
+       /// allocator before calling lzma_end().
+       lzma_allocator *allocator;
+
+       /// Block encoder
+       lzma_next_coder block_encoder;
+
+       /// Compression options for this Block
+       lzma_block block_options;
+
+       /// Next structure in the stack of free worker threads.
+       worker_thread *next;
+
+       pthread_mutex_t mutex;
+       pthread_cond_t cond;
+
+       /// The ID of this thread is used to join the thread
+       /// when it's not needed anymore.
+       pthread_t thread_id;
+};
+
+
+struct lzma_coder_s {
+       enum {
+               SEQ_STREAM_HEADER,
+               SEQ_BLOCK,
+               SEQ_INDEX,
+               SEQ_STREAM_FOOTER,
+       } sequence;
+
+       /// Start a new Block every block_size bytes of input unless
+       /// LZMA_FULL_FLUSH or LZMA_FULL_BARRIER is used earlier.
+       size_t block_size;
+
+       /// The filter chain currently in use
+       lzma_filter filters[LZMA_FILTERS_MAX + 1];
+
+
+       /// Index to hold sizes of the Blocks
+       lzma_index *index;
+
+       /// Index encoder
+       lzma_next_coder index_encoder;
+
+
+       /// Stream Flags for encoding the Stream Header and Stream Footer.
+       lzma_stream_flags stream_flags;
+
+       /// Buffer to hold Stream Header and Stream Footer.
+       uint8_t header[LZMA_STREAM_HEADER_SIZE];
+
+       /// Read position in header[]
+       size_t header_pos;
+
+
+       /// Output buffer queue for compressed data
+       lzma_outq outq;
+
+
+       /// True if wait_max is used.
+       bool has_timeout;
+
+       /// Maximum wait time if cannot use all the input and cannot
+       /// fill the output buffer.
+       struct timespec wait_max;
+
+
+       /// Error code from a worker thread
+       lzma_ret thread_error;
+
+       /// Array of allocated thread-specific structures
+       worker_thread *threads;
+
+       /// Number of structures in "threads" above. This is also the
+       /// number of threads that will be created at maximum.
+       uint32_t threads_max;
+
+       /// Number of thread structures that have been initialized, and
+       /// thus the number of worker threads actually created so far.
+       uint32_t threads_initialized;
+
+       /// Stack of free threads. When a thread finishes, it puts itself
+       /// back into this stack. This starts as empty because threads
+       /// are created only when actually needed.
+       worker_thread *threads_free;
+
+       /// The most recent worker thread to which the main thread writes
+       /// the new input from the application.
+       worker_thread *thr;
+
+       pthread_mutex_t mutex;
+       mythread_cond cond;
+};
+
+
+/// Tell the main thread that something has gone wrong.
+static void
+worker_error(worker_thread *thr, lzma_ret ret)
+{
+       assert(ret != LZMA_OK);
+       assert(ret != LZMA_STREAM_END);
+
+       mythread_sync(thr->coder->mutex) {
+               if (thr->coder->thread_error == LZMA_OK)
+                       thr->coder->thread_error = ret;
+
+               mythread_cond_signal(&thr->coder->cond);
+       }
+
+       return;
+}
+
+
+static worker_state
+worker_encode(worker_thread *thr, worker_state state)
+{
+       // Set the Block options.
+       thr->block_options = (lzma_block){
+               .version = 0,
+               .check = thr->coder->stream_flags.check,
+               .compressed_size = thr->coder->outq.buf_size_max,
+               .uncompressed_size = thr->coder->block_size,
+
+               // TODO: To allow changing the filter chain, the filters
+               // array must be copied to each worker_thread.
+               .filters = thr->coder->filters,
+       };
+
+       // Calculate maximum size of the Block Header. This amount is
+       // reserved in the beginning of the buffer so that Block Header
+       // along with Compressed Size and Uncompressed Size can be
+       // written there.
+       lzma_ret ret = lzma_block_header_size(&thr->block_options);
+       if (ret != LZMA_OK) {
+               worker_error(thr, ret);
+               return THR_STOP;
+       }
+
+       // Initialize the Block encoder.
+       ret = lzma_block_encoder_init(&thr->block_encoder,
+                       thr->allocator, &thr->block_options);
+       if (ret != LZMA_OK) {
+               worker_error(thr, ret);
+               return THR_STOP;
+       }
+
+       size_t in_pos = 0;
+       size_t in_size = 0;
+
+       thr->outbuf->size = thr->block_options.header_size;
+       const size_t out_size = thr->coder->outq.buf_size_max;
+
+       do {
+               mythread_sync(thr->mutex) {
+                       while (in_size == thr->in_size
+                                       && thr->state == THR_RUN)
+                               pthread_cond_wait(&thr->cond, &thr->mutex);
+
+                       state = thr->state;
+                       in_size = thr->in_size;
+
+                       // TODO? Store in_pos and out_pos into *thr here
+                       // so that the application may read them via
+                       // some currently non-existing function to get
+                       // progress information.
+               }
+
+               // Return if we were asked to stop or exit.
+               if (state >= THR_STOP)
+                       return state;
+
+               lzma_action action = state == THR_FINISH
+                               ? LZMA_FINISH : LZMA_RUN;
+
+               // Limit the amount of input given to the Block encoder
+               // at once. This way this thread can react fairly quickly
+               // if the main thread wants us to stop or exit.
+               static const size_t in_chunk_max = 16384;
+               size_t in_limit = in_size;
+               if (in_size - in_pos > in_chunk_max) {
+                       in_limit = in_pos + in_chunk_max;
+                       action = LZMA_RUN;
+               }
+
+               ret = thr->block_encoder.code(
+                               thr->block_encoder.coder, thr->allocator,
+                               thr->in, &in_pos, in_limit, thr->outbuf->buf,
+                               &thr->outbuf->size, out_size, action);
+       } while (ret == LZMA_OK);
+
+       if (ret != LZMA_STREAM_END) {
+               worker_error(thr, ret);
+               return THR_STOP;
+       }
+
+       assert(state == THR_FINISH);
+
+       // Encode the Block Header. By doing it after the compression,
+       // we can store the Compressed Size and Uncompressed Size fields.
+       ret = lzma_block_header_encode(&thr->block_options, thr->outbuf->buf);
+       if (ret != LZMA_OK) {
+               worker_error(thr, ret);
+               return THR_STOP;
+       }
+
+       // Set the size information that will be read by the main thread
+       // to write the Index field.
+       thr->outbuf->unpadded_size
+                       = lzma_block_unpadded_size(&thr->block_options);
+       assert(thr->outbuf->unpadded_size != 0);
+       thr->outbuf->uncompressed_size = thr->block_options.uncompressed_size;
+
+       return THR_FINISH;
+}
+
+
+static void *
+worker_start(void *thr_ptr)
+{
+       worker_thread *thr = thr_ptr;
+       worker_state state = THR_IDLE; // Init to silence a warning
+
+       while (true) {
+               // Wait for work.
+               mythread_sync(thr->mutex) {
+                       while (true) {
+                               // The thread is already idle so if we are
+                               // requested to stop, just set the state.
+                               if (thr->state == THR_STOP)
+                                       thr->state = THR_IDLE;
+
+                               state = thr->state;
+                               if (state != THR_IDLE)
+                                       break;
+
+                               pthread_cond_wait(&thr->cond, &thr->mutex);
+                       }
+               }
+
+               assert(state != THR_IDLE);
+               assert(state != THR_STOP);
+
+               if (state <= THR_FINISH)
+                       state = worker_encode(thr, state);
+
+               if (state == THR_EXIT)
+                       break;
+
+               // Mark the thread as idle. Signal is needed for the case
+               // where the main thread is waiting for the threads to stop.
+               mythread_sync(thr->mutex) {
+                       thr->state = THR_IDLE;
+                       pthread_cond_signal(&thr->cond);
+               }
+
+               mythread_sync(thr->coder->mutex) {
+                       // Mark the output buffer as finished if
+                       // no errors occurred.
+                       thr->outbuf->finished = state == THR_FINISH;
+
+                       // Return this thread to the stack of free threads.
+                       thr->next = thr->coder->threads_free;
+                       thr->coder->threads_free = thr;
+
+                       mythread_cond_signal(&thr->coder->cond);
+               }
+       }
+
+       // Exiting, free the resources.
+       pthread_mutex_destroy(&thr->mutex);
+       pthread_cond_destroy(&thr->cond);
+
+       lzma_next_end(&thr->block_encoder, thr->allocator);
+       lzma_free(thr->in, thr->allocator);
+       return NULL;
+}
+
+
+/// Make the threads stop but not exit. Optionally wait for them to stop.
+static void
+threads_stop(lzma_coder *coder, bool wait)
+{
+       // Tell the threads to stop.
+       for (uint32_t i = 0; i < coder->threads_initialized; ++i) {
+               mythread_sync(coder->threads[i].mutex) {
+                       coder->threads[i].state = THR_STOP;
+                       pthread_cond_signal(&coder->threads[i].cond);
+               }
+       }
+
+       if (!wait)
+               return;
+
+       // Wait for the threads to settle in the idle state.
+       for (uint32_t i = 0; i < coder->threads_initialized; ++i) {
+               mythread_sync(coder->threads[i].mutex) {
+                       while (coder->threads[i].state != THR_IDLE)
+                               pthread_cond_wait(&coder->threads[i].cond,
+                                               &coder->threads[i].mutex);
+               }
+       }
+
+       return;
+}
+
+
+/// Stop the threads and free the resources associated with them.
+/// Wait until the threads have exited.
+static void
+threads_end(lzma_coder *coder, lzma_allocator *allocator)
+{
+       for (uint32_t i = 0; i < coder->threads_initialized; ++i) {
+               mythread_sync(coder->threads[i].mutex) {
+                       coder->threads[i].state = THR_EXIT;
+                       pthread_cond_signal(&coder->threads[i].cond);
+               }
+       }
+
+       for (uint32_t i = 0; i < coder->threads_initialized; ++i) {
+               int ret = pthread_join(coder->threads[i].thread_id, NULL);
+               assert(ret == 0);
+               (void)ret;
+       }
+
+       lzma_free(coder->threads, allocator);
+       return;
+}
+
+
+/// Initialize a new worker_thread structure and create a new thread.
+static lzma_ret
+initialize_new_thread(lzma_coder *coder, lzma_allocator *allocator)
+{
+       worker_thread *thr = &coder->threads[coder->threads_initialized];
+
+       thr->in = lzma_alloc(coder->block_size, allocator);
+       if (thr->in == NULL)
+               return LZMA_MEM_ERROR;
+
+       if (pthread_mutex_init(&thr->mutex, NULL))
+               goto error_mutex;
+
+       if (pthread_cond_init(&thr->cond, NULL))
+               goto error_cond;
+
+       thr->state = THR_IDLE;
+       thr->allocator = allocator;
+       thr->coder = coder;
+       thr->block_encoder = LZMA_NEXT_CODER_INIT;
+
+       if (mythread_create(&thr->thread_id, &worker_start, thr))
+               goto error_thread;
+
+       ++coder->threads_initialized;
+       coder->thr = thr;
+
+       return LZMA_OK;
+
+error_thread:
+       pthread_cond_destroy(&thr->cond);
+
+error_cond:
+       pthread_mutex_destroy(&thr->mutex);
+
+error_mutex:
+       lzma_free(thr->in, allocator);
+       return LZMA_MEM_ERROR;
+}
+
+
+static lzma_ret
+get_thread(lzma_coder *coder, lzma_allocator *allocator)
+{
+       // If there are no free output subqueues, there is no
+       // point to try getting a thread.
+       if (!lzma_outq_has_buf(&coder->outq))
+               return LZMA_OK;
+
+       // If there is a free structure on the stack, use it.
+       mythread_sync(coder->mutex) {
+               if (coder->threads_free != NULL) {
+                       coder->thr = coder->threads_free;
+                       coder->threads_free = coder->threads_free->next;
+               }
+       }
+
+       if (coder->thr == NULL) {
+               // If there are no uninitialized structures left, return.
+               if (coder->threads_initialized == coder->threads_max)
+                       return LZMA_OK;
+
+               // Initialize a new thread.
+               return_if_error(initialize_new_thread(coder, allocator));
+       }
+
+       // Reset the parts of the thread state that have to be done
+       // in the main thread.
+       mythread_sync(coder->thr->mutex) {
+               coder->thr->state = THR_RUN;
+               coder->thr->in_size = 0;
+               coder->thr->outbuf = lzma_outq_get_buf(&coder->outq);
+               pthread_cond_signal(&coder->thr->cond);
+       }
+
+       return LZMA_OK;
+}
+
+
+static lzma_ret
+stream_encode_in(lzma_coder *coder, lzma_allocator *allocator,
+               const uint8_t *restrict in, size_t *restrict in_pos,
+               size_t in_size, lzma_action action)
+{
+       while (*in_pos < in_size
+                       || (coder->thr != NULL && action != LZMA_RUN)) {
+               if (coder->thr == NULL) {
+                       // Get a new thread.
+                       const lzma_ret ret = get_thread(coder, allocator);
+                       if (coder->thr == NULL)
+                               return ret;
+               }
+
+               // Copy the input data to thread's buffer.
+               size_t thr_in_size = coder->thr->in_size;
+               lzma_bufcpy(in, in_pos, in_size, coder->thr->in,
+                               &thr_in_size, coder->block_size);
+
+               // Tell the Block encoder to finish if
+               //  - it has got block_size bytes of input; or
+               //  - all input was used and LZMA_FINISH, LZMA_FULL_FLUSH,
+               //    or LZMA_FULL_BARRIER was used.
+               //
+               // TODO: LZMA_SYNC_FLUSH and LZMA_SYNC_BARRIER.
+               const bool finish = thr_in_size == coder->block_size
+                               || (*in_pos == in_size && action != LZMA_RUN);
+
+               bool block_error = false;
+
+               mythread_sync(coder->thr->mutex) {
+                       if (coder->thr->state == THR_IDLE) {
+                               // Something has gone wrong with the Block
+                               // encoder. It has set coder->thread_error
+                               // which we will read a few lines later.
+                               block_error = true;
+                       } else {
+                               // Tell the Block encoder its new amount
+                               // of input and update the state if needed.
+                               coder->thr->in_size = thr_in_size;
+
+                               if (finish)
+                                       coder->thr->state = THR_FINISH;
+
+                               pthread_cond_signal(&coder->thr->cond);
+                       }
+               }
+
+               if (block_error) {
+                       lzma_ret ret;
+
+                       mythread_sync(coder->mutex) {
+                               ret = coder->thread_error;
+                       }
+
+                       return ret;
+               }
+
+               if (finish)
+                       coder->thr = NULL;
+       }
+
+       return LZMA_OK;
+}
+
+
+/// Wait until more input can be consumed, more output can be read, or
+/// an optional timeout is reached.
+static bool
+wait_for_work(lzma_coder *coder, struct timespec *wait_abs,
+               bool *has_blocked, bool has_input)
+{
+       if (coder->has_timeout && !*has_blocked) {
+               // Every time when stream_encode_mt() is called via
+               // lzma_code(), *has_block starts as false. We set it
+               // to true here and calculate the absolute time when
+               // we must return if there's nothing to do.
+               //
+               // The idea of *has_blocked is to avoid unneeded calls
+               // to mythread_cond_abstime(), which may do a syscall
+               // depending on the operating system.
+               *has_blocked = true;
+               *wait_abs = coder->wait_max;
+               mythread_cond_abstime(&coder->cond, wait_abs);
+       }
+
+       bool timed_out = false;
+
+       mythread_sync(coder->mutex) {
+               // There are four things that we wait. If one of them
+               // becomes possible, we return.
+               //  - If there is input left, we need to get a free
+               //    worker thread and an output buffer for it.
+               //  - Data ready to be read from the output queue.
+               //  - A worker thread indicates an error.
+               //  - Time out occurs.
+               while ((!has_input || coder->threads_free == NULL
+                                       || !lzma_outq_has_buf(&coder->outq))
+                               && !lzma_outq_is_readable(&coder->outq)
+                               && coder->thread_error == LZMA_OK
+                               && !timed_out) {
+                       if (coder->has_timeout)
+                               timed_out = mythread_cond_timedwait(
+                                               &coder->cond, &coder->mutex,
+                                               wait_abs) != 0;
+                       else
+                               mythread_cond_wait(&coder->cond,
+                                               &coder->mutex);
+               }
+       }
+
+       return timed_out;
+}
+
+
+static lzma_ret
+stream_encode_mt(lzma_coder *coder, lzma_allocator *allocator,
+               const uint8_t *restrict in, size_t *restrict in_pos,
+               size_t in_size, uint8_t *restrict out,
+               size_t *restrict out_pos, size_t out_size, lzma_action action)
+{
+       switch (coder->sequence) {
+       case SEQ_STREAM_HEADER:
+               lzma_bufcpy(coder->header, &coder->header_pos,
+                               sizeof(coder->header),
+                               out, out_pos, out_size);
+               if (coder->header_pos < sizeof(coder->header))
+                       return LZMA_OK;
+
+               coder->header_pos = 0;
+               coder->sequence = SEQ_BLOCK;
+
+       // Fall through
+
+       case SEQ_BLOCK: {
+               // Initialized to silence warnings.
+               lzma_vli unpadded_size = 0;
+               lzma_vli uncompressed_size = 0;
+               lzma_ret ret = LZMA_OK;
+
+               // These are for wait_for_work().
+               bool has_blocked = false;
+               struct timespec wait_abs;
+
+               while (true) {
+                       mythread_sync(coder->mutex) {
+                               // Check for Block encoder errors.
+                               ret = coder->thread_error;
+                               if (ret != LZMA_OK) {
+                                       assert(ret != LZMA_STREAM_END);
+                                       break;
+                               }
+
+                               // Try to read compressed data to out[].
+                               ret = lzma_outq_read(&coder->outq,
+                                               out, out_pos, out_size,
+                                               &unpadded_size,
+                                               &uncompressed_size);
+                       }
+
+                       if (ret == LZMA_STREAM_END) {
+                               // End of Block. Add it to the Index.
+                               ret = lzma_index_append(coder->index,
+                                               allocator, unpadded_size,
+                                               uncompressed_size);
+
+                               // If we didn't fill the output buffer yet,
+                               // try to read more data. Maybe the next
+                               // outbuf has been finished already too.
+                               if (*out_pos < out_size)
+                                       continue;
+                       }
+
+                       if (ret != LZMA_OK) {
+                               // coder->thread_error was set or
+                               // lzma_index_append() failed.
+                               threads_stop(coder, false);
+                               return ret;
+                       }
+
+                       // Check if the last Block was finished.
+                       if (action == LZMA_FINISH
+                                       && *in_pos == in_size
+                                       && lzma_outq_is_empty(
+                                               &coder->outq))
+                               break;
+
+                       // Try to give uncompressed data to a worker thread.
+                       ret = stream_encode_in(coder, allocator,
+                                       in, in_pos, in_size, action);
+                       if (ret != LZMA_OK) {
+                               threads_stop(coder, false);
+                               return ret;
+                       }
+
+                       // Return if
+                       //  - we have used all the input and expect to
+                       //    get more input; or
+                       //  - the output buffer has been filled.
+                       //
+                       // TODO: Support flushing.
+                       if ((*in_pos == in_size && action != LZMA_FINISH)
+                                       || *out_pos == out_size)
+                               return LZMA_OK;
+
+                       // Neither in nor out has been used completely.
+                       // Wait until there's something we can do.
+                       if (wait_for_work(coder, &wait_abs, &has_blocked,
+                                       *in_pos < in_size))
+                               return LZMA_TIMED_OUT;
+               }
+
+               // All Blocks have been encoded and the threads have stopped.
+               // Prepare to encode the Index field.
+               return_if_error(lzma_index_encoder_init(
+                               &coder->index_encoder, allocator,
+                               coder->index));
+               coder->sequence = SEQ_INDEX;
+       }
+
+       // Fall through
+
+       case SEQ_INDEX: {
+               // Call the Index encoder. It doesn't take any input, so
+               // those pointers can be NULL.
+               const lzma_ret ret = coder->index_encoder.code(
+                               coder->index_encoder.coder, allocator,
+                               NULL, NULL, 0,
+                               out, out_pos, out_size, LZMA_RUN);
+               if (ret != LZMA_STREAM_END)
+                       return ret;
+
+               // Encode the Stream Footer into coder->buffer.
+               coder->stream_flags.backward_size
+                               = lzma_index_size(coder->index);
+               if (lzma_stream_footer_encode(&coder->stream_flags,
+                               coder->header) != LZMA_OK)
+                       return LZMA_PROG_ERROR;
+
+               coder->sequence = SEQ_STREAM_FOOTER;
+       }
+
+       // Fall through
+
+       case SEQ_STREAM_FOOTER:
+               lzma_bufcpy(coder->header, &coder->header_pos,
+                               sizeof(coder->header),
+                               out, out_pos, out_size);
+               return coder->header_pos < sizeof(coder->header)
+                               ? LZMA_OK : LZMA_STREAM_END;
+       }
+
+       assert(0);
+       return LZMA_PROG_ERROR;
+}
+
+
+static void
+stream_encoder_mt_end(lzma_coder *coder, lzma_allocator *allocator)
+{
+       // Threads must be killed before the output queue can be freed.
+       threads_end(coder, allocator);
+       lzma_outq_end(&coder->outq, allocator);
+
+       for (size_t i = 0; coder->filters[i].id != LZMA_VLI_UNKNOWN; ++i)
+               lzma_free(coder->filters[i].options, allocator);
+
+       lzma_next_end(&coder->index_encoder, allocator);
+       lzma_index_end(coder->index, allocator);
+
+       mythread_cond_destroy(&coder->cond);
+       pthread_mutex_destroy(&coder->mutex);
+
+       lzma_free(coder, allocator);
+       return;
+}
+
+
+/// Options handling for lzma_stream_encoder_mt_init() and
+/// lzma_stream_encoder_mt_memusage()
+static lzma_ret
+get_options(const lzma_mt *options, lzma_options_easy *opt_easy,
+               const lzma_filter **filters, uint64_t *block_size,
+               uint64_t *outbuf_size_max)
+{
+       // Validate some of the options.
+       if (options == NULL)
+               return LZMA_PROG_ERROR;
+
+       if (options->flags != 0 || options->threads == 0
+                       || options->threads > LZMA_THREADS_MAX)
+               return LZMA_OPTIONS_ERROR;
+
+       if (options->filters != NULL) {
+               // Filter chain was given, use it as is.
+               *filters = options->filters;
+       } else {
+               // Use a preset.
+               if (lzma_easy_preset(opt_easy, options->preset))
+                       return LZMA_OPTIONS_ERROR;
+
+               *filters = opt_easy->filters;
+       }
+
+       // Block size
+       if (options->block_size > 0) {
+               if (options->block_size > BLOCK_SIZE_MAX)
+                       return LZMA_OPTIONS_ERROR;
+
+               *block_size = options->block_size;
+       } else {
+               // Determine the Block size from the filter chain.
+               *block_size = lzma_mt_block_size(*filters);
+               if (*block_size == 0)
+                       return LZMA_OPTIONS_ERROR;
+
+               assert(*block_size <= BLOCK_SIZE_MAX);
+       }
+
+       // Calculate the maximum amount output that a single output buffer
+       // may need to hold. This is the same as the maximum total size of
+       // a Block.
+       //
+       // FIXME: As long as the encoder keeps the whole input buffer
+       // available and doesn't start writing output before finishing
+       // the Block, it could use lzma_stream_buffer_bound() and use
+       // uncompressed LZMA2 chunks if the data doesn't compress.
+       *outbuf_size_max = *block_size + *block_size / 16 + 16384;
+
+       return LZMA_OK;
+}
+
+
+static lzma_ret
+stream_encoder_mt_init(lzma_next_coder *next, lzma_allocator *allocator,
+               const lzma_mt *options)
+{
+       lzma_next_coder_init(&stream_encoder_mt_init, next, allocator);
+
+       // Get the filter chain.
+       lzma_options_easy easy;
+       const lzma_filter *filters;
+       uint64_t block_size;
+       uint64_t outbuf_size_max;
+       return_if_error(get_options(options, &easy, &filters,
+                       &block_size, &outbuf_size_max));
+
+#if SIZE_MAX < UINT64_MAX
+       if (block_size > SIZE_MAX)
+               return LZMA_MEM_ERROR;
+#endif
+
+       // FIXME TODO: Validate the filter chain so that we can give
+       // an error in this function instead of delaying it to the first
+       // call to lzma_code().
+
+       // Validate the Check ID.
+       if ((unsigned int)(options->check) > LZMA_CHECK_ID_MAX)
+               return LZMA_PROG_ERROR;
+
+       if (!lzma_check_is_supported(options->check))
+               return LZMA_UNSUPPORTED_CHECK;
+
+       // Allocate and initialize the base structure if needed.
+       if (next->coder == NULL) {
+               next->coder = lzma_alloc(sizeof(lzma_coder), allocator);
+               if (next->coder == NULL)
+                       return LZMA_MEM_ERROR;
+
+               // For the mutex and condition variable initializations
+               // the error handling has to be done here because
+               // stream_encoder_mt_end() doesn't know if they have
+               // already been initialized or not.
+               if (pthread_mutex_init(&next->coder->mutex, NULL)) {
+                       lzma_free(next->coder, allocator);
+                       next->coder = NULL;
+                       return LZMA_MEM_ERROR;
+               }
+
+               if (mythread_cond_init(&next->coder->cond)) {
+                       pthread_mutex_destroy(&next->coder->mutex);
+                       lzma_free(next->coder, allocator);
+                       next->coder = NULL;
+                       return LZMA_MEM_ERROR;
+               }
+
+               next->code = &stream_encode_mt;
+               next->end = &stream_encoder_mt_end;
+//             next->update = &stream_encoder_mt_update;
+
+               next->coder->filters[0].id = LZMA_VLI_UNKNOWN;
+               next->coder->index_encoder = LZMA_NEXT_CODER_INIT;
+               next->coder->index = NULL;
+               memzero(&next->coder->outq, sizeof(next->coder->outq));
+               next->coder->threads = NULL;
+               next->coder->threads_max = 0;
+               next->coder->threads_initialized = 0;
+       }
+
+       // Basic initializations
+       next->coder->sequence = SEQ_STREAM_HEADER;
+       next->coder->block_size = (size_t)(block_size);
+       next->coder->thread_error = LZMA_OK;
+       next->coder->thr = NULL;
+
+       // Allocate the thread-specific base structures.
+       assert(options->threads > 0);
+       if (next->coder->threads_max != options->threads) {
+               threads_end(next->coder, allocator);
+
+               next->coder->threads = NULL;
+               next->coder->threads_max = 0;
+
+               next->coder->threads_initialized = 0;
+               next->coder->threads_free = NULL;
+
+               next->coder->threads = lzma_alloc(
+                               options->threads * sizeof(worker_thread),
+                               allocator);
+               if (next->coder->threads == NULL)
+                       return LZMA_MEM_ERROR;
+
+               next->coder->threads_max = options->threads;
+       } else {
+               // Reuse the old structures and threads. Tell the running
+               // threads to stop and wait until they have stopped.
+               threads_stop(next->coder, true);
+       }
+
+       // Output queue
+       return_if_error(lzma_outq_init(&next->coder->outq, allocator,
+                       outbuf_size_max, options->threads));
+
+       // Timeout
+       if (options->timeout > 0) {
+               next->coder->wait_max.tv_sec = options->timeout / 1000;
+               next->coder->wait_max.tv_nsec
+                               = (options->timeout % 1000) * 1000000L;
+               next->coder->has_timeout = true;
+       } else {
+               next->coder->has_timeout = false;
+       }
+
+       // Free the old filter chain and copy the new one.
+       for (size_t i = 0; next->coder->filters[i].id != LZMA_VLI_UNKNOWN; ++i)
+               lzma_free(next->coder->filters[i].options, allocator);
+
+       return_if_error(lzma_filters_copy(options->filters,
+                       next->coder->filters, allocator));
+
+       // Index
+       lzma_index_end(next->coder->index, allocator);
+       next->coder->index = lzma_index_init(allocator);
+       if (next->coder->index == NULL)
+               return LZMA_MEM_ERROR;
+
+       // Stream Header
+       next->coder->stream_flags.version = 0;
+       next->coder->stream_flags.check = options->check;
+       return_if_error(lzma_stream_header_encode(
+                       &next->coder->stream_flags, next->coder->header));
+
+       next->coder->header_pos = 0;
+
+       return LZMA_OK;
+}
+
+
+extern LZMA_API(lzma_ret)
+lzma_stream_encoder_mt(lzma_stream *strm, const lzma_mt *options)
+{
+       lzma_next_strm_init(stream_encoder_mt_init, strm, options);
+
+       strm->internal->supported_actions[LZMA_RUN] = true;
+//     strm->internal->supported_actions[LZMA_SYNC_FLUSH] = true;
+//     strm->internal->supported_actions[LZMA_FULL_FLUSH] = true;
+//     strm->internal->supported_actions[LZMA_FULL_BARRIER] = true;
+       strm->internal->supported_actions[LZMA_FINISH] = true;
+
+       return LZMA_OK;
+}
+
+
+// This function name is a monster but it's consistent with the older
+// monster names. :-( 31 chars is the max that C99 requires so in that
+// sense it's not too long. ;-)
+extern LZMA_API(uint64_t)
+lzma_stream_encoder_mt_memusage(const lzma_mt *options)
+{
+       lzma_options_easy easy;
+       const lzma_filter *filters;
+       uint64_t block_size;
+       uint64_t outbuf_size_max;
+
+       if (get_options(options, &easy, &filters, &block_size,
+                       &outbuf_size_max) != LZMA_OK)
+               return UINT64_MAX;
+
+       // Memory usage of the input buffers
+       const uint64_t inbuf_memusage = options->threads * block_size;
+
+       // Memory usage of the filter encoders
+       uint64_t filters_memusage
+                       = lzma_raw_encoder_memusage(options->filters);
+       if (filters_memusage == UINT64_MAX)
+               return UINT64_MAX;
+
+       filters_memusage *= options->threads;
+
+       // Memory usage of the output queue
+       const uint64_t outq_memusage = lzma_outq_memusage(
+                       outbuf_size_max, options->threads);
+       if (outq_memusage == UINT64_MAX)
+               return UINT64_MAX;
+
+       // Sum them with overflow checking.
+       uint64_t total_memusage = LZMA_MEMUSAGE_BASE + sizeof(lzma_coder)
+                       + options->threads * sizeof(worker_thread);
+
+       if (UINT64_MAX - total_memusage < inbuf_memusage)
+               return UINT64_MAX;
+
+       total_memusage += inbuf_memusage;
+
+       if (UINT64_MAX - total_memusage < filters_memusage)
+               return UINT64_MAX;
+
+       total_memusage += filters_memusage;
+
+       if (UINT64_MAX - total_memusage < outq_memusage)
+               return UINT64_MAX;
+
+       return total_memusage + outq_memusage;
+}