From: Fangrui Song Date: Wed, 21 Sep 2022 18:13:02 +0000 (-0700) Subject: [ELF] Parallelize --compress-debug-sections=zstd X-Git-Tag: upstream/17.0.6~32861 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fa74144c64dff6b145b0b3fa9397f913ddaa87bf;p=platform%2Fupstream%2Fllvm.git [ELF] Parallelize --compress-debug-sections=zstd See D117853: compressing debug sections is a bottleneck and therefore it has a large value parallizing the step. zstd provides multi-threading API and the output is deterministic even with different numbers of threads (see https://github.com/facebook/zstd/issues/2238). Therefore we can leverage it instead of using the pigz-style sharding approach. Also, switch to the default compression level 3. The current level 5 is significantly slower without providing justifying size benefit. ``` 'dash b.sh 1' ran 1.05 ± 0.01 times faster than 'dash b.sh 3' 1.18 ± 0.01 times faster than 'dash b.sh 4' 1.29 ± 0.02 times faster than 'dash b.sh 5' level=1 size: 358946945 level=3 size: 309002145 level=4 size: 307693204 level=5 size: 297828315 ``` Reviewed By: andrewng, peter.smith Differential Revision: https://reviews.llvm.org/D133679 --- diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 517e815..42af115 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -24,6 +24,9 @@ #if LLVM_ENABLE_ZLIB #include #endif +#if LLVM_ENABLE_ZSTD +#include +#endif using namespace llvm; using namespace llvm::dwarf; @@ -331,25 +334,60 @@ template void OutputSection::maybeCompress() { llvm::TimeTraceScope timeScope("Compress debug sections"); compressed.uncompressedSize = size; auto buf = std::make_unique(size); + // Write uncompressed data to a temporary zero-initialized buffer. + { + parallel::TaskGroup tg; + writeTo(buf.get(), tg); + } + +#if LLVM_ENABLE_ZSTD + // Use ZSTD's streaming compression API which permits parallel workers working + // on the stream. See http://facebook.github.io/zstd/zstd_manual.html + // "Streaming compression - HowTo". if (config->compressDebugSections == DebugCompressionType::Zstd) { - { - parallel::TaskGroup tg; - writeTo(buf.get(), tg); - } + // Allocate a buffer of half of the input size, and grow it by 1.5x if + // insufficient. compressed.shards = std::make_unique[]>(1); - compression::zstd::compress(makeArrayRef(buf.get(), size), - compressed.shards[0]); - size = sizeof(Elf_Chdr) + compressed.shards[0].size(); + SmallVector &out = compressed.shards[0]; + out.resize_for_overwrite(std::max(size / 2, 32)); + size_t pos = 0; + + ZSTD_CCtx *cctx = ZSTD_createCCtx(); + size_t ret = ZSTD_CCtx_setParameter( + cctx, ZSTD_c_nbWorkers, parallel::strategy.compute_thread_count()); + if (ZSTD_isError(ret)) + fatal(Twine("ZSTD_CCtx_setParameter: ") + ZSTD_getErrorName(ret)); + ZSTD_outBuffer zob = {out.data(), out.size(), 0}; + ZSTD_EndDirective directive = ZSTD_e_continue; + const size_t blockSize = ZSTD_CStreamInSize(); + do { + const size_t n = std::min(size - pos, blockSize); + if (n == size - pos) + directive = ZSTD_e_end; + ZSTD_inBuffer zib = {buf.get() + pos, n, 0}; + size_t bytesRemaining = 0; + while (zib.pos != zib.size || + (directive == ZSTD_e_end && bytesRemaining != 0)) { + if (zob.pos == zob.size) { + out.resize_for_overwrite(out.size() * 3 / 2); + zob.dst = out.data(); + zob.size = out.size(); + } + bytesRemaining = ZSTD_compressStream2(cctx, &zob, &zib, directive); + assert(!ZSTD_isError(bytesRemaining)); + } + pos += n; + } while (directive != ZSTD_e_end); + out.resize(zob.pos); + ZSTD_freeCCtx(cctx); + + size = sizeof(Elf_Chdr) + out.size(); flags |= SHF_COMPRESSED; return; } +#endif #if LLVM_ENABLE_ZLIB - // Write uncompressed data to a temporary zero-initialized buffer. - { - parallel::TaskGroup tg; - writeTo(buf.get(), tg); - } // We chose 1 (Z_BEST_SPEED) as the default compression level because it is // the fastest. If -O2 is given, we use level 6 to compress debug info more by // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more