Find zip file end of central directory backwards up to max possible size (resubmit...
authorCarlos Sanchez Lopez <1175054+carlossanlop@users.noreply.github.com>
Tue, 1 Oct 2019 16:10:58 +0000 (09:10 -0700)
committerGitHub <noreply@github.com>
Tue, 1 Oct 2019 16:10:58 +0000 (09:10 -0700)
Issue
Huge zip files without an end of central directory take a long time to throw because we seek to the end then search backwards for the EOCD signature all the way to the end.

Fix
Ensure we only search for the signature from the end within the maximum possible number of bytes where it can be found. If the signature is not found, we fail faster.
Submitting again due to build break caused by a conflict in the files modified by this commit.

Note
This may be a breaking change for the edge case where a zip file contains an EOCD comment with the maximum length + additional trailing garbage.

Commit migrated from https://github.com/dotnet/corefx/commit/861259bf87d8e4ee2c4f8dab31b73d377faff77d

src/libraries/System.IO.Compression/src/System/IO/Compression/ZipArchive.cs
src/libraries/System.IO.Compression/src/System/IO/Compression/ZipBlocks.cs
src/libraries/System.IO.Compression/src/System/IO/Compression/ZipHelper.cs
src/libraries/System.IO.Compression/tests/ZipArchive/zip_InvalidParametersAndStrangeFiles.cs

index a62b274..c6a91b5 100644 (file)
@@ -525,9 +525,15 @@ namespace System.IO.Compression
         {
             try
             {
-                // this seeks to the start of the end of central directory record
+                // This seeks backwards almost to the beginning of the EOCD, one byte after where the signature would be
+                // located if the EOCD had the minimum possible size (no file zip comment)
                 _archiveStream.Seek(-ZipEndOfCentralDirectoryBlock.SizeOfBlockWithoutSignature, SeekOrigin.End);
-                if (!ZipHelper.SeekBackwardsToSignature(_archiveStream, ZipEndOfCentralDirectoryBlock.SignatureConstant))
+
+                // If the EOCD has the minimum possible size (no zip file comment), then exactly the previous 4 bytes will contain the signature
+                // But if the EOCD has max possible size, the signature should be found somewhere in the previous 64K + 4 bytes
+                if (!ZipHelper.SeekBackwardsToSignature(_archiveStream,
+                        ZipEndOfCentralDirectoryBlock.SignatureConstant,
+                        ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength + ZipEndOfCentralDirectoryBlock.SignatureSize))
                     throw new InvalidDataException(SR.EOCDNotFound);
 
                 long eocdStart = _archiveStream.Position;
@@ -543,56 +549,17 @@ namespace System.IO.Compression
 
                 _numberOfThisDisk = eocd.NumberOfThisDisk;
                 _centralDirectoryStart = eocd.OffsetOfStartOfCentralDirectoryWithRespectToTheStartingDiskNumber;
+
                 if (eocd.NumberOfEntriesInTheCentralDirectory != eocd.NumberOfEntriesInTheCentralDirectoryOnThisDisk)
                     throw new InvalidDataException(SR.SplitSpanned);
+
                 _expectedNumberOfEntries = eocd.NumberOfEntriesInTheCentralDirectory;
 
                 // only bother saving the comment if we are in update mode
                 if (_mode == ZipArchiveMode.Update)
                     _archiveComment = eocd.ArchiveComment;
 
-                // only bother looking for zip64 EOCD stuff if we suspect it is needed because some value is FFFFFFFFF
-                // because these are the only two values we need, we only worry about these
-                // if we don't find the zip64 EOCD, we just give up and try to use the original values
-                if (eocd.NumberOfThisDisk == ZipHelper.Mask16Bit ||
-                    eocd.OffsetOfStartOfCentralDirectoryWithRespectToTheStartingDiskNumber == ZipHelper.Mask32Bit ||
-                    eocd.NumberOfEntriesInTheCentralDirectory == ZipHelper.Mask16Bit)
-                {
-                    // we need to look for zip 64 EOCD stuff
-                    // seek to the zip 64 EOCD locator
-                    _archiveStream.Seek(eocdStart - Zip64EndOfCentralDirectoryLocator.SizeOfBlockWithoutSignature, SeekOrigin.Begin);
-                    // if we don't find it, assume it doesn't exist and use data from normal eocd
-                    if (ZipHelper.SeekBackwardsToSignature(_archiveStream, Zip64EndOfCentralDirectoryLocator.SignatureConstant))
-                    {
-                        // use locator to get to Zip64EOCD
-                        Zip64EndOfCentralDirectoryLocator locator;
-                        bool zip64eocdLocatorProper = Zip64EndOfCentralDirectoryLocator.TryReadBlock(_archiveReader, out locator);
-                        Debug.Assert(zip64eocdLocatorProper); // we just found this using the signature finder, so it should be okay
-
-                        if (locator.OffsetOfZip64EOCD > long.MaxValue)
-                            throw new InvalidDataException(SR.FieldTooBigOffsetToZip64EOCD);
-                        long zip64EOCDOffset = (long)locator.OffsetOfZip64EOCD;
-
-                        _archiveStream.Seek(zip64EOCDOffset, SeekOrigin.Begin);
-
-                        // read Zip64EOCD
-                        Zip64EndOfCentralDirectoryRecord record;
-                        if (!Zip64EndOfCentralDirectoryRecord.TryReadBlock(_archiveReader, out record))
-                            throw new InvalidDataException(SR.Zip64EOCDNotWhereExpected);
-
-                        _numberOfThisDisk = record.NumberOfThisDisk;
-
-                        if (record.NumberOfEntriesTotal > long.MaxValue)
-                            throw new InvalidDataException(SR.FieldTooBigNumEntries);
-                        if (record.OffsetOfCentralDirectory > long.MaxValue)
-                            throw new InvalidDataException(SR.FieldTooBigOffsetToCD);
-                        if (record.NumberOfEntriesTotal != record.NumberOfEntriesOnThisDisk)
-                            throw new InvalidDataException(SR.SplitSpanned);
-
-                        _expectedNumberOfEntries = (long)record.NumberOfEntriesTotal;
-                        _centralDirectoryStart = (long)record.OffsetOfCentralDirectory;
-                    }
-                }
+                TryReadZip64EndOfCentralDirectory(eocd, eocdStart);
 
                 if (_centralDirectoryStart > _archiveStream.Length)
                 {
@@ -609,6 +576,65 @@ namespace System.IO.Compression
             }
         }
 
+        // Tries to find the Zip64 End of Central Directory Locator, then the Zip64 End of Central Directory, assuming the
+        // End of Central Directory block has already been found, as well as the location in the stream where the EOCD starts.
+        private void TryReadZip64EndOfCentralDirectory(ZipEndOfCentralDirectoryBlock eocd, long eocdStart)
+        {
+            // Only bother looking for the Zip64-EOCD stuff if we suspect it is needed because some value is FFFFFFFFF
+            // because these are the only two values we need, we only worry about these
+            // if we don't find the Zip64-EOCD, we just give up and try to use the original values
+            if (eocd.NumberOfThisDisk == ZipHelper.Mask16Bit ||
+                eocd.OffsetOfStartOfCentralDirectoryWithRespectToTheStartingDiskNumber == ZipHelper.Mask32Bit ||
+                eocd.NumberOfEntriesInTheCentralDirectory == ZipHelper.Mask16Bit)
+            {
+                // Read Zip64 End of Central Directory Locator
+
+                // This seeks forwards almost to the beginning of the Zip64-EOCDL, one byte after where the signature would be located
+                _archiveStream.Seek(eocdStart - Zip64EndOfCentralDirectoryLocator.SizeOfBlockWithoutSignature, SeekOrigin.Begin);
+
+                // Exactly the previous 4 bytes should contain the Zip64-EOCDL signature
+                // if we don't find it, assume it doesn't exist and use data from normal EOCD
+                if (ZipHelper.SeekBackwardsToSignature(_archiveStream,
+                        Zip64EndOfCentralDirectoryLocator.SignatureConstant,
+                        Zip64EndOfCentralDirectoryLocator.SignatureSize))
+                {
+                    Debug.Assert(_archiveReader != null);
+
+                    // use locator to get to Zip64-EOCD
+                    Zip64EndOfCentralDirectoryLocator locator;
+                    bool zip64eocdLocatorProper = Zip64EndOfCentralDirectoryLocator.TryReadBlock(_archiveReader, out locator);
+                    Debug.Assert(zip64eocdLocatorProper); // we just found this using the signature finder, so it should be okay
+
+                    if (locator.OffsetOfZip64EOCD > long.MaxValue)
+                        throw new InvalidDataException(SR.FieldTooBigOffsetToZip64EOCD);
+
+                    long zip64EOCDOffset = (long)locator.OffsetOfZip64EOCD;
+
+                    _archiveStream.Seek(zip64EOCDOffset, SeekOrigin.Begin);
+
+                    // Read Zip64 End of Central Directory Record
+
+                    Zip64EndOfCentralDirectoryRecord record;
+                    if (!Zip64EndOfCentralDirectoryRecord.TryReadBlock(_archiveReader, out record))
+                        throw new InvalidDataException(SR.Zip64EOCDNotWhereExpected);
+
+                    _numberOfThisDisk = record.NumberOfThisDisk;
+
+                    if (record.NumberOfEntriesTotal > long.MaxValue)
+                        throw new InvalidDataException(SR.FieldTooBigNumEntries);
+
+                    if (record.OffsetOfCentralDirectory > long.MaxValue)
+                        throw new InvalidDataException(SR.FieldTooBigOffsetToCD);
+
+                    if (record.NumberOfEntriesTotal != record.NumberOfEntriesOnThisDisk)
+                        throw new InvalidDataException(SR.SplitSpanned);
+
+                    _expectedNumberOfEntries = (long)record.NumberOfEntriesTotal;
+                    _centralDirectoryStart = (long)record.OffsetOfCentralDirectory;
+                }
+            }
+        }
+
         private void WriteFile()
         {
             // if we are in create mode, we always set readEntries to true in Init
index bad189c..20fb1e7 100644 (file)
@@ -289,6 +289,8 @@ namespace System.IO.Compression
     internal struct Zip64EndOfCentralDirectoryLocator
     {
         public const uint SignatureConstant = 0x07064B50;
+        public const int SignatureSize = sizeof(uint);
+
         public const int SizeOfBlockWithoutSignature = 16;
 
         public uint NumberOfDiskWithZip64EOCD;
@@ -643,7 +645,16 @@ namespace System.IO.Compression
     internal struct ZipEndOfCentralDirectoryBlock
     {
         public const uint SignatureConstant = 0x06054B50;
+        public const int SignatureSize = sizeof(uint);
+
+        // This is the minimum possible size, assuming the zip file comments variable section is empty
         public const int SizeOfBlockWithoutSignature = 18;
+
+        // The end of central directory can have a variable size zip file comment at the end, but its max length can be 64K
+        // The Zip File Format Specification does not explicitly mention a max size for this field, but we are assuming this
+        // max size because that is the maximum value an ushort can hold.
+        public const int ZipFileCommentMaxLength = ushort.MaxValue;
+
         public uint Signature;
         public ushort NumberOfThisDisk;
         public ushort NumberOfTheDiskWithTheStartOfTheCentralDirectory;
@@ -673,7 +684,7 @@ namespace System.IO.Compression
             writer.Write(startOfCentralDirectoryTruncated);
 
             // Should be valid because of how we read archiveComment in TryReadBlock:
-            Debug.Assert((archiveComment == null) || (archiveComment.Length < ushort.MaxValue));
+            Debug.Assert((archiveComment == null) || (archiveComment.Length <= ZipFileCommentMaxLength));
 
             writer.Write(archiveComment != null ? (ushort)archiveComment.Length : (ushort)0); // zip file comment length
             if (archiveComment != null)
index f8e3a3e..22664cb 100644 (file)
@@ -102,11 +102,15 @@ namespace System.IO.Compression
             return (uint)ret;
         }
 
-        // assumes all bytes of signatureToFind are non zero, looks backwards from current position in stream,
+        // Assumes all bytes of signatureToFind are non zero, looks backwards from current position in stream,
+        // assumes maxBytesToRead is positive, ensures to not read beyond the provided max number of bytes,
         // if the signature is found then returns true and positions stream at first byte of signature
         // if the signature is not found, returns false
-        internal static bool SeekBackwardsToSignature(Stream stream, uint signatureToFind)
+        internal static bool SeekBackwardsToSignature(Stream stream, uint signatureToFind, int maxBytesToRead)
         {
+            Debug.Assert(signatureToFind != 0);
+            Debug.Assert(maxBytesToRead > 0);
+
             int bufferPointer = 0;
             uint currentSignature = 0;
             byte[] buffer = new byte[BackwardsSeekingBufferSize];
@@ -114,7 +118,8 @@ namespace System.IO.Compression
             bool outOfBytes = false;
             bool signatureFound = false;
 
-            while (!signatureFound && !outOfBytes)
+            int bytesRead = 0;
+            while (!signatureFound && !outOfBytes && bytesRead <= maxBytesToRead)
             {
                 outOfBytes = SeekBackwardsAndRead(stream, buffer, out bufferPointer);
 
@@ -132,6 +137,8 @@ namespace System.IO.Compression
                         bufferPointer--;
                     }
                 }
+
+                bytesRead += buffer.Length;
             }
 
             if (!signatureFound)
index cff07e0..b08064f 100644 (file)
@@ -3,12 +3,9 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Collections.Generic;
-using System.Diagnostics;
-using System.IO;
 using System.Text;
 using System.Threading.Tasks;
 using Xunit;
-using Xunit.Sdk;
 
 namespace System.IO.Compression.Tests
 {
@@ -760,5 +757,94 @@ namespace System.IO.Compression.Tests
                 }
             }
         }
+
+        /// <summary>
+        /// Opens an empty file that has a 64KB EOCD comment.
+        /// Adds two 64KB text entries. Verifies they can be read correctly.
+        /// Appends 64KB of garbage at the end of the file. Verifies we throw.
+        /// Prepends 64KB of garbage at the beginning of the file. Verifies we throw.
+        /// </summary>
+        [Fact]
+        public static void ReadArchive_WithEOCDComment_TrailingPrecedingGarbage()
+        {
+            void InsertEntry(ZipArchive archive, string name, string contents)
+            {
+                ZipArchiveEntry entry = archive.CreateEntry(name);
+                using (StreamWriter writer = new StreamWriter(entry.Open()))
+                {
+                    writer.WriteLine(contents);
+                }
+            }
+
+            int GetEntryContentsLength(ZipArchiveEntry entry)
+            {
+                int length = 0;
+                using (Stream stream = entry.Open())
+                {
+                    using (var reader = new StreamReader(stream))
+                    {
+                        length = reader.ReadToEnd().Length;
+                    }
+                }
+                return length;
+            }
+
+            void VerifyValidEntry(ZipArchiveEntry entry, string expectedName, int expectedMinLength)
+            {
+                Assert.NotNull(entry);
+                Assert.Equal(expectedName, entry.Name);
+                // The file has a few more bytes, but should be at least as large as its contents
+                Assert.True(GetEntryContentsLength(entry) >= expectedMinLength);
+            }
+
+            string name0 = "huge0.txt";
+            string name1 = "huge1.txt";
+            string str64KB = new string('x', ushort.MaxValue);
+            byte[] byte64KB = Text.Encoding.ASCII.GetBytes(str64KB);
+
+            // Open empty file with 64KB EOCD comment
+            string path = strange("extradata/emptyWith64KBComment.zip");
+            using (MemoryStream archiveStream = StreamHelpers.CreateTempCopyStream(path).Result)
+            {
+                // Insert 2 64KB txt entries
+                using (ZipArchive archive = new ZipArchive(archiveStream, ZipArchiveMode.Update, leaveOpen: true))
+                {
+                    InsertEntry(archive, name0, str64KB);
+                    InsertEntry(archive, name1, str64KB);
+                }
+
+                // Open and verify items
+                archiveStream.Seek(0, SeekOrigin.Begin);
+                using (ZipArchive archive = new ZipArchive(archiveStream, ZipArchiveMode.Read, leaveOpen: true))
+                {
+                    Assert.Equal(2, archive.Entries.Count);
+                    VerifyValidEntry(archive.Entries[0], name0, ushort.MaxValue);
+                    VerifyValidEntry(archive.Entries[1], name1, ushort.MaxValue);
+                }
+
+                // Append 64KB of garbage
+                archiveStream.Seek(0, SeekOrigin.End);
+                archiveStream.Write(byte64KB, 0, byte64KB.Length);
+
+                // Open should not be possible because we can't find the EOCD in the max search length from the end
+                Assert.Throws<InvalidDataException>(() =>
+                {
+                    ZipArchive archive = new ZipArchive(archiveStream, ZipArchiveMode.Read, leaveOpen: true);
+                });
+
+                // Create stream with 64KB of prepended garbage, then the above stream appended
+                // Attempting to create a ZipArchive should fail: no EOCD found
+                using (MemoryStream prependStream = new MemoryStream())
+                {
+                    prependStream.Write(byte64KB, 0, byte64KB.Length);
+                    archiveStream.WriteTo(prependStream);
+
+                    Assert.Throws<InvalidDataException>(() =>
+                    {
+                        ZipArchive archive = new ZipArchive(prependStream, ZipArchiveMode.Read);
+                    });
+                }
+            }
+        }
     }
 }