Skip to content

Commit

Permalink
Remove BinaryReader and BinaryWriter references from ZipArchive (#103153
Browse files Browse the repository at this point in the history
)

* Removed BinaryReader, BinaryWriter from ZipArchive and ZipBlocks
We now read the data into a (sometimes stack-allocated) byte array and slice it up with BinaryPrimitives.
This reduces the number of reads and writes required to read and write a ZipArchive. It also makes future work to enable async APIs easier, since BinaryReader and BinaryWriter lack this support.
Also changed approach to reading central file directory headers. Rather than performing X reads per header, we read 4KB of data at a time and look for all applicable headers in that data. This should improve performance when dealing with many small files.
* Removed BinaryReader from ZipArchiveEntry
This allowed the removal of the ArchiveReader property from ZipArchive.
* Removed BinaryWriter from ZipArchiveEntry
* Re-adding missing assertion
* Reduced memory usage with array pooling
Now pooling the file IO buffers and the temporary buffers for extra fields of the CD file header (which would otherwise be allocated and deallocated in a loop.)
* Corrected duplicate comment
* Added test for changed central directory read method
This handles 64x entries with 19-character filenames (and thus, 65-byte file headers.) As a result, it straddles two 4KB read buffers.
Also corrected the seek logic while reading the central directory header
* Test changes: formatting, converted Theory to Fact
* Resolving test failures
The buffer returned from the ArrayPool contained older data (including a ZIP header.) When reading the last chunk of the file (i.e a chunk which was less than BackwardsSeekingBufferSize) the buffer's Span wasn't resized to account for this.
SeekBackwardsToSignature would thus find the older data, try to seek beyond the end of the stream and fail to read the file.
* Responded to code review
Reads and writes are now performed using a new set of field lengths and locations.
* Code review response
Formatting change; added one comment to Zip64EndOfCentralDirectoryLocator.SignatureConstantBytes; clarified comment on ZipHelper.SeekBackwardsAndRead
* Additional code review comments
Added comment to SeekBackwardsToStream.
Lingering references to SignatureConstantBytes.Length.
Added two asserts to CanReadLargeCentralDirectoryHeader test, verifying that the archive entry metadata is in a sensible state.

---------

Co-authored-by: Carlos Sánchez López <[email protected]>
  • Loading branch information
edwardneal and carlossanlop authored Jan 23, 2025
1 parent 4dabc7b commit 77fc88a
Show file tree
Hide file tree
Showing 8 changed files with 1,031 additions and 401 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipArchiveEntry.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipArchiveMode.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipBlocks.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipBlocks.FieldLengths.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipBlocks.FieldLocations.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipCustomStreams.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipHelper.cs" />
<Compile Include="$(SharedOpenSourcePath)System\IO\Compression\ZipVersion.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.InteropServices;
using System.Text;

namespace System.IO.Compression
Expand All @@ -16,16 +17,15 @@ public class ZipArchive : IDisposable
{
private readonly Stream _archiveStream;
private ZipArchiveEntry? _archiveStreamOwner;
private readonly BinaryReader? _archiveReader;
private readonly ZipArchiveMode _mode;
private readonly List<ZipArchiveEntry> _entries;
private readonly ReadOnlyCollection<ZipArchiveEntry> _entriesCollection;
private readonly Dictionary<string, ZipArchiveEntry> _entriesDictionary;
private bool _readEntries;
private readonly bool _leaveOpen;
private long _centralDirectoryStart; //only valid after ReadCentralDirectory
private long _centralDirectoryStart; // only valid after ReadCentralDirectory
private bool _isDisposed;
private uint _numberOfThisDisk; //only valid after ReadCentralDirectory
private uint _numberOfThisDisk; // only valid after ReadCentralDirectory
private long _expectedNumberOfEntries;
private readonly Stream? _backingStream;
private byte[] _archiveComment;
Expand Down Expand Up @@ -161,10 +161,6 @@ public ZipArchive(Stream stream, ZipArchiveMode mode, bool leaveOpen, Encoding?
else
_archiveStream = stream;
_archiveStreamOwner = null;
if (mode == ZipArchiveMode.Create)
_archiveReader = null;
else
_archiveReader = new BinaryReader(_archiveStream, Encoding.UTF8, leaveOpen: true);
_entries = new List<ZipArchiveEntry>();
_entriesCollection = new ReadOnlyCollection<ZipArchiveEntry>(_entries);
_entriesDictionary = new Dictionary<string, ZipArchiveEntry>();
Expand Down Expand Up @@ -351,8 +347,6 @@ public void Dispose()
return result;
}

internal BinaryReader? ArchiveReader => _archiveReader;

internal Stream ArchiveStream => _archiveStream;

internal uint NumberOfThisDisk => _numberOfThisDisk;
Expand Down Expand Up @@ -460,7 +454,6 @@ private void CloseStreams()
{
_archiveStream.Dispose();
_backingStream?.Dispose();
_archiveReader?.Dispose();
}
else
{
Expand All @@ -483,32 +476,90 @@ private void EnsureCentralDirectoryRead()

private void ReadCentralDirectory()
{
const int ReadBufferSize = 4096;

byte[] fileBuffer = System.Buffers.ArrayPool<byte>.Shared.Rent(ReadBufferSize);
Span<byte> fileBufferSpan = fileBuffer.AsSpan(0, ReadBufferSize);

try
{
// assume ReadEndOfCentralDirectory has been called and has populated _centralDirectoryStart

_archiveStream.Seek(_centralDirectoryStart, SeekOrigin.Begin);

long numberOfEntries = 0;

Debug.Assert(_archiveReader != null);
//read the central directory
ZipCentralDirectoryFileHeader currentHeader;
bool saveExtraFieldsAndComments = Mode == ZipArchiveMode.Update;
while (ZipCentralDirectoryFileHeader.TryReadBlock(_archiveReader,
saveExtraFieldsAndComments, out currentHeader))

bool continueReadingCentralDirectory = true;
// total bytes read from central directory
int bytesRead = 0;
// current position in the current buffer
int currPosition = 0;
// total bytes read from all file headers starting in the current buffer
int bytesConsumed = 0;

_entries.Clear();
_entriesDictionary.Clear();

// read the central directory
while (continueReadingCentralDirectory)
{
AddEntry(new ZipArchiveEntry(this, currentHeader));
numberOfEntries++;
int currBytesRead = _archiveStream.Read(fileBufferSpan);
ReadOnlySpan<byte> sizedFileBuffer = fileBufferSpan.Slice(0, currBytesRead);

// the buffer read must always be large enough to fit the constant section size of at least one header
continueReadingCentralDirectory = continueReadingCentralDirectory
&& sizedFileBuffer.Length >= ZipCentralDirectoryFileHeader.BlockConstantSectionSize;

while (continueReadingCentralDirectory
&& currPosition + ZipCentralDirectoryFileHeader.BlockConstantSectionSize < sizedFileBuffer.Length)
{
ZipCentralDirectoryFileHeader currentHeader = default;

continueReadingCentralDirectory = continueReadingCentralDirectory &&
ZipCentralDirectoryFileHeader.TryReadBlock(sizedFileBuffer.Slice(currPosition), _archiveStream,
saveExtraFieldsAndComments, out bytesConsumed, out currentHeader);

if (!continueReadingCentralDirectory)
{
break;
}

AddEntry(new ZipArchiveEntry(this, currentHeader));
numberOfEntries++;
if (numberOfEntries > _expectedNumberOfEntries)
{
throw new InvalidDataException(SR.NumEntriesWrong);
}

currPosition += bytesConsumed;
bytesRead += bytesConsumed;
}

// We've run out of possible space in the entry - seek backwards by the number of bytes remaining in
// this buffer (so that the next buffer overlaps with this one) and retry.
if (currPosition < sizedFileBuffer.Length)
{
_archiveStream.Seek(-(sizedFileBuffer.Length - currPosition), SeekOrigin.Current);
}
currPosition = 0;
}

if (numberOfEntries != _expectedNumberOfEntries)
{
throw new InvalidDataException(SR.NumEntriesWrong);
}

_archiveStream.Seek(_centralDirectoryStart + bytesRead, SeekOrigin.Begin);
}
catch (EndOfStreamException ex)
{
throw new InvalidDataException(SR.Format(SR.CentralDirectoryInvalid, ex));
}
finally
{
System.Buffers.ArrayPool<byte>.Shared.Return(fileBuffer);
}
}

// This function reads all the EOCD stuff it needs to find the offset to the start of the central directory
Expand All @@ -526,16 +577,15 @@ private void ReadEndOfCentralDirectory()
// If the EOCD has the minimum possible size (no zip file comment), then exactly the previous 4 bytes will contain the signature
// But if the EOCD has max possible size, the signature should be found somewhere in the previous 64K + 4 bytes
if (!ZipHelper.SeekBackwardsToSignature(_archiveStream,
ZipEndOfCentralDirectoryBlock.SignatureConstant,
ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength + ZipEndOfCentralDirectoryBlock.SignatureSize))
ZipEndOfCentralDirectoryBlock.SignatureConstantBytes,
ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength + ZipEndOfCentralDirectoryBlock.FieldLengths.Signature))
throw new InvalidDataException(SR.EOCDNotFound);

long eocdStart = _archiveStream.Position;

Debug.Assert(_archiveReader != null);
// read the EOCD
ZipEndOfCentralDirectoryBlock eocd;
bool eocdProper = ZipEndOfCentralDirectoryBlock.TryReadBlock(_archiveReader, out eocd);
bool eocdProper = ZipEndOfCentralDirectoryBlock.TryReadBlock(_archiveStream, out eocd);
Debug.Assert(eocdProper); // we just found this using the signature finder, so it should be okay

if (eocd.NumberOfThisDisk != eocd.NumberOfTheDiskWithTheStartOfTheCentralDirectory)
Expand Down Expand Up @@ -587,14 +637,12 @@ private void TryReadZip64EndOfCentralDirectory(ZipEndOfCentralDirectoryBlock eoc
// Exactly the previous 4 bytes should contain the Zip64-EOCDL signature
// if we don't find it, assume it doesn't exist and use data from normal EOCD
if (ZipHelper.SeekBackwardsToSignature(_archiveStream,
Zip64EndOfCentralDirectoryLocator.SignatureConstant,
Zip64EndOfCentralDirectoryLocator.SignatureSize))
Zip64EndOfCentralDirectoryLocator.SignatureConstantBytes,
Zip64EndOfCentralDirectoryLocator.FieldLengths.Signature))
{
Debug.Assert(_archiveReader != null);

// use locator to get to Zip64-EOCD
Zip64EndOfCentralDirectoryLocator locator;
bool zip64eocdLocatorProper = Zip64EndOfCentralDirectoryLocator.TryReadBlock(_archiveReader, out locator);
bool zip64eocdLocatorProper = Zip64EndOfCentralDirectoryLocator.TryReadBlock(_archiveStream, out locator);
Debug.Assert(zip64eocdLocatorProper); // we just found this using the signature finder, so it should be okay

if (locator.OffsetOfZip64EOCD > long.MaxValue)
Expand All @@ -607,7 +655,7 @@ private void TryReadZip64EndOfCentralDirectory(ZipEndOfCentralDirectoryBlock eoc
// Read Zip64 End of Central Directory Record

Zip64EndOfCentralDirectoryRecord record;
if (!Zip64EndOfCentralDirectoryRecord.TryReadBlock(_archiveReader, out record))
if (!Zip64EndOfCentralDirectoryRecord.TryReadBlock(_archiveStream, out record))
throw new InvalidDataException(SR.Zip64EOCDNotWhereExpected);

_numberOfThisDisk = record.NumberOfThisDisk;
Expand Down
Loading

0 comments on commit 77fc88a

Please sign in to comment.