From f9dfd58fd69108c2bd0857b734e84844170e888c Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Fri, 8 Nov 2024 10:34:42 -0700 Subject: [PATCH] Refactor and simplify interfaces Split Archival into Archival/Extraction since some archive formats can't do both. Rar is proprietary for creating, and there's no pure-Go 7z writing implementation that I know of. - Extractor no longer requires a filename filter (kind of pointless at best, confusing at worst) - CompressedArchive renamed to Archive - Archival is now just creating archives - New Extraction interface is for reading archives - Archive format can compose compression, archival, and extraction --- 7z.go | 13 +++--- formats.go | 116 +++++++++++++++++++++++++++--------------------- formats_test.go | 4 +- fs.go | 24 +++++----- interfaces.go | 16 +++---- rar.go | 13 +++--- tar.go | 5 +-- zip.go | 5 +-- 8 files changed, 98 insertions(+), 98 deletions(-) diff --git a/7z.go b/7z.go index 4a3dbd4a..06e4dd17 100644 --- a/7z.go +++ b/7z.go @@ -51,10 +51,7 @@ func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (M return mr, nil } -// Archive is not implemented for 7z, but the method exists so that SevenZip satisfies the ArchiveFormat interface. -func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error { - return fmt.Errorf("not implemented for 7z because there is no pure Go implementation found") -} +// Archive is not implemented for 7z because I do not know of a pure-Go 7z writer. // Extract extracts files from z, implementing the Extractor interface. Uniquely, however, // sourceArchive must be an io.ReaderAt and io.Seeker, which are oddly disjoint interfaces @@ -62,7 +59,7 @@ func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error { // the interface because we figure you can Read() from anything you can ReadAt() or Seek() // with. Due to the nature of the zip archive format, if sourceArchive is not an io.Seeker // and io.ReaderAt, an error is returned. -func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { +func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error { sra, ok := sourceArchive.(seekReaderAt) if !ok { return fmt.Errorf("input type must be an io.ReaderAt and io.Seeker because of zip format constraints") @@ -87,9 +84,6 @@ func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInA return err // honor context cancellation } - if !fileIsIncluded(pathsInArchive, f.Name) { - continue - } if fileIsIncluded(skipDirs, f.Name) { continue } @@ -130,3 +124,6 @@ func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInA // https://py7zr.readthedocs.io/en/latest/archive_format.html#signature var sevenZipHeader = []byte("7z\xBC\xAF\x27\x1C") + +// Interface guard +var _ Extractor = SevenZip{} diff --git a/formats.go b/formats.go index 24865fea..837114d9 100644 --- a/formats.go +++ b/formats.go @@ -42,13 +42,14 @@ func RegisterFormat(format Format) { func Identify(ctx context.Context, filename string, stream io.Reader) (Format, io.Reader, error) { var compression Compression var archival Archival + var extraction Extraction rewindableStream, err := newRewindReader(stream) if err != nil { return nil, nil, err } - // try compression format first, since that's the outer "layer" + // try compression format first, since that's the outer "layer" if combined for name, format := range formats { cf, isCompression := format.(Compression) if !isCompression { @@ -68,10 +69,11 @@ func Identify(ctx context.Context, filename string, stream io.Reader) (Format, i } } - // try archive format next + // try archival and extraction format next for name, format := range formats { - af, isArchive := format.(Archival) - if !isArchive { + ar, isArchive := format.(Archival) + ex, isExtract := format.(Extraction) + if !isArchive && !isExtract { continue } @@ -81,20 +83,23 @@ func Identify(ctx context.Context, filename string, stream io.Reader) (Format, i } if matchResult.Matched() { - archival = af + archival = ar + extraction = ex break } } - // the stream should be rewound by identifyOne + // the stream should be rewound by identifyOne; then return the most specific type of match bufferedStream := rewindableStream.reader() switch { - case compression != nil && archival == nil: + case compression != nil && archival == nil && extraction == nil: return compression, bufferedStream, nil - case compression == nil && archival != nil: + case compression == nil && archival != nil && extraction == nil: return archival, bufferedStream, nil - case compression != nil && archival != nil: - return CompressedArchive{compression, archival}, bufferedStream, nil + case compression == nil && archival == nil && extraction != nil: + return extraction, bufferedStream, nil + case archival != nil || extraction != nil: + return Archive{compression, archival, extraction}, bufferedStream, nil default: return nil, bufferedStream, NoMatch } @@ -161,44 +166,44 @@ func readAtMost(stream io.Reader, n int) ([]byte, error) { return nil, err } -// CompressedArchive combines a compression format on top of an archive -// format (e.g. "tar.gz") and provides both functionalities in a single -// type. It ensures that archive functions are wrapped by compressors and +// Archive represents an archive which may be compressed at the outer layer. +// It combines a compression format on top of an archive/extraction +// format (e.g. ".tar.gz") and provides both functionalities in a single +// type. It ensures that archival functions are wrapped by compressors and // decompressors. However, compressed archives have some limitations; for // example, files cannot be inserted/appended because of complexities with // modifying existing compression state (perhaps this could be overcome, // but I'm not about to try it). // -// As this type is intended to compose compression and archive formats, -// both must be specified in order for this value to be valid, or its -// methods will return errors. -type CompressedArchive struct { +// The embedded Archival and Extraction values are used for writing and +// reading, respectively. Compression is optional and is only needed if the +// format is compressed externally (for example, tar archives). +type Archive struct { Compression Archival + Extraction } -// Name returns a concatenation of the archive format name -// and the compression format name. -func (caf CompressedArchive) Extension() string { - if caf.Compression == nil && caf.Archival == nil { - panic("missing both compression and archive formats") - } +// Name returns a concatenation of the archive and compression format extensions. +func (ar Archive) Extension() string { var name string - if caf.Archival != nil { - name += caf.Archival.Extension() + if ar.Archival != nil { + name += ar.Archival.Extension() + } else if ar.Extraction != nil { + name += ar.Extraction.Extension() } - if caf.Compression != nil { - name += caf.Compression.Extension() + if ar.Compression != nil { + name += ar.Compression.Extension() } return name } -// Match matches if the input matches both the compression and archive format. -func (caf CompressedArchive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) { +// Match matches if the input matches both the compression and archival/extraction format. +func (ar Archive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) { var conglomerate MatchResult - if caf.Compression != nil { - matchResult, err := caf.Compression.Match(ctx, filename, stream) + if ar.Compression != nil { + matchResult, err := ar.Compression.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -208,7 +213,7 @@ func (caf CompressedArchive) Match(ctx context.Context, filename string, stream // wrap the reader with the decompressor so we can // attempt to match the archive by reading the stream - rc, err := caf.Compression.OpenReader(stream) + rc, err := ar.Compression.OpenReader(stream) if err != nil { return matchResult, err } @@ -218,8 +223,8 @@ func (caf CompressedArchive) Match(ctx context.Context, filename string, stream conglomerate = matchResult } - if caf.Archival != nil { - matchResult, err := caf.Archival.Match(ctx, filename, stream) + if ar.Archival != nil { + matchResult, err := ar.Archival.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -234,26 +239,32 @@ func (caf CompressedArchive) Match(ctx context.Context, filename string, stream } // Archive adds files to the output archive while compressing the result. -func (caf CompressedArchive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error { - if caf.Compression != nil { - wc, err := caf.Compression.OpenWriter(output) +func (ar Archive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error { + if ar.Archival == nil { + return fmt.Errorf("no archival format") + } + if ar.Compression != nil { + wc, err := ar.Compression.OpenWriter(output) if err != nil { return err } defer wc.Close() output = wc } - return caf.Archival.Archive(ctx, output, files) + return ar.Archival.Archive(ctx, output, files) } // ArchiveAsync adds files to the output archive while compressing the result asynchronously. -func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error { - do, ok := caf.Archival.(ArchiverAsync) +func (ar Archive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error { + if ar.Archival == nil { + return fmt.Errorf("no archival format") + } + do, ok := ar.Archival.(ArchiverAsync) if !ok { - return fmt.Errorf("%s archive does not support async writing", caf.Extension()) + return fmt.Errorf("%T archive does not support async writing", ar.Archival) } - if caf.Compression != nil { - wc, err := caf.Compression.OpenWriter(output) + if ar.Compression != nil { + wc, err := ar.Compression.OpenWriter(output) if err != nil { return err } @@ -264,16 +275,19 @@ func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, } // Extract reads files out of an archive while decompressing the results. -func (caf CompressedArchive) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { - if caf.Compression != nil { - rc, err := caf.Compression.OpenReader(sourceArchive) +func (ar Archive) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error { + if ar.Extraction == nil { + return fmt.Errorf("no extraction format") + } + if ar.Compression != nil { + rc, err := ar.Compression.OpenReader(sourceArchive) if err != nil { return err } defer rc.Close() sourceArchive = rc } - return caf.Archival.Extract(ctx, sourceArchive, pathsInArchive, handleFile) + return ar.Extraction.Extract(ctx, sourceArchive, handleFile) } // MatchResult returns true if the format was matched either @@ -408,8 +422,8 @@ var formats = make(map[string]Format) // Interface guards var ( - _ Format = (*CompressedArchive)(nil) - _ Archiver = (*CompressedArchive)(nil) - _ ArchiverAsync = (*CompressedArchive)(nil) - _ Extractor = (*CompressedArchive)(nil) + _ Format = (*Archive)(nil) + _ Archiver = (*Archive)(nil) + _ ArchiverAsync = (*Archive)(nil) + _ Extractor = (*Archive)(nil) ) diff --git a/formats_test.go b/formats_test.go index 6c8d621f..20349c2a 100644 --- a/formats_test.go +++ b/formats_test.go @@ -111,7 +111,7 @@ func checkErr(t *testing.T, err error, msgFmt string, args ...any) { return } args = append(args, err) - t.Errorf(msgFmt+": %s", args...) + t.Fatalf(msgFmt+": %s", args...) } func TestIdentifyDoesNotMatchContentFromTrimmedKnownHeaderHaving0Suffix(t *testing.T) { @@ -418,7 +418,7 @@ func TestIdentifyAndOpenZip(t *testing.T) { t.Errorf("unexpected format found: expected=.zip actual=%s", format.Extension()) } - err = format.(Extractor).Extract(context.Background(), reader, nil, func(ctx context.Context, f FileInfo) error { + err = format.(Extractor).Extract(context.Background(), reader, func(ctx context.Context, f FileInfo) error { rc, err := f.Open() if err != nil { return err diff --git a/fs.go b/fs.go index a2c7510a..56042cf6 100644 --- a/fs.go +++ b/fs.go @@ -350,14 +350,12 @@ func (f ArchiveFS) Open(name string) (fs.File, error) { } var decompressor io.ReadCloser - if caf, ok := f.Format.(CompressedArchive); ok { - if caf.Compression != nil { - decompressor, err = caf.Compression.OpenReader(inputStream) - if err != nil { - return nil, err - } - inputStream = decompressor + if decomp, ok := f.Format.(Decompressor); ok { + decompressor, err = decomp.OpenReader(inputStream) + if err != nil { + return nil, err } + inputStream = decompressor } // prepare the handler that we'll need if we have to iterate the @@ -413,13 +411,13 @@ func (f ArchiveFS) Open(name string) (fs.File, error) { // files may have a "." component in them, and the underlying format doesn't // know about our file system semantics, so we need to filter ourselves (it's // not significantly less efficient). - if caf, ok := f.Format.(CompressedArchive); ok { + if ar, ok := f.Format.(Archive); ok { // bypass the CompressedArchive format's opening of the decompressor, since - // we already did it, since we need to keep it open after returning + // we already did it because we need to keep it open after returning. // "I BYPASSED THE COMPRESSOR!" -Rey - err = caf.Archival.Extract(f.context(), inputStream, nil, handler) + err = ar.Extraction.Extract(f.context(), inputStream, handler) } else { - err = f.Format.Extract(f.context(), inputStream, nil, handler) + err = f.Format.Extract(f.context(), inputStream, handler) } if err != nil { return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("extract: %w", err)} @@ -486,7 +484,7 @@ func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { if f.Stream != nil { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, nil, handler) + err = f.Format.Extract(f.context(), inputStream, handler) if err != nil && result.FileInfo == nil { return nil, err } @@ -601,7 +599,7 @@ func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, nil, handler) + err = f.Format.Extract(f.context(), inputStream, handler) if err != nil { // these being non-nil implies that we have indexed the archive, // but if an error occurred, we likely only got part of the way diff --git a/interfaces.go b/interfaces.go index f675f0e2..fd817864 100644 --- a/interfaces.go +++ b/interfaces.go @@ -33,10 +33,15 @@ type Compression interface { Decompressor } -// Archival is an archival format with both archive and extract methods. +// Archival is an archival format that can create/write archives. type Archival interface { Format Archiver +} + +// Extraction is an archival format that extract from (read) archives. +type Extraction interface { + Format Extractor } @@ -86,19 +91,14 @@ type ArchiverAsync interface { // Extractor can extract files from an archive. type Extractor interface { // Extract walks entries in the archive and calls handleFile for each - // entry that matches the pathsInArchive filter by path/name. - // - // If pathsInArchive is nil, all files are extracted without discretion. - // If pathsInArchive is empty, no files are extracted. - // If a path refers to a directory, all files within it are extracted. - // Extracted files are passed to the handleFile callback for handling. + // entry in the archive. // // Any files opened in the FileHandler should be closed when it returns, // as there is no guarantee the files can be read outside the handler // or after the walk has proceeded to the next file. // // Context cancellation must be honored. - Extract(ctx context.Context, archive io.Reader, pathsInArchive []string, handleFile FileHandler) error + Extract(ctx context.Context, archive io.Reader, handleFile FileHandler) error } // Inserter can insert files into an existing archive. diff --git a/rar.go b/rar.go index bece6071..8ca559e6 100644 --- a/rar.go +++ b/rar.go @@ -56,12 +56,9 @@ func (r Rar) Match(_ context.Context, filename string, stream io.Reader) (MatchR return mr, nil } -// Archive is not implemented for RAR, but the method exists so that Rar satisfies the ArchiveFormat interface. -func (r Rar) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error { - return fmt.Errorf("not implemented because RAR is a proprietary format") -} +// Archive is not implemented for RAR because it is patent-encumbered. -func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { +func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error { var options []rardecode.Option if r.Password != "" { options = append(options, rardecode.Password(r.Password)) @@ -91,9 +88,6 @@ func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv } return err } - if !fileIsIncluded(pathsInArchive, hdr.Name) { - continue - } if fileIsIncluded(skipDirs, hdr.Name) { continue } @@ -142,3 +136,6 @@ var ( rarHeaderV1_5 = []byte("Rar!\x1a\x07\x00") // v1.5 rarHeaderV5_0 = []byte("Rar!\x1a\x07\x01\x00") // v5.0 ) + +// Interface guard +var _ Extractor = Rar{} diff --git a/tar.go b/tar.go index d4106257..d84fed43 100644 --- a/tar.go +++ b/tar.go @@ -179,7 +179,7 @@ func (t Tar) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileIn return nil } -func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { +func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error { tr := tar.NewReader(sourceArchive) // important to initialize to non-nil, empty value due to how fileIsIncluded works @@ -201,9 +201,6 @@ func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv } return err } - if !fileIsIncluded(pathsInArchive, hdr.Name) { - continue - } if fileIsIncluded(skipDirs, hdr.Name) { continue } diff --git a/zip.go b/zip.go index 73361d8b..1de5b516 100644 --- a/zip.go +++ b/zip.go @@ -183,7 +183,7 @@ func (z Zip) archiveOneFile(ctx context.Context, zw *zip.Writer, idx int, file F // the interface because we figure you can Read() from anything you can ReadAt() or Seek() // with. Due to the nature of the zip archive format, if sourceArchive is not an io.Seeker // and io.ReaderAt, an error is returned. -func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { +func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error { sra, ok := sourceArchive.(seekReaderAt) if !ok { return fmt.Errorf("input type must be an io.ReaderAt and io.Seeker because of zip format constraints") @@ -211,9 +211,6 @@ func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv // ensure filename and comment are UTF-8 encoded (issue #147 and PR #305) z.decodeText(&f.FileHeader) - if !fileIsIncluded(pathsInArchive, f.Name) { - continue - } if fileIsIncluded(skipDirs, f.Name) { continue }