From e310539bed5a26da3ebd604bd83f652d50012818 Mon Sep 17 00:00:00 2001 From: Matt Holt Date: Thu, 7 Nov 2024 21:01:39 -0700 Subject: [PATCH] Refactor FS types; improve performance (#426) * WIP * More WIP * Finish improvements (probably) --- .github/workflows/macos-latest.yml | 2 +- .github/workflows/ubuntu-latest.yml | 2 +- .github/workflows/windows-latest.yml | 2 +- 7z.go | 21 +- README.md | 16 +- archiver.go | 33 +- archiver_test.go | 3 +- brotli.go | 19 +- bz2.go | 7 +- formats.go | 119 ++-- formats_test.go | 119 ++-- fs.go | 862 +++++++++++++-------------- fs_test.go | 21 +- go.mod | 14 +- go.sum | 26 +- gz.go | 7 +- interfaces.go | 26 +- lz4.go | 7 +- lzip.go | 7 +- rar.go | 19 +- sz.go | 7 +- tar.go | 27 +- xz.go | 7 +- zip.go | 25 +- zlib.go | 7 +- zstd.go | 7 +- 26 files changed, 716 insertions(+), 696 deletions(-) diff --git a/.github/workflows/macos-latest.yml b/.github/workflows/macos-latest.yml index 70b0aa57..5f2bdf3d 100644 --- a/.github/workflows/macos-latest.yml +++ b/.github/workflows/macos-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: macos-latest steps: - name: Install Go diff --git a/.github/workflows/ubuntu-latest.yml b/.github/workflows/ubuntu-latest.yml index 0502d0d8..d25b72d2 100644 --- a/.github/workflows/ubuntu-latest.yml +++ b/.github/workflows/ubuntu-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: ubuntu-latest steps: - name: Install Go diff --git a/.github/workflows/windows-latest.yml b/.github/workflows/windows-latest.yml index d27e28d9..b53e3eed 100644 --- a/.github/workflows/windows-latest.yml +++ b/.github/workflows/windows-latest.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - go-version: [1.22] + go-version: [1.23] runs-on: windows-latest steps: - name: Install Go diff --git a/7z.go b/7z.go index d57bd452..4a3dbd4a 100644 --- a/7z.go +++ b/7z.go @@ -31,13 +31,13 @@ type SevenZip struct { Password string } -func (z SevenZip) Name() string { return ".7z" } +func (z SevenZip) Extension() string { return ".7z" } -func (z SevenZip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), z.Name()) { + if strings.Contains(strings.ToLower(filename), z.Extension()) { mr.ByName = true } @@ -52,7 +52,7 @@ func (z SevenZip) Match(filename string, stream io.Reader) (MatchResult, error) } // Archive is not implemented for 7z, but the method exists so that SevenZip satisfies the ArchiveFormat interface. -func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []File) error { +func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error { return fmt.Errorf("not implemented for 7z because there is no pure Go implementation found") } @@ -94,11 +94,18 @@ func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInA continue } - file := File{ - FileInfo: f.FileInfo(), + fi := f.FileInfo() + file := FileInfo{ + FileInfo: fi, Header: f.FileHeader, NameInArchive: f.Name, - Open: func() (io.ReadCloser, error) { return f.Open() }, + Open: func() (fs.File, error) { + openedFile, err := f.Open() + if err != nil { + return nil, err + } + return fileInArchive{openedFile, fi}, nil + }, } err := handleFile(ctx, file) diff --git a/README.md b/README.md index 2d7dcf7c..9b1cdc78 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # archiver [![Go Reference](https://pkg.go.dev/badge/github.com/mholt/archiver/v4.svg)](https://pkg.go.dev/github.com/mholt/archiver/v4) [![Ubuntu-latest](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml) [![Macos-latest](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml) [![Windows-latest](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml) -Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities. +Introducing **Archiver 4.0 (alpha)** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities. **:warning: v4 is in ALPHA. The core library APIs work pretty well but the command has not been implemented yet, nor have most automated tests. If you need the `arc` command, stick with v3 for now.** @@ -11,8 +11,8 @@ Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility an - By file name - By header - Traverse directories, archive files, and any other file uniformly as [`io/fs`](https://pkg.go.dev/io/fs) file systems: - - [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS) - [`FileFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#FileFS) + - [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS) - [`ArchiveFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#ArchiveFS) - Compress and decompress files - Create and extract archive files @@ -117,7 +117,7 @@ If you want all the files, pass in a nil list of file paths. ```go // the type that will be used to read the input stream -format := archiver.Zip{} +var format archiver.Zip // the list of files we want out of the archive; any // directories will include all their contents unless @@ -141,7 +141,7 @@ if err != nil { Have an input stream with unknown contents? No problem, archiver can identify it for you. It will try matching based on filename and/or the header (which peeks at the stream): ```go -format, input, err := archiver.Identify("filename.tar.zst", input) +format, input, err := archiver.Identify(ctx, "filename.tar.zst", input) if err != nil { return err } @@ -165,7 +165,7 @@ if decom, ok := format.(archiver.Decompressor); ok { } ``` -`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. +`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. If your input stream is `io.Seeker` however, no buffer is created (it uses `Seek()` instead). ### Virtual file systems @@ -212,7 +212,7 @@ if dir, ok := f.(fs.ReadDirFile); ok { return err } for _, e := range entries { - fmt.Println(e.Name()) + fmt.Println(e.Extension()) } } ``` @@ -225,7 +225,7 @@ if err != nil { return err } for _, e := range entries { - fmt.Println(e.Name()) + fmt.Println(e.Extension()) } ``` @@ -247,6 +247,8 @@ if err != nil { } ``` +**Important .tar note:** Tar files do not efficiently implement file system semantics due to their roots in sequential-access design for tapes. File systems inherently assume random access, but tar files need to be read from the beginning to access something at the end. This is especially slow when the archive is compressed. Optimizations have been implemented to amortize `ReadDir()` calls so that `fs.WalkDir()` only has to scan the archive once, but they use more memory. Open calls require another scan to find the file. It may be more efficient to use `Tar.Extract()` directly if file system semantics are not important to you. + #### Use with `http.FileServer` It can be used with http.FileServer to browse archives and directories in a browser. However, due to how http.FileServer works, don't directly use http.FileServer with compressed files; instead wrap it like following: diff --git a/archiver.go b/archiver.go index c968e1f7..7e68f30d 100644 --- a/archiver.go +++ b/archiver.go @@ -12,14 +12,14 @@ import ( "time" ) -// File is a virtualized, generalized file abstraction for interacting with archives. -type File struct { +// FileInfo is a virtualized, generalized file abstraction for interacting with archives. +type FileInfo struct { fs.FileInfo // The file header as used/provided by the archive format. // Typically, you do not need to set this field when creating // an archive. - Header interface{} + Header any // The path of the file as it appears in the archive. // This is equivalent to Header.Name (for most Header @@ -28,6 +28,10 @@ type File struct { // format-agnosticism (no type assertions) for basic // operations. // + // When extracting, this name or path may not have + // been sanitized; it should not be trusted at face + // value. Consider using path.Clean() before using. + // // EXPERIMENTAL: If inserting a file into an archive, // and this is left blank, the implementation of the // archive format can default to using the file's base @@ -40,12 +44,11 @@ type File struct { // A callback function that opens the file to read its // contents. The file must be closed when reading is - // complete. Nil for files that don't have content - // (such as directories and links). - Open func() (io.ReadCloser, error) + // complete. + Open func() (fs.File, error) } -func (f File) Stat() (fs.FileInfo, error) { return f.FileInfo, nil } +func (f FileInfo) Stat() (fs.FileInfo, error) { return f.FileInfo, nil } // FilesFromDisk returns a list of files by walking the directories in the // given filenames map. The keys are the names on disk, and the values are @@ -68,8 +71,8 @@ func (f File) Stat() (fs.FileInfo, error) { return f.FileInfo, nil } // // This function is used primarily when preparing a list of files to add to // an archive. -func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]File, error) { - var files []File +func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]FileInfo, error) { + var files []FileInfo for rootOnDisk, rootInArchive := range filenames { walkErr := filepath.WalkDir(rootOnDisk, func(filename string, d fs.DirEntry, err error) error { if err != nil { @@ -114,11 +117,11 @@ func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]Fil info = noAttrFileInfo{info} } - file := File{ + file := FileInfo{ FileInfo: info, NameInArchive: nameInArchive, LinkTarget: linkTarget, - Open: func() (io.ReadCloser, error) { + Open: func() (fs.File, error) { return os.Open(filename) }, } @@ -191,7 +194,7 @@ func (no noAttrFileInfo) Mode() fs.FileMode { return no.FileInfo.Mode() & (fs.ModeType | fs.ModePerm) } func (noAttrFileInfo) ModTime() time.Time { return time.Time{} } -func (noAttrFileInfo) Sys() interface{} { return nil } +func (noAttrFileInfo) Sys() any { return nil } // FromDiskOptions specifies various options for gathering files from disk. type FromDiskOptions struct { @@ -215,12 +218,12 @@ type FromDiskOptions struct { // archive contents are not necessarily ordered, skipping directories requires // memory, and skipping lots of directories may run up your memory bill. // -// Any other returned error will terminate a walk. -type FileHandler func(ctx context.Context, f File) error +// Any other returned error will terminate a walk and be returned to the caller. +type FileHandler func(ctx context.Context, info FileInfo) error // openAndCopyFile opens file for reading, copies its // contents to w, then closes file. -func openAndCopyFile(file File, w io.Writer) error { +func openAndCopyFile(file FileInfo, w io.Writer) error { fileReader, err := file.Open() if err != nil { return err diff --git a/archiver_test.go b/archiver_test.go index e4355540..0cf8124c 100644 --- a/archiver_test.go +++ b/archiver_test.go @@ -245,7 +245,8 @@ func TestNameOnDiskToNameInArchive(t *testing.T) { }, } { if !strings.HasPrefix(tc.nameOnDisk, tc.rootOnDisk) { - t.Fatalf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i) + t.Errorf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i) + continue } if tc.windows && runtime.GOOS != "windows" { t.Logf("Test %d: Skipping test that is only compatible with Windows", i) diff --git a/brotli.go b/brotli.go index 5d17fae7..c650f40e 100644 --- a/brotli.go +++ b/brotli.go @@ -1,6 +1,7 @@ package archiver import ( + "context" "io" "strings" @@ -16,19 +17,25 @@ type Brotli struct { Quality int } -func (Brotli) Name() string { return ".br" } +func (Brotli) Extension() string { return ".br" } -func (br Brotli) Match(filename string, stream io.Reader) (MatchResult, error) { +func (br Brotli) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), br.Name()) { + if strings.Contains(strings.ToLower(filename), br.Extension()) { mr.ByName = true } - // brotli does not have well-defined file headers; the - // best way to match the stream would be to try decoding - // part of it, and this is not implemented for now + // brotli does not have well-defined file headers or a magic number; + // the best way to match the stream is probably to try decoding part + // of it, but we'll just have to guess a large-enough size that is + // still small enough for the smallest streams we'll encounter + r := brotli.NewReader(stream) + buf := make([]byte, 16) + if _, err := io.ReadFull(r, buf); err == nil { + mr.ByStream = true + } return mr, nil } diff --git a/bz2.go b/bz2.go index 57a278f4..a2a5f05e 100644 --- a/bz2.go +++ b/bz2.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -17,13 +18,13 @@ type Bz2 struct { CompressionLevel int } -func (Bz2) Name() string { return ".bz2" } +func (Bz2) Extension() string { return ".bz2" } -func (bz Bz2) Match(filename string, stream io.Reader) (MatchResult, error) { +func (bz Bz2) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), bz.Name()) { + if strings.Contains(strings.ToLower(filename), bz.Extension()) { mr.ByName = true } diff --git a/formats.go b/formats.go index 546d3122..24865fea 100644 --- a/formats.go +++ b/formats.go @@ -12,7 +12,7 @@ import ( // RegisterFormat registers a format. It should be called during init. // Duplicate formats by name are not allowed and will panic. func RegisterFormat(format Format) { - name := strings.Trim(strings.ToLower(format.Name()), ".") + name := strings.Trim(strings.ToLower(format.Extension()), ".") if _, ok := formats[name]; ok { panic("format " + name + " is already registered") } @@ -32,14 +32,21 @@ func RegisterFormat(format Format) { // // If stream is non-nil then the returned io.Reader will always be // non-nil and will read from the same point as the reader which was -// passed in; it should be used in place of the input stream after +// passed in. If the input stream is not an io.Seeker, the returned +// io.Reader value should be used in place of the input stream after // calling Identify() because it preserves and re-reads the bytes that // were already read during the identification process. -func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { +// +// If the input stream is an io.Seeker, Seek() must work, and the +// original input value will be returned instead of a wrapper value. +func Identify(ctx context.Context, filename string, stream io.Reader) (Format, io.Reader, error) { var compression Compression var archival Archival - rewindableStream := newRewindReader(stream) + rewindableStream, err := newRewindReader(stream) + if err != nil { + return nil, nil, err + } // try compression format first, since that's the outer "layer" for name, format := range formats { @@ -48,7 +55,7 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { continue } - matchResult, err := identifyOne(format, filename, rewindableStream, nil) + matchResult, err := identifyOne(ctx, format, filename, rewindableStream, nil) if err != nil { return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err) } @@ -68,7 +75,7 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { continue } - matchResult, err := identifyOne(format, filename, rewindableStream, compression) + matchResult, err := identifyOne(ctx, format, filename, rewindableStream, compression) if err != nil { return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err) } @@ -89,13 +96,17 @@ func Identify(filename string, stream io.Reader) (Format, io.Reader, error) { case compression != nil && archival != nil: return CompressedArchive{compression, archival}, bufferedStream, nil default: - return nil, bufferedStream, ErrNoMatch + return nil, bufferedStream, NoMatch } } -func identifyOne(format Format, filename string, stream *rewindReader, comp Compression) (mr MatchResult, err error) { +func identifyOne(ctx context.Context, format Format, filename string, stream *rewindReader, comp Compression) (mr MatchResult, err error) { defer stream.rewind() + if filename == "." { + filename = "" + } + // if looking within a compressed format, wrap the stream in a // reader that can decompress it so we can match the "inner" format // (yes, we have to make a new reader every time we do a match, @@ -107,14 +118,14 @@ func identifyOne(format Format, filename string, stream *rewindReader, comp Comp return MatchResult{}, openErr } defer decompressedStream.Close() - mr, err = format.Match(filename, decompressedStream) + mr, err = format.Match(ctx, filename, decompressedStream) } else { // Make sure we pass a nil io.Reader not a *rewindReader(nil) var r io.Reader if stream != nil { r = stream } - mr, err = format.Match(filename, r) + mr, err = format.Match(ctx, filename, r) } // if the error is EOF, we can just ignore it. @@ -168,26 +179,26 @@ type CompressedArchive struct { // Name returns a concatenation of the archive format name // and the compression format name. -func (caf CompressedArchive) Name() string { +func (caf CompressedArchive) Extension() string { if caf.Compression == nil && caf.Archival == nil { panic("missing both compression and archive formats") } var name string if caf.Archival != nil { - name += caf.Archival.Name() + name += caf.Archival.Extension() } if caf.Compression != nil { - name += caf.Compression.Name() + name += caf.Compression.Extension() } return name } // Match matches if the input matches both the compression and archive format. -func (caf CompressedArchive) Match(filename string, stream io.Reader) (MatchResult, error) { +func (caf CompressedArchive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) { var conglomerate MatchResult if caf.Compression != nil { - matchResult, err := caf.Compression.Match(filename, stream) + matchResult, err := caf.Compression.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -208,7 +219,7 @@ func (caf CompressedArchive) Match(filename string, stream io.Reader) (MatchResu } if caf.Archival != nil { - matchResult, err := caf.Archival.Match(filename, stream) + matchResult, err := caf.Archival.Match(ctx, filename, stream) if err != nil { return MatchResult{}, err } @@ -223,7 +234,7 @@ func (caf CompressedArchive) Match(filename string, stream io.Reader) (MatchResu } // Archive adds files to the output archive while compressing the result. -func (caf CompressedArchive) Archive(ctx context.Context, output io.Writer, files []File) error { +func (caf CompressedArchive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error { if caf.Compression != nil { wc, err := caf.Compression.OpenWriter(output) if err != nil { @@ -239,7 +250,7 @@ func (caf CompressedArchive) Archive(ctx context.Context, output io.Writer, file func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error { do, ok := caf.Archival.(ArchiverAsync) if !ok { - return fmt.Errorf("%s archive does not support async writing", caf.Name()) + return fmt.Errorf("%s archive does not support async writing", caf.Extension()) } if caf.Compression != nil { wc, err := caf.Compression.OpenWriter(output) @@ -253,27 +264,13 @@ func (caf CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, } // Extract reads files out of an archive while decompressing the results. -// If Extract is not called from ArchiveFS.Open, then the FileHandler passed -// in must close all opened files by the time the Extract walk finishes. func (caf CompressedArchive) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error { if caf.Compression != nil { rc, err := caf.Compression.OpenReader(sourceArchive) if err != nil { return err } - // I don't like this solution, but we have to close the decompressor. - // The problem is that if we simply defer rc.Close(), we potentially - // close it before the caller is done using files it opened. Ideally - // it should be closed when the sourceArchive is also closed. But since - // we don't originate sourceArchive, we can't close it when it closes. - // The best I can think of for now is this hack where we tell a type - // that supports this to close another reader when itself closes. - // See issue #365. - if cc, ok := sourceArchive.(compressorCloser); ok { - cc.closeCompressor(rc) - } else { - defer rc.Close() - } + defer rc.Close() sourceArchive = rc } return caf.Archival.Extract(ctx, sourceArchive, pathsInArchive, handleFile) @@ -299,26 +296,42 @@ func (mr MatchResult) Matched() bool { return mr.ByName || mr.ByStream } // read from the stream. This is useful for "peeking" a stream an // arbitrary number of bytes. Loosely based on the Connection type // from https://github.com/mholt/caddy-l4. +// +// If the reader is also an io.Seeker, no buffer is used, and instead +// the stream seeks back to the starting position. type rewindReader struct { io.Reader + start int64 buf *bytes.Buffer bufReader io.Reader } -func newRewindReader(r io.Reader) *rewindReader { +func newRewindReader(r io.Reader) (*rewindReader, error) { if r == nil { - return nil + return nil, nil } - return &rewindReader{ - Reader: r, - buf: new(bytes.Buffer), + + rr := &rewindReader{Reader: r} + + // avoid buffering if we have a seeker we can use + if seeker, ok := r.(io.Seeker); ok { + var err error + rr.start, err = seeker.Seek(0, io.SeekCurrent) + if err != nil { + return nil, fmt.Errorf("seek to determine current position: %w", err) + } + } else { + rr.buf = new(bytes.Buffer) } + + return rr, nil } func (rr *rewindReader) Read(p []byte) (n int, err error) { if rr == nil { - panic("internal error: reading from nil rewindReader") + panic("reading from nil rewindReader") } + // if there is a buffer we should read from, start // with that; we only read from the underlying stream // after the buffer has been "depleted" @@ -333,13 +346,13 @@ func (rr *rewindReader) Read(p []byte) (n int, err error) { } } - // buffer has been "depleted" so read from - // underlying connection + // buffer has been depleted or we are not using one, + // so read from underlying stream nr, err := rr.Reader.Read(p[n:]) // anything that was read needs to be written to - // the buffer, even if there was an error - if nr > 0 { + // the buffer (if used), even if there was an error + if nr > 0 && rr.buf != nil { if nw, errw := rr.buf.Write(p[n : n+nr]); errw != nil { return nw, errw } @@ -355,18 +368,24 @@ func (rr *rewindReader) Read(p []byte) (n int, err error) { // rewind resets the stream to the beginning by causing // Read() to start reading from the beginning of the -// buffered bytes. +// stream, or, if buffering, the buffered bytes. func (rr *rewindReader) rewind() { if rr == nil { return } + if ras, ok := rr.Reader.(io.Seeker); ok { + if _, err := ras.Seek(rr.start, io.SeekStart); err == nil { + return + } + } rr.bufReader = bytes.NewReader(rr.buf.Bytes()) } // reader returns a reader that reads first from the buffered -// bytes, then from the underlying stream. After calling this, -// no more rewinding is allowed since reads from the stream are -// not recorded, so rewinding properly is impossible. +// bytes (if buffering), then from the underlying stream; if a +// Seeker, the stream will be seeked back to the start. After +// calling this, no more rewinding is allowed since reads from +// the stream are not recorded, so rewinding properly is impossible. // If the underlying reader implements io.Seeker, then the // underlying reader will be used directly. func (rr *rewindReader) reader() io.Reader { @@ -374,15 +393,15 @@ func (rr *rewindReader) reader() io.Reader { return nil } if ras, ok := rr.Reader.(io.Seeker); ok { - if _, err := ras.Seek(0, io.SeekStart); err == nil { + if _, err := ras.Seek(rr.start, io.SeekStart); err == nil { return rr.Reader } } return io.MultiReader(bytes.NewReader(rr.buf.Bytes()), rr.Reader) } -// ErrNoMatch is returned if there are no matching formats. -var ErrNoMatch = fmt.Errorf("no formats matched") +// NoMatch is a special error returned if there are no matching formats. +var NoMatch = fmt.Errorf("no formats matched") // Registered formats. var formats = make(map[string]Format) diff --git a/formats_test.go b/formats_test.go index ec5e3bb8..6c8d621f 100644 --- a/formats_test.go +++ b/formats_test.go @@ -16,7 +16,10 @@ import ( func TestRewindReader(t *testing.T) { data := "the header\nthe body\n" - r := newRewindReader(strings.NewReader(data)) + r, err := newRewindReader(strings.NewReader(data)) + if err != nil { + t.Errorf("creating rewindReader: %v", err) + } buf := make([]byte, 10) // enough for 'the header' @@ -25,10 +28,10 @@ func TestRewindReader(t *testing.T) { r.rewind() n, err := r.Read(buf) if err != nil { - t.Fatalf("Read failed: %s", err) + t.Errorf("Read failed: %s", err) } if string(buf[:n]) != "the header" { - t.Fatalf("iteration %d: expected 'the header' but got '%s' (n=%d)", i, string(buf[:n]), n) + t.Errorf("iteration %d: expected 'the header' but got '%s' (n=%d)", i, string(buf[:n]), n) } } @@ -38,10 +41,10 @@ func TestRewindReader(t *testing.T) { buf = make([]byte, len(data)) n, err := io.ReadFull(finalReader, buf) if err != nil { - t.Fatalf("ReadFull failed: %s (n=%d)", err, n) + t.Errorf("ReadFull failed: %s (n=%d)", err, n) } if string(buf) != data { - t.Fatalf("expected '%s' but got '%s'", string(data), string(buf)) + t.Errorf("expected '%s' but got '%s'", string(data), string(buf)) } } @@ -65,24 +68,24 @@ func TestCompression(t *testing.T) { checkErr(t, wc.Close(), "closing writer") // make sure Identify correctly chooses this compression method - format, stream, err := Identify(testFilename, compressed) + format, stream, err := Identify(context.Background(), testFilename, compressed) checkErr(t, err, "identifying") - if format.Name() != comp.Name() { - t.Fatalf("expected format %s but got %s", comp.Name(), format.Name()) + if format.Extension() != comp.Extension() { + t.Errorf("expected format %s but got %s", comp.Extension(), format.Extension()) } // read the contents back out and compare decompReader, err := format.(Decompressor).OpenReader(stream) - checkErr(t, err, "opening with decompressor '%s'", format.Name()) + checkErr(t, err, "opening with decompressor '%s'", format.Extension()) data, err := io.ReadAll(decompReader) checkErr(t, err, "reading decompressed data") checkErr(t, decompReader.Close(), "closing decompressor") if !bytes.Equal(data, contents) { - t.Fatalf("not equal to original") + t.Errorf("not equal to original") } } - var cannotIdentifyFromStream = map[string]bool{Brotli{}.Name(): true} + var cannotIdentifyFromStream = map[string]bool{Brotli{}.Extension(): true} for _, f := range formats { // only test compressors @@ -91,24 +94,24 @@ func TestCompression(t *testing.T) { continue } - t.Run(f.Name()+"_with_extension", func(t *testing.T) { - testOK(t, comp, "file"+f.Name()) + t.Run(f.Extension()+"_with_extension", func(t *testing.T) { + testOK(t, comp, "file"+f.Extension()) }) - if !cannotIdentifyFromStream[f.Name()] { - t.Run(f.Name()+"_without_extension", func(t *testing.T) { + if !cannotIdentifyFromStream[f.Extension()] { + t.Run(f.Extension()+"_without_extension", func(t *testing.T) { testOK(t, comp, "") }) } } } -func checkErr(t *testing.T, err error, msgFmt string, args ...interface{}) { +func checkErr(t *testing.T, err error, msgFmt string, args ...any) { t.Helper() if err == nil { return } args = append(args, err) - t.Fatalf(msgFmt+": %s", args...) + t.Errorf(msgFmt+": %s", args...) } func TestIdentifyDoesNotMatchContentFromTrimmedKnownHeaderHaving0Suffix(t *testing.T) { @@ -142,13 +145,13 @@ func TestIdentifyDoesNotMatchContentFromTrimmedKnownHeaderHaving0Suffix(t *testi } headerTrimmed := tt.header[:headerLen-1] stream := bytes.NewReader(headerTrimmed) - got, _, err := Identify("", stream) + got, _, err := Identify(context.Background(), "", stream) if got != nil { - t.Errorf("no Format expected for trimmed know %s header: found Format= %v", tt.name, got.Name()) + t.Errorf("no Format expected for trimmed know %s header: found Format= %v", tt.name, got.Extension()) return } - if ErrNoMatch != err { - t.Fatalf("ErrNoMatch expected for for trimmed know %s header: err :=%#v", tt.name, err) + if !errors.Is(err, NoMatch) { + t.Errorf("NoMatch expected for for trimmed know %s header: err :=%#v", tt.name, err) return } @@ -185,13 +188,13 @@ func TestIdentifyCanAssessSmallOrNoContent(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, _, err := Identify("", tt.args.stream) + got, _, err := Identify(context.Background(), "", tt.args.stream) if got != nil { - t.Errorf("no Format expected for non archive and not compressed stream: found Format= %v", got.Name()) + t.Errorf("no Format expected for non archive and not compressed stream: found Format=%#v", got) return } - if ErrNoMatch != err { - t.Fatalf("ErrNoMatch expected for non archive and not compressed stream: err :=%#v", err) + if !errors.Is(err, NoMatch) { + t.Errorf("NoMatch expected for non archive and not compressed stream: %#v", err) return } @@ -206,36 +209,36 @@ func compress( buf := bytes.NewBuffer(make([]byte, 0, 128)) cwriter, err := openwriter(buf) if err != nil { - t.Fatalf("fail to open compression writer: compression-name=%s, err=%#v", compName, err) + t.Errorf("fail to open compression writer: compression-name=%s, err=%#v", compName, err) return nil } _, err = cwriter.Write(content) if err != nil { cerr := cwriter.Close() - t.Fatalf( + t.Errorf( "fail to write using compression writer: compression-name=%s, err=%#v, close-err=%#v", compName, err, cerr) return nil } err = cwriter.Close() if err != nil { - t.Fatalf("fail to close compression writer: compression-name=%s, err=%#v", compName, err) + t.Errorf("fail to close compression writer: compression-name=%s, err=%#v", compName, err) return nil } return buf.Bytes() } func archive(t *testing.T, arch Archiver, fname string, fileInfo fs.FileInfo) []byte { - files := []File{ + files := []FileInfo{ {FileInfo: fileInfo, NameInArchive: "tmp.txt", - Open: func() (io.ReadCloser, error) { + Open: func() (fs.File, error) { return os.Open(fname) }}, } buf := bytes.NewBuffer(make([]byte, 0, 128)) err := arch.Archive(context.TODO(), buf, files) if err != nil { - t.Fatalf("fail to create archive: err=%#v", err) + t.Errorf("fail to create archive: err=%#v", err) return nil } return buf.Bytes() @@ -251,29 +254,24 @@ func newWriteNopCloser(w io.Writer) (io.WriteCloser, error) { } func newTmpTextFile(t *testing.T, content string) (string, fs.FileInfo) { - tmpTxtFile, err := os.CreateTemp("", "TestIdentifyFindFormatByStreamContent-tmp-*.txt") if err != nil { - t.Fatalf("fail to create tmp test file for archive tests: err=%v", err) + t.Errorf("fail to create tmp test file for archive tests: err=%v", err) return "", nil } fname := tmpTxtFile.Name() if _, err = tmpTxtFile.Write([]byte(content)); err != nil { - tmpTxtFile.Close() - os.Remove(fname) - t.Fatalf("fail to write content to tmp-txt-file: err=%#v", err) + t.Errorf("fail to write content to tmp-txt-file: err=%#v", err) return "", nil } if err = tmpTxtFile.Close(); err != nil { - os.Remove(fname) - t.Fatalf("fail to close tmp-txt-file: err=%#v", err) + t.Errorf("fail to close tmp-txt-file: err=%#v", err) return "", nil } fi, err := os.Stat(fname) if err != nil { - os.Remove(fname) - t.Fatalf("fail to get tmp-txt-file stats: err=%v", err) + t.Errorf("fail to get tmp-txt-file stats: err=%v", err) return "", nil } @@ -281,9 +279,9 @@ func newTmpTextFile(t *testing.T, content string) (string, fs.FileInfo) { } func TestIdentifyFindFormatByStreamContent(t *testing.T) { - tmpTxtFileName, tmpTxtFileInfo := newTmpTextFile(t, "this is text") + tmpTxtFileName, tmpTxtFileInfo := newTmpTextFile(t, "this is text that has to be long enough for brotli to match") t.Cleanup(func() { - os.Remove(tmpTxtFileName) + os.RemoveAll(tmpTxtFileName) }) tests := []struct { @@ -293,7 +291,13 @@ func TestIdentifyFindFormatByStreamContent(t *testing.T) { compressorName string wantFormatName string }{ - //TODO add test case for brotli when Brotli.Match() by stream content is implemented + { + name: "should recognize brotli", + openCompressionWriter: Brotli{}.OpenWriter, + content: []byte("this is text, but it has to be long enough to match brotli which doesn't have a magic number"), + compressorName: ".br", + wantFormatName: ".br", + }, { name: "should recognize bz2", openCompressionWriter: Bz2{}.OpenWriter, @@ -389,13 +393,13 @@ func TestIdentifyFindFormatByStreamContent(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { stream := bytes.NewReader(compress(t, tt.compressorName, tt.content, tt.openCompressionWriter)) - got, _, err := Identify("", stream) + got, _, err := Identify(context.Background(), "", stream) if err != nil { - t.Fatalf("should have found a corresponding Format: err :=%+v", err) + t.Errorf("should have found a corresponding Format, but got err=%+v", err) return } - if tt.wantFormatName != got.Name() { - t.Errorf("unexpected format found: expected=%s actual:%s", tt.wantFormatName, got.Name()) + if tt.wantFormatName != got.Extension() { + t.Errorf("unexpected format found: expected=%s actual=%s", tt.wantFormatName, got.Extension()) return } @@ -408,13 +412,13 @@ func TestIdentifyAndOpenZip(t *testing.T) { checkErr(t, err, "opening zip") defer f.Close() - format, reader, err := Identify("test.zip", f) + format, reader, err := Identify(context.Background(), "test.zip", f) checkErr(t, err, "identifying zip") - if format.Name() != ".zip" { - t.Fatalf("unexpected format found: expected=.zip actual:%s", format.Name()) + if format.Extension() != ".zip" { + t.Errorf("unexpected format found: expected=.zip actual=%s", format.Extension()) } - err = format.(Extractor).Extract(context.Background(), reader, nil, func(ctx context.Context, f File) error { + err = format.(Extractor).Extract(context.Background(), reader, nil, func(ctx context.Context, f FileInfo) error { rc, err := f.Open() if err != nil { return err @@ -430,25 +434,26 @@ func TestIdentifyASCIIFileStartingWithX(t *testing.T) { // Create a temporary file starting with the letter 'x' tmpFile, err := os.CreateTemp("", "TestIdentifyASCIIFileStartingWithX-tmp-*.txt") if err != nil { - t.Fatalf("fail to create tmp test file for archive tests: err=%v", err) + t.Errorf("fail to create tmp test file for archive tests: err=%v", err) } + defer os.Remove(tmpFile.Name()) _, err = tmpFile.Write([]byte("xThis is a test file")) if err != nil { - t.Fatalf("Failed to write to temp file: %v", err) + t.Errorf("Failed to write to temp file: %v", err) } tmpFile.Close() // Open the file and use the Identify function file, err := os.Open(tmpFile.Name()) if err != nil { - t.Fatalf("Failed to open temp file: %v", err) + t.Errorf("Failed to open temp file: %v", err) } defer file.Close() - _, _, err = Identify(tmpFile.Name(), file) - if !errors.Is(err, ErrNoMatch) { - t.Fatalf("Identify failed: %v", err) + _, _, err = Identify(context.Background(), tmpFile.Name(), file) + if !errors.Is(err, NoMatch) { + t.Errorf("Identify failed: %v", err) } } diff --git a/fs.go b/fs.go index bceda446..560727b5 100644 --- a/fs.go +++ b/fs.go @@ -9,132 +9,121 @@ import ( "os" "path" "path/filepath" - "runtime" - "sort" + "slices" "strings" "time" - - "github.com/klauspost/compress/zip" ) -// FileSystem opens the file at root as a read-only file system. The root may be a -// path to a directory, archive file, compressed archive file, compressed file, or -// any other file on disk. +// FileSystem identifies the format of the input and returns a read-only file system. +// The input can be a filename, stream, or both. // -// If root is a directory, its contents are accessed directly from the disk's file system. -// If root is an archive file, its contents can be accessed like a normal directory; -// compressed archive files are transparently decompressed as contents are accessed. -// And if root is any other file, it is the only file in the file system; if the file -// is compressed, it is transparently decompressed when read from. +// If only a filename is specified, it may be a path to a directory, archive file, +// compressed archive file, compressed regular file, or any other regular file on +// disk. If the filename is a directory, its contents are accessed directly from +// the device's file system. If the filename is an archive file, the contents can +// be accessed like a normal directory; compressed archive files are transparently +// decompressed as contents are accessed. And if the filename is any other file, it +// is the only file in the returned file system; if the file is compressed, it is +// transparently decompressed when read from. // -// This method essentially offers uniform read access to various kinds of files: -// directories, archives, compressed archives, and individual files are all treated -// the same way. +// If a stream is specified, the filename (if available) is used as a hint to help +// identify its format. Streams of archive files must be able to be made into an +// io.SectionReader (for safe concurrency) which requires io.ReaderAt and io.Seeker +// (to efficiently determine size). The automatic format identification requires +// io.Reader and will use io.Seeker if supported to avoid buffering. // -// Except for zip files, the returned FS values are guaranteed to be fs.ReadDirFS and -// fs.StatFS types, and may also be fs.SubFS. -func FileSystem(ctx context.Context, root string) (fs.FS, error) { - info, err := os.Stat(root) - if err != nil { - return nil, err - } +// Whether the data comes from disk or a stream, it is peeked at to automatically +// detect which format to use. +// +// This function essentially offers uniform read access to various kinds of files: +// directories, archives, compressed archives, individual files, and file streams +// are all treated the same way. +// +// NOTE: The performance of compressed tar archives is not great due to overhead +// with decompression. However, the fs.WalkDir() use case has been optimized to +// create an index on first call to ReadDir(). +func FileSystem(ctx context.Context, filename string, stream ReaderAtSeeker) (fs.FS, error) { + if filename == "" && stream == nil { + return nil, errors.New("no input") + } + + // if an input stream is specified, we'll use that for identification + // and for ArchiveFS (if it's an archive); but if not, we'll open the + // file and read it for identification, but in that case we won't want + // to also use it for the ArchiveFS (because we need to close what we + // opened, and ArchiveFS opens its own files), hence this separate var + idStream := stream + + // if input is only a filename (no stream), check if it's a directory; + // if not, open it so we can determine which format to use (filename + // is not always a good indicator of file format) + if filename != "" && stream == nil { + info, err := os.Stat(filename) + if err != nil { + return nil, err + } - // real folders can be accessed easily - if info.IsDir() { - return DirFS(root), nil - } + // real folders can be accessed easily + if info.IsDir() { + return os.DirFS(filename), nil + } - // if any archive formats recognize this file, access it like a folder - file, err := os.Open(root) - if err != nil { - return nil, err + // if any archive formats recognize this file, access it like a folder + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + idStream = file // use file for format identification only } - defer file.Close() - format, _, err := Identify(filepath.Base(root), file) - if err != nil && !errors.Is(err, ErrNoMatch) { - return nil, err + // normally, callers should use the Reader value returned from Identify, but + // our input is a Seeker, so we know the original input value gets returned + format, _, err := Identify(ctx, filepath.Base(filename), idStream) + if errors.Is(err, NoMatch) { + return FileFS{Path: filename}, nil // must be an ordinary file + } + if err != nil { + return nil, fmt.Errorf("identify format: %w", err) } - if format != nil { - switch ff := format.(type) { - case Zip: - // zip.Reader is more performant than ArchiveFS, because zip.Reader caches content information - // and zip.Reader can open several content files concurrently because of io.ReaderAt requirement - // while ArchiveFS can't. - // zip.Reader doesn't suffer from issue #330 and #310 according to local test (but they should be fixed anyway) - - // open the file anew, as our original handle will be closed when we return - file, err := os.Open(root) - if err != nil { - return nil, err - } - return zip.NewReader(file, info.Size()) - case Archival: - // TODO: we only really need Extractor and Decompressor here, not the combined interfaces... - return ArchiveFS{Path: root, Format: ff, Context: ctx}, nil - case Compression: - return FileFS{Path: root, Compression: ff}, nil + switch fileFormat := format.(type) { + case Extractor: + // if no stream was input, return an ArchiveFS that relies on the filepath + if stream == nil { + return &ArchiveFS{Path: filename, Format: fileFormat, Context: ctx}, nil } - } - // otherwise consider it an ordinary file; make a file system with it as its only file - return FileFS{Path: root}, nil -} + // otherwise, if a stream was input, return an ArchiveFS that relies on that -// DirFS allows accessing a directory on disk with a consistent file system interface. -// It is almost the same as os.DirFS, except for some reason os.DirFS only implements -// Open() and Stat(), but we also need ReadDir(). Seems like an obvious miss (as of Go 1.17) -// and I have questions: https://twitter.com/mholt6/status/1476058551432876032 -type DirFS string + // determine size -- we know that the stream value we get back from + // Identify is the same type as what we input because it is a Seeker + size, err := stream.Seek(0, io.SeekEnd) + if err != nil { + return nil, fmt.Errorf("seeking for size: %w", err) + } + _, err = stream.Seek(0, io.SeekStart) + if err != nil { + return nil, fmt.Errorf("seeking back to beginning: %w", err) + } -// Open opens the named file. -func (f DirFS) Open(name string) (fs.File, error) { - if err := f.checkName(name, "open"); err != nil { - return nil, err - } - return os.Open(filepath.Join(string(f), name)) -} + sr := io.NewSectionReader(stream, 0, size) -// ReadDir returns a listing of all the files in the named directory. -func (f DirFS) ReadDir(name string) ([]fs.DirEntry, error) { - if err := f.checkName(name, "readdir"); err != nil { - return nil, err - } - return os.ReadDir(filepath.Join(string(f), name)) -} + return &ArchiveFS{Stream: sr, Format: fileFormat, Context: ctx}, nil -// Stat returns info about the named file. -func (f DirFS) Stat(name string) (fs.FileInfo, error) { - if err := f.checkName(name, "stat"); err != nil { - return nil, err + case Compression: + return FileFS{Path: filename, Compression: fileFormat}, nil } - return os.Stat(filepath.Join(string(f), name)) -} -// Sub returns an FS corresponding to the subtree rooted at dir. -func (f DirFS) Sub(dir string) (fs.FS, error) { - if err := f.checkName(dir, "sub"); err != nil { - return nil, err - } - info, err := f.Stat(dir) - if err != nil { - return nil, err - } - if !info.IsDir() { - return nil, fmt.Errorf("%s is not a directory", dir) - } - return DirFS(filepath.Join(string(f), dir)), nil + return nil, fmt.Errorf("unable to create file system rooted at %s due to unsupported file or folder type", filename) } -// checkName returns an error if name is not a valid path according to the docs of -// the io/fs package, with an extra cue taken from the standard lib's implementation -// of os.dirFS.Open(), which checks for invalid characters in Windows paths. -func (f DirFS) checkName(name, op string) error { - if !fs.ValidPath(name) || runtime.GOOS == "windows" && strings.ContainsAny(name, `\:`) { - return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid} - } - return nil +// ReaderAtSeeker is a type that can read, read at, and seek. +// os.File and io.SectionReader both implement this interface. +type ReaderAtSeeker interface { + io.Reader + io.ReaderAt + io.Seeker } // FileFS allows accessing a file on disk using a consistent file system interface. @@ -169,7 +158,15 @@ func (f FileFS) Open(name string) (fs.File, error) { if err != nil { return nil, err } - return compressedFile{file, r}, nil + return compressedFile{r, closeBoth{file, r}}, nil +} + +// Stat stats the named file, which must be the file used to create the file system. +func (f FileFS) Stat(name string) (fs.FileInfo, error) { + if err := f.checkName(name, "stat"); err != nil { + return nil, err + } + return os.Stat(f.Path) } // ReadDir returns a directory listing with the file as the singular entry. @@ -184,23 +181,18 @@ func (f FileFS) ReadDir(name string) ([]fs.DirEntry, error) { return []fs.DirEntry{fs.FileInfoToDirEntry(info)}, nil } -// Stat stats the named file, which must be the file used to create the file system. -func (f FileFS) Stat(name string) (fs.FileInfo, error) { - if err := f.checkName(name, "stat"); err != nil { - return nil, err - } - return os.Stat(f.Path) -} - // checkName ensures the name is a valid path and also, in the case of // the FileFS, that it is either ".", the filename originally passed in // to create the FileFS, or the base of the filename (name without path). // Other names do not make sense for a FileFS since the FS is only 1 file. func (f FileFS) checkName(name, op string) error { + if name == f.Path { + return nil + } if !fs.ValidPath(name) { - return &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid} + return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid} } - if name != "." && name != f.Path && name != filepath.Base(f.Path) { + if name != "." && name != filepath.Base(f.Path) { return &fs.PathError{Op: op, Path: name, Err: fs.ErrNotExist} } return nil @@ -210,50 +202,66 @@ func (f FileFS) checkName(name, op string) error { // from a decompression reader, and which closes both // that reader and the underlying file. type compressedFile struct { - *os.File - decomp io.ReadCloser + io.Reader // decompressor + closeBoth // file and decompressor } -func (cf compressedFile) Read(p []byte) (int, error) { return cf.decomp.Read(p) } -func (cf compressedFile) Close() error { - err := cf.File.Close() - err2 := cf.decomp.Close() - if err2 != nil && err == nil { - err = err2 - } - return err -} - -// ArchiveFS allows accessing an archive (or a compressed archive) using a +// ArchiveFS allows reading an archive (or a compressed archive) using a // consistent file system interface. Essentially, it allows traversal and // reading of archive contents the same way as any normal directory on disk. // The contents of compressed archives are transparently decompressed. // -// A valid ArchiveFS value must set either Path or Stream. If Path is set, -// a literal file will be opened from the disk. If Stream is set, new -// SectionReaders will be implicitly created to access the stream, enabling -// safe, concurrent access. +// A valid ArchiveFS value must set either Path or Stream, but not both. +// If Path is set, a literal file will be opened from the disk. +// If Stream is set, new SectionReaders will be implicitly created to +// access the stream, enabling safe, concurrent access. // // NOTE: Due to Go's file system APIs (see package io/fs), the performance -// of ArchiveFS when used with fs.WalkDir() is poor for archives with lots -// of files (see issue #326). The fs.WalkDir() API requires listing each -// directory's contents in turn, and the only way to ensure we return the -// complete list of folder contents is to traverse the whole archive and -// build a slice; so if this is done for the root of an archive with many -// files, performance tends toward O(n^2) as the entire archive is walked -// for every folder that is enumerated (WalkDir calls ReadDir recursively). -// If you do not need each directory's contents walked in order, please -// prefer calling Extract() from an archive type directly; this will perform -// a O(n) walk of the contents in archive order, rather than the slower -// directory tree order. +// of ArchiveFS can suffer when using fs.WalkDir(). To mitigate this, +// an optimized fs.ReadDirFS has been implemented that indexes the entire +// archive on the first call to ReadDir() (since the entire archive needs +// to be walked for every call to ReadDir() anyway, as archive contents are +// often unordered). The first call to ReadDir(), i.e. near the start of the +// walk, will be slow for large archives, but should be instantaneous after. +// If you don't care about walking a file system in directory order, consider +// calling Extract() on the underlying archive format type directly, which +// walks the archive in entry order, without needing to do any sorting. +// +// Note that fs.FS implementations, including this one, reject paths starting +// with "./". This can be problematic sometimes, as it is not uncommon for +// tarballs to contain a top-level/root directory literally named ".", which +// can happen if a tarball is created in the same directory it is archiving. +// The underlying Extract() calls are faithful to entries with this name, +// but file systems have certain semantics around "." that restrict its use. +// For example, a file named "." cannot be created on a real file system +// because it is a special name that means "current directory". +// +// We had to decide whether to honor the true name in the archive, or honor +// file system semantics. Given that this is a virtual file system and other +// code using the fs.FS APIs will trip over a literal directory named ".", +// we choose to honor file system semantics. Files named "." are ignored; +// directories with this name are effectively transparent; their contents +// get promoted up a directory/level. This means a file at "./x" where "." +// is a literal directory name, its name will be passed in as "x" in +// WalkDir callbacks. If you need the raw, uninterpeted values from an +// archive, use the formats' Extract() method directly. See +// https://github.com/golang/go/issues/70155 for a little more background. +// +// This does have one negative edge case... a tar containing contents like +// [x . ./x] will have a conflict on the file named "x" because "./x" will +// also be accessed with the name of "x". type ArchiveFS struct { // set one of these Path string // path to the archive file on disk, or... Stream *io.SectionReader // ...stream from which to read archive - Format Archival // the archive format + Format Extractor // the archive format Prefix string // optional subdirectory in which to root the fs - Context context.Context // optional + Context context.Context // optional; mainly for cancellation + + // amortizing cache speeds up walks (esp. ReadDir) + contents map[string]fs.FileInfo + dirs map[string][]fs.DirEntry } // context always return a context, preferring f.Context if not nil. @@ -268,12 +276,33 @@ func (f ArchiveFS) context() context.Context { // the archive file itself will be opened as a directory file. func (f ArchiveFS) Open(name string) (fs.File, error) { if !fs.ValidPath(name) { - return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid} + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)} + } + + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + + // if we've already indexed the archive, we can know quickly if the file doesn't exist, + // and we can also return directory files with their entries instantly + if f.contents != nil { + if info, found := f.contents[name]; found { + if info.IsDir() { + if entries, ok := f.dirs[name]; ok { + return &dirFile{info: info, entries: entries}, nil + } + } + } else { + if entries, found := f.dirs[name]; found { + return &dirFile{info: implicitDirInfo{implicitDirEntry{name}}, entries: entries}, nil + } + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)} + } } - var archiveFile fs.File + // if a filename is specified, open the archive file + var archiveFile *os.File var err error - if f.Path != "" { + if f.Stream == nil { archiveFile, err = os.Open(f.Path) if err != nil { return nil, err @@ -286,210 +315,133 @@ func (f ArchiveFS) Open(name string) (fs.File, error) { archiveFile.Close() } }() - } else if f.Stream != nil { - archiveFile = fakeArchiveFile{} + } else if f.Stream == nil { + return nil, fmt.Errorf("no input; one of Path or Stream must be set") } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) - // handle special case of opening the archive root - if name == "." && archiveFile != nil { - archiveInfo, err := archiveFile.Stat() + if name == "." { + var archiveInfo fs.FileInfo + if archiveFile != nil { + archiveInfo, err = archiveFile.Stat() + if err != nil { + return nil, err + } + } else { + archiveInfo = implicitDirInfo{ + implicitDirEntry{"."}, + } + } + var entries []fs.DirEntry + entries, err = f.ReadDir(name) if err != nil { return nil, err } - entries, err := f.ReadDir(name) - if err != nil { + if err := archiveFile.Close(); err != nil { return nil, err } return &dirFile{ - extractedFile: extractedFile{ - File: File{ - FileInfo: dirFileInfo{archiveInfo}, - NameInArchive: ".", - }, - }, + info: dirFileInfo{archiveInfo}, entries: entries, }, nil } - var ( - files []File - found bool - ) - // collect them all or stop at exact file match, note we don't stop at folder match - handler := func(_ context.Context, file File) error { - file.NameInArchive = strings.Trim(file.NameInArchive, "/") - files = append(files, file) - if file.NameInArchive == name && !file.IsDir() { - found = true - return errStopWalk - } - return nil - } - var inputStream io.Reader if f.Stream == nil { - // when the archive file is closed, any (soon-to-be) associated decompressor should also be closed; see #365 - archiveFile = &closeBoth{File: archiveFile} inputStream = archiveFile } else { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, []string{name}, handler) - if found { - err = nil - } - if err != nil { - return nil, err - } - - if len(files) == 0 { - return nil, fs.ErrNotExist + var decompressor io.ReadCloser + if caf, ok := f.Format.(CompressedArchive); ok { + if caf.Compression != nil { + decompressor, err = caf.Compression.OpenReader(inputStream) + if err != nil { + return nil, err + } + inputStream = decompressor + } } - // exactly one or exact file found, test name match to detect implicit dir name https://github.com/mholt/archiver/issues/340 - if (len(files) == 1 && files[0].NameInArchive == name) || found { - file := files[len(files)-1] - if file.IsDir() { - return &dirFile{extractedFile: extractedFile{File: file}}, nil + // prepare the handler that we'll need if we have to iterate the + // archive to find the file being requested + var fsFile fs.File + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err } - // if named file is not a regular file, it can't be opened - if !file.Mode().IsRegular() { - return extractedFile{File: file}, nil + // paths in archives can't necessarily be trusted; also clean up any "./" prefix + file.NameInArchive = path.Clean(file.NameInArchive) + + if !strings.HasPrefix(file.NameInArchive, name) { + return nil } - // regular files can be read, so open it for reading - rc, err := file.Open() - if err != nil { - return nil, err + // if this is the requested file, and it's a directory, set up the dirFile, + // which will include a listing of all its contents as we continue the walk + if file.NameInArchive == name && file.IsDir() { + fsFile = &dirFile{info: file} // will fill entries slice as we continue the walk + return nil } - return extractedFile{File: file, ReadCloser: rc, parentArchive: archiveFile}, nil - } - // implicit files - files = fillImplicit(files) - file, foundFile := search(name, files) - if !foundFile { - return nil, fs.ErrNotExist - } + // if the named file was a directory and we are filling its entries, + // add this entry to the list + if df, ok := fsFile.(*dirFile); ok { + df.entries = append(df.entries, fs.FileInfoToDirEntry(file)) - if file.IsDir() { - return &dirFile{extractedFile: extractedFile{File: file}, entries: openReadDir(name, files)}, nil - } + // don't traverse into subfolders + if file.IsDir() { + return fs.SkipDir + } - // very unlikely - // maybe just panic, because extractor already walk through all the entries, file is impossible to read - // unless it's from a zip file. + return nil + } - // if named file is not a regular file, it can't be opened - if !file.Mode().IsRegular() { - return extractedFile{File: file}, nil - } + innerFile, err := file.Open() + if err != nil { + return err + } - // regular files can be read, so open it for reading - rc, err := file.Open() - if err != nil { - return nil, err - } - return extractedFile{File: file, ReadCloser: rc, parentArchive: archiveFile}, nil -} + fsFile = closeBoth{File: innerFile, c: archiveFile} -// copy of the same function from zip -func split(name string) (dir, elem string, isDir bool) { - if name[len(name)-1] == '/' { - isDir = true - name = name[:len(name)-1] - } - i := len(name) - 1 - for i >= 0 && name[i] != '/' { - i-- - } - if i < 0 { - return ".", name, isDir + if decompressor != nil { + fsFile = closeBoth{fsFile, decompressor} + } + + return fs.SkipAll } - return name[:i], name[i+1:], isDir -} -// modified from zip.Reader initFileList, it's used to find all implicit dirs -func fillImplicit(files []File) []File { - dirs := make(map[string]bool) - knownDirs := make(map[string]bool) - entries := make([]File, 0) - for _, file := range files { - for dir := path.Dir(file.NameInArchive); dir != "."; dir = path.Dir(dir) { - dirs[dir] = true - } - entries = append(entries, file) - if file.IsDir() { - knownDirs[file.NameInArchive] = true - } + // when we start the walk, we pass in a nil list of files to extract, since + // files may have a "." component in them, and the underlying format doesn't + // know about our file system semantics, so we need to filter ourselves (it's + // not significantly less efficient). + if caf, ok := f.Format.(CompressedArchive); ok { + // bypass the CompressedArchive format's opening of the decompressor, since + // we already did it, since we need to keep it open after returning + // "I BYPASSED THE COMPRESSOR!" -Rey + err = caf.Archival.Extract(f.context(), inputStream, nil, handler) + } else { + err = f.Format.Extract(f.context(), inputStream, nil, handler) } - for dir := range dirs { - if !knownDirs[dir] { - entries = append(entries, File{FileInfo: implicitDirInfo{implicitDirEntry{path.Base(dir)}}, NameInArchive: dir}) - } + if err != nil { + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("extract: %w", err)} } - - sort.Slice(entries, func(i, j int) bool { - fi, fj := entries[i], entries[j] - di, ei, _ := split(fi.NameInArchive) - dj, ej, _ := split(fj.NameInArchive) - - if di != dj { - return di < dj - } - return ei < ej - }) - return entries -} - -// modified from zip.Reader openLookup -func search(name string, entries []File) (File, bool) { - dir, elem, _ := split(name) - i := sort.Search(len(entries), func(i int) bool { - idir, ielem, _ := split(entries[i].NameInArchive) - return idir > dir || idir == dir && ielem >= elem - }) - if i < len(entries) { - fname := entries[i].NameInArchive - if fname == name || len(fname) == len(name)+1 && fname[len(name)] == '/' && fname[:len(name)] == name { - return entries[i], true - } + if fsFile == nil { + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)} } - return File{}, false -} -// modified from zip.Reader openReadDir -func openReadDir(dir string, entries []File) []fs.DirEntry { - i := sort.Search(len(entries), func(i int) bool { - idir, _, _ := split(entries[i].NameInArchive) - return idir >= dir - }) - j := sort.Search(len(entries), func(j int) bool { - jdir, _, _ := split(entries[j].NameInArchive) - return jdir > dir - }) - dirs := make([]fs.DirEntry, j-i) - for idx := range dirs { - dirs[idx] = fs.FileInfoToDirEntry(entries[i+idx]) - } - return dirs + return fsFile, nil } // Stat stats the named file from within the archive. If name is "." then // the archive file itself is statted and treated as a directory file. func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { if !fs.ValidPath(name) { - return nil, &fs.PathError{Op: "stat", Path: name, Err: fs.ErrInvalid} + return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%s: %w", name, fs.ErrInvalid)} } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) - if name == "." { if f.Path != "" { fileInfo, err := os.Stat(f.Path) @@ -502,6 +454,17 @@ func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { } } + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + + // if archive has already been indexed, simply use it + if f.contents != nil { + if info, ok := f.contents[name]; ok { + return info, nil + } + return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat %s: %w", name, fs.ErrNotExist)} + } + var archiveFile *os.File var err error if f.Stream == nil { @@ -512,16 +475,14 @@ func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { defer archiveFile.Close() } - var ( - files []File - found bool - ) - handler := func(_ context.Context, file File) error { - file.NameInArchive = strings.Trim(file.NameInArchive, "/") - files = append(files, file) - if file.NameInArchive == name { - found = true - return errStopWalk + var result FileInfo + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err + } + if path.Clean(file.NameInArchive) == name { + result = file + return fs.SkipAll } return nil } @@ -529,33 +490,38 @@ func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) { if f.Stream != nil { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, []string{name}, handler) - if found { - err = nil - } - if err != nil { + err = f.Format.Extract(f.context(), inputStream, nil, handler) + if err != nil && result.FileInfo == nil { return nil, err } - - // exactly one or exact file found, test name match to detect implicit dir name https://github.com/mholt/archiver/issues/340 - if (len(files) == 1 && files[0].NameInArchive == name) || found { - return files[len(files)-1].FileInfo, nil - } - - files = fillImplicit(files) - file, found := search(name, files) - if !found { + if result.FileInfo == nil { return nil, fs.ErrNotExist } - return file.FileInfo, nil + return result.FileInfo, nil } -// ReadDir reads the named directory from within the archive. -func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { +// ReadDir reads the named directory from within the archive. If name is "." +// then the root of the archive content is listed. +func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { if !fs.ValidPath(name) { return nil, &fs.PathError{Op: "readdir", Path: name, Err: fs.ErrInvalid} } + // apply prefix if fs is rooted in a subtree + name = path.Join(f.Prefix, name) + + // fs.WalkDir() calls ReadDir() once per directory, and for archives with + // lots of directories, that is very slow, since we have to traverse the + // entire archive in order to ensure that we got all the entries for a + // directory -- so we can fast-track this lookup if we've done the + // traversal already + if len(f.dirs) > 0 { + return f.dirs[name], nil + } + + f.contents = make(map[string]fs.FileInfo) + f.dirs = make(map[string][]fs.DirEntry) + var archiveFile *os.File var err error if f.Stream == nil { @@ -566,31 +532,72 @@ func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { defer archiveFile.Close() } - // apply prefix if fs is rooted in a subtree - name = path.Join(f.Prefix, name) + handler := func(ctx context.Context, file FileInfo) error { + if err := ctx.Err(); err != nil { + return err + } - // collect all files with prefix - var ( - files []File - foundFile bool - ) - handler := func(_ context.Context, file File) error { - file.NameInArchive = strings.Trim(file.NameInArchive, "/") + // can't always trust path names + file.NameInArchive = path.Clean(file.NameInArchive) + + // avoid infinite walk; apparently, creating a tar file in the target + // directory may result in an entry called "." in the archive; see #384 if file.NameInArchive == "." { return nil } - files = append(files, file) + + // if the name being requested isn't a directory, return an error similar to + // what most OSes return from the readdir system call when given a non-dir if file.NameInArchive == name && !file.IsDir() { - foundFile = true - return errStopWalk + return &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a directory")} } - return nil - } - // handle special case of reading from root of archive - var filter []string - if name != "." { - filter = []string{name} + // index this file info for quick access + f.contents[file.NameInArchive] = file + + // this is a real directory; prefer its DirEntry over an implicit/fake one we may have created earlier; + // first try to find if it exists, and if so, replace the value; otherwise insert it in sorted position + if file.IsDir() { + dirEntry := fs.FileInfoToDirEntry(file) + idx, found := slices.BinarySearchFunc(f.dirs[path.Dir(file.NameInArchive)], dirEntry, func(a, b fs.DirEntry) int { + return strings.Compare(a.Name(), b.Name()) + }) + if found { + f.dirs[path.Dir(file.NameInArchive)][idx] = dirEntry + } else { + f.dirs[path.Dir(file.NameInArchive)] = slices.Insert(f.dirs[path.Dir(file.NameInArchive)], idx, dirEntry) + } + } + + // this loop looks like an abomination, but it's really quite simple: we're + // just iterating the directories of the path up to the root; i.e. we lob off + // the base (last component) of the path until no separators remain, i.e. only + // one component remains -- then loop again to make sure it's not a duplicate + for dir, base := path.Dir(file.NameInArchive), path.Base(file.NameInArchive); ; dir, base = path.Dir(dir), path.Base(dir) { + if err := ctx.Err(); err != nil { + return err + } + + var dirInfo fs.DirEntry = implicitDirInfo{implicitDirEntry{base}} + + // we are "filling in" any directories that could potentially be only implicit, + // and since a nested directory can have more than 1 item, we need to prevent + // duplication; for example: given a/b/c and a/b/d, we need to avoid adding + // an entry for "b" twice within "a" -- hence we search for it first, and if + // it doesn't already exist, we insert it in sorted position + idx, found := slices.BinarySearchFunc(f.dirs[dir], dirInfo, func(a, b fs.DirEntry) int { + return strings.Compare(a.Name(), b.Name()) + }) + if !found { + f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirInfo) + } + + if dir == "." { + break + } + } + + return nil } var inputStream io.Reader = archiveFile @@ -598,30 +605,18 @@ func (f ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) { inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size()) } - err = f.Format.Extract(f.context(), inputStream, filter, handler) - if foundFile { - return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")} - } + err = f.Format.Extract(f.context(), inputStream, nil, handler) if err != nil { - return nil, err + // these being non-nil implies that we have indexed the archive, + // but if an error occurred, we likely only got part of the way + // through and our index is incomplete, and we'd have to re-walk + // the whole thing anyway; so reset these to nil to avoid bugs + f.dirs = nil + f.contents = nil + return nil, fmt.Errorf("extract: %w", err) } - // always find all implicit directories - files = fillImplicit(files) - // and return early for dot file - if name == "." { - return openReadDir(name, files), nil - } - - file, foundFile := search(name, files) - if !foundFile { - return nil, fs.ErrNotExist - } - - if !file.IsDir() { - return nil, &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a dir")} - } - return openReadDir(name, files), nil + return f.dirs[name], nil } // Sub returns an FS corresponding to the subtree rooted at dir. @@ -636,6 +631,11 @@ func (f *ArchiveFS) Sub(dir string) (fs.FS, error) { if !info.IsDir() { return nil, fmt.Errorf("%s is not a directory", dir) } + // result is the same as what we're starting with, except + // we indicate a path prefix to be used for all operations; + // the reason we don't append to the Path field directly + // is because the input might be a stream rather than a + // path on disk, and the Prefix field is applied on both result := f result.Prefix = dir return result, nil @@ -705,44 +705,18 @@ func pathWithoutTopDir(fpath string) string { return fpath[slashIdx+1:] } -// errStopWalk is an arbitrary error value, since returning -// any error (other than fs.SkipDir) will stop a walk. We -// use this as we may only want 1 file from an extraction, -// even if that file is a directory and would otherwise be -// traversed during the walk. -var errStopWalk = fmt.Errorf("stop walk") - -type fakeArchiveFile struct{} - -func (f fakeArchiveFile) Stat() (fs.FileInfo, error) { - return implicitDirInfo{ - implicitDirEntry{name: "."}, - }, nil -} -func (f fakeArchiveFile) Read([]byte) (int, error) { return 0, io.EOF } -func (f fakeArchiveFile) Close() error { return nil } - // dirFile implements the fs.ReadDirFile interface. type dirFile struct { - extractedFile - - // TODO: We could probably be more memory-efficient by not loading - // all the entries at once and then "faking" the paging for ReadDir(). - // Instead, we could maybe store a reference to the parent archive FS, - // then walk it each time ReadDir is called, skipping entriesRead - // files, then continuing the listing, until n are listed. But that - // might be kinda messy and a lot of work, so I leave it for a future - // optimization if needed. + info fs.FileInfo entries []fs.DirEntry - entriesRead int + entriesRead int // used for paging with ReadDir(n) } -// If this represents the root of the archive, we use the archive's -// FileInfo which says it's a file, not a directory; the whole point -// of this package is to treat the archive as a directory, so always -// return true in our case. -func (dirFile) IsDir() bool { return true } +func (dirFile) Read([]byte) (int, error) { return 0, errors.New("cannot read a directory file") } +func (df dirFile) Stat() (fs.FileInfo, error) { return df.info, nil } +func (dirFile) Close() error { return nil } +// ReadDir implements [fs.ReadDirFile]. func (df *dirFile) ReadDir(n int) ([]fs.DirEntry, error) { if n <= 0 { return df.entries, nil @@ -771,46 +745,14 @@ func (dirFileInfo) Size() int64 { return 0 } func (info dirFileInfo) Mode() fs.FileMode { return info.FileInfo.Mode() | fs.ModeDir } func (dirFileInfo) IsDir() bool { return true } -// extractedFile implements fs.File, thus it represents an "opened" file, -// which is slightly different from our File type which represents a file -// that possibly may be opened. If the file is actually opened, this type -// ensures that the parent archive is closed when this file from within it -// is also closed. -type extractedFile struct { - File - - // Set these fields if a "regular file" which has actual content - // that can be read, i.e. a file that is open for reading. - // ReadCloser should be the file's reader, and parentArchive is - // a reference to the archive the files comes out of. - // If parentArchive is set, it will also be closed along with - // the file when Close() is called. +// fileInArchive represents a file that is opened from within an archive. +// It implements fs.File. +type fileInArchive struct { io.ReadCloser - parentArchive io.Closer -} - -// Close closes the the current file if opened and -// the parent archive if specified. This is a no-op -// for directories which do not set those fields. -func (ef extractedFile) Close() error { - if ef.parentArchive != nil { - if err := ef.parentArchive.Close(); err != nil { - return err - } - } - if ef.ReadCloser != nil { - return ef.ReadCloser.Close() - } - return nil + info fs.FileInfo } -// compressorCloser is a type that closes two closers at the same time. -// It only exists to fix #365. If a better solution can be found, I'd -// likely prefer it. -type compressorCloser interface { - io.Closer - closeCompressor(io.Closer) -} +func (af fileInArchive) Stat() (fs.FileInfo, error) { return af.info, nil } // closeBoth closes both the file and an associated // closer, such as a (de)compressor that wraps the @@ -818,28 +760,34 @@ type compressorCloser interface { // better solution is found, I'd probably prefer that. type closeBoth struct { fs.File - c io.Closer + c io.Closer // usually the archive or the decompressor } -// closeCompressor will have the closer closed when the associated File closes. -func (dc *closeBoth) closeCompressor(c io.Closer) { dc.c = c } - // Close closes both the file and the associated closer. It always calls -// Close() on both, but returns only the first error, if any. +// Close() on both, but if multiple errors occur they are wrapped together. func (dc closeBoth) Close() error { - err1, err2 := dc.File.Close(), dc.c.Close() - if err1 != nil { - return err1 + var err error + if dc.File != nil { + if err2 := dc.File.Close(); err2 != nil { + err = fmt.Errorf("closing file: %w", err2) + } + } + if dc.c != nil { + if err2 := dc.c.Close(); err2 != nil { + if err == nil { + err = fmt.Errorf("closing closer: %w", err2) + } else { + err = fmt.Errorf("%w; additionally, closing closer: %w", err, err2) + } + } } - return err2 + return err } // implicitDirEntry represents a directory that does // not actually exist in the archive but is inferred // from the paths of actual files in the archive. -type implicitDirEntry struct { - name string -} +type implicitDirEntry struct{ name string } func (e implicitDirEntry) Name() string { return e.name } func (implicitDirEntry) IsDir() bool { return true } @@ -853,28 +801,20 @@ func (e implicitDirEntry) Info() (fs.FileInfo, error) { // not contain actual entries for a directory, but we need to // pretend it exists so its contents can be discovered and // traversed. -type implicitDirInfo struct { - implicitDirEntry -} +type implicitDirInfo struct{ implicitDirEntry } func (d implicitDirInfo) Name() string { return d.name } func (implicitDirInfo) Size() int64 { return 0 } func (d implicitDirInfo) Mode() fs.FileMode { return d.Type() } func (implicitDirInfo) ModTime() time.Time { return time.Time{} } -func (implicitDirInfo) Sys() interface{} { return nil } +func (implicitDirInfo) Sys() any { return nil } // Interface guards var ( - _ fs.ReadDirFS = (*DirFS)(nil) - _ fs.StatFS = (*DirFS)(nil) - _ fs.SubFS = (*DirFS)(nil) - _ fs.ReadDirFS = (*FileFS)(nil) _ fs.StatFS = (*FileFS)(nil) _ fs.ReadDirFS = (*ArchiveFS)(nil) _ fs.StatFS = (*ArchiveFS)(nil) _ fs.SubFS = (*ArchiveFS)(nil) - - _ compressorCloser = (*closeBoth)(nil) ) diff --git a/fs_test.go b/fs_test.go index 9180fbf3..5d6a8bd3 100644 --- a/fs_test.go +++ b/fs_test.go @@ -58,13 +58,13 @@ func TestSelfTar(t *testing.T) { fn := "testdata/self-tar.tar" fh, err := os.Open(fn) if err != nil { - t.Fatalf("Could not load test tar: %v", fn) + t.Errorf("Could not load test tar: %v", fn) } fstat, err := os.Stat(fn) if err != nil { - t.Fatalf("Could not stat test tar: %v", fn) + t.Errorf("Could not stat test tar: %v", fn) } - fsys := ArchiveFS{ + fsys := &ArchiveFS{ Stream: io.NewSectionReader(fh, 0, fstat.Size()), Format: Tar{}, } @@ -78,12 +78,12 @@ func TestSelfTar(t *testing.T) { return nil }) if err != nil { - t.Fatal(err) + t.Error(err) } } func ExampleArchiveFS_Stream() { - fsys := ArchiveFS{ + fsys := &ArchiveFS{ Stream: io.NewSectionReader(bytes.NewReader(testZIP), 0, int64(len(testZIP))), Format: Zip{}, } @@ -158,9 +158,7 @@ func TestArchiveFS_ReadDir(t *testing.T) { t.Parallel() fsys := tc.archive for baseDir, wantLS := range tc.want { - baseDir := baseDir - wantLS := wantLS - t.Run(fmt.Sprintf("ReadDir(%s)", baseDir), func(t *testing.T) { + t.Run(fmt.Sprintf("ReadDir(%q)", baseDir), func(t *testing.T) { dis, err := fsys.ReadDir(baseDir) if err != nil { t.Error(err) @@ -183,17 +181,18 @@ func TestArchiveFS_ReadDir(t *testing.T) { t.Run(fmt.Sprintf("Open(%s)", baseDir), func(t *testing.T) { f, err := fsys.Open(baseDir) if err != nil { - t.Error(err) + t.Errorf("fsys.Open(%q): %#v %s", baseDir, err, err) + return } rdf, ok := f.(fs.ReadDirFile) if !ok { - t.Fatalf("'%s' did not return a fs.ReadDirFile, %+v", baseDir, rdf) + t.Errorf("fsys.Open(%q) did not return a fs.ReadDirFile, got: %#v", baseDir, f) } dis, err := rdf.ReadDir(-1) if err != nil { - t.Fatal(err) + t.Error(err) } dirs := []string{} diff --git a/go.mod b/go.mod index 3adbffad..0dacae9b 100644 --- a/go.mod +++ b/go.mod @@ -1,26 +1,26 @@ module github.com/mholt/archiver/v4 -go 1.22 +go 1.22.2 -toolchain go1.22.2 +toolchain go1.23.2 require ( - github.com/andybalholm/brotli v1.1.0 + github.com/andybalholm/brotli v1.1.1 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 - github.com/klauspost/compress v1.17.8 + github.com/klauspost/compress v1.17.11 github.com/klauspost/pgzip v1.2.6 - github.com/nwaples/rardecode/v2 v2.0.0-beta.3 + github.com/nwaples/rardecode/v2 v2.0.0-beta.4 github.com/therootcompany/xz v1.0.1 github.com/ulikunitz/xz v0.5.12 ) require ( github.com/STARRY-S/zip v0.1.0 - github.com/bodgit/sevenzip v1.5.1 + github.com/bodgit/sevenzip v1.5.2 github.com/golang/snappy v0.0.4 github.com/pierrec/lz4/v4 v4.1.21 github.com/sorairolake/lzip-go v0.3.5 - golang.org/x/text v0.16.0 + golang.org/x/text v0.19.0 ) require ( diff --git a/go.sum b/go.sum index e1803b1c..5e844566 100644 --- a/go.sum +++ b/go.sum @@ -19,12 +19,12 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/STARRY-S/zip v0.1.0 h1:eUER3jKmHKXjv+iy3BekLa+QnNSo1Lqz4eTzYBcGDqo= github.com/STARRY-S/zip v0.1.0/go.mod h1:qj/mTZkvb3AvfGQ2e775/3AODRvB4peSw8KNMvrM8/I= -github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= -github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= +github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= github.com/bodgit/plumbing v1.3.0 h1:pf9Itz1JOQgn7vEOE7v7nlEfBykYqvUYioC61TwWCFU= github.com/bodgit/plumbing v1.3.0/go.mod h1:JOTb4XiRu5xfnmdnDJo6GmSbSbtSyufrsyZFByMtKEs= -github.com/bodgit/sevenzip v1.5.1 h1:rVj0baZsooZFy64DJN0zQogPzhPrT8BQ8TTRd1H4WHw= -github.com/bodgit/sevenzip v1.5.1/go.mod h1:Q3YMySuVWq6pyGEolyIE98828lOfEoeWg5zeH6x22rc= +github.com/bodgit/sevenzip v1.5.2 h1:acMIYRaqoHAdeu9LhEGGjL9UzBD4RNf9z7+kWDNignI= +github.com/bodgit/sevenzip v1.5.2/go.mod h1:gTGzXA67Yko6/HLSD0iK4kWaWzPlPmLfDO73jTjSRqc= github.com/bodgit/windows v1.0.1 h1:tF7K6KOluPYygXa3Z2594zxlkbKPAOvqr97etrGNIz4= github.com/bodgit/windows v1.0.1/go.mod h1:a6JLwrB4KrTR5hBpp8FI9/9W9jJfeQ2h4XDXU74ZCdM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -84,16 +84,16 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= -github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= -github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/nwaples/rardecode/v2 v2.0.0-beta.3 h1:evQTW0IjM2GAL5AaPHiQrT+laWohkt5zHKA3yCsGQGU= -github.com/nwaples/rardecode/v2 v2.0.0-beta.3/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY= +github.com/nwaples/rardecode/v2 v2.0.0-beta.4 h1:sdiJxQdPjECn2lh9nLFFhgLCf+0ulDU5rODbtERTlUY= +github.com/nwaples/rardecode/v2 v2.0.0-beta.4/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -117,6 +117,8 @@ github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0B github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= @@ -183,8 +185,8 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -212,8 +214,8 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/gz.go b/gz.go index b9873f19..e8b3f98d 100644 --- a/gz.go +++ b/gz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -29,13 +30,13 @@ type Gz struct { Multithreaded bool } -func (Gz) Name() string { return ".gz" } +func (Gz) Extension() string { return ".gz" } -func (gz Gz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (gz Gz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), gz.Name()) { + if strings.Contains(strings.ToLower(filename), gz.Extension()) { mr.ByName = true } diff --git a/interfaces.go b/interfaces.go index 9a41e1f1..f675f0e2 100644 --- a/interfaces.go +++ b/interfaces.go @@ -5,10 +5,12 @@ import ( "io" ) -// Format represents either an archive or compression format. +// Format represents a way of getting data out of something else. +// A format usually represents compression or an archive (or both). type Format interface { - // Name returns the name of the format. - Name() string + // Extension returns the conventional file extension for this + // format. + Extension() string // Match returns true if the given name/stream is recognized. // One of the arguments is optional: filename might be empty @@ -21,7 +23,7 @@ type Format interface { // preserve the stream through matching, you should either // buffer what is read by Match, or seek to the last position // before Match was called. - Match(filename string, stream io.Reader) (MatchResult, error) + Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) } // Compression is a compression format with both compress and decompress methods. @@ -57,13 +59,13 @@ type Archiver interface { // Archive writes an archive file to output with the given files. // // Context cancellation must be honored. - Archive(ctx context.Context, output io.Writer, files []File) error + Archive(ctx context.Context, output io.Writer, files []FileInfo) error } // ArchiveAsyncJob contains a File to be archived and a channel that // the result of the archiving should be returned on. type ArchiveAsyncJob struct { - File File + File FileInfo Result chan<- error } @@ -83,14 +85,20 @@ type ArchiverAsync interface { // Extractor can extract files from an archive. type Extractor interface { - // Extract reads the files at pathsInArchive from sourceArchive. + // Extract walks entries in the archive and calls handleFile for each + // entry that matches the pathsInArchive filter by path/name. + // // If pathsInArchive is nil, all files are extracted without discretion. // If pathsInArchive is empty, no files are extracted. // If a path refers to a directory, all files within it are extracted. // Extracted files are passed to the handleFile callback for handling. // + // Any files opened in the FileHandler should be closed when it returns, + // as there is no guarantee the files can be read outside the handler + // or after the walk has proceeded to the next file. + // // Context cancellation must be honored. - Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchive []string, handleFile FileHandler) error + Extract(ctx context.Context, archive io.Reader, pathsInArchive []string, handleFile FileHandler) error } // Inserter can insert files into an existing archive. @@ -99,5 +107,5 @@ type Inserter interface { // Insert inserts the files into archive. // // Context cancellation must be honored. - Insert(ctx context.Context, archive io.ReadWriteSeeker, files []File) error + Insert(ctx context.Context, archive io.ReadWriteSeeker, files []FileInfo) error } diff --git a/lz4.go b/lz4.go index aaa22a54..7425ad2a 100644 --- a/lz4.go +++ b/lz4.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -17,13 +18,13 @@ type Lz4 struct { CompressionLevel int } -func (Lz4) Name() string { return ".lz4" } +func (Lz4) Extension() string { return ".lz4" } -func (lz Lz4) Match(filename string, stream io.Reader) (MatchResult, error) { +func (lz Lz4) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), lz.Name()) { + if strings.Contains(strings.ToLower(filename), lz.Extension()) { mr.ByName = true } diff --git a/lzip.go b/lzip.go index a861a487..1cbffa50 100644 --- a/lzip.go +++ b/lzip.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "path/filepath" "strings" @@ -16,13 +17,13 @@ func init() { // Lzip facilitates lzip compression. type Lzip struct{} -func (Lzip) Name() string { return ".lz" } +func (Lzip) Extension() string { return ".lz" } -func (lz Lzip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (lz Lzip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if filepath.Ext(strings.ToLower(filename)) == lz.Name() { + if filepath.Ext(strings.ToLower(filename)) == lz.Extension() { mr.ByName = true } diff --git a/rar.go b/rar.go index ed0099f6..bece6071 100644 --- a/rar.go +++ b/rar.go @@ -30,13 +30,13 @@ type Rar struct { Password string } -func (Rar) Name() string { return ".rar" } +func (Rar) Extension() string { return ".rar" } -func (r Rar) Match(filename string, stream io.Reader) (MatchResult, error) { +func (r Rar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), r.Name()) { + if strings.Contains(strings.ToLower(filename), r.Extension()) { mr.ByName = true } @@ -57,7 +57,7 @@ func (r Rar) Match(filename string, stream io.Reader) (MatchResult, error) { } // Archive is not implemented for RAR, but the method exists so that Rar satisfies the ArchiveFormat interface. -func (r Rar) Archive(_ context.Context, _ io.Writer, _ []File) error { +func (r Rar) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error { return fmt.Errorf("not implemented because RAR is a proprietary format") } @@ -98,11 +98,14 @@ func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv continue } - file := File{ - FileInfo: rarFileInfo{hdr}, + info := rarFileInfo{hdr} + file := FileInfo{ + FileInfo: info, Header: hdr, NameInArchive: hdr.Name, - Open: func() (io.ReadCloser, error) { return io.NopCloser(rr), nil }, + Open: func() (fs.File, error) { + return fileInArchive{io.NopCloser(rr), info}, nil + }, } err = handleFile(ctx, file) @@ -133,7 +136,7 @@ func (rfi rarFileInfo) Size() int64 { return rfi.fh.UnPackedSize } func (rfi rarFileInfo) Mode() os.FileMode { return rfi.fh.Mode() } func (rfi rarFileInfo) ModTime() time.Time { return rfi.fh.ModificationTime } func (rfi rarFileInfo) IsDir() bool { return rfi.fh.IsDir } -func (rfi rarFileInfo) Sys() interface{} { return nil } +func (rfi rarFileInfo) Sys() any { return nil } var ( rarHeaderV1_5 = []byte("Rar!\x1a\x07\x00") // v1.5 diff --git a/sz.go b/sz.go index 9d10604a..8a926b7f 100644 --- a/sz.go +++ b/sz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -15,13 +16,13 @@ func init() { // Sz facilitates Snappy compression. type Sz struct{} -func (sz Sz) Name() string { return ".sz" } +func (sz Sz) Extension() string { return ".sz" } -func (sz Sz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (sz Sz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), sz.Name()) { + if strings.Contains(strings.ToLower(filename), sz.Extension()) { mr.ByName = true } diff --git a/tar.go b/tar.go index 0db0a665..d4106257 100644 --- a/tar.go +++ b/tar.go @@ -26,13 +26,13 @@ type Tar struct { ContinueOnError bool } -func (Tar) Name() string { return ".tar" } +func (Tar) Extension() string { return ".tar" } -func (t Tar) Match(filename string, stream io.Reader) (MatchResult, error) { +func (t Tar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), t.Name()) { + if strings.Contains(strings.ToLower(filename), t.Extension()) { mr.ByName = true } @@ -46,7 +46,7 @@ func (t Tar) Match(filename string, stream io.Reader) (MatchResult, error) { return mr, nil } -func (t Tar) Archive(ctx context.Context, output io.Writer, files []File) error { +func (t Tar) Archive(ctx context.Context, output io.Writer, files []FileInfo) error { tw := tar.NewWriter(output) defer tw.Close() @@ -74,7 +74,7 @@ func (t Tar) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan Arc return nil } -func (t Tar) writeFileToArchive(ctx context.Context, tw *tar.Writer, file File) error { +func (t Tar) writeFileToArchive(ctx context.Context, tw *tar.Writer, file FileInfo) error { if err := ctx.Err(); err != nil { return err // honor context cancellation } @@ -109,7 +109,7 @@ func (t Tar) writeFileToArchive(ctx context.Context, tw *tar.Writer, file File) return nil } -func (t Tar) Insert(ctx context.Context, into io.ReadWriteSeeker, files []File) error { +func (t Tar) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileInfo) error { // Tar files may end with some, none, or a lot of zero-byte padding. The spec says // it should end with two 512-byte trailer records consisting solely of null/0 // bytes: https://www.gnu.org/software/tar/manual/html_node/Standard.html. However, @@ -212,16 +212,25 @@ func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv continue } - file := File{ - FileInfo: hdr.FileInfo(), + info := hdr.FileInfo() + file := FileInfo{ + FileInfo: info, Header: hdr, NameInArchive: hdr.Name, LinkTarget: hdr.Linkname, - Open: func() (io.ReadCloser, error) { return io.NopCloser(tr), nil }, + Open: func() (fs.File, error) { + return fileInArchive{io.NopCloser(tr), info}, nil + }, } err = handleFile(ctx, file) if errors.Is(err, fs.SkipAll) { + // At first, I wasn't sure if fs.SkipAll implied that the rest of the entries + // should still be iterated and just "skipped" (i.e. no-ops) or if the walk + // should stop; both have the same net effect, one is just less efficient... + // apparently the name of fs.StopWalk was the preferred name, but it still + // became fs.SkipAll because of semantics with documentation; see + // https://github.com/golang/go/issues/47209 -- anyway, the walk should stop. break } else if errors.Is(err, fs.SkipDir) { // if a directory, skip this path; if a file, skip the folder path diff --git a/xz.go b/xz.go index 4e1b6b41..edb61373 100644 --- a/xz.go +++ b/xz.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -16,13 +17,13 @@ func init() { // Xz facilitates xz compression. type Xz struct{} -func (Xz) Name() string { return ".xz" } +func (Xz) Extension() string { return ".xz" } -func (x Xz) Match(filename string, stream io.Reader) (MatchResult, error) { +func (x Xz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), x.Name()) { + if strings.Contains(strings.ToLower(filename), x.Extension()) { mr.ByName = true } diff --git a/zip.go b/zip.go index 0a4d04a8..c012c080 100644 --- a/zip.go +++ b/zip.go @@ -83,13 +83,13 @@ type Zip struct { TextEncoding string } -func (z Zip) Name() string { return ".zip" } +func (z Zip) Extension() string { return ".zip" } -func (z Zip) Match(filename string, stream io.Reader) (MatchResult, error) { +func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), z.Name()) { + if strings.Contains(strings.ToLower(filename), z.Extension()) { mr.ByName = true } @@ -103,7 +103,7 @@ func (z Zip) Match(filename string, stream io.Reader) (MatchResult, error) { return mr, nil } -func (z Zip) Archive(ctx context.Context, output io.Writer, files []File) error { +func (z Zip) Archive(ctx context.Context, output io.Writer, files []FileInfo) error { zw := zip.NewWriter(output) defer zw.Close() @@ -129,7 +129,7 @@ func (z Zip) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan Arc return nil } -func (z Zip) archiveOneFile(ctx context.Context, zw *zip.Writer, idx int, file File) error { +func (z Zip) archiveOneFile(ctx context.Context, zw *zip.Writer, idx int, file FileInfo) error { if err := ctx.Err(); err != nil { return err // honor context cancellation } @@ -218,11 +218,18 @@ func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInArchiv continue } - file := File{ - FileInfo: f.FileInfo(), + info := f.FileInfo() + file := FileInfo{ + FileInfo: info, Header: f.FileHeader, NameInArchive: f.Name, - Open: func() (io.ReadCloser, error) { return f.Open() }, + Open: func() (fs.File, error) { + openedFile, err := f.Open() + if err != nil { + return nil, err + } + return fileInArchive{openedFile, info}, nil + }, } err := handleFile(ctx, file) @@ -266,7 +273,7 @@ func (z Zip) decodeText(hdr *zip.FileHeader) { } // Insert appends the listed files into the provided Zip archive stream. -func (z Zip) Insert(ctx context.Context, into io.ReadWriteSeeker, files []File) error { +func (z Zip) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileInfo) error { // following very simple example at https://github.com/STARRY-S/zip?tab=readme-ov-file#usage zu, err := szip.NewUpdater(into) if err != nil { diff --git a/zlib.go b/zlib.go index 84275186..485991e6 100644 --- a/zlib.go +++ b/zlib.go @@ -1,6 +1,7 @@ package archiver import ( + "context" "io" "strings" @@ -16,13 +17,13 @@ type Zlib struct { CompressionLevel int } -func (Zlib) Name() string { return ".zz" } +func (Zlib) Extension() string { return ".zz" } -func (zz Zlib) Match(filename string, stream io.Reader) (MatchResult, error) { +func (zz Zlib) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), zz.Name()) { + if strings.Contains(strings.ToLower(filename), zz.Extension()) { mr.ByName = true } diff --git a/zstd.go b/zstd.go index fe07b76f..cd0c2814 100644 --- a/zstd.go +++ b/zstd.go @@ -2,6 +2,7 @@ package archiver import ( "bytes" + "context" "io" "strings" @@ -18,13 +19,13 @@ type Zstd struct { DecoderOptions []zstd.DOption } -func (Zstd) Name() string { return ".zst" } +func (Zstd) Extension() string { return ".zst" } -func (zs Zstd) Match(filename string, stream io.Reader) (MatchResult, error) { +func (zs Zstd) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) { var mr MatchResult // match filename - if strings.Contains(strings.ToLower(filename), zs.Name()) { + if strings.Contains(strings.ToLower(filename), zs.Extension()) { mr.ByName = true }