Skip to content
This repository has been archived by the owner on Nov 19, 2024. It is now read-only.

Commit

Permalink
Refactor FS types; improve performance (#426)
Browse files Browse the repository at this point in the history
* WIP

* More WIP

* Finish improvements (probably)
  • Loading branch information
mholt authored Nov 8, 2024
1 parent 264c901 commit e310539
Show file tree
Hide file tree
Showing 26 changed files with 716 additions and 696 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/macos-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:

strategy:
matrix:
go-version: [1.22]
go-version: [1.23]
runs-on: macos-latest
steps:
- name: Install Go
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ubuntu-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:

strategy:
matrix:
go-version: [1.22]
go-version: [1.23]
runs-on: ubuntu-latest
steps:
- name: Install Go
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:

strategy:
matrix:
go-version: [1.22]
go-version: [1.23]
runs-on: windows-latest
steps:
- name: Install Go
Expand Down
21 changes: 14 additions & 7 deletions 7z.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ type SevenZip struct {
Password string
}

func (z SevenZip) Name() string { return ".7z" }
func (z SevenZip) Extension() string { return ".7z" }

func (z SevenZip) Match(filename string, stream io.Reader) (MatchResult, error) {
func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
var mr MatchResult

// match filename
if strings.Contains(strings.ToLower(filename), z.Name()) {
if strings.Contains(strings.ToLower(filename), z.Extension()) {
mr.ByName = true
}

Expand All @@ -52,7 +52,7 @@ func (z SevenZip) Match(filename string, stream io.Reader) (MatchResult, error)
}

// Archive is not implemented for 7z, but the method exists so that SevenZip satisfies the ArchiveFormat interface.
func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []File) error {
func (z SevenZip) Archive(_ context.Context, _ io.Writer, _ []FileInfo) error {
return fmt.Errorf("not implemented for 7z because there is no pure Go implementation found")
}

Expand Down Expand Up @@ -94,11 +94,18 @@ func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, pathsInA
continue
}

file := File{
FileInfo: f.FileInfo(),
fi := f.FileInfo()
file := FileInfo{
FileInfo: fi,
Header: f.FileHeader,
NameInArchive: f.Name,
Open: func() (io.ReadCloser, error) { return f.Open() },
Open: func() (fs.File, error) {
openedFile, err := f.Open()
if err != nil {
return nil, err
}
return fileInArchive{openedFile, fi}, nil
},
}

err := handleFile(ctx, file)
Expand Down
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# archiver [![Go Reference](https://pkg.go.dev/badge/github.com/mholt/archiver/v4.svg)](https://pkg.go.dev/github.com/mholt/archiver/v4) [![Ubuntu-latest](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml) [![Macos-latest](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml) [![Windows-latest](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml)

Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities.
Introducing **Archiver 4.0 (alpha)** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities.

**:warning: v4 is in ALPHA. The core library APIs work pretty well but the command has not been implemented yet, nor have most automated tests. If you need the `arc` command, stick with v3 for now.**

Expand All @@ -11,8 +11,8 @@ Introducing **Archiver 4.0** - a cross-platform, multi-format archive utility an
- By file name
- By header
- Traverse directories, archive files, and any other file uniformly as [`io/fs`](https://pkg.go.dev/io/fs) file systems:
- [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS)
- [`FileFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#FileFS)
- [`DirFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#DirFS)
- [`ArchiveFS`](https://pkg.go.dev/github.com/mholt/archiver/v4#ArchiveFS)
- Compress and decompress files
- Create and extract archive files
Expand Down Expand Up @@ -117,7 +117,7 @@ If you want all the files, pass in a nil list of file paths.

```go
// the type that will be used to read the input stream
format := archiver.Zip{}
var format archiver.Zip

// the list of files we want out of the archive; any
// directories will include all their contents unless
Expand All @@ -141,7 +141,7 @@ if err != nil {
Have an input stream with unknown contents? No problem, archiver can identify it for you. It will try matching based on filename and/or the header (which peeks at the stream):

```go
format, input, err := archiver.Identify("filename.tar.zst", input)
format, input, err := archiver.Identify(ctx, "filename.tar.zst", input)
if err != nil {
return err
}
Expand All @@ -165,7 +165,7 @@ if decom, ok := format.(archiver.Decompressor); ok {
}
```

`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew.
`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. If your input stream is `io.Seeker` however, no buffer is created (it uses `Seek()` instead).

### Virtual file systems

Expand Down Expand Up @@ -212,7 +212,7 @@ if dir, ok := f.(fs.ReadDirFile); ok {
return err
}
for _, e := range entries {
fmt.Println(e.Name())
fmt.Println(e.Extension())
}
}
```
Expand All @@ -225,7 +225,7 @@ if err != nil {
return err
}
for _, e := range entries {
fmt.Println(e.Name())
fmt.Println(e.Extension())
}
```

Expand All @@ -247,6 +247,8 @@ if err != nil {
}
```

**Important .tar note:** Tar files do not efficiently implement file system semantics due to their roots in sequential-access design for tapes. File systems inherently assume random access, but tar files need to be read from the beginning to access something at the end. This is especially slow when the archive is compressed. Optimizations have been implemented to amortize `ReadDir()` calls so that `fs.WalkDir()` only has to scan the archive once, but they use more memory. Open calls require another scan to find the file. It may be more efficient to use `Tar.Extract()` directly if file system semantics are not important to you.

#### Use with `http.FileServer`

It can be used with http.FileServer to browse archives and directories in a browser. However, due to how http.FileServer works, don't directly use http.FileServer with compressed files; instead wrap it like following:
Expand Down
33 changes: 18 additions & 15 deletions archiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ import (
"time"
)

// File is a virtualized, generalized file abstraction for interacting with archives.
type File struct {
// FileInfo is a virtualized, generalized file abstraction for interacting with archives.
type FileInfo struct {
fs.FileInfo

// The file header as used/provided by the archive format.
// Typically, you do not need to set this field when creating
// an archive.
Header interface{}
Header any

// The path of the file as it appears in the archive.
// This is equivalent to Header.Name (for most Header
Expand All @@ -28,6 +28,10 @@ type File struct {
// format-agnosticism (no type assertions) for basic
// operations.
//
// When extracting, this name or path may not have
// been sanitized; it should not be trusted at face
// value. Consider using path.Clean() before using.
//
// EXPERIMENTAL: If inserting a file into an archive,
// and this is left blank, the implementation of the
// archive format can default to using the file's base
Expand All @@ -40,12 +44,11 @@ type File struct {

// A callback function that opens the file to read its
// contents. The file must be closed when reading is
// complete. Nil for files that don't have content
// (such as directories and links).
Open func() (io.ReadCloser, error)
// complete.
Open func() (fs.File, error)
}

func (f File) Stat() (fs.FileInfo, error) { return f.FileInfo, nil }
func (f FileInfo) Stat() (fs.FileInfo, error) { return f.FileInfo, nil }

// FilesFromDisk returns a list of files by walking the directories in the
// given filenames map. The keys are the names on disk, and the values are
Expand All @@ -68,8 +71,8 @@ func (f File) Stat() (fs.FileInfo, error) { return f.FileInfo, nil }
//
// This function is used primarily when preparing a list of files to add to
// an archive.
func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]File, error) {
var files []File
func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]FileInfo, error) {
var files []FileInfo
for rootOnDisk, rootInArchive := range filenames {
walkErr := filepath.WalkDir(rootOnDisk, func(filename string, d fs.DirEntry, err error) error {
if err != nil {
Expand Down Expand Up @@ -114,11 +117,11 @@ func FilesFromDisk(options *FromDiskOptions, filenames map[string]string) ([]Fil
info = noAttrFileInfo{info}
}

file := File{
file := FileInfo{
FileInfo: info,
NameInArchive: nameInArchive,
LinkTarget: linkTarget,
Open: func() (io.ReadCloser, error) {
Open: func() (fs.File, error) {
return os.Open(filename)
},
}
Expand Down Expand Up @@ -191,7 +194,7 @@ func (no noAttrFileInfo) Mode() fs.FileMode {
return no.FileInfo.Mode() & (fs.ModeType | fs.ModePerm)
}
func (noAttrFileInfo) ModTime() time.Time { return time.Time{} }
func (noAttrFileInfo) Sys() interface{} { return nil }
func (noAttrFileInfo) Sys() any { return nil }

// FromDiskOptions specifies various options for gathering files from disk.
type FromDiskOptions struct {
Expand All @@ -215,12 +218,12 @@ type FromDiskOptions struct {
// archive contents are not necessarily ordered, skipping directories requires
// memory, and skipping lots of directories may run up your memory bill.
//
// Any other returned error will terminate a walk.
type FileHandler func(ctx context.Context, f File) error
// Any other returned error will terminate a walk and be returned to the caller.
type FileHandler func(ctx context.Context, info FileInfo) error

// openAndCopyFile opens file for reading, copies its
// contents to w, then closes file.
func openAndCopyFile(file File, w io.Writer) error {
func openAndCopyFile(file FileInfo, w io.Writer) error {
fileReader, err := file.Open()
if err != nil {
return err
Expand Down
3 changes: 2 additions & 1 deletion archiver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ func TestNameOnDiskToNameInArchive(t *testing.T) {
},
} {
if !strings.HasPrefix(tc.nameOnDisk, tc.rootOnDisk) {
t.Fatalf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i)
t.Errorf("Test %d: Invalid test case! Filename (on disk) will have rootOnDisk as a prefix according to the fs.WalkDirFunc godoc.", i)
continue
}
if tc.windows && runtime.GOOS != "windows" {
t.Logf("Test %d: Skipping test that is only compatible with Windows", i)
Expand Down
19 changes: 13 additions & 6 deletions brotli.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package archiver

import (
"context"
"io"
"strings"

Expand All @@ -16,19 +17,25 @@ type Brotli struct {
Quality int
}

func (Brotli) Name() string { return ".br" }
func (Brotli) Extension() string { return ".br" }

func (br Brotli) Match(filename string, stream io.Reader) (MatchResult, error) {
func (br Brotli) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
var mr MatchResult

// match filename
if strings.Contains(strings.ToLower(filename), br.Name()) {
if strings.Contains(strings.ToLower(filename), br.Extension()) {
mr.ByName = true
}

// brotli does not have well-defined file headers; the
// best way to match the stream would be to try decoding
// part of it, and this is not implemented for now
// brotli does not have well-defined file headers or a magic number;
// the best way to match the stream is probably to try decoding part
// of it, but we'll just have to guess a large-enough size that is
// still small enough for the smallest streams we'll encounter
r := brotli.NewReader(stream)
buf := make([]byte, 16)
if _, err := io.ReadFull(r, buf); err == nil {
mr.ByStream = true
}

return mr, nil
}
Expand Down
7 changes: 4 additions & 3 deletions bz2.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package archiver

import (
"bytes"
"context"
"io"
"strings"

Expand All @@ -17,13 +18,13 @@ type Bz2 struct {
CompressionLevel int
}

func (Bz2) Name() string { return ".bz2" }
func (Bz2) Extension() string { return ".bz2" }

func (bz Bz2) Match(filename string, stream io.Reader) (MatchResult, error) {
func (bz Bz2) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
var mr MatchResult

// match filename
if strings.Contains(strings.ToLower(filename), bz.Name()) {
if strings.Contains(strings.ToLower(filename), bz.Extension()) {
mr.ByName = true
}

Expand Down
Loading

0 comments on commit e310539

Please sign in to comment.