From e534bfea6e39a07bedec19d5fa34bb0c0b986f9c Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 21 Sep 2021 13:34:59 -0400 Subject: [PATCH 1/2] Revert "Compression Updates (#961)" This reverts commit b89f2ae1269095f7601e5848d89cd224ac1b5778. --- CHANGELOG.md | 3 +- docs/tempo/website/configuration/_index.md | 4 +- .../website/configuration/compression.md | 1 - example/docker-compose/azure/tempo-azure.yaml | 4 +- .../distributed/tempo-distributed.yaml | 2 +- example/docker-compose/gcs/tempo-gcs.yaml | 4 +- example/docker-compose/local/tempo-local.yaml | 4 +- example/docker-compose/s3/tempo-s3.yaml | 4 +- go.mod | 2 +- tempodb/backend/encoding.go | 4 - tempodb/encoding/streaming_block_test.go | 72 +- tempodb/encoding/v2/pool.go | 70 +- .../klauspost/compress/s2/.gitignore | 15 - .../github.com/klauspost/compress/s2/LICENSE | 28 - .../klauspost/compress/s2/README.md | 717 - .../klauspost/compress/s2/decode.go | 565 - .../klauspost/compress/s2/decode_amd64.s | 571 - .../klauspost/compress/s2/decode_arm64.s | 574 - .../klauspost/compress/s2/decode_asm.go | 17 - .../klauspost/compress/s2/decode_other.go | 267 - .../klauspost/compress/s2/encode.go | 1172 -- .../klauspost/compress/s2/encode_all.go | 456 - .../klauspost/compress/s2/encode_amd64.go | 142 - .../klauspost/compress/s2/encode_best.go | 604 - .../klauspost/compress/s2/encode_better.go | 431 - .../klauspost/compress/s2/encode_go.go | 298 - .../compress/s2/encodeblock_amd64.go | 189 - .../klauspost/compress/s2/encodeblock_amd64.s | 15678 ---------------- vendor/github.com/klauspost/compress/s2/s2.go | 139 - .../klauspost/compress/snappy/.gitignore | 16 - .../klauspost/compress/snappy/AUTHORS | 18 - .../klauspost/compress/snappy/CONTRIBUTORS | 41 - .../klauspost/compress/snappy/LICENSE | 27 - .../klauspost/compress/snappy/README.md | 17 - .../klauspost/compress/snappy/decode.go | 60 - .../klauspost/compress/snappy/encode.go | 59 - .../klauspost/compress/snappy/snappy.go | 46 - vendor/modules.txt | 2 - 38 files changed, 52 insertions(+), 22271 deletions(-) delete mode 100644 vendor/github.com/klauspost/compress/s2/.gitignore delete mode 100644 vendor/github.com/klauspost/compress/s2/LICENSE delete mode 100644 vendor/github.com/klauspost/compress/s2/README.md delete mode 100644 vendor/github.com/klauspost/compress/s2/decode.go delete mode 100644 vendor/github.com/klauspost/compress/s2/decode_amd64.s delete mode 100644 vendor/github.com/klauspost/compress/s2/decode_arm64.s delete mode 100644 vendor/github.com/klauspost/compress/s2/decode_asm.go delete mode 100644 vendor/github.com/klauspost/compress/s2/decode_other.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode_all.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode_amd64.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode_best.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode_better.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encode_go.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go delete mode 100644 vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s delete mode 100644 vendor/github.com/klauspost/compress/s2/s2.go delete mode 100644 vendor/github.com/klauspost/compress/snappy/.gitignore delete mode 100644 vendor/github.com/klauspost/compress/snappy/AUTHORS delete mode 100644 vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS delete mode 100644 vendor/github.com/klauspost/compress/snappy/LICENSE delete mode 100644 vendor/github.com/klauspost/compress/snappy/README.md delete mode 100644 vendor/github.com/klauspost/compress/snappy/decode.go delete mode 100644 vendor/github.com/klauspost/compress/snappy/encode.go delete mode 100644 vendor/github.com/klauspost/compress/snappy/snappy.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f93bdcfb55..4063e3c6df8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,8 @@ * [ENHANCEMENT] Added traceid to `trace too large message`. [#888](https://github.com/grafana/tempo/pull/888) (@mritunjaysharma394) * [ENHANCEMENT] Add support to tempo workloads to `overrides` from single configmap in microservice mode. [#896](https://github.com/grafana/tempo/pull/896) (@kavirajk) * [ENHANCEMENT] Make `overrides_config` block name consistent with Loki and Cortex in microservice mode. [#906](https://github.com/grafana/tempo/pull/906) (@kavirajk) -* [ENHANCEMENT] Changes the metrics name from `cortex_runtime_config_last_reload_successful` to `tempo_runtime_config_last_reload_successful` [#945](https://github.com/grafana/tempo/pull/945) (@kavirajk) * [ENHANCEMENT] Updated config defaults to reflect better capture operational knowledge. [#913](https://github.com/grafana/tempo/pull/913) (@joe-elliott) +* [ENHANCEMENT] Changes the metrics name from `cortex_runtime_config_last_reload_successful` to `tempo_runtime_config_last_reload_successful` [#945](https://github.com/grafana/tempo/pull/945) (@kavirajk) ``` ingester: trace_idle_period: 30s => 10s # reduce ingester memory requirements with little impact on querying @@ -37,7 +37,6 @@ * [ENHANCEMENT] Improve zstd read throughput using zstd.Decoder [#948](https://github.com/grafana/tempo/pull/948) (@joe-elliott) * [ENHANCEMENT] Dedupe search records while replaying WAL [#940](https://github.com/grafana/tempo/pull/940) (@annanay25) * [ENHANCEMENT] Add status endpoint to list the available endpoints [#938](https://github.com/grafana/tempo/pull/938) (@zalegrala) -* [ENHANCEMENT] Compression updates: Added s2, improved snappy performance [#961](https://github.com/grafana/tempo/pull/961) (@joe-elliott) * [ENHANCEMENT] Add search block headers [#943](https://github.com/grafana/tempo/pull/943) (@mdisibio) * [ENHANCEMENT] Add search block headers for wal blocks [#963](https://github.com/grafana/tempo/pull/963) (@mdisibio) * [CHANGE] Renamed CLI flag from `--storage.trace.maintenance-cycle` to `--storage.trace.blocklist_poll`. This is a **breaking change** [#897](https://github.com/grafana/tempo/pull/897) (@mritunjaysharma394) diff --git a/docs/tempo/website/configuration/_index.md b/docs/tempo/website/configuration/_index.md index 2f4ec8f6fca..ae96a0306e8 100644 --- a/docs/tempo/website/configuration/_index.md +++ b/docs/tempo/website/configuration/_index.md @@ -523,7 +523,7 @@ storage: [path: ] # wal encoding/compression. - # options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + # options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd # (default: snappy) [encoding: ] @@ -542,7 +542,7 @@ storage: # (default: 1MiB) [index_downsample_bytes: ] - # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd [encoding: ] ``` diff --git a/docs/tempo/website/configuration/compression.md b/docs/tempo/website/configuration/compression.md index 6f65ae6bc99..a587c740165 100644 --- a/docs/tempo/website/configuration/compression.md +++ b/docs/tempo/website/configuration/compression.md @@ -29,7 +29,6 @@ The following options are supported: - lz4 - snappy - zstd -- s2 It is important to note that although all of these compression formats are supported in Tempo, at Grafana we use zstd and it's possible/probable that the other compression algorithms may have issue at scale. Please diff --git a/example/docker-compose/azure/tempo-azure.yaml b/example/docker-compose/azure/tempo-azure.yaml index 36324d8c0aa..b5d262dd009 100644 --- a/example/docker-compose/azure/tempo-azure.yaml +++ b/example/docker-compose/azure/tempo-azure.yaml @@ -34,10 +34,10 @@ storage: block: bloom_filter_false_positive: .05 # bloom filter false positive rate. lower values create larger filters but fewer false positives index_downsample_bytes: 1000 # number of bytes per index record - encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd wal: path: /tmp/tempo/wal # where to store the the wal locally - encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd azure: container-name: tempo # how to store data in azure endpoint-suffix: azurite:10000 diff --git a/example/docker-compose/distributed/tempo-distributed.yaml b/example/docker-compose/distributed/tempo-distributed.yaml index ab5f2c7ebca..30fd0f2111c 100644 --- a/example/docker-compose/distributed/tempo-distributed.yaml +++ b/example/docker-compose/distributed/tempo-distributed.yaml @@ -55,7 +55,7 @@ storage: insecure: true wal: path: /tmp/tempo/wal # where to store the the wal locally - encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd local: path: /tmp/tempo/blocks pool: diff --git a/example/docker-compose/gcs/tempo-gcs.yaml b/example/docker-compose/gcs/tempo-gcs.yaml index d0270367a90..303be04a97c 100644 --- a/example/docker-compose/gcs/tempo-gcs.yaml +++ b/example/docker-compose/gcs/tempo-gcs.yaml @@ -35,10 +35,10 @@ storage: block: bloom_filter_false_positive: .05 # bloom filter false positive rate. lower values create larger filters but fewer false positives index_downsample_bytes: 1000 # number of bytes per index record - encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd wal: path: /tmp/tempo/wal # where to store the the wal locally - encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd gcs: bucket_name: tempo endpoint: https://gcs:4443/storage/v1/ diff --git a/example/docker-compose/local/tempo-local.yaml b/example/docker-compose/local/tempo-local.yaml index d85591d107e..9667b5d8367 100644 --- a/example/docker-compose/local/tempo-local.yaml +++ b/example/docker-compose/local/tempo-local.yaml @@ -34,10 +34,10 @@ storage: block: bloom_filter_false_positive: .05 # bloom filter false positive rate. lower values create larger filters but fewer false positives index_downsample_bytes: 1000 # number of bytes per index record - encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd wal: path: /tmp/tempo/wal # where to store the the wal locally - encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd local: path: /tmp/tempo/blocks pool: diff --git a/example/docker-compose/s3/tempo-s3.yaml b/example/docker-compose/s3/tempo-s3.yaml index 8f8281d70eb..7bd6272d051 100644 --- a/example/docker-compose/s3/tempo-s3.yaml +++ b/example/docker-compose/s3/tempo-s3.yaml @@ -35,10 +35,10 @@ storage: block: bloom_filter_false_positive: .05 # bloom filter false positive rate. lower values create larger filters but fewer false positives index_downsample_bytes: 1000 # number of bytes per index record - encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd wal: path: /tmp/tempo/wal # where to store the the wal locally - encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd s3: bucket: tempo # how to store data in s3 endpoint: minio:9000 diff --git a/go.mod b/go.mod index f8370c68ede..96eceda15cb 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/gogo/protobuf v1.3.2 github.com/gogo/status v1.1.0 github.com/golang/protobuf v1.5.2 + github.com/golang/snappy v0.0.4 github.com/google/flatbuffers v2.0.0+incompatible github.com/google/go-cmp v0.5.6 github.com/google/uuid v1.2.0 @@ -140,7 +141,6 @@ require ( github.com/gogo/googleapis v1.4.0 // indirect github.com/golang-migrate/migrate/v4 v4.7.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/golang/snappy v0.0.4 // indirect github.com/google/btree v1.0.1 // indirect github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9 // indirect github.com/googleapis/gax-go/v2 v2.0.5 // indirect diff --git a/tempodb/backend/encoding.go b/tempodb/backend/encoding.go index 13bb8cba8ef..e482674ed90 100644 --- a/tempodb/backend/encoding.go +++ b/tempodb/backend/encoding.go @@ -21,7 +21,6 @@ const ( EncLZ4_4M EncSnappy EncZstd - EncS2 ) // SupportedEncoding is a slice of all supported encodings @@ -34,7 +33,6 @@ var SupportedEncoding = []Encoding{ EncLZ4_4M, EncSnappy, EncZstd, - EncS2, } func (e Encoding) String() string { @@ -55,8 +53,6 @@ func (e Encoding) String() string { return "snappy" case EncZstd: return "zstd" - case EncS2: - return "s2" default: return "unsupported" } diff --git a/tempodb/encoding/streaming_block_test.go b/tempodb/encoding/streaming_block_test.go index 74e02c065b2..a7e14106a58 100644 --- a/tempodb/encoding/streaming_block_test.go +++ b/tempodb/encoding/streaming_block_test.go @@ -9,7 +9,6 @@ import ( "io/ioutil" "math/rand" "os" - "path" "sort" "testing" "time" @@ -302,7 +301,7 @@ func streamingBlock(t *testing.T, cfg *BlockConfig, w backend.Writer) (*Streamin return block, ids, reqs } -const benchDownsample = 1024 * 1024 +const benchDownsample = 200 func BenchmarkWriteGzip(b *testing.B) { benchmarkCompressBlock(b, backend.EncGZIP, benchDownsample, false) @@ -320,12 +319,10 @@ func BenchmarkWriteLZ41M(b *testing.B) { func BenchmarkWriteNone(b *testing.B) { benchmarkCompressBlock(b, backend.EncNone, benchDownsample, false) } + func BenchmarkWriteZstd(b *testing.B) { benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, false) } -func BenchmarkWriteS2(b *testing.B) { - benchmarkCompressBlock(b, backend.EncS2, benchDownsample, false) -} func BenchmarkReadGzip(b *testing.B) { benchmarkCompressBlock(b, backend.EncGZIP, benchDownsample, true) @@ -342,12 +339,10 @@ func BenchmarkReadLZ41M(b *testing.B) { func BenchmarkReadNone(b *testing.B) { benchmarkCompressBlock(b, backend.EncNone, benchDownsample, true) } + func BenchmarkReadZstd(b *testing.B) { benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, true) } -func BenchmarkReadS2(b *testing.B) { - benchmarkCompressBlock(b, backend.EncS2, benchDownsample, true) -} // Download a block from your backend and place in ./benchmark_block// //nolint:unparam @@ -362,7 +357,7 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa require.NoError(b, err, "error creating backend") r := backend.NewReader(rawR) - meta, err := r.BlockMeta(context.Background(), uuid.MustParse("20a614f8-8cda-4b9d-9789-cb626f9fab28"), "1") + meta, err := r.BlockMeta(context.Background(), uuid.MustParse("00006e9d-94f0-4487-8e62-99f951be9349"), "1") require.NoError(b, err) backendBlock, err := NewBackendBlock(meta, r) @@ -385,13 +380,13 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa b.ResetTimer() } + originatingMeta := backend.NewBlockMeta(testTenantID, uuid.New(), "should_be_ignored", encoding, "") block, err := NewStreamingBlock(&BlockConfig{ IndexDownsampleBytes: indexDownsample, BloomFP: .05, Encoding: encoding, IndexPageSizeBytes: 10 * 1024 * 1024, - BloomShardSizeBytes: 100000, - }, uuid.New(), meta.TenantID, []*backend.BlockMeta{meta}, meta.TotalObjects) + }, originatingMeta.BlockID, originatingMeta.TenantID, []*backend.BlockMeta{originatingMeta}, originatingMeta.TotalObjects) require.NoError(b, err, "unexpected error completing block") ctx := context.Background() @@ -400,7 +395,8 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa if err != io.EOF { require.NoError(b, err) } - if err == io.EOF { + + if id == nil { break } @@ -420,30 +416,30 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa return } - b.ResetTimer() - - fullFilename := path.Join(backendTmpDir, block.compactedMeta.TenantID, block.compactedMeta.BlockID.String(), "data") - file, err := os.Open(fullFilename) - require.NoError(b, err) - pr, err := v2.NewDataReader(backend.NewContextReaderWithAllReader(file), encoding) - require.NoError(b, err) - - var tempBuffer []byte - o := v2.NewObjectReaderWriter() - for { - tempBuffer, _, err = pr.NextPage(tempBuffer) - if err == io.EOF { - break - } - require.NoError(b, err) - - bufferReader := bytes.NewReader(tempBuffer) - - for { - _, _, err = o.UnmarshalObjectFromReader(bufferReader) - if err == io.EOF { - break - } - } - } + // todo: restore read benchmarks + // b.ResetTimer() + + // file, err := os.Open(block.fullFilename()) + // require.NoError(b, err) + // pr, err := v2.NewDataReader(backend.NewContextReaderWithAllReader(file), encoding) + // require.NoError(b, err) + + // var tempBuffer []byte + // o := v2.NewObjectReaderWriter() + // for { + // tempBuffer, _, err = pr.NextPage(tempBuffer) + // if err == io.EOF { + // break + // } + // require.NoError(b, err) + + // bufferReader := bytes.NewReader(tempBuffer) + + // for { + // _, _, err = o.UnmarshalObjectFromReader(bufferReader) + // if err == io.EOF { + // break + // } + // } + // } } diff --git a/tempodb/encoding/v2/pool.go b/tempodb/encoding/v2/pool.go index 9e3bd4c0cd3..3bbea9a3385 100644 --- a/tempodb/encoding/v2/pool.go +++ b/tempodb/encoding/v2/pool.go @@ -5,16 +5,15 @@ import ( "io" "sync" + "github.com/golang/snappy" "github.com/grafana/tempo/tempodb/backend" "github.com/klauspost/compress/gzip" - "github.com/klauspost/compress/s2" - "github.com/klauspost/compress/snappy" "github.com/klauspost/compress/zstd" "github.com/pierrec/lz4/v4" "github.com/prometheus/prometheus/pkg/pool" ) -const maxEncoding = backend.EncS2 +const maxEncoding = backend.EncZstd // WriterPool is a pool of io.Writer // This is used by every chunk to avoid unnecessary allocations. @@ -50,8 +49,6 @@ var ( Noop NoopPool // Zstd Pool Zstd = ZstdPool{} - // S2 Pool - S2 = S2Pool{} // BytesBufferPool is a bytes buffer used for lines decompressed. // Buckets [0.5KB,1KB,2KB,4KB,8KB] @@ -85,8 +82,6 @@ func getReaderPool(enc backend.Encoding) (ReaderPool, error) { return &Snappy, nil case backend.EncZstd: return &Zstd, nil - case backend.EncS2: - return &S2, nil default: return nil, fmt.Errorf("Unknown pool encoding %d", enc) } @@ -262,7 +257,7 @@ func (pool *SnappyPool) Encoding() backend.Encoding { // GetReader gets or creates a new CompressionReader and reset it to read from src func (pool *SnappyPool) GetReader(src io.Reader) (io.Reader, error) { if r := pool.readers.Get(); r != nil { - reader := r.(*s2.Reader) + reader := r.(*snappy.Reader) reader.Reset(src) return reader, nil } @@ -276,7 +271,7 @@ func (pool *SnappyPool) PutReader(reader io.Reader) { // ResetReader implements ReaderPool func (pool *SnappyPool) ResetReader(src io.Reader, resetReader io.Reader) (io.Reader, error) { - reader := resetReader.(*s2.Reader) + reader := resetReader.(*snappy.Reader) reader.Reset(src) return reader, nil } @@ -284,7 +279,7 @@ func (pool *SnappyPool) ResetReader(src io.Reader, resetReader io.Reader) (io.Re // GetWriter gets or creates a new CompressionWriter and reset it to write to dst func (pool *SnappyPool) GetWriter(dst io.Writer) (io.WriteCloser, error) { if w := pool.writers.Get(); w != nil { - writer := w.(*s2.Writer) + writer := w.(*snappy.Writer) writer.Reset(dst) return writer, nil } @@ -399,58 +394,3 @@ func (pool *ZstdPool) ResetWriter(dst io.Writer, resetWriter io.WriteCloser) (io writer.Reset(dst) return writer, nil } - -// S2Pool is one s short of s3 -type S2Pool struct { - readers sync.Pool - writers sync.Pool -} - -// Encoding implements WriterPool and ReaderPool -func (pool *S2Pool) Encoding() backend.Encoding { - return backend.EncS2 -} - -// GetReader gets or creates a new CompressionReader and reset it to read from src -func (pool *S2Pool) GetReader(src io.Reader) (io.Reader, error) { - if r := pool.readers.Get(); r != nil { - reader := r.(*s2.Reader) - reader.Reset(src) - return reader, nil - } - return s2.NewReader(src), nil -} - -// PutReader places back in the pool a CompressionReader -func (pool *S2Pool) PutReader(reader io.Reader) { - pool.readers.Put(reader) -} - -// ResetReader implements ReaderPool -func (pool *S2Pool) ResetReader(src io.Reader, resetReader io.Reader) (io.Reader, error) { - reader := resetReader.(*s2.Reader) - reader.Reset(src) - return reader, nil -} - -// GetWriter gets or creates a new CompressionWriter and reset it to write to dst -func (pool *S2Pool) GetWriter(dst io.Writer) (io.WriteCloser, error) { - if w := pool.writers.Get(); w != nil { - writer := w.(*s2.Writer) - writer.Reset(dst) - return writer, nil - } - return s2.NewWriter(dst), nil -} - -// PutWriter places back in the pool a CompressionWriter -func (pool *S2Pool) PutWriter(writer io.WriteCloser) { - pool.writers.Put(writer) -} - -// ResetWriter implements WriterPool -func (pool *S2Pool) ResetWriter(dst io.Writer, resetWriter io.WriteCloser) (io.WriteCloser, error) { - writer := resetWriter.(*s2.Writer) - writer.Reset(dst) - return writer, nil -} diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore deleted file mode 100644 index 3a89c6e3e26..00000000000 --- a/vendor/github.com/klauspost/compress/s2/.gitignore +++ /dev/null @@ -1,15 +0,0 @@ -testdata/bench - -# These explicitly listed benchmark data files are for an obsolete version of -# snappy_test.go. -testdata/alice29.txt -testdata/asyoulik.txt -testdata/fireworks.jpeg -testdata/geo.protodata -testdata/html -testdata/html_x_4 -testdata/kppkn.gtb -testdata/lcet10.txt -testdata/paper-100k.pdf -testdata/plrabn12.txt -testdata/urls.10K diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE deleted file mode 100644 index 1d2d645bd93..00000000000 --- a/vendor/github.com/klauspost/compress/s2/LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. -Copyright (c) 2019 Klaus Post. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md deleted file mode 100644 index 81fad652436..00000000000 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ /dev/null @@ -1,717 +0,0 @@ -# S2 Compression - -S2 is an extension of [Snappy](https://github.com/google/snappy). - -S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads. - -Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy. -This means that S2 can seamlessly replace Snappy without converting compressed content. - -S2 can produce Snappy compatible output, faster and better than Snappy. -If you want full benefit of the changes you should use s2 without Snappy compatibility. - -S2 is designed to have high throughput on content that cannot be compressed. -This is important, so you don't have to worry about spending CPU cycles on already compressed data. - -## Benefits over Snappy - -* Better compression -* Adjustable compression (3 levels) -* Concurrent stream compression -* Faster decompression, even for Snappy compatible content -* Ability to quickly skip forward in compressed stream -* Compatible with reading Snappy compressed content -* Smaller block size overhead on incompressible blocks -* Block concatenation -* Uncompressed stream mode -* Automatic stream size padding -* Snappy compatible block compression - -## Drawbacks over Snappy - -* Not optimized for 32 bit systems. -* Streams use slightly more memory due to larger blocks and concurrency (configurable). - -# Usage - -Installation: `go get -u github.com/klauspost/compress/s2` - -Full package documentation: - -[![godoc][1]][2] - -[1]: https://godoc.org/github.com/klauspost/compress?status.svg -[2]: https://godoc.org/github.com/klauspost/compress/s2 - -## Compression - -```Go -func EncodeStream(src io.Reader, dst io.Writer) error { - enc := s2.NewWriter(dst) - _, err := io.Copy(enc, src) - if err != nil { - enc.Close() - return err - } - // Blocks until compression is done. - return enc.Close() -} -``` - -You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete. - -For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method. - -The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2. -It is possible to flush any buffered data using the `Flush()` method. -This will block until all data sent to the encoder has been written to the output. - -S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader. - -As a final method to compress data, if you have a single block of data you would like to have encoded as a stream, -a slightly more efficient method is to use the `EncodeBuffer` method. -This will take ownership of the buffer until the stream is closed. - -```Go -func EncodeStream(src []byte, dst io.Writer) error { - enc := s2.NewWriter(dst) - // The encoder owns the buffer until Flush or Close is called. - err := enc.EncodeBuffer(buf) - if err != nil { - enc.Close() - return err - } - // Blocks until compression is done. - return enc.Close() -} -``` - -Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, -so it should only be used a single time per stream. -If you need to write several blocks, you should use the regular io.Writer interface. - - -## Decompression - -```Go -func DecodeStream(src io.Reader, dst io.Writer) error { - dec := s2.NewReader(src) - _, err := io.Copy(dst, dec) - return err -} -``` - -Similar to the Writer, a Reader can be reused using the `Reset` method. - -For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available. -However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed. - -For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`. -Do however note that these functions (similar to Snappy) does not provide validation of data, -so data corruption may be undetected. Stream encoding provides CRC checks of data. - -It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. -For big skips the decompressor is able to skip blocks without decompressing them. - -## Single Blocks - -Similar to Snappy S2 offers single block compression. -Blocks do not offer the same flexibility and safety as streams, -but may be preferable for very small payloads, less than 100K. - -Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. -It is possible to provide a destination buffer. -If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. -If not a new will be allocated. - -Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression. - -Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. -Again an optional destination buffer can be supplied. -The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. -If that is not satisfied a new buffer will be allocated. - -Block function always operate on a single goroutine since it should only be used for small payloads. - -# Commandline tools - -Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression. - -Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases). - -Installing then requires Go to be installed. To install them, use: - -`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d` - -To build binaries to the current folder use: - -`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d` - - -## s2c - -``` -Usage: s2c [options] file1 file2 - -Compresses all files supplied as input separately. -Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'. -By default output files will be overwritten. -Use - as the only file name to read from stdin and write to stdout. - -Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt -Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt - -File names beginning with 'http://' and 'https://' will be downloaded and compressed. -Only http response code 200 is accepted. - -Options: - -bench int - Run benchmark n times. No output will be written - -blocksize string - Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M") - -c Write all output to stdout. Multiple input files will be concatenated - -cpu int - Compress using this amount of threads (default 32) - -faster - Compress faster, but with a minor compression loss - -help - Display help - -o string - Write output to another file. Single input file only - -pad string - Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1") - -q Don't write any output to terminal, except errors - -rm - Delete source file(s) after successful compression - -safe - Do not overwrite output files - -slower - Compress more, but a lot slower - -snappy - Generate Snappy compatible output stream - -verify - Verify written files - -``` - -## s2d - -``` -Usage: s2d [options] file1 file2 - -Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'. -Output file names have the extension removed. By default output files will be overwritten. -Use - as the only file name to read from stdin and write to stdout. - -Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt -Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt - -File names beginning with 'http://' and 'https://' will be downloaded and decompressed. -Extensions on downloaded files are ignored. Only http response code 200 is accepted. - -Options: - -bench int - Run benchmark n times. No output will be written - -c Write all output to stdout. Multiple input files will be concatenated - -help - Display help - -o string - Write output to another file. Single input file only - -q Don't write any output to terminal, except errors - -rm - Delete source file(s) after successful decompression - -safe - Do not overwrite output files - -verify - Verify files, but do not write output -``` - -## s2sx: self-extracting archives - -s2sx allows creating self-extracting archives with no dependencies. - -By default, executables are created for the same platforms as the host os, -but this can be overridden with `-os` and `-arch` parameters. - -Extracted files have 0666 permissions, except when untar option used. - -``` -Usage: s2sx [options] file1 file2 - -Compresses all files supplied as input separately. -If files have '.s2' extension they are assumed to be compressed already. -Output files are written as 'filename.s2sx' and with '.exe' for windows targets. -If output is big, an additional file with ".more" is written. This must be included as well. -By default output files will be overwritten. - -Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt -Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt - -Options: - -arch string - Destination architecture (default "amd64") - -c Write all output to stdout. Multiple input files will be concatenated - -cpu int - Compress using this amount of threads (default 32) - -help - Display help - -max string - Maximum executable size. Rest will be written to another file. (default "1G") - -os string - Destination operating system (default "windows") - -q Don't write any output to terminal, except errors - -rm - Delete source file(s) after successful compression - -safe - Do not overwrite output files - -untar - Untar on destination -``` - -Available platforms are: - - * darwin-amd64 - * darwin-arm64 - * linux-amd64 - * linux-arm - * linux-arm64 - * linux-mips64 - * linux-ppc64le - * windows-386 - * windows-amd64 - -By default, there is a size limit of 1GB for the output executable. - -When this is exceeded the remaining file content is written to a file called -output+`.more`. This file must be included for a successful extraction and -placed alongside the executable for a successful extraction. - -This file *must* have the same name as the executable, so if the executable is renamed, -so must the `.more` file. - -This functionality is disabled with stdin/stdout. - -### Self-extracting TAR files - -If you wrap a TAR file you can specify `-untar` to make it untar on the destination host. - -Files are extracted to the current folder with the path specified in the tar file. - -Note that tar files are not validated before they are wrapped. - -For security reasons files that move below the root folder are not allowed. - -# Performance - -This section will focus on comparisons to Snappy. -This package is solely aimed at replacing Snappy as a high speed compression package. -If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) -gives better compression, but typically at speeds slightly below "better" mode in this package. - -Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation. - -Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput. - -A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain. -The content compressed in this mode is fully compatible with the standard decoder. - -Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU): - -| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | -|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| -| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% | -| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - | -| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% | -| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - | -| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% | -| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - | -| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% | -| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - | -| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% | -| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - | -| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% | -| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - | -| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% | -| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - | -| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% | -| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - | -| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% | -| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - | -| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% | -| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - | -| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% | -| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - | - -### Legend - -* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. -* `S2 throughput`: Throughput of S2 in MB/s. -* `S2 % smaller`: How many percent of the Snappy output size is S2 better. -* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. -* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. -* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression. - -There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads. - -Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size. - -The "better" compression mode sees a good improvement in all cases, but usually at a performance cost. - -Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. -This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). - -## Decompression - -S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used. - -S2 vs Snappy **decompression** speed. Both operating on single core: - -| File | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy | -|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------| -| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 2117 MB/s | 1.14x | 1738 MB/s | 0.94x | -| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s | 1.25x | 2307 MB/s | 1.20x | -| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 2075 MB/s | 0.98x | 1764 MB/s | 0.83x | -| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 2967 MB/s | 1.05x | 2885 MB/s | 1.02x | -| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 4141 MB/s | 1.07x | 4184 MB/s | 1.08x | -| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 2264 MB/s | 1.12x | 2185 MB/s | 1.08x | -| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 1525 MB/s | 1.03x | 1347 MB/s | 0.91x | -| sharnd.out.2gb | 3813 MB/s | 0.79x | 3900 MB/s | 0.81x | -| [enwik9](http://mattmahoney.net/dc/textdata.html) | 1246 MB/s | 1.29x | 967 MB/s | 1.00x | -| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 1433 MB/s | 1.12x | 1203 MB/s | 0.94x | -| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 1284 MB/s | 1.32x | 1010 MB/s | 1.04x | - -### Legend - -* `S2 Throughput`: Decompression speed of S2 encoded content. -* `Better Throughput`: Decompression speed of S2 "better" encoded content. -* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed. - - -While the decompression code hasn't changed, there is a significant speedup in decompression speed. -S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. -While this reduces compression a bit, it improves decompression speed. - -The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy. - -Without assembly decompression is also very fast; single goroutine decompression speed. No assembly: - -| File | S2 Throughput | S2 throughput | -|--------------------------------|--------------|---------------| -| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | -| 10gb.tar.s2 | 1.30x | 867.07 MB/s | -| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | -| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | -| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | -| enwik9.s2 | 1.67x | 681.53 MB/s | -| adresser.json.s2 | 3.41x | 4230.53 MB/s | -| silesia.tar.s2 | 1.52x | 811.58 | - -Even though S2 typically compresses better than Snappy, decompression speed is always better. - -## Block compression - - -When compressing blocks no concurrent compression is performed just as Snappy. -This is because blocks are for smaller payloads and generally will not benefit from concurrent compression. - -An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input. -In rare, worst case scenario Snappy blocks could be significantly bigger than the input. - -### Mixed content blocks - -The most reliable is a wide dataset. -For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), -53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. - -| * | Input | Output | Reduction | MB/s | -|-------------------|------------|------------|-----------|--------| -| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** | -| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 | -| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 | -| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 | -| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 | -| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 | - -S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best". -"Better" mode provides the same compression speed as LZ4 with better compression ratio. - -When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression. - -As can be seen from the other benchmarks decompression should also be easier on the S2 generated output. - -Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for -other Go compressors: - -| * | Input | Output | Reduction | MB/s | -|-------------------|------------|------------|-----------|--------| -| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 | -| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 | -| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 | -| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 | - -### Standard block compression - -Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns. -So individual benchmarks should only be seen as a guideline and the overall picture is more important. - -These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. - -Block compression. Parallel benchmark running on 16 cores, 16 goroutines. - -AMD64 assembly is use for both S2 and Snappy. - -| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec | -|-----------------------|-------------|---------|--------------|-------------|-------------|-------------| -| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s | -| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s | -| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s | -| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s | -| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s | -| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s | -| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s | -| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s | -| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s | -| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s | -| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s | -| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s | -| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s | -| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s | -| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s | - - -| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed | -|-----------------------|-------------|------------------|----------|--------------| -| html | 22.31% | 7.58% | 1.07x | 1.20x | -| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x | -| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x | -| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x | -| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x | -| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x | -| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x | -| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x | -| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x | -| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x | -| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x | -| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x | -| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x | -| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x | -| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x | - -Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. - -Decompression speed is better than Snappy, except in one case. - -Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline. - -Size is on average around Snappy, but varies on content type. -In cases where compression is worse, it usually is compensated by a speed boost. - - -### Better compression - -Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns. -So individual benchmarks should only be seen as a guideline and the overall picture is more important. - -| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec | -|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------| -| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s | -| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s | -| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s | -| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s | -| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s | -| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s | -| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s | -| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s | -| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s | -| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s | -| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s | -| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s | -| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s | -| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s | -| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s | - - -| Relative Perf | Snappy size | Better size | Better Speed | Better dec | -|-----------------------|-------------|-------------|--------------|------------| -| html | 22.31% | 13.18% | 0.48x | 0.98x | -| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x | -| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x | -| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x | -| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x | -| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x | -| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x | -| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x | -| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x | -| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x | -| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x | -| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x | -| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x | -| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x | -| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x | - -Except for the mostly incompressible JPEG image compression is better and usually in the -double digits in terms of percentage reduction over Snappy. - -The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder -to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down. - -This mode aims to provide better compression at the expense of performance and achieves that -without a huge performance penalty, except on very small blocks. - -Decompression speed suffers a little compared to the regular S2 mode, -but still manages to be close to Snappy in spite of increased compression. - -# Best compression mode - -S2 offers a "best" compression mode. - -This will compress as much as possible with little regard to CPU usage. - -Mainly for offline compression, but where decompression speed should still -be high and compatible with other S2 compressed data. - -Some examples compared on 16 core CPU, amd64 assembly used: - -``` -* enwik10 -Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s - -* github-june-2days-2019.json -Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s - -* nyc-taxi-data-10M.csv -Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s - -* 10gb.tar -Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ - -* consensus.db.10gb -Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s -``` - -Decompression speed should be around the same as using the 'better' compression mode. - -# Snappy Compatibility - -S2 now offers full compatibility with Snappy. - -This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output. - -There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by -simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`. -This uses "better" mode for all operations. -If you would like more control, you can use the s2 package as described below: - -## Blocks - -Snappy compatible blocks can be generated with the S2 encoder. -Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace - -| Snappy | S2 replacement | -|----------------------------|-------------------------| -| snappy.Encode(...) | s2.EncodeSnappy(...) | -| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | - -`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. - -`s2.ConcatBlocks` is compatible with snappy blocks. - -Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), -53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: - -| Encoder | Size | MB/s | Reduction | -|-----------------------|------------|--------|------------ -| snappy.Encode | 1128706759 | 725.59 | 71.89% | -| s2.EncodeSnappy | 1093823291 | 899.16 | 72.75% | -| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | -| s2.EncodeSnappyBest | 944507998 | 66.00 | 76.47% | - -## Streams - -For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`. -All other options are available, but note that block size limit is different for snappy. - -Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: - -| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best | -|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------| -| nyc-taxi-data-10M.csv | 1316042016 - 517.54MB/s | 1307003093 - 8406.29MB/s | 1174534014 - 4984.35MB/s | 1115904679 - 177.81MB/s | -| enwik10 | 5088294643 - 433.45MB/s | 5175840939 - 8454.52MB/s | 4560784526 - 4403.10MB/s | 4340299103 - 159.71MB/s | -| 10gb.tar | 6056946612 - 703.25MB/s | 6208571995 - 9035.75MB/s | 5741646126 - 2402.08MB/s | 5548973895 - 171.17MB/s | -| github-june-2days-2019.json | 1525176492 - 908.11MB/s | 1476519054 - 12625.93MB/s | 1400547532 - 6163.61MB/s | 1321887137 - 200.71MB/s | -| consensus.db.10gb | 5412897703 - 1054.38MB/s | 5354073487 - 12634.82MB/s | 5335069899 - 2472.23MB/s | 5201000954 - 166.32MB/s | - -# Decompression - -All decompression functions map directly to equivalent s2 functions. - -| Snappy | S2 replacement | -|------------------------|--------------------| -| snappy.Decode(...) | s2.Decode(...) | -| snappy.DecodedLen(...) | s2.DecodedLen(...) | -| snappy.NewReader(...) | s2.NewReader(...) | - -Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip) -are also available for Snappy streams. - -If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize) -on your Reader will reduce memory consumption. - -# Concatenating blocks and streams. - -Concatenating streams will concatenate the output of both without recompressing them. -While this is inefficient in terms of compression it might be usable in certain scenarios. -The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement. - -Blocks can be concatenated using the `ConcatBlocks` function. - -Snappy blocks/streams can safely be concatenated with S2 blocks and streams. - -# Format Extensions - -* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. -* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB). -* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset. - -Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0. - -The length is specified by reading the 3-bit length specified in the tag and decode using this table: - -| Length | Actual Length | -|--------|----------------------| -| 0 | 4 | -| 1 | 5 | -| 2 | 6 | -| 3 | 7 | -| 4 | 8 | -| 5 | 8 + read 1 byte | -| 6 | 260 + read 2 bytes | -| 7 | 65540 + read 3 bytes | - -This allows any repeat offset + length to be represented by 2 to 5 bytes. - -Lengths are stored as little endian values. - -The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams. - -Default streaming block size is 1MB. - -# LICENSE - -This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation. - -Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go deleted file mode 100644 index d0ae5304efe..00000000000 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ /dev/null @@ -1,565 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package s2 - -import ( - "encoding/binary" - "errors" - "io" -) - -var ( - // ErrCorrupt reports that the input is invalid. - ErrCorrupt = errors.New("s2: corrupt input") - // ErrCRC reports that the input failed CRC validation (streams only) - ErrCRC = errors.New("s2: corrupt input, crc mismatch") - // ErrTooLarge reports that the uncompressed length is too large. - ErrTooLarge = errors.New("s2: decoded block is too large") - // ErrUnsupported reports that the input isn't supported. - ErrUnsupported = errors.New("s2: unsupported input") -) - -// DecodedLen returns the length of the decoded block. -func DecodedLen(src []byte) (int, error) { - v, _, err := decodedLen(src) - return v, err -} - -// decodedLen returns the length of the decoded block and the number of bytes -// that the length header occupied. -func decodedLen(src []byte) (blockLen, headerLen int, err error) { - v, n := binary.Uvarint(src) - if n <= 0 || v > 0xffffffff { - return 0, 0, ErrCorrupt - } - - const wordSize = 32 << (^uint(0) >> 32 & 1) - if wordSize == 32 && v > 0x7fffffff { - return 0, 0, ErrTooLarge - } - return int(v), n, nil -} - -const ( - decodeErrCodeCorrupt = 1 -) - -// Decode returns the decoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire decoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -func Decode(dst, src []byte) ([]byte, error) { - dLen, s, err := decodedLen(src) - if err != nil { - return nil, err - } - if dLen <= cap(dst) { - dst = dst[:dLen] - } else { - dst = make([]byte, dLen) - } - if s2Decode(dst, src[s:]) != 0 { - return nil, ErrCorrupt - } - return dst, nil -} - -// NewReader returns a new Reader that decompresses from r, using the framing -// format described at -// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes. -func NewReader(r io.Reader, opts ...ReaderOption) *Reader { - nr := Reader{ - r: r, - maxBlock: maxBlockSize, - } - for _, opt := range opts { - if err := opt(&nr); err != nil { - nr.err = err - return &nr - } - } - nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize - if nr.lazyBuf > 0 { - nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize) - } else { - nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize) - } - nr.paramsOK = true - return &nr -} - -// ReaderOption is an option for creating a decoder. -type ReaderOption func(*Reader) error - -// ReaderMaxBlockSize allows to control allocations if the stream -// has been compressed with a smaller WriterBlockSize, or with the default 1MB. -// Blocks must be this size or smaller to decompress, -// otherwise the decoder will return ErrUnsupported. -// -// For streams compressed with Snappy this can safely be set to 64KB (64 << 10). -// -// Default is the maximum limit of 4MB. -func ReaderMaxBlockSize(blockSize int) ReaderOption { - return func(r *Reader) error { - if blockSize > maxBlockSize || blockSize <= 0 { - return errors.New("s2: block size too large. Must be <= 4MB and > 0") - } - if r.lazyBuf == 0 && blockSize < defaultBlockSize { - r.lazyBuf = blockSize - } - r.maxBlock = blockSize - return nil - } -} - -// ReaderAllocBlock allows to control upfront stream allocations -// and not allocate for frames bigger than this initially. -// If frames bigger than this is seen a bigger buffer will be allocated. -// -// Default is 1MB, which is default output size. -func ReaderAllocBlock(blockSize int) ReaderOption { - return func(r *Reader) error { - if blockSize > maxBlockSize || blockSize < 1024 { - return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024") - } - r.lazyBuf = blockSize - return nil - } -} - -// Reader is an io.Reader that can read Snappy-compressed bytes. -type Reader struct { - r io.Reader - err error - decoded []byte - buf []byte - // decoded[i:j] contains decoded bytes that have not yet been passed on. - i, j int - // maximum block size allowed. - maxBlock int - // maximum expected buffer size. - maxBufSize int - // alloc a buffer this size if > 0. - lazyBuf int - readHeader bool - paramsOK bool - snappyFrame bool -} - -// ensureBufferSize will ensure that the buffer can take at least n bytes. -// If false is returned the buffer exceeds maximum allowed size. -func (r *Reader) ensureBufferSize(n int) bool { - if len(r.buf) >= n { - return true - } - if n > r.maxBufSize { - r.err = ErrCorrupt - return false - } - // Realloc buffer. - r.buf = make([]byte, n) - return true -} - -// Reset discards any buffered data, resets all state, and switches the Snappy -// reader to read from r. This permits reusing a Reader rather than allocating -// a new one. -func (r *Reader) Reset(reader io.Reader) { - if !r.paramsOK { - return - } - r.r = reader - r.err = nil - r.i = 0 - r.j = 0 - r.readHeader = false -} - -func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) { - if _, r.err = io.ReadFull(r.r, p); r.err != nil { - if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { - r.err = ErrCorrupt - } - return false - } - return true -} - -// skipN will skip n bytes. -// If the supplied reader supports seeking that is used. -// tmp is used as a temporary buffer for reading. -// The supplied slice does not need to be the size of the read. -func (r *Reader) skipN(tmp []byte, n int, allowEOF bool) (ok bool) { - if rs, ok := r.r.(io.ReadSeeker); ok { - _, err := rs.Seek(int64(n), io.SeekCurrent) - if err == nil { - return true - } - if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { - r.err = ErrCorrupt - return false - } - } - for n > 0 { - if n < len(tmp) { - tmp = tmp[:n] - } - if _, r.err = io.ReadFull(r.r, tmp); r.err != nil { - if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { - r.err = ErrCorrupt - } - return false - } - n -= len(tmp) - } - return true -} - -// Read satisfies the io.Reader interface. -func (r *Reader) Read(p []byte) (int, error) { - if r.err != nil { - return 0, r.err - } - for { - if r.i < r.j { - n := copy(p, r.decoded[r.i:r.j]) - r.i += n - return n, nil - } - if !r.readFull(r.buf[:4], true) { - return 0, r.err - } - chunkType := r.buf[0] - if !r.readHeader { - if chunkType != chunkTypeStreamIdentifier { - r.err = ErrCorrupt - return 0, r.err - } - r.readHeader = true - } - chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 - - // The chunk types are specified at - // https://github.com/google/snappy/blob/master/framing_format.txt - switch chunkType { - case chunkTypeCompressedData: - // Section 4.2. Compressed data (chunk type 0x00). - if chunkLen < checksumSize { - r.err = ErrCorrupt - return 0, r.err - } - if !r.ensureBufferSize(chunkLen) { - if r.err == nil { - r.err = ErrUnsupported - } - return 0, r.err - } - buf := r.buf[:chunkLen] - if !r.readFull(buf, false) { - return 0, r.err - } - checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 - buf = buf[checksumSize:] - - n, err := DecodedLen(buf) - if err != nil { - r.err = err - return 0, r.err - } - if r.snappyFrame && n > maxSnappyBlockSize { - r.err = ErrCorrupt - return 0, r.err - } - - if n > len(r.decoded) { - if n > r.maxBlock { - r.err = ErrCorrupt - return 0, r.err - } - r.decoded = make([]byte, n) - } - if _, err := Decode(r.decoded, buf); err != nil { - r.err = err - return 0, r.err - } - if crc(r.decoded[:n]) != checksum { - r.err = ErrCRC - return 0, r.err - } - r.i, r.j = 0, n - continue - - case chunkTypeUncompressedData: - // Section 4.3. Uncompressed data (chunk type 0x01). - if chunkLen < checksumSize { - r.err = ErrCorrupt - return 0, r.err - } - if !r.ensureBufferSize(chunkLen) { - if r.err == nil { - r.err = ErrUnsupported - } - return 0, r.err - } - buf := r.buf[:checksumSize] - if !r.readFull(buf, false) { - return 0, r.err - } - checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 - // Read directly into r.decoded instead of via r.buf. - n := chunkLen - checksumSize - if r.snappyFrame && n > maxSnappyBlockSize { - r.err = ErrCorrupt - return 0, r.err - } - if n > len(r.decoded) { - if n > r.maxBlock { - r.err = ErrCorrupt - return 0, r.err - } - r.decoded = make([]byte, n) - } - if !r.readFull(r.decoded[:n], false) { - return 0, r.err - } - if crc(r.decoded[:n]) != checksum { - r.err = ErrCRC - return 0, r.err - } - r.i, r.j = 0, n - continue - - case chunkTypeStreamIdentifier: - // Section 4.1. Stream identifier (chunk type 0xff). - if chunkLen != len(magicBody) { - r.err = ErrCorrupt - return 0, r.err - } - if !r.readFull(r.buf[:len(magicBody)], false) { - return 0, r.err - } - if string(r.buf[:len(magicBody)]) != magicBody { - if string(r.buf[:len(magicBody)]) != magicBodySnappy { - r.err = ErrCorrupt - return 0, r.err - } else { - r.snappyFrame = true - } - } else { - r.snappyFrame = false - } - continue - } - - if chunkType <= 0x7f { - // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). - r.err = ErrUnsupported - return 0, r.err - } - // Section 4.4 Padding (chunk type 0xfe). - // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). - if chunkLen > maxBlockSize { - r.err = ErrUnsupported - return 0, r.err - } - - if !r.skipN(r.buf, chunkLen, false) { - return 0, r.err - } - } -} - -// Skip will skip n bytes forward in the decompressed output. -// For larger skips this consumes less CPU and is faster than reading output and discarding it. -// CRC is not checked on skipped blocks. -// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped. -// If a decoding error is encountered subsequent calls to Read will also fail. -func (r *Reader) Skip(n int64) error { - if n < 0 { - return errors.New("attempted negative skip") - } - if r.err != nil { - return r.err - } - - for n > 0 { - if r.i < r.j { - // Skip in buffer. - // decoded[i:j] contains decoded bytes that have not yet been passed on. - left := int64(r.j - r.i) - if left >= n { - r.i += int(n) - return nil - } - n -= int64(r.j - r.i) - r.i, r.j = 0, 0 - } - - // Buffer empty; read blocks until we have content. - if !r.readFull(r.buf[:4], true) { - if r.err == io.EOF { - r.err = io.ErrUnexpectedEOF - } - return r.err - } - chunkType := r.buf[0] - if !r.readHeader { - if chunkType != chunkTypeStreamIdentifier { - r.err = ErrCorrupt - return r.err - } - r.readHeader = true - } - chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 - - // The chunk types are specified at - // https://github.com/google/snappy/blob/master/framing_format.txt - switch chunkType { - case chunkTypeCompressedData: - // Section 4.2. Compressed data (chunk type 0x00). - if chunkLen < checksumSize { - r.err = ErrCorrupt - return r.err - } - if !r.ensureBufferSize(chunkLen) { - if r.err == nil { - r.err = ErrUnsupported - } - return r.err - } - buf := r.buf[:chunkLen] - if !r.readFull(buf, false) { - return r.err - } - checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 - buf = buf[checksumSize:] - - dLen, err := DecodedLen(buf) - if err != nil { - r.err = err - return r.err - } - if dLen > r.maxBlock { - r.err = ErrCorrupt - return r.err - } - // Check if destination is within this block - if int64(dLen) > n { - if len(r.decoded) < dLen { - r.decoded = make([]byte, dLen) - } - if _, err := Decode(r.decoded, buf); err != nil { - r.err = err - return r.err - } - if crc(r.decoded[:dLen]) != checksum { - r.err = ErrCorrupt - return r.err - } - } else { - // Skip block completely - n -= int64(dLen) - dLen = 0 - } - r.i, r.j = 0, dLen - continue - case chunkTypeUncompressedData: - // Section 4.3. Uncompressed data (chunk type 0x01). - if chunkLen < checksumSize { - r.err = ErrCorrupt - return r.err - } - if !r.ensureBufferSize(chunkLen) { - if r.err != nil { - r.err = ErrUnsupported - } - return r.err - } - buf := r.buf[:checksumSize] - if !r.readFull(buf, false) { - return r.err - } - checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 - // Read directly into r.decoded instead of via r.buf. - n2 := chunkLen - checksumSize - if n2 > len(r.decoded) { - if n2 > r.maxBlock { - r.err = ErrCorrupt - return r.err - } - r.decoded = make([]byte, n2) - } - if !r.readFull(r.decoded[:n2], false) { - return r.err - } - if int64(n2) < n { - if crc(r.decoded[:n2]) != checksum { - r.err = ErrCorrupt - return r.err - } - } - r.i, r.j = 0, n2 - continue - case chunkTypeStreamIdentifier: - // Section 4.1. Stream identifier (chunk type 0xff). - if chunkLen != len(magicBody) { - r.err = ErrCorrupt - return r.err - } - if !r.readFull(r.buf[:len(magicBody)], false) { - return r.err - } - if string(r.buf[:len(magicBody)]) != magicBody { - if string(r.buf[:len(magicBody)]) != magicBodySnappy { - r.err = ErrCorrupt - return r.err - } - } - - continue - } - - if chunkType <= 0x7f { - // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). - r.err = ErrUnsupported - return r.err - } - if chunkLen > maxBlockSize { - r.err = ErrUnsupported - return r.err - } - // Section 4.4 Padding (chunk type 0xfe). - // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). - if !r.skipN(r.buf, chunkLen, false) { - return r.err - } - } - return nil -} - -// ReadByte satisfies the io.ByteReader interface. -func (r *Reader) ReadByte() (byte, error) { - if r.err != nil { - return 0, r.err - } - if r.i < r.j { - c := r.decoded[r.i] - r.i++ - return c, nil - } - var tmp [1]byte - for i := 0; i < 10; i++ { - n, err := r.Read(tmp[:]) - if err != nil { - return 0, err - } - if n == 1 { - return tmp[0], nil - } - } - return 0, io.ErrNoProgress -} diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s deleted file mode 100644 index baa3bf60f5b..00000000000 --- a/vendor/github.com/klauspost/compress/s2/decode_amd64.s +++ /dev/null @@ -1,571 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" - -#define R_TMP0 AX -#define R_TMP1 BX -#define R_LEN CX -#define R_OFF DX -#define R_SRC SI -#define R_DST DI -#define R_DBASE R8 -#define R_DLEN R9 -#define R_DEND R10 -#define R_SBASE R11 -#define R_SLEN R12 -#define R_SEND R13 -#define R_TMP2 R14 -#define R_TMP3 R15 - -// The asm code generally follows the pure Go code in decode_other.go, except -// where marked with a "!!!". - -// func decode(dst, src []byte) int -// -// All local variables fit into registers. The non-zero stack size is only to -// spill registers and push args when issuing a CALL. The register allocation: -// - R_TMP0 scratch -// - R_TMP1 scratch -// - R_LEN length or x (shared) -// - R_OFF offset -// - R_SRC &src[s] -// - R_DST &dst[d] -// + R_DBASE dst_base -// + R_DLEN dst_len -// + R_DEND dst_base + dst_len -// + R_SBASE src_base -// + R_SLEN src_len -// + R_SEND src_base + src_len -// - R_TMP2 used by doCopy -// - R_TMP3 used by doCopy -// -// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the -// function, and after a CALL returns, and are not otherwise modified. -// -// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. -// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. -TEXT ·s2Decode(SB), NOSPLIT, $48-56 - // Initialize R_SRC, R_DST and R_DBASE-R_SEND. - MOVQ dst_base+0(FP), R_DBASE - MOVQ dst_len+8(FP), R_DLEN - MOVQ R_DBASE, R_DST - MOVQ R_DBASE, R_DEND - ADDQ R_DLEN, R_DEND - MOVQ src_base+24(FP), R_SBASE - MOVQ src_len+32(FP), R_SLEN - MOVQ R_SBASE, R_SRC - MOVQ R_SBASE, R_SEND - ADDQ R_SLEN, R_SEND - XORQ R_OFF, R_OFF - -loop: - // for s < len(src) - CMPQ R_SRC, R_SEND - JEQ end - - // R_LEN = uint32(src[s]) - // - // switch src[s] & 0x03 - MOVBLZX (R_SRC), R_LEN - MOVL R_LEN, R_TMP1 - ANDL $3, R_TMP1 - CMPL R_TMP1, $1 - JAE tagCopy - - // ---------------------------------------- - // The code below handles literal tags. - - // case tagLiteral: - // x := uint32(src[s] >> 2) - // switch - SHRL $2, R_LEN - CMPL R_LEN, $60 - JAE tagLit60Plus - - // case x < 60: - // s++ - INCQ R_SRC - -doLit: - // This is the end of the inner "switch", when we have a literal tag. - // - // We assume that R_LEN == x and x fits in a uint32, where x is the variable - // used in the pure Go decode_other.go code. - - // length = int(x) + 1 - // - // Unlike the pure Go code, we don't need to check if length <= 0 because - // R_LEN can hold 64 bits, so the increment cannot overflow. - INCQ R_LEN - - // Prepare to check if copying length bytes will run past the end of dst or - // src. - // - // R_TMP0 = len(dst) - d - // R_TMP1 = len(src) - s - MOVQ R_DEND, R_TMP0 - SUBQ R_DST, R_TMP0 - MOVQ R_SEND, R_TMP1 - SUBQ R_SRC, R_TMP1 - - // !!! Try a faster technique for short (16 or fewer bytes) copies. - // - // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { - // goto callMemmove // Fall back on calling runtime·memmove. - // } - // - // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s - // against 21 instead of 16, because it cannot assume that all of its input - // is contiguous in memory and so it needs to leave enough source bytes to - // read the next tag without refilling buffers, but Go's Decode assumes - // contiguousness (the src argument is a []byte). - CMPQ R_LEN, $16 - JGT callMemmove - CMPQ R_TMP0, $16 - JLT callMemmove - CMPQ R_TMP1, $16 - JLT callMemmove - - // !!! Implement the copy from src to dst as a 16-byte load and store. - // (Decode's documentation says that dst and src must not overlap.) - // - // This always copies 16 bytes, instead of only length bytes, but that's - // OK. If the input is a valid Snappy encoding then subsequent iterations - // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a - // non-nil error), so the overrun will be ignored. - // - // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or - // 16-byte loads and stores. This technique probably wouldn't be as - // effective on architectures that are fussier about alignment. - MOVOU 0(R_SRC), X0 - MOVOU X0, 0(R_DST) - - // d += length - // s += length - ADDQ R_LEN, R_DST - ADDQ R_LEN, R_SRC - JMP loop - -callMemmove: - // if length > len(dst)-d || length > len(src)-s { etc } - CMPQ R_LEN, R_TMP0 - JGT errCorrupt - CMPQ R_LEN, R_TMP1 - JGT errCorrupt - - // copy(dst[d:], src[s:s+length]) - // - // This means calling runtime·memmove(&dst[d], &src[s], length), so we push - // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those - // three registers to the stack, to save local variables across the CALL. - MOVQ R_DST, 0(SP) - MOVQ R_SRC, 8(SP) - MOVQ R_LEN, 16(SP) - MOVQ R_DST, 24(SP) - MOVQ R_SRC, 32(SP) - MOVQ R_LEN, 40(SP) - MOVQ R_OFF, 48(SP) - CALL runtime·memmove(SB) - - // Restore local variables: unspill registers from the stack and - // re-calculate R_DBASE-R_SEND. - MOVQ 24(SP), R_DST - MOVQ 32(SP), R_SRC - MOVQ 40(SP), R_LEN - MOVQ 48(SP), R_OFF - MOVQ dst_base+0(FP), R_DBASE - MOVQ dst_len+8(FP), R_DLEN - MOVQ R_DBASE, R_DEND - ADDQ R_DLEN, R_DEND - MOVQ src_base+24(FP), R_SBASE - MOVQ src_len+32(FP), R_SLEN - MOVQ R_SBASE, R_SEND - ADDQ R_SLEN, R_SEND - - // d += length - // s += length - ADDQ R_LEN, R_DST - ADDQ R_LEN, R_SRC - JMP loop - -tagLit60Plus: - // !!! This fragment does the - // - // s += x - 58; if uint(s) > uint(len(src)) { etc } - // - // checks. In the asm version, we code it once instead of once per switch case. - ADDQ R_LEN, R_SRC - SUBQ $58, R_SRC - CMPQ R_SRC, R_SEND - JA errCorrupt - - // case x == 60: - CMPL R_LEN, $61 - JEQ tagLit61 - JA tagLit62Plus - - // x = uint32(src[s-1]) - MOVBLZX -1(R_SRC), R_LEN - JMP doLit - -tagLit61: - // case x == 61: - // x = uint32(src[s-2]) | uint32(src[s-1])<<8 - MOVWLZX -2(R_SRC), R_LEN - JMP doLit - -tagLit62Plus: - CMPL R_LEN, $62 - JA tagLit63 - - // case x == 62: - // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 - MOVWLZX -3(R_SRC), R_LEN - MOVBLZX -1(R_SRC), R_TMP1 - SHLL $16, R_TMP1 - ORL R_TMP1, R_LEN - JMP doLit - -tagLit63: - // case x == 63: - // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 - MOVL -4(R_SRC), R_LEN - JMP doLit - -// The code above handles literal tags. -// ---------------------------------------- -// The code below handles copy tags. - -tagCopy4: - // case tagCopy4: - // s += 5 - ADDQ $5, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = 1 + int(src[s-5])>>2 - SHRQ $2, R_LEN - INCQ R_LEN - - // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) - MOVLQZX -4(R_SRC), R_OFF - JMP doCopy - -tagCopy2: - // case tagCopy2: - // s += 3 - ADDQ $3, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = 1 + int(src[s-3])>>2 - SHRQ $2, R_LEN - INCQ R_LEN - - // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) - MOVWQZX -2(R_SRC), R_OFF - JMP doCopy - -tagCopy: - // We have a copy tag. We assume that: - // - R_TMP1 == src[s] & 0x03 - // - R_LEN == src[s] - CMPQ R_TMP1, $2 - JEQ tagCopy2 - JA tagCopy4 - - // case tagCopy1: - // s += 2 - ADDQ $2, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) - MOVQ R_LEN, R_TMP0 - ANDQ $0xe0, R_TMP0 - SHLQ $3, R_TMP0 - MOVBQZX -1(R_SRC), R_TMP1 - ORQ R_TMP1, R_TMP0 - - // length = 4 + int(src[s-2])>>2&0x7 - SHRQ $2, R_LEN - ANDQ $7, R_LEN - ADDQ $4, R_LEN - - // check if repeat code - CMPQ R_TMP0, $0 - JE repeatCode - - // This is a regular copy, transfer our temporary value to R_OFF (length) - MOVQ R_TMP0, R_OFF - JMP doCopy - -// This is a repeat code. -repeatCode: - // If length < 9, reuse last offset, with the length already calculated. - CMPQ R_LEN, $9 - JL doCopyRepeat - - // Read additional bytes for length. - JE repeatLen1 - - // Rare, so the extra branch shouldn't hurt too much. - CMPQ R_LEN, $10 - JE repeatLen2 - JMP repeatLen3 - -// Read repeat lengths. -repeatLen1: - // s ++ - ADDQ $1, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = src[s-1] + 8 - MOVBQZX -1(R_SRC), R_LEN - ADDL $8, R_LEN - JMP doCopyRepeat - -repeatLen2: - // s +=2 - ADDQ $2, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) - MOVWQZX -2(R_SRC), R_LEN - ADDL $260, R_LEN - JMP doCopyRepeat - -repeatLen3: - // s +=3 - ADDQ $3, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) - MOVBLZX -1(R_SRC), R_TMP0 - MOVWLZX -3(R_SRC), R_LEN - SHLL $16, R_TMP0 - ORL R_TMP0, R_LEN - ADDL $65540, R_LEN - JMP doCopyRepeat - -doCopy: - // This is the end of the outer "switch", when we have a copy tag. - // - // We assume that: - // - R_LEN == length && R_LEN > 0 - // - R_OFF == offset - - // if d < offset { etc } - MOVQ R_DST, R_TMP1 - SUBQ R_DBASE, R_TMP1 - CMPQ R_TMP1, R_OFF - JLT errCorrupt - - // Repeat values can skip the test above, since any offset > 0 will be in dst. -doCopyRepeat: - // if offset <= 0 { etc } - CMPQ R_OFF, $0 - JLE errCorrupt - - // if length > len(dst)-d { etc } - MOVQ R_DEND, R_TMP1 - SUBQ R_DST, R_TMP1 - CMPQ R_LEN, R_TMP1 - JGT errCorrupt - - // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length - // - // Set: - // - R_TMP2 = len(dst)-d - // - R_TMP3 = &dst[d-offset] - MOVQ R_DEND, R_TMP2 - SUBQ R_DST, R_TMP2 - MOVQ R_DST, R_TMP3 - SUBQ R_OFF, R_TMP3 - - // !!! Try a faster technique for short (16 or fewer bytes) forward copies. - // - // First, try using two 8-byte load/stores, similar to the doLit technique - // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is - // still OK if offset >= 8. Note that this has to be two 8-byte load/stores - // and not one 16-byte load/store, and the first store has to be before the - // second load, due to the overlap if offset is in the range [8, 16). - // - // if length > 16 || offset < 8 || len(dst)-d < 16 { - // goto slowForwardCopy - // } - // copy 16 bytes - // d += length - CMPQ R_LEN, $16 - JGT slowForwardCopy - CMPQ R_OFF, $8 - JLT slowForwardCopy - CMPQ R_TMP2, $16 - JLT slowForwardCopy - MOVQ 0(R_TMP3), R_TMP0 - MOVQ R_TMP0, 0(R_DST) - MOVQ 8(R_TMP3), R_TMP1 - MOVQ R_TMP1, 8(R_DST) - ADDQ R_LEN, R_DST - JMP loop - -slowForwardCopy: - // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we - // can still try 8-byte load stores, provided we can overrun up to 10 extra - // bytes. As above, the overrun will be fixed up by subsequent iterations - // of the outermost loop. - // - // The C++ snappy code calls this technique IncrementalCopyFastPath. Its - // commentary says: - // - // ---- - // - // The main part of this loop is a simple copy of eight bytes at a time - // until we've copied (at least) the requested amount of bytes. However, - // if d and d-offset are less than eight bytes apart (indicating a - // repeating pattern of length < 8), we first need to expand the pattern in - // order to get the correct results. For instance, if the buffer looks like - // this, with the eight-byte and patterns marked as - // intervals: - // - // abxxxxxxxxxxxx - // [------] d-offset - // [------] d - // - // a single eight-byte copy from to will repeat the pattern - // once, after which we can move two bytes without moving : - // - // ababxxxxxxxxxx - // [------] d-offset - // [------] d - // - // and repeat the exercise until the two no longer overlap. - // - // This allows us to do very well in the special case of one single byte - // repeated many times, without taking a big hit for more general cases. - // - // The worst case of extra writing past the end of the match occurs when - // offset == 1 and length == 1; the last copy will read from byte positions - // [0..7] and write to [4..11], whereas it was only supposed to write to - // position 1. Thus, ten excess bytes. - // - // ---- - // - // That "10 byte overrun" worst case is confirmed by Go's - // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy - // and finishSlowForwardCopy algorithm. - // - // if length > len(dst)-d-10 { - // goto verySlowForwardCopy - // } - SUBQ $10, R_TMP2 - CMPQ R_LEN, R_TMP2 - JGT verySlowForwardCopy - - // We want to keep the offset, so we use R_TMP2 from here. - MOVQ R_OFF, R_TMP2 - -makeOffsetAtLeast8: - // !!! As above, expand the pattern so that offset >= 8 and we can use - // 8-byte load/stores. - // - // for offset < 8 { - // copy 8 bytes from dst[d-offset:] to dst[d:] - // length -= offset - // d += offset - // offset += offset - // // The two previous lines together means that d-offset, and therefore - // // R_TMP3, is unchanged. - // } - CMPQ R_TMP2, $8 - JGE fixUpSlowForwardCopy - MOVQ (R_TMP3), R_TMP1 - MOVQ R_TMP1, (R_DST) - SUBQ R_TMP2, R_LEN - ADDQ R_TMP2, R_DST - ADDQ R_TMP2, R_TMP2 - JMP makeOffsetAtLeast8 - -fixUpSlowForwardCopy: - // !!! Add length (which might be negative now) to d (implied by R_DST being - // &dst[d]) so that d ends up at the right place when we jump back to the - // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if - // length is positive, copying the remaining length bytes will write to the - // right place. - MOVQ R_DST, R_TMP0 - ADDQ R_LEN, R_DST - -finishSlowForwardCopy: - // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative - // length means that we overrun, but as above, that will be fixed up by - // subsequent iterations of the outermost loop. - CMPQ R_LEN, $0 - JLE loop - MOVQ (R_TMP3), R_TMP1 - MOVQ R_TMP1, (R_TMP0) - ADDQ $8, R_TMP3 - ADDQ $8, R_TMP0 - SUBQ $8, R_LEN - JMP finishSlowForwardCopy - -verySlowForwardCopy: - // verySlowForwardCopy is a simple implementation of forward copy. In C - // parlance, this is a do/while loop instead of a while loop, since we know - // that length > 0. In Go syntax: - // - // for { - // dst[d] = dst[d - offset] - // d++ - // length-- - // if length == 0 { - // break - // } - // } - MOVB (R_TMP3), R_TMP1 - MOVB R_TMP1, (R_DST) - INCQ R_TMP3 - INCQ R_DST - DECQ R_LEN - JNZ verySlowForwardCopy - JMP loop - -// The code above handles copy tags. -// ---------------------------------------- - -end: - // This is the end of the "for s < len(src)". - // - // if d != len(dst) { etc } - CMPQ R_DST, R_DEND - JNE errCorrupt - - // return 0 - MOVQ $0, ret+48(FP) - RET - -errCorrupt: - // return decodeErrCodeCorrupt - MOVQ $1, ret+48(FP) - RET diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s deleted file mode 100644 index 4b63d5086a9..00000000000 --- a/vendor/github.com/klauspost/compress/s2/decode_arm64.s +++ /dev/null @@ -1,574 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" - -#define R_TMP0 R2 -#define R_TMP1 R3 -#define R_LEN R4 -#define R_OFF R5 -#define R_SRC R6 -#define R_DST R7 -#define R_DBASE R8 -#define R_DLEN R9 -#define R_DEND R10 -#define R_SBASE R11 -#define R_SLEN R12 -#define R_SEND R13 -#define R_TMP2 R14 -#define R_TMP3 R15 - -// TEST_SRC will check if R_SRC is <= SRC_END -#define TEST_SRC() \ - CMP R_SEND, R_SRC \ - BGT errCorrupt - -// MOVD R_SRC, R_TMP1 -// SUB R_SBASE, R_TMP1, R_TMP1 -// CMP R_SLEN, R_TMP1 -// BGT errCorrupt - -// The asm code generally follows the pure Go code in decode_other.go, except -// where marked with a "!!!". - -// func decode(dst, src []byte) int -// -// All local variables fit into registers. The non-zero stack size is only to -// spill registers and push args when issuing a CALL. The register allocation: -// - R_TMP0 scratch -// - R_TMP1 scratch -// - R_LEN length or x -// - R_OFF offset -// - R_SRC &src[s] -// - R_DST &dst[d] -// + R_DBASE dst_base -// + R_DLEN dst_len -// + R_DEND dst_base + dst_len -// + R_SBASE src_base -// + R_SLEN src_len -// + R_SEND src_base + src_len -// - R_TMP2 used by doCopy -// - R_TMP3 used by doCopy -// -// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the -// function, and after a CALL returns, and are not otherwise modified. -// -// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. -// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. -TEXT ·s2Decode(SB), NOSPLIT, $56-64 - // Initialize R_SRC, R_DST and R_DBASE-R_SEND. - MOVD dst_base+0(FP), R_DBASE - MOVD dst_len+8(FP), R_DLEN - MOVD R_DBASE, R_DST - MOVD R_DBASE, R_DEND - ADD R_DLEN, R_DEND, R_DEND - MOVD src_base+24(FP), R_SBASE - MOVD src_len+32(FP), R_SLEN - MOVD R_SBASE, R_SRC - MOVD R_SBASE, R_SEND - ADD R_SLEN, R_SEND, R_SEND - MOVD $0, R_OFF - -loop: - // for s < len(src) - CMP R_SEND, R_SRC - BEQ end - - // R_LEN = uint32(src[s]) - // - // switch src[s] & 0x03 - MOVBU (R_SRC), R_LEN - MOVW R_LEN, R_TMP1 - ANDW $3, R_TMP1 - MOVW $1, R1 - CMPW R1, R_TMP1 - BGE tagCopy - - // ---------------------------------------- - // The code below handles literal tags. - - // case tagLiteral: - // x := uint32(src[s] >> 2) - // switch - MOVW $60, R1 - LSRW $2, R_LEN, R_LEN - CMPW R_LEN, R1 - BLS tagLit60Plus - - // case x < 60: - // s++ - ADD $1, R_SRC, R_SRC - -doLit: - // This is the end of the inner "switch", when we have a literal tag. - // - // We assume that R_LEN == x and x fits in a uint32, where x is the variable - // used in the pure Go decode_other.go code. - - // length = int(x) + 1 - // - // Unlike the pure Go code, we don't need to check if length <= 0 because - // R_LEN can hold 64 bits, so the increment cannot overflow. - ADD $1, R_LEN, R_LEN - - // Prepare to check if copying length bytes will run past the end of dst or - // src. - // - // R_TMP0 = len(dst) - d - // R_TMP1 = len(src) - s - MOVD R_DEND, R_TMP0 - SUB R_DST, R_TMP0, R_TMP0 - MOVD R_SEND, R_TMP1 - SUB R_SRC, R_TMP1, R_TMP1 - - // !!! Try a faster technique for short (16 or fewer bytes) copies. - // - // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { - // goto callMemmove // Fall back on calling runtime·memmove. - // } - // - // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s - // against 21 instead of 16, because it cannot assume that all of its input - // is contiguous in memory and so it needs to leave enough source bytes to - // read the next tag without refilling buffers, but Go's Decode assumes - // contiguousness (the src argument is a []byte). - CMP $16, R_LEN - BGT callMemmove - CMP $16, R_TMP0 - BLT callMemmove - CMP $16, R_TMP1 - BLT callMemmove - - // !!! Implement the copy from src to dst as a 16-byte load and store. - // (Decode's documentation says that dst and src must not overlap.) - // - // This always copies 16 bytes, instead of only length bytes, but that's - // OK. If the input is a valid Snappy encoding then subsequent iterations - // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a - // non-nil error), so the overrun will be ignored. - // - // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or - // 16-byte loads and stores. This technique probably wouldn't be as - // effective on architectures that are fussier about alignment. - LDP 0(R_SRC), (R_TMP2, R_TMP3) - STP (R_TMP2, R_TMP3), 0(R_DST) - - // d += length - // s += length - ADD R_LEN, R_DST, R_DST - ADD R_LEN, R_SRC, R_SRC - B loop - -callMemmove: - // if length > len(dst)-d || length > len(src)-s { etc } - CMP R_TMP0, R_LEN - BGT errCorrupt - CMP R_TMP1, R_LEN - BGT errCorrupt - - // copy(dst[d:], src[s:s+length]) - // - // This means calling runtime·memmove(&dst[d], &src[s], length), so we push - // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those - // three registers to the stack, to save local variables across the CALL. - MOVD R_DST, 8(RSP) - MOVD R_SRC, 16(RSP) - MOVD R_LEN, 24(RSP) - MOVD R_DST, 32(RSP) - MOVD R_SRC, 40(RSP) - MOVD R_LEN, 48(RSP) - MOVD R_OFF, 56(RSP) - CALL runtime·memmove(SB) - - // Restore local variables: unspill registers from the stack and - // re-calculate R_DBASE-R_SEND. - MOVD 32(RSP), R_DST - MOVD 40(RSP), R_SRC - MOVD 48(RSP), R_LEN - MOVD 56(RSP), R_OFF - MOVD dst_base+0(FP), R_DBASE - MOVD dst_len+8(FP), R_DLEN - MOVD R_DBASE, R_DEND - ADD R_DLEN, R_DEND, R_DEND - MOVD src_base+24(FP), R_SBASE - MOVD src_len+32(FP), R_SLEN - MOVD R_SBASE, R_SEND - ADD R_SLEN, R_SEND, R_SEND - - // d += length - // s += length - ADD R_LEN, R_DST, R_DST - ADD R_LEN, R_SRC, R_SRC - B loop - -tagLit60Plus: - // !!! This fragment does the - // - // s += x - 58; if uint(s) > uint(len(src)) { etc } - // - // checks. In the asm version, we code it once instead of once per switch case. - ADD R_LEN, R_SRC, R_SRC - SUB $58, R_SRC, R_SRC - TEST_SRC() - - // case x == 60: - MOVW $61, R1 - CMPW R1, R_LEN - BEQ tagLit61 - BGT tagLit62Plus - - // x = uint32(src[s-1]) - MOVBU -1(R_SRC), R_LEN - B doLit - -tagLit61: - // case x == 61: - // x = uint32(src[s-2]) | uint32(src[s-1])<<8 - MOVHU -2(R_SRC), R_LEN - B doLit - -tagLit62Plus: - CMPW $62, R_LEN - BHI tagLit63 - - // case x == 62: - // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 - MOVHU -3(R_SRC), R_LEN - MOVBU -1(R_SRC), R_TMP1 - ORR R_TMP1<<16, R_LEN - B doLit - -tagLit63: - // case x == 63: - // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 - MOVWU -4(R_SRC), R_LEN - B doLit - - // The code above handles literal tags. - // ---------------------------------------- - // The code below handles copy tags. - -tagCopy4: - // case tagCopy4: - // s += 5 - ADD $5, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - MOVD R_SRC, R_TMP1 - SUB R_SBASE, R_TMP1, R_TMP1 - CMP R_SLEN, R_TMP1 - BGT errCorrupt - - // length = 1 + int(src[s-5])>>2 - MOVD $1, R1 - ADD R_LEN>>2, R1, R_LEN - - // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) - MOVWU -4(R_SRC), R_OFF - B doCopy - -tagCopy2: - // case tagCopy2: - // s += 3 - ADD $3, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - TEST_SRC() - - // length = 1 + int(src[s-3])>>2 - MOVD $1, R1 - ADD R_LEN>>2, R1, R_LEN - - // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) - MOVHU -2(R_SRC), R_OFF - B doCopy - -tagCopy: - // We have a copy tag. We assume that: - // - R_TMP1 == src[s] & 0x03 - // - R_LEN == src[s] - CMP $2, R_TMP1 - BEQ tagCopy2 - BGT tagCopy4 - - // case tagCopy1: - // s += 2 - ADD $2, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - TEST_SRC() - - // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) - // Calculate offset in R_TMP0 in case it is a repeat. - MOVD R_LEN, R_TMP0 - AND $0xe0, R_TMP0 - MOVBU -1(R_SRC), R_TMP1 - ORR R_TMP0<<3, R_TMP1, R_TMP0 - - // length = 4 + int(src[s-2])>>2&0x7 - MOVD $7, R1 - AND R_LEN>>2, R1, R_LEN - ADD $4, R_LEN, R_LEN - - // check if repeat code with offset 0. - CMP $0, R_TMP0 - BEQ repeatCode - - // This is a regular copy, transfer our temporary value to R_OFF (offset) - MOVD R_TMP0, R_OFF - B doCopy - - // This is a repeat code. -repeatCode: - // If length < 9, reuse last offset, with the length already calculated. - CMP $9, R_LEN - BLT doCopyRepeat - BEQ repeatLen1 - CMP $10, R_LEN - BEQ repeatLen2 - -repeatLen3: - // s +=3 - ADD $3, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - TEST_SRC() - - // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540 - MOVBU -1(R_SRC), R_TMP0 - MOVHU -3(R_SRC), R_LEN - ORR R_TMP0<<16, R_LEN, R_LEN - ADD $65540, R_LEN, R_LEN - B doCopyRepeat - -repeatLen2: - // s +=2 - ADD $2, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - TEST_SRC() - - // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260 - MOVHU -2(R_SRC), R_LEN - ADD $260, R_LEN, R_LEN - B doCopyRepeat - -repeatLen1: - // s +=1 - ADD $1, R_SRC, R_SRC - - // if uint(s) > uint(len(src)) { etc } - TEST_SRC() - - // length = src[s-1] + 8 - MOVBU -1(R_SRC), R_LEN - ADD $8, R_LEN, R_LEN - B doCopyRepeat - -doCopy: - // This is the end of the outer "switch", when we have a copy tag. - // - // We assume that: - // - R_LEN == length && R_LEN > 0 - // - R_OFF == offset - - // if d < offset { etc } - MOVD R_DST, R_TMP1 - SUB R_DBASE, R_TMP1, R_TMP1 - CMP R_OFF, R_TMP1 - BLT errCorrupt - - // Repeat values can skip the test above, since any offset > 0 will be in dst. -doCopyRepeat: - - // if offset <= 0 { etc } - CMP $0, R_OFF - BLE errCorrupt - - // if length > len(dst)-d { etc } - MOVD R_DEND, R_TMP1 - SUB R_DST, R_TMP1, R_TMP1 - CMP R_TMP1, R_LEN - BGT errCorrupt - - // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length - // - // Set: - // - R_TMP2 = len(dst)-d - // - R_TMP3 = &dst[d-offset] - MOVD R_DEND, R_TMP2 - SUB R_DST, R_TMP2, R_TMP2 - MOVD R_DST, R_TMP3 - SUB R_OFF, R_TMP3, R_TMP3 - - // !!! Try a faster technique for short (16 or fewer bytes) forward copies. - // - // First, try using two 8-byte load/stores, similar to the doLit technique - // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is - // still OK if offset >= 8. Note that this has to be two 8-byte load/stores - // and not one 16-byte load/store, and the first store has to be before the - // second load, due to the overlap if offset is in the range [8, 16). - // - // if length > 16 || offset < 8 || len(dst)-d < 16 { - // goto slowForwardCopy - // } - // copy 16 bytes - // d += length - CMP $16, R_LEN - BGT slowForwardCopy - CMP $8, R_OFF - BLT slowForwardCopy - CMP $16, R_TMP2 - BLT slowForwardCopy - MOVD 0(R_TMP3), R_TMP0 - MOVD R_TMP0, 0(R_DST) - MOVD 8(R_TMP3), R_TMP1 - MOVD R_TMP1, 8(R_DST) - ADD R_LEN, R_DST, R_DST - B loop - -slowForwardCopy: - // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we - // can still try 8-byte load stores, provided we can overrun up to 10 extra - // bytes. As above, the overrun will be fixed up by subsequent iterations - // of the outermost loop. - // - // The C++ snappy code calls this technique IncrementalCopyFastPath. Its - // commentary says: - // - // ---- - // - // The main part of this loop is a simple copy of eight bytes at a time - // until we've copied (at least) the requested amount of bytes. However, - // if d and d-offset are less than eight bytes apart (indicating a - // repeating pattern of length < 8), we first need to expand the pattern in - // order to get the correct results. For instance, if the buffer looks like - // this, with the eight-byte and patterns marked as - // intervals: - // - // abxxxxxxxxxxxx - // [------] d-offset - // [------] d - // - // a single eight-byte copy from to will repeat the pattern - // once, after which we can move two bytes without moving : - // - // ababxxxxxxxxxx - // [------] d-offset - // [------] d - // - // and repeat the exercise until the two no longer overlap. - // - // This allows us to do very well in the special case of one single byte - // repeated many times, without taking a big hit for more general cases. - // - // The worst case of extra writing past the end of the match occurs when - // offset == 1 and length == 1; the last copy will read from byte positions - // [0..7] and write to [4..11], whereas it was only supposed to write to - // position 1. Thus, ten excess bytes. - // - // ---- - // - // That "10 byte overrun" worst case is confirmed by Go's - // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy - // and finishSlowForwardCopy algorithm. - // - // if length > len(dst)-d-10 { - // goto verySlowForwardCopy - // } - SUB $10, R_TMP2, R_TMP2 - CMP R_TMP2, R_LEN - BGT verySlowForwardCopy - - // We want to keep the offset, so we use R_TMP2 from here. - MOVD R_OFF, R_TMP2 - -makeOffsetAtLeast8: - // !!! As above, expand the pattern so that offset >= 8 and we can use - // 8-byte load/stores. - // - // for offset < 8 { - // copy 8 bytes from dst[d-offset:] to dst[d:] - // length -= offset - // d += offset - // offset += offset - // // The two previous lines together means that d-offset, and therefore - // // R_TMP3, is unchanged. - // } - CMP $8, R_TMP2 - BGE fixUpSlowForwardCopy - MOVD (R_TMP3), R_TMP1 - MOVD R_TMP1, (R_DST) - SUB R_TMP2, R_LEN, R_LEN - ADD R_TMP2, R_DST, R_DST - ADD R_TMP2, R_TMP2, R_TMP2 - B makeOffsetAtLeast8 - -fixUpSlowForwardCopy: - // !!! Add length (which might be negative now) to d (implied by R_DST being - // &dst[d]) so that d ends up at the right place when we jump back to the - // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if - // length is positive, copying the remaining length bytes will write to the - // right place. - MOVD R_DST, R_TMP0 - ADD R_LEN, R_DST, R_DST - -finishSlowForwardCopy: - // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative - // length means that we overrun, but as above, that will be fixed up by - // subsequent iterations of the outermost loop. - MOVD $0, R1 - CMP R1, R_LEN - BLE loop - MOVD (R_TMP3), R_TMP1 - MOVD R_TMP1, (R_TMP0) - ADD $8, R_TMP3, R_TMP3 - ADD $8, R_TMP0, R_TMP0 - SUB $8, R_LEN, R_LEN - B finishSlowForwardCopy - -verySlowForwardCopy: - // verySlowForwardCopy is a simple implementation of forward copy. In C - // parlance, this is a do/while loop instead of a while loop, since we know - // that length > 0. In Go syntax: - // - // for { - // dst[d] = dst[d - offset] - // d++ - // length-- - // if length == 0 { - // break - // } - // } - MOVB (R_TMP3), R_TMP1 - MOVB R_TMP1, (R_DST) - ADD $1, R_TMP3, R_TMP3 - ADD $1, R_DST, R_DST - SUB $1, R_LEN, R_LEN - CBNZ R_LEN, verySlowForwardCopy - B loop - - // The code above handles copy tags. - // ---------------------------------------- - -end: - // This is the end of the "for s < len(src)". - // - // if d != len(dst) { etc } - CMP R_DEND, R_DST - BNE errCorrupt - - // return 0 - MOVD $0, ret+48(FP) - RET - -errCorrupt: - // return decodeErrCodeCorrupt - MOVD $1, R_TMP0 - MOVD R_TMP0, ret+48(FP) - RET diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go deleted file mode 100644 index cb3576edd47..00000000000 --- a/vendor/github.com/klauspost/compress/s2/decode_asm.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2016 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (amd64 || arm64) && !appengine && gc && !noasm -// +build amd64 arm64 -// +build !appengine -// +build gc -// +build !noasm - -package s2 - -// decode has the same semantics as in decode_other.go. -// -//go:noescape -func s2Decode(dst, src []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go deleted file mode 100644 index 1074ebd215e..00000000000 --- a/vendor/github.com/klauspost/compress/s2/decode_other.go +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2016 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (!amd64 && !arm64) || appengine || !gc || noasm -// +build !amd64,!arm64 appengine !gc noasm - -package s2 - -import ( - "fmt" - "strconv" -) - -// decode writes the decoding of src to dst. It assumes that the varint-encoded -// length of the decompressed bytes has already been read, and that len(dst) -// equals that length. -// -// It returns 0 on success or a decodeErrCodeXxx error code on failure. -func s2Decode(dst, src []byte) int { - const debug = false - if debug { - fmt.Println("Starting decode, dst len:", len(dst)) - } - var d, s, length int - offset := 0 - - // As long as we can read at least 5 bytes... - for s < len(src)-5 { - switch src[s] & 0x03 { - case tagLiteral: - x := uint32(src[s] >> 2) - switch { - case x < 60: - s++ - case x == 60: - s += 2 - x = uint32(src[s-1]) - case x == 61: - s += 3 - x = uint32(src[s-2]) | uint32(src[s-1])<<8 - case x == 62: - s += 4 - x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 - case x == 63: - s += 5 - x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 - } - length = int(x) + 1 - if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { - return decodeErrCodeCorrupt - } - if debug { - fmt.Println("literals, length:", length, "d-after:", d+length) - } - - copy(dst[d:], src[s:s+length]) - d += length - s += length - continue - - case tagCopy1: - s += 2 - length = int(src[s-2]) >> 2 & 0x7 - toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) - if toffset == 0 { - if debug { - fmt.Print("(repeat) ") - } - // keep last offset - switch length { - case 5: - s += 1 - length = int(uint32(src[s-1])) + 4 - case 6: - s += 2 - length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) - case 7: - s += 3 - length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) - default: // 0-> 4 - } - } else { - offset = toffset - } - length += 4 - case tagCopy2: - s += 3 - length = 1 + int(src[s-3])>>2 - offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) - - case tagCopy4: - s += 5 - length = 1 + int(src[s-5])>>2 - offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) - } - - if offset <= 0 || d < offset || length > len(dst)-d { - return decodeErrCodeCorrupt - } - - if debug { - fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length) - } - - // Copy from an earlier sub-slice of dst to a later sub-slice. - // If no overlap, use the built-in copy: - if offset > length { - copy(dst[d:d+length], dst[d-offset:]) - d += length - continue - } - - // Unlike the built-in copy function, this byte-by-byte copy always runs - // forwards, even if the slices overlap. Conceptually, this is: - // - // d += forwardCopy(dst[d:d+length], dst[d-offset:]) - // - // We align the slices into a and b and show the compiler they are the same size. - // This allows the loop to run without bounds checks. - a := dst[d : d+length] - b := dst[d-offset:] - b = b[:len(a)] - for i := range a { - a[i] = b[i] - } - d += length - } - - // Remaining with extra checks... - for s < len(src) { - switch src[s] & 0x03 { - case tagLiteral: - x := uint32(src[s] >> 2) - switch { - case x < 60: - s++ - case x == 60: - s += 2 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - x = uint32(src[s-1]) - case x == 61: - s += 3 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - x = uint32(src[s-2]) | uint32(src[s-1])<<8 - case x == 62: - s += 4 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 - case x == 63: - s += 5 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 - } - length = int(x) + 1 - if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { - return decodeErrCodeCorrupt - } - if debug { - fmt.Println("literals, length:", length, "d-after:", d+length) - } - - copy(dst[d:], src[s:s+length]) - d += length - s += length - continue - - case tagCopy1: - s += 2 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = int(src[s-2]) >> 2 & 0x7 - toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) - if toffset == 0 { - if debug { - fmt.Print("(repeat) ") - } - // keep last offset - switch length { - case 5: - s += 1 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = int(uint32(src[s-1])) + 4 - case 6: - s += 2 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) - case 7: - s += 3 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) - default: // 0-> 4 - } - } else { - offset = toffset - } - length += 4 - case tagCopy2: - s += 3 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = 1 + int(src[s-3])>>2 - offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) - - case tagCopy4: - s += 5 - if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. - return decodeErrCodeCorrupt - } - length = 1 + int(src[s-5])>>2 - offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) - } - - if offset <= 0 || d < offset || length > len(dst)-d { - return decodeErrCodeCorrupt - } - - if debug { - fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length) - } - - // Copy from an earlier sub-slice of dst to a later sub-slice. - // If no overlap, use the built-in copy: - if offset > length { - copy(dst[d:d+length], dst[d-offset:]) - d += length - continue - } - - // Unlike the built-in copy function, this byte-by-byte copy always runs - // forwards, even if the slices overlap. Conceptually, this is: - // - // d += forwardCopy(dst[d:d+length], dst[d-offset:]) - // - // We align the slices into a and b and show the compiler they are the same size. - // This allows the loop to run without bounds checks. - a := dst[d : d+length] - b := dst[d-offset:] - b = b[:len(a)] - for i := range a { - a[i] = b[i] - } - d += length - } - - if d != len(dst) { - return decodeErrCodeCorrupt - } - return 0 -} diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go deleted file mode 100644 index aa8b108d011..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode.go +++ /dev/null @@ -1,1172 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package s2 - -import ( - "crypto/rand" - "encoding/binary" - "errors" - "fmt" - "io" - "math" - "math/bits" - "runtime" - "sync" -) - -// Encode returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func Encode(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - n := encodeBlock(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// EncodeBetter returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// EncodeBetter compresses better than Encode but typically with a -// 10-40% speed decrease on both compression and decompression. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeBetter(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if len(dst) < n { - dst = make([]byte, n) - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - n := encodeBlockBetter(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// EncodeBest returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// EncodeBest compresses as good as reasonably possible but with a -// big speed decrease. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeBest(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if len(dst) < n { - dst = make([]byte, n) - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - n := encodeBlockBest(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappy(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - - n := encodeBlockSnappy(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappyBetter(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - - n := encodeBlockBetterSnappy(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappyBest(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - - n := encodeBlockBestSnappy(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination. -// If the destination is nil or too small, a new will be allocated. -// The blocks are not validated, so garbage in = garbage out. -// dst may not overlap block data. -// Any data in dst is preserved as is, so it will not be considered a block. -func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) { - totalSize := uint64(0) - compSize := 0 - for _, b := range blocks { - l, hdr, err := decodedLen(b) - if err != nil { - return nil, err - } - totalSize += uint64(l) - compSize += len(b) - hdr - } - if totalSize == 0 { - dst = append(dst, 0) - return dst, nil - } - if totalSize > math.MaxUint32 { - return nil, ErrTooLarge - } - var tmp [binary.MaxVarintLen32]byte - hdrSize := binary.PutUvarint(tmp[:], totalSize) - wantSize := hdrSize + compSize - - if cap(dst)-len(dst) < wantSize { - dst = append(make([]byte, 0, wantSize+len(dst)), dst...) - } - dst = append(dst, tmp[:hdrSize]...) - for _, b := range blocks { - _, hdr, err := decodedLen(b) - if err != nil { - return nil, err - } - dst = append(dst, b[hdr:]...) - } - return dst, nil -} - -// inputMargin is the minimum number of extra input bytes to keep, inside -// encodeBlock's inner loop. On some architectures, this margin lets us -// implement a fast path for emitLiteral, where the copy of short (<= 16 byte) -// literals can be implemented as a single load to and store from a 16-byte -// register. That literal's actual length can be as short as 1 byte, so this -// can copy up to 15 bytes too much, but that's OK as subsequent iterations of -// the encoding loop will fix up the copy overrun, and this inputMargin ensures -// that we don't overrun the dst and src buffers. -const inputMargin = 8 - -// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that -// will be accepted by the encoder. -const minNonLiteralBlockSize = 32 - -// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size. -// Blocks this big are highly discouraged, though. -const MaxBlockSize = math.MaxUint32 - binary.MaxVarintLen32 - 5 - -// MaxEncodedLen returns the maximum length of a snappy block, given its -// uncompressed length. -// -// It will return a negative value if srcLen is too large to encode. -// 32 bit platforms will have lower thresholds for rejecting big content. -func MaxEncodedLen(srcLen int) int { - n := uint64(srcLen) - if n > 0xffffffff { - // Also includes negative. - return -1 - } - // Size of the varint encoded block size. - n = n + uint64((bits.Len64(n)+7)/7) - - // Add maximum size of encoding block as literals. - n += uint64(literalExtraSize(int64(srcLen))) - if n > 0xffffffff { - return -1 - } - return int(n) -} - -var errClosed = errors.New("s2: Writer is closed") - -// NewWriter returns a new Writer that compresses to w, using the -// framing format described at -// https://github.com/google/snappy/blob/master/framing_format.txt -// -// Users must call Close to guarantee all data has been forwarded to -// the underlying io.Writer and that resources are released. -// They may also call Flush zero or more times before calling Close. -func NewWriter(w io.Writer, opts ...WriterOption) *Writer { - w2 := Writer{ - blockSize: defaultBlockSize, - concurrency: runtime.GOMAXPROCS(0), - randSrc: rand.Reader, - level: levelFast, - } - for _, opt := range opts { - if err := opt(&w2); err != nil { - w2.errState = err - return &w2 - } - } - w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize) - w2.paramsOK = true - w2.ibuf = make([]byte, 0, w2.blockSize) - w2.buffers.New = func() interface{} { - return make([]byte, w2.obufLen) - } - w2.Reset(w) - return &w2 -} - -// Writer is an io.Writer that can write Snappy-compressed bytes. -type Writer struct { - errMu sync.Mutex - errState error - - // ibuf is a buffer for the incoming (uncompressed) bytes. - ibuf []byte - - blockSize int - obufLen int - concurrency int - written int64 - output chan chan result - buffers sync.Pool - pad int - - writer io.Writer - randSrc io.Reader - writerWg sync.WaitGroup - - // wroteStreamHeader is whether we have written the stream header. - wroteStreamHeader bool - paramsOK bool - snappy bool - flushOnWrite bool - level uint8 -} - -const ( - levelUncompressed = iota + 1 - levelFast - levelBetter - levelBest -) - -type result []byte - -// err returns the previously set error. -// If no error has been set it is set to err if not nil. -func (w *Writer) err(err error) error { - w.errMu.Lock() - errSet := w.errState - if errSet == nil && err != nil { - w.errState = err - errSet = err - } - w.errMu.Unlock() - return errSet -} - -// Reset discards the writer's state and switches the Snappy writer to write to w. -// This permits reusing a Writer rather than allocating a new one. -func (w *Writer) Reset(writer io.Writer) { - if !w.paramsOK { - return - } - // Close previous writer, if any. - if w.output != nil { - close(w.output) - w.writerWg.Wait() - w.output = nil - } - w.errState = nil - w.ibuf = w.ibuf[:0] - w.wroteStreamHeader = false - w.written = 0 - w.writer = writer - // If we didn't get a writer, stop here. - if writer == nil { - return - } - // If no concurrency requested, don't spin up writer goroutine. - if w.concurrency == 1 { - return - } - - toWrite := make(chan chan result, w.concurrency) - w.output = toWrite - w.writerWg.Add(1) - - // Start a writer goroutine that will write all output in order. - go func() { - defer w.writerWg.Done() - - // Get a queued write. - for write := range toWrite { - // Wait for the data to be available. - in := <-write - if len(in) > 0 { - if w.err(nil) == nil { - // Don't expose data from previous buffers. - toWrite := in[:len(in):len(in)] - // Write to output. - n, err := writer.Write(toWrite) - if err == nil && n != len(toWrite) { - err = io.ErrShortBuffer - } - _ = w.err(err) - w.written += int64(n) - } - } - if cap(in) >= w.obufLen { - w.buffers.Put([]byte(in)) - } - // close the incoming write request. - // This can be used for synchronizing flushes. - close(write) - } - }() -} - -// Write satisfies the io.Writer interface. -func (w *Writer) Write(p []byte) (nRet int, errRet error) { - if w.flushOnWrite { - return w.write(p) - } - // If we exceed the input buffer size, start writing - for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil { - var n int - if len(w.ibuf) == 0 { - // Large write, empty buffer. - // Write directly from p to avoid copy. - n, _ = w.write(p) - } else { - n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) - w.ibuf = w.ibuf[:len(w.ibuf)+n] - w.write(w.ibuf) - w.ibuf = w.ibuf[:0] - } - nRet += n - p = p[n:] - } - if err := w.err(nil); err != nil { - return nRet, err - } - // p should always be able to fit into w.ibuf now. - n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) - w.ibuf = w.ibuf[:len(w.ibuf)+n] - nRet += n - return nRet, nil -} - -// ReadFrom implements the io.ReaderFrom interface. -// Using this is typically more efficient since it avoids a memory copy. -// ReadFrom reads data from r until EOF or error. -// The return value n is the number of bytes read. -// Any error except io.EOF encountered during the read is also returned. -func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) { - if len(w.ibuf) > 0 { - err := w.Flush() - if err != nil { - return 0, err - } - } - if br, ok := r.(byter); ok { - buf := br.Bytes() - if err := w.EncodeBuffer(buf); err != nil { - return 0, err - } - return int64(len(buf)), w.Flush() - } - for { - inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen] - n2, err := io.ReadFull(r, inbuf[obufHeaderLen:]) - if err != nil { - if err == io.ErrUnexpectedEOF { - err = io.EOF - } - if err != io.EOF { - return n, w.err(err) - } - } - if n2 == 0 { - break - } - n += int64(n2) - err2 := w.writeFull(inbuf[:n2+obufHeaderLen]) - if w.err(err2) != nil { - break - } - - if err != nil { - // We got EOF and wrote everything - break - } - } - - return n, w.err(nil) -} - -// EncodeBuffer will add a buffer to the stream. -// This is the fastest way to encode a stream, -// but the input buffer cannot be written to by the caller -// until Flush or Close has been called when concurrency != 1. -// -// If you cannot control that, use the regular Write function. -// -// Note that input is not buffered. -// This means that each write will result in discrete blocks being created. -// For buffered writes, use the regular Write function. -func (w *Writer) EncodeBuffer(buf []byte) (err error) { - if err := w.err(nil); err != nil { - return err - } - - if w.flushOnWrite { - _, err := w.write(buf) - return err - } - // Flush queued data first. - if len(w.ibuf) > 0 { - err := w.Flush() - if err != nil { - return err - } - } - if w.concurrency == 1 { - _, err := w.writeSync(buf) - return err - } - - // Spawn goroutine and write block to output channel. - if !w.wroteStreamHeader { - w.wroteStreamHeader = true - hWriter := make(chan result) - w.output <- hWriter - if w.snappy { - hWriter <- []byte(magicChunkSnappy) - } else { - hWriter <- []byte(magicChunk) - } - } - - for len(buf) > 0 { - // Cut input. - uncompressed := buf - if len(uncompressed) > w.blockSize { - uncompressed = uncompressed[:w.blockSize] - } - buf = buf[len(uncompressed):] - // Get an output buffer. - obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen] - output := make(chan result) - // Queue output now, so we keep order. - w.output <- output - go func() { - checksum := crc(uncompressed) - - // Set to uncompressed. - chunkType := uint8(chunkTypeUncompressedData) - chunkLen := 4 + len(uncompressed) - - // Attempt compressing. - n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - - // Check if we should use this, or store as uncompressed instead. - if n2 > 0 { - chunkType = uint8(chunkTypeCompressedData) - chunkLen = 4 + n + n2 - obuf = obuf[:obufHeaderLen+n+n2] - } else { - // copy uncompressed - copy(obuf[obufHeaderLen:], uncompressed) - } - - // Fill in the per-chunk header that comes before the body. - obuf[0] = chunkType - obuf[1] = uint8(chunkLen >> 0) - obuf[2] = uint8(chunkLen >> 8) - obuf[3] = uint8(chunkLen >> 16) - obuf[4] = uint8(checksum >> 0) - obuf[5] = uint8(checksum >> 8) - obuf[6] = uint8(checksum >> 16) - obuf[7] = uint8(checksum >> 24) - - // Queue final output. - output <- obuf - }() - } - return nil -} - -func (w *Writer) encodeBlock(obuf, uncompressed []byte) int { - if w.snappy { - switch w.level { - case levelFast: - return encodeBlockSnappy(obuf, uncompressed) - case levelBetter: - return encodeBlockBetterSnappy(obuf, uncompressed) - case levelBest: - return encodeBlockBestSnappy(obuf, uncompressed) - } - return 0 - } - switch w.level { - case levelFast: - return encodeBlock(obuf, uncompressed) - case levelBetter: - return encodeBlockBetter(obuf, uncompressed) - case levelBest: - return encodeBlockBest(obuf, uncompressed) - } - return 0 -} - -func (w *Writer) write(p []byte) (nRet int, errRet error) { - if err := w.err(nil); err != nil { - return 0, err - } - if w.concurrency == 1 { - return w.writeSync(p) - } - - // Spawn goroutine and write block to output channel. - for len(p) > 0 { - if !w.wroteStreamHeader { - w.wroteStreamHeader = true - hWriter := make(chan result) - w.output <- hWriter - if w.snappy { - hWriter <- []byte(magicChunkSnappy) - } else { - hWriter <- []byte(magicChunk) - } - } - - var uncompressed []byte - if len(p) > w.blockSize { - uncompressed, p = p[:w.blockSize], p[w.blockSize:] - } else { - uncompressed, p = p, nil - } - - // Copy input. - // If the block is incompressible, this is used for the result. - inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen] - obuf := w.buffers.Get().([]byte)[:w.obufLen] - copy(inbuf[obufHeaderLen:], uncompressed) - uncompressed = inbuf[obufHeaderLen:] - - output := make(chan result) - // Queue output now, so we keep order. - w.output <- output - go func() { - checksum := crc(uncompressed) - - // Set to uncompressed. - chunkType := uint8(chunkTypeUncompressedData) - chunkLen := 4 + len(uncompressed) - - // Attempt compressing. - n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - - // Check if we should use this, or store as uncompressed instead. - if n2 > 0 { - chunkType = uint8(chunkTypeCompressedData) - chunkLen = 4 + n + n2 - obuf = obuf[:obufHeaderLen+n+n2] - } else { - // Use input as output. - obuf, inbuf = inbuf, obuf - } - - // Fill in the per-chunk header that comes before the body. - obuf[0] = chunkType - obuf[1] = uint8(chunkLen >> 0) - obuf[2] = uint8(chunkLen >> 8) - obuf[3] = uint8(chunkLen >> 16) - obuf[4] = uint8(checksum >> 0) - obuf[5] = uint8(checksum >> 8) - obuf[6] = uint8(checksum >> 16) - obuf[7] = uint8(checksum >> 24) - - // Queue final output. - output <- obuf - - // Put unused buffer back in pool. - w.buffers.Put(inbuf) - }() - nRet += len(uncompressed) - } - return nRet, nil -} - -// writeFull is a special version of write that will always write the full buffer. -// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer. -// The data will be written as a single block. -// The caller is not allowed to use inbuf after this function has been called. -func (w *Writer) writeFull(inbuf []byte) (errRet error) { - if err := w.err(nil); err != nil { - return err - } - - if w.concurrency == 1 { - _, err := w.writeSync(inbuf[obufHeaderLen:]) - return err - } - - // Spawn goroutine and write block to output channel. - if !w.wroteStreamHeader { - w.wroteStreamHeader = true - hWriter := make(chan result) - w.output <- hWriter - if w.snappy { - hWriter <- []byte(magicChunkSnappy) - } else { - hWriter <- []byte(magicChunk) - } - } - - // Get an output buffer. - obuf := w.buffers.Get().([]byte)[:w.obufLen] - uncompressed := inbuf[obufHeaderLen:] - - output := make(chan result) - // Queue output now, so we keep order. - w.output <- output - go func() { - checksum := crc(uncompressed) - - // Set to uncompressed. - chunkType := uint8(chunkTypeUncompressedData) - chunkLen := 4 + len(uncompressed) - - // Attempt compressing. - n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - - // Check if we should use this, or store as uncompressed instead. - if n2 > 0 { - chunkType = uint8(chunkTypeCompressedData) - chunkLen = 4 + n + n2 - obuf = obuf[:obufHeaderLen+n+n2] - } else { - // Use input as output. - obuf, inbuf = inbuf, obuf - } - - // Fill in the per-chunk header that comes before the body. - obuf[0] = chunkType - obuf[1] = uint8(chunkLen >> 0) - obuf[2] = uint8(chunkLen >> 8) - obuf[3] = uint8(chunkLen >> 16) - obuf[4] = uint8(checksum >> 0) - obuf[5] = uint8(checksum >> 8) - obuf[6] = uint8(checksum >> 16) - obuf[7] = uint8(checksum >> 24) - - // Queue final output. - output <- obuf - - // Put unused buffer back in pool. - w.buffers.Put(inbuf) - }() - return nil -} - -func (w *Writer) writeSync(p []byte) (nRet int, errRet error) { - if err := w.err(nil); err != nil { - return 0, err - } - if !w.wroteStreamHeader { - w.wroteStreamHeader = true - var n int - var err error - if w.snappy { - n, err = w.writer.Write([]byte(magicChunkSnappy)) - } else { - n, err = w.writer.Write([]byte(magicChunk)) - } - if err != nil { - return 0, w.err(err) - } - if n != len(magicChunk) { - return 0, w.err(io.ErrShortWrite) - } - w.written += int64(n) - } - - for len(p) > 0 { - var uncompressed []byte - if len(p) > w.blockSize { - uncompressed, p = p[:w.blockSize], p[w.blockSize:] - } else { - uncompressed, p = p, nil - } - - obuf := w.buffers.Get().([]byte)[:w.obufLen] - checksum := crc(uncompressed) - - // Set to uncompressed. - chunkType := uint8(chunkTypeUncompressedData) - chunkLen := 4 + len(uncompressed) - - // Attempt compressing. - n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - - if n2 > 0 { - chunkType = uint8(chunkTypeCompressedData) - chunkLen = 4 + n + n2 - obuf = obuf[:obufHeaderLen+n+n2] - } else { - obuf = obuf[:8] - } - - // Fill in the per-chunk header that comes before the body. - obuf[0] = chunkType - obuf[1] = uint8(chunkLen >> 0) - obuf[2] = uint8(chunkLen >> 8) - obuf[3] = uint8(chunkLen >> 16) - obuf[4] = uint8(checksum >> 0) - obuf[5] = uint8(checksum >> 8) - obuf[6] = uint8(checksum >> 16) - obuf[7] = uint8(checksum >> 24) - - n, err := w.writer.Write(obuf) - if err != nil { - return 0, w.err(err) - } - if n != len(obuf) { - return 0, w.err(io.ErrShortWrite) - } - w.written += int64(n) - if chunkType == chunkTypeUncompressedData { - // Write uncompressed data. - n, err := w.writer.Write(uncompressed) - if err != nil { - return 0, w.err(err) - } - if n != len(uncompressed) { - return 0, w.err(io.ErrShortWrite) - } - w.written += int64(n) - } - w.buffers.Put(obuf) - // Queue final output. - nRet += len(uncompressed) - } - return nRet, nil -} - -// Flush flushes the Writer to its underlying io.Writer. -// This does not apply padding. -func (w *Writer) Flush() error { - if err := w.err(nil); err != nil { - return err - } - - // Queue any data still in input buffer. - if len(w.ibuf) != 0 { - if !w.wroteStreamHeader { - _, err := w.writeSync(w.ibuf) - w.ibuf = w.ibuf[:0] - return w.err(err) - } else { - _, err := w.write(w.ibuf) - w.ibuf = w.ibuf[:0] - err = w.err(err) - if err != nil { - return err - } - } - } - if w.output == nil { - return w.err(nil) - } - - // Send empty buffer - res := make(chan result) - w.output <- res - // Block until this has been picked up. - res <- nil - // When it is closed, we have flushed. - <-res - return w.err(nil) -} - -// Close calls Flush and then closes the Writer. -// Calling Close multiple times is ok. -func (w *Writer) Close() error { - err := w.Flush() - if w.output != nil { - close(w.output) - w.writerWg.Wait() - w.output = nil - } - if w.err(nil) == nil && w.writer != nil && w.pad > 0 { - add := calcSkippableFrame(w.written, int64(w.pad)) - frame, err := skippableFrame(w.ibuf[:0], add, w.randSrc) - if err = w.err(err); err != nil { - return err - } - _, err2 := w.writer.Write(frame) - _ = w.err(err2) - } - _ = w.err(errClosed) - if err == errClosed { - return nil - } - return err -} - -const skippableFrameHeader = 4 - -// calcSkippableFrame will return a total size to be added for written -// to be divisible by multiple. -// The value will always be > skippableFrameHeader. -// The function will panic if written < 0 or wantMultiple <= 0. -func calcSkippableFrame(written, wantMultiple int64) int { - if wantMultiple <= 0 { - panic("wantMultiple <= 0") - } - if written < 0 { - panic("written < 0") - } - leftOver := written % wantMultiple - if leftOver == 0 { - return 0 - } - toAdd := wantMultiple - leftOver - for toAdd < skippableFrameHeader { - toAdd += wantMultiple - } - return int(toAdd) -} - -// skippableFrame will add a skippable frame with a total size of bytes. -// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader -func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) { - if total == 0 { - return dst, nil - } - if total < skippableFrameHeader { - return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total) - } - if int64(total) >= maxBlockSize+skippableFrameHeader { - return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total) - } - // Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)" - dst = append(dst, chunkTypePadding) - f := uint32(total - skippableFrameHeader) - // Add chunk length. - dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16)) - // Add data - start := len(dst) - dst = append(dst, make([]byte, f)...) - _, err := io.ReadFull(r, dst[start:]) - return dst, err -} - -// WriterOption is an option for creating a encoder. -type WriterOption func(*Writer) error - -// WriterConcurrency will set the concurrency, -// meaning the maximum number of decoders to run concurrently. -// The value supplied must be at least 1. -// By default this will be set to GOMAXPROCS. -func WriterConcurrency(n int) WriterOption { - return func(w *Writer) error { - if n <= 0 { - return errors.New("concurrency must be at least 1") - } - w.concurrency = n - return nil - } -} - -// WriterBetterCompression will enable better compression. -// EncodeBetter compresses better than Encode but typically with a -// 10-40% speed decrease on both compression and decompression. -func WriterBetterCompression() WriterOption { - return func(w *Writer) error { - w.level = levelBetter - return nil - } -} - -// WriterBestCompression will enable better compression. -// EncodeBetter compresses better than Encode but typically with a -// big speed decrease on compression. -func WriterBestCompression() WriterOption { - return func(w *Writer) error { - w.level = levelBest - return nil - } -} - -// WriterUncompressed will bypass compression. -// The stream will be written as uncompressed blocks only. -// If concurrency is > 1 CRC and output will still be done async. -func WriterUncompressed() WriterOption { - return func(w *Writer) error { - w.level = levelUncompressed - return nil - } -} - -// WriterBlockSize allows to override the default block size. -// Blocks will be this size or smaller. -// Minimum size is 4KB and and maximum size is 4MB. -// -// Bigger blocks may give bigger throughput on systems with many cores, -// and will increase compression slightly, but it will limit the possible -// concurrency for smaller payloads for both encoding and decoding. -// Default block size is 1MB. -// -// When writing Snappy compatible output using WriterSnappyCompat, -// the maximum block size is 64KB. -func WriterBlockSize(n int) WriterOption { - return func(w *Writer) error { - if w.snappy && n > maxSnappyBlockSize || n < minBlockSize { - return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output") - } - if n > maxBlockSize || n < minBlockSize { - return errors.New("s2: block size too large. Must be <= 4MB and >=4KB") - } - w.blockSize = n - return nil - } -} - -// WriterPadding will add padding to all output so the size will be a multiple of n. -// This can be used to obfuscate the exact output size or make blocks of a certain size. -// The contents will be a skippable frame, so it will be invisible by the decoder. -// n must be > 0 and <= 4MB. -// The padded area will be filled with data from crypto/rand.Reader. -// The padding will be applied whenever Close is called on the writer. -func WriterPadding(n int) WriterOption { - return func(w *Writer) error { - if n <= 0 { - return fmt.Errorf("s2: padding must be at least 1") - } - // No need to waste our time. - if n == 1 { - w.pad = 0 - } - if n > maxBlockSize { - return fmt.Errorf("s2: padding must less than 4MB") - } - w.pad = n - return nil - } -} - -// WriterPaddingSrc will get random data for padding from the supplied source. -// By default crypto/rand is used. -func WriterPaddingSrc(reader io.Reader) WriterOption { - return func(w *Writer) error { - w.randSrc = reader - return nil - } -} - -// WriterSnappyCompat will write snappy compatible output. -// The output can be decompressed using either snappy or s2. -// If block size is more than 64KB it is set to that. -func WriterSnappyCompat() WriterOption { - return func(w *Writer) error { - w.snappy = true - if w.blockSize > 64<<10 { - // We choose 8 bytes less than 64K, since that will make literal emits slightly more effective. - // And allows us to skip some size checks. - w.blockSize = (64 << 10) - 8 - } - return nil - } -} - -// WriterFlushOnWrite will compress blocks on each call to the Write function. -// -// This is quite inefficient as blocks size will depend on the write size. -// -// Use WriterConcurrency(1) to also make sure that output is flushed. -// When Write calls return, otherwise they will be written when compression is done. -func WriterFlushOnWrite() WriterOption { - return func(w *Writer) error { - w.flushOnWrite = true - return nil - } -} diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go deleted file mode 100644 index 8b16c38a68f..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode_all.go +++ /dev/null @@ -1,456 +0,0 @@ -// Copyright 2016 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package s2 - -import ( - "bytes" - "encoding/binary" - "math/bits" -) - -func load32(b []byte, i int) uint32 { - return binary.LittleEndian.Uint32(b[i:]) -} - -func load64(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) -} - -// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash6(u uint64, h uint8) uint32 { - const prime6bytes = 227718039650203 - return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) -} - -func encodeGo(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if len(dst) < n { - dst = make([]byte, n) - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - n := encodeBlockGo(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockGo(dst, src []byte) (d int) { - // Initialize the hash table. - const ( - tableBits = 14 - maxTableSize = 1 << tableBits - - debug = false - ) - - var table [maxTableSize]uint32 - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - - // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 5 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We search for a repeat at -1, but don't output repeats when nextEmit == 0 - repeat := 1 - - for { - candidate := 0 - for { - // Next src position to check - nextS := s + (s-nextEmit)>>6 + 4 - if nextS > sLimit { - goto emitRemainder - } - hash0 := hash6(cv, tableBits) - hash1 := hash6(cv>>8, tableBits) - candidate = int(table[hash0]) - candidate2 := int(table[hash1]) - table[hash0] = uint32(s) - table[hash1] = uint32(s + 1) - hash2 := hash6(cv>>16, tableBits) - - // Check repeat at offset checkRep. - const checkRep = 1 - if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { - base := s + checkRep - // Extend back - for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { - i-- - base-- - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - - // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep - for s <= sLimit { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - if debug { - // Validate match. - if s <= candidate { - panic("s <= candidate") - } - a := src[base:s] - b := src[base-repeat : base-repeat+(s-base)] - if !bytes.Equal(a, b) { - panic("mismatch") - } - } - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], repeat, s-base) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], repeat, s-base) - } - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - cv = load64(src, s) - continue - } - - if uint32(cv) == load32(src, candidate) { - break - } - candidate = int(table[hash2]) - if uint32(cv>>8) == load32(src, candidate2) { - table[hash2] = uint32(s + 2) - candidate = candidate2 - s++ - break - } - table[hash2] = uint32(s + 2) - if uint32(cv>>16) == load32(src, candidate) { - s += 2 - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards. - // The top bytes will be rechecked to get the full match. - for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { - candidate-- - s-- - } - - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - - d += emitLiteral(dst[d:], src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - base := s - repeat = base - candidate - - // Extend the 4-byte match as long as possible. - s += 4 - candidate += 4 - for s <= len(src)-8 { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - - d += emitCopy(dst[d:], repeat, s-base) - if debug { - // Validate match. - if s <= candidate { - panic("s <= candidate") - } - a := src[base:s] - b := src[base-repeat : base-repeat+(s-base)] - if !bytes.Equal(a, b) { - panic("mismatch") - } - } - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Check for an immediate match, otherwise start search at s+1 - x := load64(src, s-2) - m2Hash := hash6(x, tableBits) - currHash := hash6(x>>16, tableBits) - candidate = int(table[currHash]) - table[m2Hash] = uint32(s - 2) - table[currHash] = uint32(s) - if debug && s == candidate { - panic("s == candidate") - } - if uint32(x>>16) != load32(src, candidate) { - cv = load64(src, s+1) - s++ - break - } - } - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} - -func encodeBlockSnappyGo(dst, src []byte) (d int) { - // Initialize the hash table. - const ( - tableBits = 14 - maxTableSize = 1 << tableBits - ) - - var table [maxTableSize]uint32 - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - - // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 5 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We search for a repeat at -1, but don't output repeats when nextEmit == 0 - repeat := 1 - - for { - candidate := 0 - for { - // Next src position to check - nextS := s + (s-nextEmit)>>6 + 4 - if nextS > sLimit { - goto emitRemainder - } - hash0 := hash6(cv, tableBits) - hash1 := hash6(cv>>8, tableBits) - candidate = int(table[hash0]) - candidate2 := int(table[hash1]) - table[hash0] = uint32(s) - table[hash1] = uint32(s + 1) - hash2 := hash6(cv>>16, tableBits) - - // Check repeat at offset checkRep. - const checkRep = 1 - if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { - base := s + checkRep - // Extend back - for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { - i-- - base-- - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - - // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep - for s <= sLimit { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - - d += emitCopyNoRepeat(dst[d:], repeat, s-base) - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - cv = load64(src, s) - continue - } - - if uint32(cv) == load32(src, candidate) { - break - } - candidate = int(table[hash2]) - if uint32(cv>>8) == load32(src, candidate2) { - table[hash2] = uint32(s + 2) - candidate = candidate2 - s++ - break - } - table[hash2] = uint32(s + 2) - if uint32(cv>>16) == load32(src, candidate) { - s += 2 - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards - for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { - candidate-- - s-- - } - - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - - d += emitLiteral(dst[d:], src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - base := s - repeat = base - candidate - - // Extend the 4-byte match as long as possible. - s += 4 - candidate += 4 - for s <= len(src)-8 { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - - d += emitCopyNoRepeat(dst[d:], repeat, s-base) - if false { - // Validate match. - a := src[base:s] - b := src[base-repeat : base-repeat+(s-base)] - if !bytes.Equal(a, b) { - panic("mismatch") - } - } - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Check for an immediate match, otherwise start search at s+1 - x := load64(src, s-2) - m2Hash := hash6(x, tableBits) - currHash := hash6(x>>16, tableBits) - candidate = int(table[currHash]) - table[m2Hash] = uint32(s - 2) - table[currHash] = uint32(s) - if uint32(x>>16) != load32(src, candidate) { - cv = load64(src, s+1) - s++ - break - } - } - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go deleted file mode 100644 index e612225f4d3..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go +++ /dev/null @@ -1,142 +0,0 @@ -//go:build !appengine && !noasm && gc -// +build !appengine,!noasm,gc - -package s2 - -// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlock(dst, src []byte) (d int) { - const ( - // Use 12 bit table when less than... - limit12B = 16 << 10 - // Use 10 bit table when less than... - limit10B = 4 << 10 - // Use 8 bit table when less than... - limit8B = 512 - ) - - if len(src) >= 4<<20 { - return encodeBlockAsm(dst, src) - } - if len(src) >= limit12B { - return encodeBlockAsm4MB(dst, src) - } - if len(src) >= limit10B { - return encodeBlockAsm12B(dst, src) - } - if len(src) >= limit8B { - return encodeBlockAsm10B(dst, src) - } - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeBlockAsm8B(dst, src) -} - -// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetter(dst, src []byte) (d int) { - const ( - // Use 12 bit table when less than... - limit12B = 16 << 10 - // Use 10 bit table when less than... - limit10B = 4 << 10 - // Use 8 bit table when less than... - limit8B = 512 - ) - - if len(src) > 4<<20 { - return encodeBetterBlockAsm(dst, src) - } - if len(src) >= limit12B { - return encodeBetterBlockAsm4MB(dst, src) - } - if len(src) >= limit10B { - return encodeBetterBlockAsm12B(dst, src) - } - if len(src) >= limit8B { - return encodeBetterBlockAsm10B(dst, src) - } - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeBetterBlockAsm8B(dst, src) -} - -// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockSnappy(dst, src []byte) (d int) { - const ( - // Use 12 bit table when less than... - limit12B = 16 << 10 - // Use 10 bit table when less than... - limit10B = 4 << 10 - // Use 8 bit table when less than... - limit8B = 512 - ) - if len(src) >= 64<<10 { - return encodeSnappyBlockAsm(dst, src) - } - if len(src) >= limit12B { - return encodeSnappyBlockAsm64K(dst, src) - } - if len(src) >= limit10B { - return encodeSnappyBlockAsm12B(dst, src) - } - if len(src) >= limit8B { - return encodeSnappyBlockAsm10B(dst, src) - } - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeSnappyBlockAsm8B(dst, src) -} - -// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetterSnappy(dst, src []byte) (d int) { - const ( - // Use 12 bit table when less than... - limit12B = 16 << 10 - // Use 10 bit table when less than... - limit10B = 4 << 10 - // Use 8 bit table when less than... - limit8B = 512 - ) - if len(src) >= 64<<10 { - return encodeSnappyBetterBlockAsm(dst, src) - } - if len(src) >= limit12B { - return encodeSnappyBetterBlockAsm64K(dst, src) - } - if len(src) >= limit10B { - return encodeSnappyBetterBlockAsm12B(dst, src) - } - if len(src) >= limit8B { - return encodeSnappyBetterBlockAsm10B(dst, src) - } - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeSnappyBetterBlockAsm8B(dst, src) -} diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go deleted file mode 100644 index 4480347769a..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ /dev/null @@ -1,604 +0,0 @@ -// Copyright 2016 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package s2 - -import ( - "fmt" - "math/bits" -) - -// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBest(dst, src []byte) (d int) { - // Initialize the hash tables. - const ( - // Long hash matches. - lTableBits = 19 - maxLTableSize = 1 << lTableBits - - // Short hash matches. - sTableBits = 16 - maxSTableSize = 1 << sTableBits - - inputMargin = 8 + 2 - ) - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - if len(src) < minNonLiteralBlockSize { - return 0 - } - - var lTable [maxLTableSize]uint64 - var sTable [maxSTableSize]uint64 - - // Bail if we can't compress to at least this. - dstLimit := len(src) - 5 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We search for a repeat at -1, but don't output repeats when nextEmit == 0 - repeat := 1 - const lowbitMask = 0xffffffff - getCur := func(x uint64) int { - return int(x & lowbitMask) - } - getPrev := func(x uint64) int { - return int(x >> 32) - } - const maxSkip = 64 - - for { - type match struct { - offset int - s int - length int - score int - rep bool - } - var best match - for { - // Next src position to check - nextS := (s-nextEmit)>>8 + 1 - if nextS > maxSkip { - nextS = s + maxSkip - } else { - nextS += s - } - if nextS > sLimit { - goto emitRemainder - } - hashL := hash8(cv, lTableBits) - hashS := hash4(cv, sTableBits) - candidateL := lTable[hashL] - candidateS := sTable[hashS] - - score := func(m match) int { - // Matches that are longer forward are penalized since we must emit it as a literal. - score := m.length - m.s - if nextEmit == m.s { - // If we do not have to emit literals, we save 1 byte - score++ - } - offset := m.s - m.offset - if m.rep { - return score - emitRepeatSize(offset, m.length) - } - return score - emitCopySize(offset, m.length) - } - - matchAt := func(offset, s int, first uint32, rep bool) match { - if best.length != 0 && best.s-best.offset == s-offset { - // Don't retest if we have the same offset. - return match{offset: offset, s: s} - } - if load32(src, offset) != first { - return match{offset: offset, s: s} - } - m := match{offset: offset, s: s, length: 4 + offset, rep: rep} - s += 4 - for s <= sLimit { - if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { - m.length += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - m.length += 8 - } - m.length -= offset - m.score = score(m) - if m.score <= -m.s { - // Eliminate if no savings, we might find a better one. - m.length = 0 - } - return m - } - - bestOf := func(a, b match) match { - if b.length == 0 { - return a - } - if a.length == 0 { - return b - } - as := a.score + b.s - bs := b.score + a.s - if as >= bs { - return a - } - return b - } - - best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false)) - best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false)) - best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false)) - - { - best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true)) - if best.length > 0 { - // s+1 - nextShort := sTable[hash4(cv>>8, sTableBits)] - s := s + 1 - cv := load64(src, s) - nextLong := lTable[hash8(cv, lTableBits)] - best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false)) - best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false)) - best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) - best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) - // Repeat at + 2 - best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true)) - - // s+2 - if true { - nextShort = sTable[hash4(cv>>8, sTableBits)] - s++ - cv = load64(src, s) - nextLong = lTable[hash8(cv, lTableBits)] - best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false)) - best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false)) - best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) - best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) - } - // Search for a match at best match end, see if that is better. - if sAt := best.s + best.length; sAt < sLimit { - sBack := best.s - backL := best.length - // Load initial values - cv = load64(src, sBack) - // Search for mismatch - next := lTable[hash8(load64(src, sAt), lTableBits)] - //next := sTable[hash4(load64(src, sAt), sTableBits)] - - if checkAt := getCur(next) - backL; checkAt > 0 { - best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) - } - if checkAt := getPrev(next) - backL; checkAt > 0 { - best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) - } - } - } - } - - // Update table - lTable[hashL] = uint64(s) | candidateL<<32 - sTable[hashS] = uint64(s) | candidateS<<32 - - if best.length > 0 { - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards, not needed for repeats... - s = best.s - if !best.rep { - for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] { - best.offset-- - best.length++ - s-- - } - } - if false && best.offset >= s { - panic(fmt.Errorf("t %d >= s %d", best.offset, s)) - } - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - base := s - offset := s - best.offset - - s += best.length - - if offset > 65535 && s-base <= 5 && !best.rep { - // Bail if the match is equal or worse to the encoding. - s = best.s + 1 - if s >= sLimit { - goto emitRemainder - } - cv = load64(src, s) - continue - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - if best.rep { - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], offset, best.length) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], offset, best.length) - } - } else { - d += emitCopy(dst[d:], offset, best.length) - } - repeat = offset - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Fill tables... - for i := best.s + 1; i < s; i++ { - cv0 := load64(src, i) - long0 := hash8(cv0, lTableBits) - short0 := hash4(cv0, sTableBits) - lTable[long0] = uint64(i) | lTable[long0]<<32 - sTable[short0] = uint64(i) | sTable[short0]<<32 - } - cv = load64(src, s) - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} - -// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBestSnappy(dst, src []byte) (d int) { - // Initialize the hash tables. - const ( - // Long hash matches. - lTableBits = 19 - maxLTableSize = 1 << lTableBits - - // Short hash matches. - sTableBits = 16 - maxSTableSize = 1 << sTableBits - - inputMargin = 8 + 2 - ) - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - if len(src) < minNonLiteralBlockSize { - return 0 - } - - var lTable [maxLTableSize]uint64 - var sTable [maxSTableSize]uint64 - - // Bail if we can't compress to at least this. - dstLimit := len(src) - 5 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We search for a repeat at -1, but don't output repeats when nextEmit == 0 - repeat := 1 - const lowbitMask = 0xffffffff - getCur := func(x uint64) int { - return int(x & lowbitMask) - } - getPrev := func(x uint64) int { - return int(x >> 32) - } - const maxSkip = 64 - - for { - type match struct { - offset int - s int - length int - score int - } - var best match - for { - // Next src position to check - nextS := (s-nextEmit)>>8 + 1 - if nextS > maxSkip { - nextS = s + maxSkip - } else { - nextS += s - } - if nextS > sLimit { - goto emitRemainder - } - hashL := hash8(cv, lTableBits) - hashS := hash4(cv, sTableBits) - candidateL := lTable[hashL] - candidateS := sTable[hashS] - - score := func(m match) int { - // Matches that are longer forward are penalized since we must emit it as a literal. - score := m.length - m.s - if nextEmit == m.s { - // If we do not have to emit literals, we save 1 byte - score++ - } - offset := m.s - m.offset - - return score - emitCopySize(offset, m.length) - } - - matchAt := func(offset, s int, first uint32) match { - if best.length != 0 && best.s-best.offset == s-offset { - // Don't retest if we have the same offset. - return match{offset: offset, s: s} - } - if load32(src, offset) != first { - return match{offset: offset, s: s} - } - m := match{offset: offset, s: s, length: 4 + offset} - s += 4 - for s <= sLimit { - if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { - m.length += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - m.length += 8 - } - m.length -= offset - m.score = score(m) - if m.score <= -m.s { - // Eliminate if no savings, we might find a better one. - m.length = 0 - } - return m - } - - bestOf := func(a, b match) match { - if b.length == 0 { - return a - } - if a.length == 0 { - return b - } - as := a.score + b.s - bs := b.score + a.s - if as >= bs { - return a - } - return b - } - - best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv))) - best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv))) - best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv))) - - { - best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) - if best.length > 0 { - // s+1 - nextShort := sTable[hash4(cv>>8, sTableBits)] - s := s + 1 - cv := load64(src, s) - nextLong := lTable[hash8(cv, lTableBits)] - best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) - best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) - best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) - best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) - // Repeat at + 2 - best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) - - // s+2 - if true { - nextShort = sTable[hash4(cv>>8, sTableBits)] - s++ - cv = load64(src, s) - nextLong = lTable[hash8(cv, lTableBits)] - best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) - best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) - best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) - best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) - } - // Search for a match at best match end, see if that is better. - if sAt := best.s + best.length; sAt < sLimit { - sBack := best.s - backL := best.length - // Load initial values - cv = load64(src, sBack) - // Search for mismatch - next := lTable[hash8(load64(src, sAt), lTableBits)] - //next := sTable[hash4(load64(src, sAt), sTableBits)] - - if checkAt := getCur(next) - backL; checkAt > 0 { - best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) - } - if checkAt := getPrev(next) - backL; checkAt > 0 { - best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) - } - } - } - } - - // Update table - lTable[hashL] = uint64(s) | candidateL<<32 - sTable[hashS] = uint64(s) | candidateS<<32 - - if best.length > 0 { - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards, not needed for repeats... - s = best.s - if true { - for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] { - best.offset-- - best.length++ - s-- - } - } - if false && best.offset >= s { - panic(fmt.Errorf("t %d >= s %d", best.offset, s)) - } - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - base := s - offset := s - best.offset - - s += best.length - - if offset > 65535 && s-base <= 5 { - // Bail if the match is equal or worse to the encoding. - s = best.s + 1 - if s >= sLimit { - goto emitRemainder - } - cv = load64(src, s) - continue - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - d += emitCopyNoRepeat(dst[d:], offset, best.length) - repeat = offset - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Fill tables... - for i := best.s + 1; i < s; i++ { - cv0 := load64(src, i) - long0 := hash8(cv0, lTableBits) - short0 := hash4(cv0, sTableBits) - lTable[long0] = uint64(i) | lTable[long0]<<32 - sTable[short0] = uint64(i) | sTable[short0]<<32 - } - cv = load64(src, s) - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} - -// emitCopySize returns the size to encode the offset+length -// -// It assumes that: -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -func emitCopySize(offset, length int) int { - if offset >= 65536 { - i := 0 - if length > 64 { - length -= 64 - if length >= 4 { - // Emit remaining as repeats - return 5 + emitRepeatSize(offset, length) - } - i = 5 - } - if length == 0 { - return i - } - return i + 5 - } - - // Offset no more than 2 bytes. - if length > 64 { - // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeatSize(offset, length-60) - } - if length >= 12 || offset >= 2048 { - return 3 - } - // Emit the remaining copy, encoded as 2 bytes. - return 2 -} - -// emitRepeatSize returns the number of bytes required to encode a repeat. -// Length must be at least 4 and < 1<<24 -func emitRepeatSize(offset, length int) int { - // Repeat offset, make length cheaper - if length <= 4+4 || (length < 8+4 && offset < 2048) { - return 2 - } - if length < (1<<8)+4+4 { - return 3 - } - if length < (1<<16)+(1<<8)+4 { - return 4 - } - const maxRepeat = (1 << 24) - 1 - length -= (1 << 16) - 4 - left := 0 - if length > maxRepeat { - left = length - maxRepeat + 4 - length = maxRepeat - 4 - } - if left > 0 { - return 5 + emitRepeatSize(offset, left) - } - return 5 -} diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go deleted file mode 100644 index 943215b8ae8..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode_better.go +++ /dev/null @@ -1,431 +0,0 @@ -// Copyright 2016 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package s2 - -import ( - "math/bits" -) - -// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4(u uint64, h uint8) uint32 { - const prime4bytes = 2654435761 - return (uint32(u) * prime4bytes) >> ((32 - h) & 31) -} - -// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash5(u uint64, h uint8) uint32 { - const prime5bytes = 889523592379 - return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63)) -} - -// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash7(u uint64, h uint8) uint32 { - const prime7bytes = 58295818150454627 - return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) -} - -// hash8 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash8(u uint64, h uint8) uint32 { - const prime8bytes = 0xcf1bbcdcb7a56463 - return uint32((u * prime8bytes) >> ((64 - h) & 63)) -} - -// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetterGo(dst, src []byte) (d int) { - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - if len(src) < minNonLiteralBlockSize { - return 0 - } - - // Initialize the hash tables. - const ( - // Long hash matches. - lTableBits = 16 - maxLTableSize = 1 << lTableBits - - // Short hash matches. - sTableBits = 14 - maxSTableSize = 1 << sTableBits - ) - - var lTable [maxLTableSize]uint32 - var sTable [maxSTableSize]uint32 - - // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 6 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We initialize repeat to 0, so we never match on first attempt - repeat := 0 - - for { - candidateL := 0 - nextS := 0 - for { - // Next src position to check - nextS = s + (s-nextEmit)>>7 + 1 - if nextS > sLimit { - goto emitRemainder - } - hashL := hash7(cv, lTableBits) - hashS := hash4(cv, sTableBits) - candidateL = int(lTable[hashL]) - candidateS := int(sTable[hashS]) - lTable[hashL] = uint32(s) - sTable[hashS] = uint32(s) - - // Check repeat at offset checkRep. - const checkRep = 1 - if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { - base := s + checkRep - // Extend back - for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { - i-- - base-- - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - - // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep - for s < len(src) { - if len(src)-s < 8 { - if src[s] == src[candidate] { - s++ - candidate++ - continue - } - break - } - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], repeat, s-base) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], repeat, s-base) - } - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - cv = load64(src, s) - continue - } - - if uint32(cv) == load32(src, candidateL) { - break - } - - // Check our short candidate - if uint32(cv) == load32(src, candidateS) { - // Try a long candidate at s+1 - hashL = hash7(cv>>8, lTableBits) - candidateL = int(lTable[hashL]) - lTable[hashL] = uint32(s + 1) - if uint32(cv>>8) == load32(src, candidateL) { - s++ - break - } - // Use our short candidate. - candidateL = candidateS - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards - for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { - candidateL-- - s-- - } - - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - base := s - offset := base - candidateL - - // Extend the 4-byte match as long as possible. - s += 4 - candidateL += 4 - for s < len(src) { - if len(src)-s < 8 { - if src[s] == src[candidateL] { - s++ - candidateL++ - continue - } - break - } - if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidateL += 8 - } - - if offset > 65535 && s-base <= 5 && repeat != offset { - // Bail if the match is equal or worse to the encoding. - s = nextS + 1 - if s >= sLimit { - goto emitRemainder - } - cv = load64(src, s) - continue - } - - d += emitLiteral(dst[d:], src[nextEmit:base]) - if repeat == offset { - d += emitRepeat(dst[d:], offset, s-base) - } else { - d += emitCopy(dst[d:], offset, s-base) - repeat = offset - } - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Index match start+1 (long) and start+2 (short) - index0 := base + 1 - // Index match end-2 (long) and end-1 (short) - index1 := s - 2 - - cv0 := load64(src, index0) - cv1 := load64(src, index1) - cv = load64(src, s) - lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) - sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) - sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} - -// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - if len(src) < minNonLiteralBlockSize { - return 0 - } - - // Initialize the hash tables. - const ( - // Long hash matches. - lTableBits = 16 - maxLTableSize = 1 << lTableBits - - // Short hash matches. - sTableBits = 14 - maxSTableSize = 1 << sTableBits - ) - - var lTable [maxLTableSize]uint32 - var sTable [maxSTableSize]uint32 - - // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 6 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We initialize repeat to 0, so we never match on first attempt - repeat := 0 - const maxSkip = 100 - - for { - candidateL := 0 - nextS := 0 - for { - // Next src position to check - nextS = (s-nextEmit)>>7 + 1 - if nextS > maxSkip { - nextS = s + maxSkip - } else { - nextS += s - } - - if nextS > sLimit { - goto emitRemainder - } - hashL := hash7(cv, lTableBits) - hashS := hash4(cv, sTableBits) - candidateL = int(lTable[hashL]) - candidateS := int(sTable[hashS]) - lTable[hashL] = uint32(s) - sTable[hashS] = uint32(s) - - if uint32(cv) == load32(src, candidateL) { - break - } - - // Check our short candidate - if uint32(cv) == load32(src, candidateS) { - // Try a long candidate at s+1 - hashL = hash7(cv>>8, lTableBits) - candidateL = int(lTable[hashL]) - lTable[hashL] = uint32(s + 1) - if uint32(cv>>8) == load32(src, candidateL) { - s++ - break - } - // Use our short candidate. - candidateL = candidateS - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards - for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { - candidateL-- - s-- - } - - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - base := s - offset := base - candidateL - - // Extend the 4-byte match as long as possible. - s += 4 - candidateL += 4 - for s < len(src) { - if len(src)-s < 8 { - if src[s] == src[candidateL] { - s++ - candidateL++ - continue - } - break - } - if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidateL += 8 - } - - if offset > 65535 && s-base <= 5 && repeat != offset { - // Bail if the match is equal or worse to the encoding. - s = nextS + 1 - if s >= sLimit { - goto emitRemainder - } - cv = load64(src, s) - continue - } - - d += emitLiteral(dst[d:], src[nextEmit:base]) - d += emitCopyNoRepeat(dst[d:], offset, s-base) - repeat = offset - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Index match start+1 (long) and start+2 (short) - index0 := base + 1 - // Index match end-2 (long) and end-1 (short) - index1 := s - 2 - - cv0 := load64(src, index0) - cv1 := load64(src, index1) - cv = load64(src, s) - lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) - sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) - sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go deleted file mode 100644 index 43d43534e4f..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ /dev/null @@ -1,298 +0,0 @@ -//go:build !amd64 || appengine || !gc || noasm -// +build !amd64 appengine !gc noasm - -package s2 - -import ( - "math/bits" -) - -// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) -func encodeBlock(dst, src []byte) (d int) { - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeBlockGo(dst, src) -} - -// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) -func encodeBlockBetter(dst, src []byte) (d int) { - return encodeBlockBetterGo(dst, src) -} - -// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) -func encodeBlockBetterSnappy(dst, src []byte) (d int) { - return encodeBlockBetterSnappyGo(dst, src) -} - -// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It -// assumes that the varint-encoded length of the decompressed bytes has already -// been written. -// -// It also assumes that: -// len(dst) >= MaxEncodedLen(len(src)) -func encodeBlockSnappy(dst, src []byte) (d int) { - if len(src) < minNonLiteralBlockSize { - return 0 - } - return encodeBlockSnappyGo(dst, src) -} - -// emitLiteral writes a literal chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 0 <= len(lit) && len(lit) <= math.MaxUint32 -func emitLiteral(dst, lit []byte) int { - if len(lit) == 0 { - return 0 - } - const num = 63<<2 | tagLiteral - i, n := 0, uint(len(lit)-1) - switch { - case n < 60: - dst[0] = uint8(n)<<2 | tagLiteral - i = 1 - case n < 1<<8: - dst[1] = uint8(n) - dst[0] = 60<<2 | tagLiteral - i = 2 - case n < 1<<16: - dst[2] = uint8(n >> 8) - dst[1] = uint8(n) - dst[0] = 61<<2 | tagLiteral - i = 3 - case n < 1<<24: - dst[3] = uint8(n >> 16) - dst[2] = uint8(n >> 8) - dst[1] = uint8(n) - dst[0] = 62<<2 | tagLiteral - i = 4 - default: - dst[4] = uint8(n >> 24) - dst[3] = uint8(n >> 16) - dst[2] = uint8(n >> 8) - dst[1] = uint8(n) - dst[0] = 63<<2 | tagLiteral - i = 5 - } - return i + copy(dst[i:], lit) -} - -// emitRepeat writes a repeat chunk and returns the number of bytes written. -// Length must be at least 4 and < 1<<24 -func emitRepeat(dst []byte, offset, length int) int { - // Repeat offset, make length cheaper - length -= 4 - if length <= 4 { - dst[0] = uint8(length)<<2 | tagCopy1 - dst[1] = 0 - return 2 - } - if length < 8 && offset < 2048 { - // Encode WITH offset - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 - return 2 - } - if length < (1<<8)+4 { - length -= 4 - dst[2] = uint8(length) - dst[1] = 0 - dst[0] = 5<<2 | tagCopy1 - return 3 - } - if length < (1<<16)+(1<<8) { - length -= 1 << 8 - dst[3] = uint8(length >> 8) - dst[2] = uint8(length >> 0) - dst[1] = 0 - dst[0] = 6<<2 | tagCopy1 - return 4 - } - const maxRepeat = (1 << 24) - 1 - length -= 1 << 16 - left := 0 - if length > maxRepeat { - left = length - maxRepeat + 4 - length = maxRepeat - 4 - } - dst[4] = uint8(length >> 16) - dst[3] = uint8(length >> 8) - dst[2] = uint8(length >> 0) - dst[1] = 0 - dst[0] = 7<<2 | tagCopy1 - if left > 0 { - return 5 + emitRepeat(dst[5:], offset, left) - } - return 5 -} - -// emitCopy writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -func emitCopy(dst []byte, offset, length int) int { - if offset >= 65536 { - i := 0 - if length > 64 { - // Emit a length 64 copy, encoded as 5 bytes. - dst[4] = uint8(offset >> 24) - dst[3] = uint8(offset >> 16) - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 63<<2 | tagCopy4 - length -= 64 - if length >= 4 { - // Emit remaining as repeats - return 5 + emitRepeat(dst[5:], offset, length) - } - i = 5 - } - if length == 0 { - return i - } - // Emit a copy, offset encoded as 4 bytes. - dst[i+0] = uint8(length-1)<<2 | tagCopy4 - dst[i+1] = uint8(offset) - dst[i+2] = uint8(offset >> 8) - dst[i+3] = uint8(offset >> 16) - dst[i+4] = uint8(offset >> 24) - return i + 5 - } - - // Offset no more than 2 bytes. - if length > 64 { - // Emit a length 60 copy, encoded as 3 bytes. - // Emit remaining as repeat value (minimum 4 bytes). - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 59<<2 | tagCopy2 - length -= 60 - // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeat(dst[3:], offset, length) - } - if length >= 12 || offset >= 2048 { - // Emit the remaining copy, encoded as 3 bytes. - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = uint8(length-1)<<2 | tagCopy2 - return 3 - } - // Emit the remaining copy, encoded as 2 bytes. - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 - return 2 -} - -// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -func emitCopyNoRepeat(dst []byte, offset, length int) int { - if offset >= 65536 { - i := 0 - if length > 64 { - // Emit a length 64 copy, encoded as 5 bytes. - dst[4] = uint8(offset >> 24) - dst[3] = uint8(offset >> 16) - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 63<<2 | tagCopy4 - length -= 64 - if length >= 4 { - // Emit remaining as repeats - return 5 + emitCopyNoRepeat(dst[5:], offset, length) - } - i = 5 - } - if length == 0 { - return i - } - // Emit a copy, offset encoded as 4 bytes. - dst[i+0] = uint8(length-1)<<2 | tagCopy4 - dst[i+1] = uint8(offset) - dst[i+2] = uint8(offset >> 8) - dst[i+3] = uint8(offset >> 16) - dst[i+4] = uint8(offset >> 24) - return i + 5 - } - - // Offset no more than 2 bytes. - if length > 64 { - // Emit a length 60 copy, encoded as 3 bytes. - // Emit remaining as repeat value (minimum 4 bytes). - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 59<<2 | tagCopy2 - length -= 60 - // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitCopyNoRepeat(dst[3:], offset, length) - } - if length >= 12 || offset >= 2048 { - // Emit the remaining copy, encoded as 3 bytes. - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = uint8(length-1)<<2 | tagCopy2 - return 3 - } - // Emit the remaining copy, encoded as 2 bytes. - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 - return 2 -} - -// matchLen returns how many bytes match in a and b -// -// It assumes that: -// len(a) <= len(b) -// -func matchLen(a []byte, b []byte) int { - b = b[:len(a)] - var checked int - if len(a) > 4 { - // Try 4 bytes first - if diff := load32(a, 0) ^ load32(b, 0); diff != 0 { - return bits.TrailingZeros32(diff) >> 3 - } - // Switch to 8 byte matching. - checked = 4 - a = a[4:] - b = b[4:] - for len(a) >= 8 { - b = b[:len(a)] - if diff := load64(a, 0) ^ load64(b, 0); diff != 0 { - return checked + (bits.TrailingZeros64(diff) >> 3) - } - checked += 8 - a = a[8:] - b = b[8:] - } - } - b = b[:len(a)] - for i := range a { - if a[i] != b[i] { - return int(i) + checked - } - } - return len(a) + checked -} diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go deleted file mode 100644 index c8cf7b69e81..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ /dev/null @@ -1,189 +0,0 @@ -// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. - -//go:build !appengine && !noasm && gc -// +build !appengine,!noasm,gc - -package s2 - -// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm(dst []byte, src []byte) int - -// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4194304 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm4MB(dst []byte, src []byte) int - -// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm12B(dst []byte, src []byte) int - -// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm10B(dst []byte, src []byte) int - -// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm8B(dst []byte, src []byte) int - -// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm(dst []byte, src []byte) int - -// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4194304 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm4MB(dst []byte, src []byte) int - -// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm12B(dst []byte, src []byte) int - -// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm10B(dst []byte, src []byte) int - -// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm8B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm(dst []byte, src []byte) int - -// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 65535 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm64K(dst []byte, src []byte) int - -// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm12B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm10B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm8B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 65535 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int - -// emitLiteral writes a literal chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes with margin of 0 bytes -// 0 <= len(lit) && len(lit) <= math.MaxUint32 -// -//go:noescape -func emitLiteral(dst []byte, lit []byte) int - -// emitRepeat writes a repeat chunk and returns the number of bytes written. -// Length must be at least 4 and < 1<<32 -// -//go:noescape -func emitRepeat(dst []byte, offset int, length int) int - -// emitCopy writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -// -//go:noescape -func emitCopy(dst []byte, offset int, length int) int - -// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -// -//go:noescape -func emitCopyNoRepeat(dst []byte, offset int, length int) int - -// matchLen returns how many bytes match in a and b -// -// It assumes that: -// len(a) <= len(b) -// -//go:noescape -func matchLen(a []byte, b []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s deleted file mode 100644 index 1ac65a0e352..00000000000 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ /dev/null @@ -1,15678 +0,0 @@ -// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. - -// +build !appengine -// +build !noasm -// +build gc - -#include "textflag.h" - -// func encodeBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm - -repeat_extend_back_loop_encodeBlockAsm: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm - -repeat_extend_back_end_encodeBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_repeat_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -four_bytes_repeat_emit_encodeBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -three_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -two_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm - JMP memmove_long_repeat_emit_encodeBlockAsm - -one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm - -memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm - -matchlen_loopback_repeat_extend_encodeBlockAsm: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_loop_repeat_extend_encodeBlockAsm: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm - -matchlen_single_repeat_extend_encodeBlockAsm: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_single_loopback_repeat_extend_encodeBlockAsm: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm - -repeat_extend_forward_end_encodeBlockAsm: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm - -cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBlockAsm - CMPL SI, $0x0100ffff - JLT repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_repeat_encodeBlockAsm - -repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_as_copy_encodeBlockAsm: - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm - -four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -emit_copy_three_repeat_as_copy_encodeBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm - -no_repeat_found_encodeBlockAsm: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm - -candidate3_match_encodeBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm - -candidate2_match_encodeBlockAsm: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm - -match_extend_back_loop_encodeBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm - JMP match_extend_back_loop_encodeBlockAsm - -match_extend_back_end_encodeBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x01000000 - JLT four_bytes_match_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBlockAsm - -four_bytes_match_emit_encodeBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm - -three_bytes_match_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm - -two_bytes_match_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm - JMP memmove_long_match_emit_encodeBlockAsm - -one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm - -memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm: -match_nolit_loop_encodeBlockAsm: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm - -matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm - -matchlen_loop_match_nolit_encodeBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm - -matchlen_single_match_nolit_encodeBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm - -matchlen_single_loopback_match_nolit_encodeBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm - -match_nolit_end_encodeBlockAsm: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeBlockAsm - -four_bytes_loop_back_match_nolit_encodeBlockAsm: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x0100ffff - JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(R10), R10 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm - -four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_match_nolit_encodeBlockAsm: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x0100ffff - JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(R10), R10 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP two_byte_offset_match_nolit_encodeBlockAsm - -two_byte_offset_short_match_nolit_encodeBlockAsm: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -emit_copy_three_match_nolit_encodeBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm - INCL CX - JMP search_loop_encodeBlockAsm - -emit_remainder_encodeBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -four_bytes_emit_remainder_encodeBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -three_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -two_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm - JMP memmove_long_emit_remainder_encodeBlockAsm - -one_byte_emit_remainder_encodeBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm - -memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBlockAsm4MB(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm4MB - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm4MB - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm4MB - -repeat_extend_back_loop_encodeBlockAsm4MB: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm4MB - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm4MB - -repeat_extend_back_end_encodeBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -three_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -two_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm4MB - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -one_byte_repeat_emit_encodeBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB - -memmove_long_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm4MB: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm4MB - -matchlen_loopback_repeat_extend_encodeBlockAsm4MB: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_loop_repeat_extend_encodeBlockAsm4MB: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB - -matchlen_single_repeat_extend_encodeBlockAsm4MB: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm4MB - LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB - -repeat_extend_forward_end_encodeBlockAsm4MB: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm4MB - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB - -cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm4MB - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBlockAsm4MB - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_match_repeat_encodeBlockAsm4MB: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_match_repeat_encodeBlockAsm4MB: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_match_repeat_encodeBlockAsm4MB: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_match_repeat_encodeBlockAsm4MB: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_as_copy_encodeBlockAsm4MB: - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - -four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB - -four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm4MB: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm4MB - -no_repeat_found_encodeBlockAsm4MB: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm4MB - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm4MB - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm4MB - -candidate3_match_encodeBlockAsm4MB: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm4MB - -candidate2_match_encodeBlockAsm4MB: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm4MB - -match_extend_back_loop_encodeBlockAsm4MB: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm4MB - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm4MB - JMP match_extend_back_loop_encodeBlockAsm4MB - -match_extend_back_end_encodeBlockAsm4MB: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm4MB: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeBlockAsm4MB - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -three_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -two_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm4MB - JMP memmove_long_match_emit_encodeBlockAsm4MB - -one_byte_match_emit_encodeBlockAsm4MB: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm4MB: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm4MB - -memmove_long_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm4MB: -match_nolit_loop_encodeBlockAsm4MB: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm4MB - -matchlen_loopback_match_nolit_encodeBlockAsm4MB: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_loop_match_nolit_encodeBlockAsm4MB: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB - -matchlen_single_match_nolit_encodeBlockAsm4MB: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm4MB - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB - -match_nolit_end_encodeBlockAsm4MB: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeBlockAsm4MB - -four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeBlockAsm4MB - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBlockAsm4MB: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -emit_copy_three_match_nolit_encodeBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm4MB: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm4MB - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm4MB: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm4MB - INCL CX - JMP search_loop_encodeBlockAsm4MB - -emit_remainder_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -three_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -two_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -one_byte_emit_remainder_encodeBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB - -memmove_long_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm12B - -repeat_extend_back_loop_encodeBlockAsm12B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm12B - -repeat_extend_back_end_encodeBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -two_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B - -memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm12B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm12B - -matchlen_loopback_repeat_extend_encodeBlockAsm12B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_loop_repeat_extend_encodeBlockAsm12B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B - -matchlen_single_repeat_extend_encodeBlockAsm12B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B - -repeat_extend_forward_end_encodeBlockAsm12B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm12B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm12B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm12B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm12B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_as_copy_encodeBlockAsm12B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm12B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm12B - -no_repeat_found_encodeBlockAsm12B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm12B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm12B - -candidate3_match_encodeBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm12B - -candidate2_match_encodeBlockAsm12B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm12B - -match_extend_back_loop_encodeBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B - -match_extend_back_end_encodeBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm12B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm12B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm12B - -two_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B - -one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm12B - -memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm12B - -matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm12B - -matchlen_single_match_nolit_encodeBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B - -match_nolit_end_encodeBlockAsm12B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm12B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBlockAsm12B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -emit_copy_three_match_nolit_encodeBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm12B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm12B - INCL CX - JMP search_loop_encodeBlockAsm12B - -emit_remainder_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -two_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -one_byte_emit_remainder_encodeBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B - -memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm10B - -repeat_extend_back_loop_encodeBlockAsm10B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm10B - -repeat_extend_back_end_encodeBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -two_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -one_byte_repeat_emit_encodeBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B - -memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm10B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm10B - -matchlen_loopback_repeat_extend_encodeBlockAsm10B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_loop_repeat_extend_encodeBlockAsm10B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B - -matchlen_single_repeat_extend_encodeBlockAsm10B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B - -repeat_extend_forward_end_encodeBlockAsm10B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm10B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm10B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm10B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm10B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_as_copy_encodeBlockAsm10B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm10B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm10B - -no_repeat_found_encodeBlockAsm10B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm10B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm10B - -candidate3_match_encodeBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm10B - -candidate2_match_encodeBlockAsm10B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm10B - -match_extend_back_loop_encodeBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B - -match_extend_back_end_encodeBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm10B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm10B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm10B - -two_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B - -one_byte_match_emit_encodeBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm10B - -memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm10B - -matchlen_loopback_match_nolit_encodeBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_loop_match_nolit_encodeBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm10B - -matchlen_single_match_nolit_encodeBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B - -match_nolit_end_encodeBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBlockAsm10B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm10B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -emit_copy_three_match_nolit_encodeBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm10B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm10B - INCL CX - JMP search_loop_encodeBlockAsm10B - -emit_remainder_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -two_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -one_byte_emit_remainder_encodeBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B - -memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm8B - -repeat_extend_back_loop_encodeBlockAsm8B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm8B - -repeat_extend_back_end_encodeBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -two_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -one_byte_repeat_emit_encodeBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B - -memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm8B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm8B - -matchlen_loopback_repeat_extend_encodeBlockAsm8B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_loop_repeat_extend_encodeBlockAsm8B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B - -matchlen_single_repeat_extend_encodeBlockAsm8B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B - -repeat_extend_forward_end_encodeBlockAsm8B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm8B - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm8B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_as_copy_encodeBlockAsm8B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm8B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm8B - -no_repeat_found_encodeBlockAsm8B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm8B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm8B - -candidate3_match_encodeBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm8B - -candidate2_match_encodeBlockAsm8B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm8B - -match_extend_back_loop_encodeBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B - -match_extend_back_end_encodeBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm8B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm8B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm8B - -two_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B - -one_byte_match_emit_encodeBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm8B - -memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm8B - -matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm8B - -matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B - -match_nolit_end_encodeBlockAsm8B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm8B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, SI - LEAL -4(R10), R10 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBlockAsm8B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -emit_copy_three_match_nolit_encodeBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm8B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL CX - JMP search_loop_encodeBlockAsm8B - -emit_remainder_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -two_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -one_byte_emit_remainder_encodeBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B - -memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm - -check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm: - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm - -candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm - -match_extend_back_loop_encodeBetterBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm - -match_extend_back_end_encodeBetterBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm - -matchlen_loopback_match_nolit_encodeBetterBlockAsm: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_loop_match_nolit_encodeBetterBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm - -matchlen_single_match_nolit_encodeBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm - -match_nolit_end_encodeBetterBlockAsm: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm - CMPL R12, $0x01 - JG match_length_ok_encodeBetterBlockAsm - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm - -match_length_ok_encodeBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -four_bytes_match_emit_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -three_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -two_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm - -one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm - -memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm - -four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -emit_copy_three_match_nolit_encodeBetterBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -match_is_repeat_encodeBetterBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -four_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -three_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -two_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - -memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat -emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm - -repeat_five_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm - -emit_remainder_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -four_bytes_emit_remainder_encodeBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -three_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -two_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -one_byte_emit_remainder_encodeBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm - -memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeBetterBlockAsm4MB - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm4MB - -check_maxskip_ok_encodeBetterBlockAsm4MB: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm4MB: - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm4MB - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB - -candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - -match_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm4MB - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - JMP match_extend_back_loop_encodeBetterBlockAsm4MB - -match_extend_back_end_encodeBetterBlockAsm4MB: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm4MB: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB - -matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB - -matchlen_single_match_nolit_encodeBetterBlockAsm4MB: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm4MB - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB - -match_nolit_end_encodeBetterBlockAsm4MB: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x01 - JG match_length_ok_encodeBetterBlockAsm4MB - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm4MB - -match_length_ok_encodeBetterBlockAsm4MB: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -three_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -two_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -one_byte_match_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB - -memmove_long_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm4MB: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -match_is_repeat_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - -memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm4MB - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm4MB: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm4MB - -emit_remainder_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -three_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -two_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -one_byte_emit_remainder_encodeBetterBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - -memmove_long_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 65560(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - -match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - JMP match_extend_back_loop_encodeBetterBlockAsm12B - -match_extend_back_end_encodeBetterBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm12B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm12B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_loop_match_nolit_encodeBetterBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B - -matchlen_single_match_nolit_encodeBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B - -match_nolit_end_encodeBetterBlockAsm12B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm12B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -two_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -one_byte_match_emit_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B - -memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm12B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeBetterBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -match_is_repeat_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -one_byte_match_emit_repeat_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm12B - -emit_remainder_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -two_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -one_byte_emit_remainder_encodeBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - -memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B - -candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - -match_extend_back_loop_encodeBetterBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - JMP match_extend_back_loop_encodeBetterBlockAsm10B - -match_extend_back_end_encodeBetterBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm10B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm10B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_loop_match_nolit_encodeBetterBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B - -matchlen_single_match_nolit_encodeBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B - -match_nolit_end_encodeBetterBlockAsm10B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm10B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -two_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -one_byte_match_emit_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B - -memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm10B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeBetterBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -match_is_repeat_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -one_byte_match_emit_repeat_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm10B - -emit_remainder_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -two_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -one_byte_emit_remainder_encodeBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - -memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 4120(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B - -candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm8B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - -match_extend_back_loop_encodeBetterBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - JMP match_extend_back_loop_encodeBetterBlockAsm8B - -match_extend_back_end_encodeBetterBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm8B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm8B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_loop_match_nolit_encodeBetterBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B - -matchlen_single_match_nolit_encodeBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B - -match_nolit_end_encodeBetterBlockAsm8B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm8B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -two_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -one_byte_match_emit_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B - -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm8B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -match_is_repeat_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -one_byte_match_emit_repeat_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R11 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm8B - -emit_remainder_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -two_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -one_byte_emit_remainder_encodeBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - -memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm - -repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm - -repeat_extend_back_end_encodeSnappyBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_repeat_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL SI, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -three_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -two_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -one_byte_repeat_emit_encodeSnappyBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm - -memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm - -matchlen_single_repeat_extend_encodeSnappyBlockAsm: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm - -repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeSnappyBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm - -no_repeat_found_encodeSnappyBlockAsm: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm - -candidate3_match_encodeSnappyBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm - -candidate2_match_encodeSnappyBlockAsm: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm - -match_extend_back_loop_encodeSnappyBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm - JMP match_extend_back_loop_encodeSnappyBlockAsm - -match_extend_back_end_encodeSnappyBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x01000000 - JLT four_bytes_match_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -three_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -two_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm - -memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm: -match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - -matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm - -match_nolit_end_encodeSnappyBlockAsm: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeSnappyBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm - INCL CX - JMP search_loop_encodeSnappyBlockAsm - -emit_remainder_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -four_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -three_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -two_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -one_byte_emit_remainder_encodeSnappyBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm - -memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm64K - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm64K - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm64K - -repeat_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K - -repeat_extend_back_end_encodeSnappyBlockAsm64K: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -two_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm64K - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -one_byte_repeat_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - -memmove_long_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K - LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K - -repeat_extend_forward_end_encodeSnappyBlockAsm64K: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm64K - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm64K: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm64K - -no_repeat_found_encodeSnappyBlockAsm64K: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm64K - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm64K - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm64K - -candidate3_match_encodeSnappyBlockAsm64K: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm64K - -candidate2_match_encodeSnappyBlockAsm64K: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - -match_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBlockAsm64K - -match_extend_back_end_encodeSnappyBlockAsm64K: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm64K: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm64K - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -two_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -one_byte_match_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K - -memmove_long_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm64K: -match_nolit_loop_encodeSnappyBlockAsm64K: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBlockAsm64K: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm64K - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K - -match_nolit_end_encodeSnappyBlockAsm64K: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm64K - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm64K: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm64K - INCL CX - JMP search_loop_encodeSnappyBlockAsm64K - -emit_remainder_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm12B - -repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B - -repeat_extend_back_end_encodeSnappyBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -two_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm12B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -one_byte_repeat_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - -memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B - -repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm12B - -no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm12B - -candidate3_match_encodeSnappyBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm12B - -candidate2_match_encodeSnappyBlockAsm12B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - -match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBlockAsm12B - -match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -two_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B - -memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm12B: -match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B - -match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm12B - INCL CX - JMP search_loop_encodeSnappyBlockAsm12B - -emit_remainder_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm10B - -repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B - -repeat_extend_back_end_encodeSnappyBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -two_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm10B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -one_byte_repeat_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - -memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B - -repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm10B - -no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm10B - -candidate3_match_encodeSnappyBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm10B - -candidate2_match_encodeSnappyBlockAsm10B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - -match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBlockAsm10B - -match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -two_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B - -memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm10B: -match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B - -match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm10B - INCL CX - JMP search_loop_encodeSnappyBlockAsm10B - -emit_remainder_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm8B - -repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B - -repeat_extend_back_end_encodeSnappyBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -two_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm8B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -one_byte_repeat_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - -memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B - -repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm8B - -no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm8B - -candidate3_match_encodeSnappyBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm8B - -candidate2_match_encodeSnappyBlockAsm8B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - -match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBlockAsm8B - -match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -two_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B - -memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm8B: -match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B - -match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm8B - INCL CX - JMP search_loop_encodeSnappyBlockAsm8B - -emit_remainder_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeSnappyBetterBlockAsm - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeSnappyBetterBlockAsm - -check_maxskip_ok_encodeSnappyBetterBlockAsm: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeSnappyBetterBlockAsm: - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm - -candidateS_match_encodeSnappyBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - -match_extend_back_loop_encodeSnappyBetterBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm - -match_extend_back_end_encodeSnappyBetterBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm - -match_nolit_end_encodeSnappyBetterBlockAsm: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL R12, $0x01 - JG match_length_ok_encodeSnappyBetterBlockAsm - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeSnappyBetterBlockAsm - -match_length_ok_encodeSnappyBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -four_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -three_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -two_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -one_byte_match_emit_encodeSnappyBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - -memmove_long_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm - -emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm64K - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K - -candidateS_match_encodeSnappyBetterBlockAsm64K: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - -match_extend_back_loop_encodeSnappyBetterBlockAsm64K: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K - -match_extend_back_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm64K: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm64K - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - -match_nolit_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -one_byte_match_emit_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - -memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm64K - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm64K - -emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 65560(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B - -candidateS_match_encodeSnappyBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - -match_extend_back_loop_encodeSnappyBetterBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B - -match_extend_back_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm12B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm12B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - -match_nolit_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -one_byte_match_emit_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm12B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm12B - -emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B - -candidateS_match_encodeSnappyBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - -match_extend_back_loop_encodeSnappyBetterBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B - -match_extend_back_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm10B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm10B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - -match_nolit_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -one_byte_match_emit_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm10B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm10B - -emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 4120(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B - -candidateS_match_encodeSnappyBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - -match_extend_back_loop_encodeSnappyBetterBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B - -match_extend_back_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm8B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm8B - LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - -match_nolit_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -one_byte_match_emit_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm8B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm8B - -emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func emitLiteral(dst []byte, lit []byte) int -// Requires: SSE2 -TEXT ·emitLiteral(SB), NOSPLIT, $0-56 - MOVQ lit_len+32(FP), DX - MOVQ dst_base+0(FP), AX - MOVQ lit_base+24(FP), CX - TESTQ DX, DX - JZ emit_literal_end_standalone_skip - MOVL DX, BX - LEAL -1(DX), SI - CMPL SI, $0x3c - JLT one_byte_standalone - CMPL SI, $0x00000100 - JLT two_bytes_standalone - CMPL SI, $0x00010000 - JLT three_bytes_standalone - CMPL SI, $0x01000000 - JLT four_bytes_standalone - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP memmove_long_standalone - -four_bytes_standalone: - MOVL SI, DI - SHRL $0x10, DI - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB DI, 3(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP memmove_long_standalone - -three_bytes_standalone: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP memmove_long_standalone - -two_bytes_standalone: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_standalone - JMP memmove_long_standalone - -one_byte_standalone: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, BX - ADDQ $0x01, AX - -memmove_standalone: - // genMemMoveShort - CMPQ DX, $0x03 - JB emit_lit_memmove_standalone_memmove_move_1or2 - JE emit_lit_memmove_standalone_memmove_move_3 - CMPQ DX, $0x08 - JB emit_lit_memmove_standalone_memmove_move_4through7 - CMPQ DX, $0x10 - JBE emit_lit_memmove_standalone_memmove_move_8through16 - CMPQ DX, $0x20 - JBE emit_lit_memmove_standalone_memmove_move_17through32 - JMP emit_lit_memmove_standalone_memmove_move_33through64 - -emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(DX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(DX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(DX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(DX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -memmove_long_standalone: - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVQ DX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_standalonelarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_standalonelarge_big_loop_back - -emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ DX, R8 - JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -emit_literal_end_standalone_skip: - XORQ BX, BX - -emit_literal_end_standalone: - MOVQ BX, ret+48(FP) - RET - -// func emitRepeat(dst []byte, offset int, length int) int -TEXT ·emitRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitRepeat -emit_repeat_again_standalone: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone - -cant_repeat_two_offset_standalone: - CMPL DX, $0x00000104 - JLT repeat_three_standalone - CMPL DX, $0x00010100 - JLT repeat_four_standalone - CMPL DX, $0x0100ffff - JLT repeat_five_standalone - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone - -repeat_five_standalone: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_repeat_end - -repeat_four_standalone: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_repeat_end - -repeat_three_standalone: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_repeat_end - -repeat_two_standalone: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_repeat_end - -repeat_two_offset_standalone: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - -gen_emit_repeat_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopy(dst []byte, offset int, length int) int -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JL two_byte_offset_standalone - -four_bytes_loop_back_standalone: - CMPL DX, $0x40 - JLE four_bytes_remain_standalone - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JL four_bytes_remain_standalone - - // emitRepeat -emit_repeat_again_standalone_emit_copy: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone_emit_copy - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone_emit_copy - -cant_repeat_two_offset_standalone_emit_copy: - CMPL DX, $0x00000104 - JLT repeat_three_standalone_emit_copy - CMPL DX, $0x00010100 - JLT repeat_four_standalone_emit_copy - CMPL DX, $0x0100ffff - JLT repeat_five_standalone_emit_copy - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy - -repeat_five_standalone_emit_copy: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - JMP four_bytes_loop_back_standalone - -four_bytes_remain_standalone: - TESTL DX, DX - JZ gen_emit_copy_end - MOVB $0x03, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -two_byte_offset_standalone: - CMPL DX, $0x40 - JLE two_byte_offset_short_standalone - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - - // emitRepeat -emit_repeat_again_standalone_emit_copy_short: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone_emit_copy_short - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone_emit_copy_short - -cant_repeat_two_offset_standalone_emit_copy_short: - CMPL DX, $0x00000104 - JLT repeat_three_standalone_emit_copy_short - CMPL DX, $0x00010100 - JLT repeat_four_standalone_emit_copy_short - CMPL DX, $0x0100ffff - JLT repeat_five_standalone_emit_copy_short - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short - -repeat_five_standalone_emit_copy_short: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - JMP two_byte_offset_standalone - -two_byte_offset_short_standalone: - CMPL DX, $0x0c - JGE emit_copy_three_standalone - CMPL CX, $0x00000800 - JGE emit_copy_three_standalone - MOVB $0x01, SI - LEAL -16(SI)(DX*4), DX - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -emit_copy_three_standalone: - MOVB $0x02, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopyNoRepeat(dst []byte, offset int, length int) int -TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JL two_byte_offset_standalone_snappy - -four_bytes_loop_back_standalone_snappy: - CMPL DX, $0x40 - JLE four_bytes_remain_standalone_snappy - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JL four_bytes_remain_standalone_snappy - JMP four_bytes_loop_back_standalone_snappy - -four_bytes_remain_standalone_snappy: - TESTL DX, DX - JZ gen_emit_copy_end_snappy - MOVB $0x03, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end_snappy - -two_byte_offset_standalone_snappy: - CMPL DX, $0x40 - JLE two_byte_offset_short_standalone_snappy - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - JMP two_byte_offset_standalone_snappy - -two_byte_offset_short_standalone_snappy: - CMPL DX, $0x0c - JGE emit_copy_three_standalone_snappy - CMPL CX, $0x00000800 - JGE emit_copy_three_standalone_snappy - MOVB $0x01, SI - LEAL -16(SI)(DX*4), DX - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end_snappy - -emit_copy_three_standalone_snappy: - MOVB $0x02, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end_snappy: - MOVQ BX, ret+40(FP) - RET - -// func matchLen(a []byte, b []byte) int -TEXT ·matchLen(SB), NOSPLIT, $0-56 - MOVQ a_base+0(FP), AX - MOVQ b_base+24(FP), CX - MOVQ a_len+8(FP), DX - - // matchLen - XORL SI, SI - CMPL DX, $0x08 - JL matchlen_single_standalone - -matchlen_loopback_standalone: - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - TESTQ BX, BX - JZ matchlen_loop_standalone - BSFQ BX, BX - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end - -matchlen_loop_standalone: - LEAL -8(DX), DX - LEAL 8(SI), SI - CMPL DX, $0x08 - JGE matchlen_loopback_standalone - -matchlen_single_standalone: - TESTL DX, DX - JZ gen_match_len_end - -matchlen_single_loopback_standalone: - MOVB (AX)(SI*1), BL - CMPB (CX)(SI*1), BL - JNE gen_match_len_end - LEAL 1(SI), SI - DECL DX - JNZ matchlen_single_loopback_standalone - -gen_match_len_end: - MOVQ SI, ret+48(FP) - RET diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go deleted file mode 100644 index 89d69e965bf..00000000000 --- a/vendor/github.com/klauspost/compress/s2/s2.go +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package s2 implements the S2 compression format. -// -// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput, -// which is why it features concurrent compression for bigger payloads. -// -// Decoding is compatible with Snappy compressed content, -// but content compressed with S2 cannot be decompressed by Snappy. -// -// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2 -// -// There are actually two S2 formats: block and stream. They are related, -// but different: trying to decompress block-compressed data as a S2 stream -// will fail, and vice versa. The block format is the Decode and Encode -// functions and the stream format is the Reader and Writer types. -// -// A "better" compression option is available. This will trade some compression -// speed -// -// The block format, the more common case, is used when the complete size (the -// number of bytes) of the original data is known upfront, at the time -// compression starts. The stream format, also known as the framing format, is -// for when that isn't always true. -// -// Blocks to not offer much data protection, so it is up to you to -// add data validation of decompressed blocks. -// -// Streams perform CRC validation of the decompressed data. -// Stream compression will also be performed on multiple CPU cores concurrently -// significantly improving throughput. -package s2 - -import ( - "bytes" - "hash/crc32" -) - -/* -Each encoded block begins with the varint-encoded length of the decoded data, -followed by a sequence of chunks. Chunks begin and end on byte boundaries. The -first byte of each chunk is broken into its 2 least and 6 most significant bits -called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. -Zero means a literal tag. All other values mean a copy tag. - -For literal tags: - - If m < 60, the next 1 + m bytes are literal bytes. - - Otherwise, let n be the little-endian unsigned integer denoted by the next - m - 59 bytes. The next 1 + n bytes after that are literal bytes. - -For copy tags, length bytes are copied from offset bytes ago, in the style of -Lempel-Ziv compression algorithms. In particular: - - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). - The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 - of the offset. The next byte is bits 0-7 of the offset. - - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). - The length is 1 + m. The offset is the little-endian unsigned integer - denoted by the next 2 bytes. - - For l == 3, the offset ranges in [0, 1<<32) and the length in - [1, 65). The length is 1 + m. The offset is the little-endian unsigned - integer denoted by the next 4 bytes. -*/ -const ( - tagLiteral = 0x00 - tagCopy1 = 0x01 - tagCopy2 = 0x02 - tagCopy4 = 0x03 -) - -const ( - checksumSize = 4 - chunkHeaderSize = 4 - magicChunk = "\xff\x06\x00\x00" + magicBody - magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy - magicBodySnappy = "sNaPpY" - magicBody = "S2sTwO" - - // maxBlockSize is the maximum size of the input to encodeBlock. - // - // For the framing format (Writer type instead of Encode function), - // this is the maximum uncompressed size of a block. - maxBlockSize = 4 << 20 - - // minBlockSize is the minimum size of block setting when creating a writer. - minBlockSize = 4 << 10 - - // Default block size - defaultBlockSize = 1 << 20 - - // maxSnappyBlockSize is the maximum snappy block size. - maxSnappyBlockSize = 1 << 16 - - obufHeaderLen = checksumSize + chunkHeaderSize -) - -const ( - chunkTypeCompressedData = 0x00 - chunkTypeUncompressedData = 0x01 - chunkTypePadding = 0xfe - chunkTypeStreamIdentifier = 0xff -) - -var crcTable = crc32.MakeTable(crc32.Castagnoli) - -// crc implements the checksum specified in section 3 of -// https://github.com/google/snappy/blob/master/framing_format.txt -func crc(b []byte) uint32 { - c := crc32.Update(0, crcTable, b) - return c>>15 | c<<17 + 0xa282ead8 -} - -// literalExtraSize returns the extra size of encoding n literals. -// n should be >= 0 and <= math.MaxUint32. -func literalExtraSize(n int64) int64 { - if n == 0 { - return 0 - } - switch { - case n < 60: - return 1 - case n < 1<<8: - return 2 - case n < 1<<16: - return 3 - case n < 1<<24: - return 4 - default: - return 5 - } -} - -type byter interface { - Bytes() []byte -} - -var _ byter = &bytes.Buffer{} diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/klauspost/compress/snappy/.gitignore deleted file mode 100644 index 042091d9b3b..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/.gitignore +++ /dev/null @@ -1,16 +0,0 @@ -cmd/snappytool/snappytool -testdata/bench - -# These explicitly listed benchmark data files are for an obsolete version of -# snappy_test.go. -testdata/alice29.txt -testdata/asyoulik.txt -testdata/fireworks.jpeg -testdata/geo.protodata -testdata/html -testdata/html_x_4 -testdata/kppkn.gtb -testdata/lcet10.txt -testdata/paper-100k.pdf -testdata/plrabn12.txt -testdata/urls.10K diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/klauspost/compress/snappy/AUTHORS deleted file mode 100644 index 52ccb5a934d..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/AUTHORS +++ /dev/null @@ -1,18 +0,0 @@ -# This is the official list of Snappy-Go authors for copyright purposes. -# This file is distinct from the CONTRIBUTORS files. -# See the latter for an explanation. - -# Names should be added to this file as -# Name or Organization -# The email address is not required for organizations. - -# Please keep the list sorted. - -Amazon.com, Inc -Damian Gryski -Eric Buth -Google Inc. -Jan Mercl <0xjnml@gmail.com> -Klaus Post -Rodolfo Carvalho -Sebastien Binet diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS deleted file mode 100644 index ea6524ddd02..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS +++ /dev/null @@ -1,41 +0,0 @@ -# This is the official list of people who can contribute -# (and typically have contributed) code to the Snappy-Go repository. -# The AUTHORS file lists the copyright holders; this file -# lists people. For example, Google employees are listed here -# but not in AUTHORS, because Google holds the copyright. -# -# The submission process automatically checks to make sure -# that people submitting code are listed in this file (by email address). -# -# Names should be added to this file only after verifying that -# the individual or the individual's organization has agreed to -# the appropriate Contributor License Agreement, found here: -# -# http://code.google.com/legal/individual-cla-v1.0.html -# http://code.google.com/legal/corporate-cla-v1.0.html -# -# The agreement for individuals can be filled out on the web. -# -# When adding J Random Contributor's name to this file, -# either J's name or J's organization's name should be -# added to the AUTHORS file, depending on whether the -# individual or corporate CLA was used. - -# Names should be added to this file like so: -# Name - -# Please keep the list sorted. - -Alex Legg -Damian Gryski -Eric Buth -Jan Mercl <0xjnml@gmail.com> -Jonathan Swinney -Kai Backman -Klaus Post -Marc-Antoine Ruel -Nigel Tao -Rob Pike -Rodolfo Carvalho -Russ Cox -Sebastien Binet diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/klauspost/compress/snappy/LICENSE deleted file mode 100644 index 6050c10f4c8..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/klauspost/compress/snappy/README.md b/vendor/github.com/klauspost/compress/snappy/README.md deleted file mode 100644 index 7abf6dfc92c..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# snappy - -The Snappy compression format in the Go programming language. - -This is a plug-in replacement for `github.com/golang/snappy`. - -It provides full replacement of the Snappy package. - -See [Snappy Compatibility](https://github.com/klauspost/compress/tree/master/s2#snappy-compatibility) in the S2 documentation. - -"Better" compression mode is used. For buffered streams concurrent compression is used. - -For more options use the [s2 package](https://pkg.go.dev/github.com/klauspost/compress/s2). - -# usage - -Replace imports `github.com/golang/snappy` with `github.com/klauspost/compress/snappy`. diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/klauspost/compress/snappy/decode.go deleted file mode 100644 index 89f1fa23444..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/decode.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package snappy - -import ( - "io" - - "github.com/klauspost/compress/s2" -) - -var ( - // ErrCorrupt reports that the input is invalid. - ErrCorrupt = s2.ErrCorrupt - // ErrTooLarge reports that the uncompressed length is too large. - ErrTooLarge = s2.ErrTooLarge - // ErrUnsupported reports that the input isn't supported. - ErrUnsupported = s2.ErrUnsupported -) - -const ( - // maxBlockSize is the maximum size of the input to encodeBlock. It is not - // part of the wire format per se, but some parts of the encoder assume - // that an offset fits into a uint16. - // - // Also, for the framing format (Writer type instead of Encode function), - // https://github.com/google/snappy/blob/master/framing_format.txt says - // that "the uncompressed data in a chunk must be no longer than 65536 - // bytes". - maxBlockSize = 65536 -) - -// DecodedLen returns the length of the decoded block. -func DecodedLen(src []byte) (int, error) { - return s2.DecodedLen(src) -} - -// Decode returns the decoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire decoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// Decode handles the Snappy block format, not the Snappy stream format. -func Decode(dst, src []byte) ([]byte, error) { - return s2.Decode(dst, src) -} - -// NewReader returns a new Reader that decompresses from r, using the framing -// format described at -// https://github.com/google/snappy/blob/master/framing_format.txt -func NewReader(r io.Reader) *Reader { - return s2.NewReader(r, s2.ReaderMaxBlockSize(maxBlockSize)) -} - -// Reader is an io.Reader that can read Snappy-compressed bytes. -// -// Reader handles the Snappy stream format, not the Snappy block format. -type Reader = s2.Reader diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/klauspost/compress/snappy/encode.go deleted file mode 100644 index e8bd72c1864..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/encode.go +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package snappy - -import ( - "io" - - "github.com/klauspost/compress/s2" -) - -// Encode returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// Encode handles the Snappy block format, not the Snappy stream format. -func Encode(dst, src []byte) []byte { - return s2.EncodeSnappyBetter(dst, src) -} - -// MaxEncodedLen returns the maximum length of a snappy block, given its -// uncompressed length. -// -// It will return a negative value if srcLen is too large to encode. -func MaxEncodedLen(srcLen int) int { - return s2.MaxEncodedLen(srcLen) -} - -// NewWriter returns a new Writer that compresses to w. -// -// The Writer returned does not buffer writes. There is no need to Flush or -// Close such a Writer. -// -// Deprecated: the Writer returned is not suitable for many small writes, only -// for few large writes. Use NewBufferedWriter instead, which is efficient -// regardless of the frequency and shape of the writes, and remember to Close -// that Writer when done. -func NewWriter(w io.Writer) *Writer { - return s2.NewWriter(w, s2.WriterSnappyCompat(), s2.WriterBetterCompression(), s2.WriterFlushOnWrite(), s2.WriterConcurrency(1)) -} - -// NewBufferedWriter returns a new Writer that compresses to w, using the -// framing format described at -// https://github.com/google/snappy/blob/master/framing_format.txt -// -// The Writer returned buffers writes. Users must call Close to guarantee all -// data has been forwarded to the underlying io.Writer. They may also call -// Flush zero or more times before calling Close. -func NewBufferedWriter(w io.Writer) *Writer { - return s2.NewWriter(w, s2.WriterSnappyCompat(), s2.WriterBetterCompression()) -} - -// Writer is an io.Writer that can write Snappy-compressed bytes. -// -// Writer handles the Snappy stream format, not the Snappy block format. -type Writer = s2.Writer diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/klauspost/compress/snappy/snappy.go deleted file mode 100644 index 398cdc95a01..00000000000 --- a/vendor/github.com/klauspost/compress/snappy/snappy.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package snappy implements the Snappy compression format. It aims for very -// high speeds and reasonable compression. -// -// There are actually two Snappy formats: block and stream. They are related, -// but different: trying to decompress block-compressed data as a Snappy stream -// will fail, and vice versa. The block format is the Decode and Encode -// functions and the stream format is the Reader and Writer types. -// -// The block format, the more common case, is used when the complete size (the -// number of bytes) of the original data is known upfront, at the time -// compression starts. The stream format, also known as the framing format, is -// for when that isn't always true. -// -// The canonical, C++ implementation is at https://github.com/google/snappy and -// it only implements the block format. -package snappy - -/* -Each encoded block begins with the varint-encoded length of the decoded data, -followed by a sequence of chunks. Chunks begin and end on byte boundaries. The -first byte of each chunk is broken into its 2 least and 6 most significant bits -called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. -Zero means a literal tag. All other values mean a copy tag. - -For literal tags: - - If m < 60, the next 1 + m bytes are literal bytes. - - Otherwise, let n be the little-endian unsigned integer denoted by the next - m - 59 bytes. The next 1 + n bytes after that are literal bytes. - -For copy tags, length bytes are copied from offset bytes ago, in the style of -Lempel-Ziv compression algorithms. In particular: - - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). - The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 - of the offset. The next byte is bits 0-7 of the offset. - - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). - The length is 1 + m. The offset is the little-endian unsigned integer - denoted by the next 2 bytes. - - For l == 3, this tag is a legacy format that is no longer issued by most - encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in - [1, 65). The length is 1 + m. The offset is the little-endian unsigned - integer denoted by the next 4 bytes. -*/ diff --git a/vendor/modules.txt b/vendor/modules.txt index a40494ddbcc..35cb23dc02d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -746,8 +746,6 @@ github.com/klauspost/compress/fse github.com/klauspost/compress/gzip github.com/klauspost/compress/huff0 github.com/klauspost/compress/internal/snapref -github.com/klauspost/compress/s2 -github.com/klauspost/compress/snappy github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd/internal/xxhash # github.com/klauspost/cpuid v1.3.1 From 0a2a3e9c6773fa2926fdf57d3dc905fc5e910558 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 21 Sep 2021 14:50:53 -0400 Subject: [PATCH 2/2] remove s2 Signed-off-by: Joe Elliott --- tempodb/encoding/streaming_block_test.go | 64 ++++++++++++------------ 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/tempodb/encoding/streaming_block_test.go b/tempodb/encoding/streaming_block_test.go index a7e14106a58..038ae469914 100644 --- a/tempodb/encoding/streaming_block_test.go +++ b/tempodb/encoding/streaming_block_test.go @@ -9,6 +9,7 @@ import ( "io/ioutil" "math/rand" "os" + "path" "sort" "testing" "time" @@ -301,7 +302,7 @@ func streamingBlock(t *testing.T, cfg *BlockConfig, w backend.Writer) (*Streamin return block, ids, reqs } -const benchDownsample = 200 +const benchDownsample = 1024 * 1024 func BenchmarkWriteGzip(b *testing.B) { benchmarkCompressBlock(b, backend.EncGZIP, benchDownsample, false) @@ -319,7 +320,6 @@ func BenchmarkWriteLZ41M(b *testing.B) { func BenchmarkWriteNone(b *testing.B) { benchmarkCompressBlock(b, backend.EncNone, benchDownsample, false) } - func BenchmarkWriteZstd(b *testing.B) { benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, false) } @@ -339,7 +339,6 @@ func BenchmarkReadLZ41M(b *testing.B) { func BenchmarkReadNone(b *testing.B) { benchmarkCompressBlock(b, backend.EncNone, benchDownsample, true) } - func BenchmarkReadZstd(b *testing.B) { benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, true) } @@ -380,13 +379,13 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa b.ResetTimer() } - originatingMeta := backend.NewBlockMeta(testTenantID, uuid.New(), "should_be_ignored", encoding, "") block, err := NewStreamingBlock(&BlockConfig{ IndexDownsampleBytes: indexDownsample, BloomFP: .05, Encoding: encoding, IndexPageSizeBytes: 10 * 1024 * 1024, - }, originatingMeta.BlockID, originatingMeta.TenantID, []*backend.BlockMeta{originatingMeta}, originatingMeta.TotalObjects) + BloomShardSizeBytes: 100000, + }, uuid.New(), meta.TenantID, []*backend.BlockMeta{meta}, meta.TotalObjects) require.NoError(b, err, "unexpected error completing block") ctx := context.Background() @@ -395,8 +394,7 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa if err != io.EOF { require.NoError(b, err) } - - if id == nil { + if err == io.EOF { break } @@ -416,30 +414,30 @@ func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsa return } - // todo: restore read benchmarks - // b.ResetTimer() - - // file, err := os.Open(block.fullFilename()) - // require.NoError(b, err) - // pr, err := v2.NewDataReader(backend.NewContextReaderWithAllReader(file), encoding) - // require.NoError(b, err) - - // var tempBuffer []byte - // o := v2.NewObjectReaderWriter() - // for { - // tempBuffer, _, err = pr.NextPage(tempBuffer) - // if err == io.EOF { - // break - // } - // require.NoError(b, err) - - // bufferReader := bytes.NewReader(tempBuffer) - - // for { - // _, _, err = o.UnmarshalObjectFromReader(bufferReader) - // if err == io.EOF { - // break - // } - // } - // } + b.ResetTimer() + + fullFilename := path.Join(backendTmpDir, block.compactedMeta.TenantID, block.compactedMeta.BlockID.String(), "data") + file, err := os.Open(fullFilename) + require.NoError(b, err) + pr, err := v2.NewDataReader(backend.NewContextReaderWithAllReader(file), encoding) + require.NoError(b, err) + + var tempBuffer []byte + o := v2.NewObjectReaderWriter() + for { + tempBuffer, _, err = pr.NextPage(tempBuffer) + if err == io.EOF { + break + } + require.NoError(b, err) + + bufferReader := bytes.NewReader(tempBuffer) + + for { + _, _, err = o.UnmarshalObjectFromReader(bufferReader) + if err == io.EOF { + break + } + } + } }