Skip to content

Commit

Permalink
eliasfano32: optimise reverse iterator (#10395)
Browse files Browse the repository at this point in the history
- Reduces memory allocations for reverse iterator from 49681997 B/op to
144 B/op
- Adds support for reverse binary search Seek to reverse iterator (we
can use that in `FrozenInvertedIdxIter.advanceInFiles` for more
performance - will add in a subsequent
[PR](#10397))

```
➜ go test -bench=. -benchmem
goos: darwin
goarch: arm64
pkg: github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32
BenchmarkName/next_to_value_1_000_000-8                     1113           1091076 ns/op             144 B/op          1 allocs/op
BenchmarkName/seek_to_value_1_000_000-8                  2000680               601.1 ns/op           144 B/op          1 allocs/op
BenchmarkName/reverse_next_to_value_1_230-8                    8         129665120 ns/op             152 B/op          2 allocs/op
BenchmarkName/reverse_seek_to_value_1_230-8              1000000              1062 ns/op             152 B/op          2 allocs/op
BenchmarkName/naive_reverse_iterator-8                         8         135305406 ns/op        49681997 B/op         42 allocs/op
BenchmarkName/reverse_iterator-8                               8         127542469 ns/op             144 B/op          1 allocs/op
PASS
ok      github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32  12.937s
```
  • Loading branch information
taratorio authored May 18, 2024
1 parent 36b3311 commit 58f6327
Show file tree
Hide file tree
Showing 3 changed files with 400 additions and 78 deletions.
203 changes: 151 additions & 52 deletions erigon-lib/recsplit/eliasfano32/elias_fano.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package eliasfano32

import (
"encoding/binary"
"errors"
"fmt"
"io"
"math"
Expand All @@ -26,12 +27,12 @@ import (
"unsafe"

"github.com/ledgerwatch/erigon-lib/common/bitutil"
"github.com/ledgerwatch/erigon-lib/kv/iter"
)

// EliasFano algo overview https://www.antoniomallia.it/sorted-integers-compression-with-elias-fano-encoding.html
// P. Elias. Efficient storage and retrieval by content and address of static files. J. ACM, 21(2):246–260, 1974.
// Partitioned Elias-Fano Indexes http://groups.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf
// Quasi-Succinct Indices, Sebastiano Vigna https://arxiv.org/pdf/1206.4300

const (
log2q uint64 = 8
Expand All @@ -43,6 +44,8 @@ const (
superQSize uint64 = 1 + qPerSuperQ/2 // 1 + 64/2 = 33
)

var ErrEliasFanoIterExhausted = errors.New("elias fano iterator exhausted")

// EliasFano can be used to encode one monotone sequence
type EliasFano struct {
data []uint64
Expand Down Expand Up @@ -228,33 +231,52 @@ func Seek(data []byte, n uint64) (uint64, bool) {
return ef.Search(n)
}

// Search returns the value in the sequence, equal or greater than given value
func (ef *EliasFano) search(v uint64) (nextV uint64, nextI uint64, ok bool) {
func (ef *EliasFano) search(v uint64, reverse bool) (nextV uint64, nextI uint64, ok bool) {
if v == 0 {
if reverse {
return 0, 0, ef.Min() == 0
}
return ef.Min(), 0, true
}
if v == ef.Max() {
return ef.Max(), ef.count, true
}
if v > ef.Max() {
if reverse {
return ef.Max(), ef.count, true
}
return 0, 0, false
}

hi := v >> ef.l
i := sort.Search(int(ef.count+1), func(i int) bool {
if reverse {
return ef.upper(ef.count-uint64(i)) <= hi
}
return ef.upper(uint64(i)) >= hi
})
for j := uint64(i); j <= ef.count; j++ {
val, _, _, _, _ := ef.get(j)
if val >= v {
return val, j, true
if reverse {
for j := uint64(i); j <= ef.count; j++ {
idx := ef.count - j
val, _, _, _, _ := ef.get(idx)
if val <= v {
return val, idx, true
}
}
} else {
for j := uint64(i); j <= ef.count; j++ {
val, _, _, _, _ := ef.get(j)
if val >= v {
return val, j, true
}
}
}
return 0, 0, false
}

// Search returns the value in the sequence, equal or greater than given value
func (ef *EliasFano) Search(v uint64) (uint64, bool) {
n, _, ok := ef.search(v)
n, _, ok := ef.search(v, false /* reverse */)
return n, ok
}

Expand All @@ -271,20 +293,34 @@ func (ef *EliasFano) Count() uint64 {
}

func (ef *EliasFano) Iterator() *EliasFanoIter {
return &EliasFanoIter{ef: ef, upperMask: 1, upperStep: uint64(1) << ef.l, lowerBits: ef.lowerBits, upperBits: ef.upperBits, count: ef.count, l: ef.l, lowerBitsMask: ef.lowerBitsMask}
}
func (ef *EliasFano) ReverseIterator() *iter.ArrStream[uint64] {
//TODO: this is very un-optimal, need implement proper reverse-iterator
it := ef.Iterator()
var values []uint64
for it.HasNext() {
v, err := it.Next()
if err != nil {
panic(err)
}
values = append(values, v)
it := &EliasFanoIter{
ef: ef,
lowerBits: ef.lowerBits,
upperBits: ef.upperBits,
count: ef.count,
lowerBitsMask: ef.lowerBitsMask,
l: ef.l,
upperStep: uint64(1) << ef.l,
}
return iter.ReverseArray[uint64](values)

it.init()
return it
}

func (ef *EliasFano) ReverseIterator() *EliasFanoIter {
it := &EliasFanoIter{
ef: ef,
lowerBits: ef.lowerBits,
upperBits: ef.upperBits,
count: ef.count,
lowerBitsMask: ef.lowerBitsMask,
l: ef.l,
upperStep: uint64(1) << ef.l,
reverse: true,
}

it.init()
return it
}

type EliasFanoIter struct {
Expand All @@ -297,74 +333,105 @@ type EliasFanoIter struct {
lowerBitsMask uint64
l uint64
upperStep uint64
reverse bool

//fields of current value
upper uint64
upperIdx uint64

//fields of next value
idx uint64
lowerIdx uint64
upperMask uint64

itemsIterated uint64
}

func (efi *EliasFanoIter) Close() {}
func (efi *EliasFanoIter) HasNext() bool {
return efi.idx <= efi.count
return efi.itemsIterated <= efi.count
}

func (efi *EliasFanoIter) Reset() {
efi.upperMask = 1
efi.upperStep = uint64(1) << efi.l
efi.upperIdx = 0

efi.upper = 0
efi.upperIdx = 0
efi.lowerIdx = 0
efi.idx = 0
efi.upperMask = 0
efi.itemsIterated = 0
efi.init()
}

func (efi *EliasFanoIter) SeekDeprecated(n uint64) {
efi.Reset()
_, i, ok := efi.ef.search(n)
if !ok {
efi.idx = efi.count + 1
func (efi *EliasFanoIter) init() {
if efi.itemsIterated != 0 {
return
}
for j := uint64(0); j < i; j++ {
efi.increment()

if efi.reverse {
higherBitsMaxValue := efi.ef.u >> efi.l
lastUpperBitIdx := (efi.ef.count + 1) + higherBitsMaxValue
efi.upperMask = 1 << (lastUpperBitIdx % 64) >> 1 // last bit is always 0 delimiter, so we move >> 1
efi.upperIdx = lastUpperBitIdx / 64
efi.upper = higherBitsMaxValue << efi.l
efi.lowerIdx = efi.count * efi.l
} else {
efi.upperMask = 1
}
//fmt.Printf("seek: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx)
//fmt.Printf("seek: efi.upper=%d\n", efi.upper)
}

func (efi *EliasFanoIter) Seek(n uint64) {
//fmt.Printf("b seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx)
//fmt.Printf("b seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, itemsIterated=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.itemsIterated)
//fmt.Printf("b seek2: efi.upper=%d\n", efi.upper)
efi.Reset()
nn, nextI, ok := efi.ef.search(n)
nn, nextI, ok := efi.ef.search(n, efi.reverse)
_ = nn
if !ok {
efi.idx = efi.count + 1
efi.itemsIterated = efi.count + 1
return
}
if nextI == 0 {
if nextI == 0 && !efi.reverse {
return
}
if nextI == efi.count && efi.reverse {
return
}

if efi.reverse {
efi.itemsIterated = efi.count - nextI

// fields of current value
v, _, sel, currWords, lower := efi.ef.get(nextI - 1) //TODO: search can return same info
efi.upper = v &^ (lower & efi.ef.lowerBitsMask)
efi.upperIdx = currWords
// fields of current value
v, _, sel, currWords, lower := efi.ef.get(nextI + 1)
efi.upper = v &^ (lower & efi.ef.lowerBitsMask)
efi.upperIdx = currWords

// fields of next value
efi.lowerIdx -= efi.itemsIterated * efi.l
efi.upperMask = 1 << (sel - 1)
} else {
efi.itemsIterated = nextI

// fields of next value
efi.lowerIdx = nextI * efi.l
efi.idx = nextI
efi.upperMask = 1 << (sel + 1)
// fields of current value
v, _, sel, currWords, lower := efi.ef.get(nextI - 1)
efi.upper = v &^ (lower & efi.ef.lowerBitsMask)
efi.upperIdx = currWords

//fmt.Printf("seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx)
// fields of next value
efi.lowerIdx = nextI * efi.l
efi.upperMask = 1 << (sel + 1)
}

//fmt.Printf("seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, itemsIterated=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.itemsIterated)
//fmt.Printf("seek2: efi.upper=%d\n", efi.upper)
}

func (efi *EliasFanoIter) moveNext() {
if efi.reverse {
efi.decrement()
} else {
efi.increment()
}

efi.itemsIterated++
}

func (efi *EliasFanoIter) increment() {
if efi.upperMask == 0 {
efi.upperIdx++
Expand All @@ -380,16 +447,48 @@ func (efi *EliasFanoIter) increment() {
}
efi.upperMask <<= 1
efi.lowerIdx += efi.l
efi.idx++
}

func (efi *EliasFanoIter) decrement() {
if efi.upperMask == 0 {
if efi.upperIdx == 0 {
panic("decrement: unexpected efi.upperIdx underflow")
}
efi.upperIdx--
efi.upperMask = 1 << 63
}

for efi.upperBits[efi.upperIdx]&efi.upperMask == 0 {
if efi.upper < efi.upperStep {
panic("decrement: unexpected efi.upper underflow")
}
efi.upper -= efi.upperStep
efi.upperMask >>= 1
if efi.upperMask == 0 {
if efi.upperIdx == 0 {
panic("decrement: unexpected efi.upperIdx underflow")
}
efi.upperIdx--
efi.upperMask = 1 << 63
}
}

// note: there can be an underflow here after the last Next()
// but that is ok since we are protected from ErrEliasFanoIterExhausted
efi.lowerIdx -= efi.l
efi.upperMask >>= 1
}

func (efi *EliasFanoIter) Next() (uint64, error) {
if !efi.HasNext() {
return 0, ErrEliasFanoIterExhausted
}
idx64, shift := efi.lowerIdx/64, efi.lowerIdx%64
lower := efi.lowerBits[idx64] >> shift
if shift > 0 {
lower |= efi.lowerBits[idx64+1] << (64 - shift)
}
efi.increment()
efi.moveNext()
return efi.upper | (lower & efi.lowerBitsMask), nil
}

Expand Down
37 changes: 34 additions & 3 deletions erigon-lib/recsplit/eliasfano32/elias_fano_fuzz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,49 @@ func FuzzSingleEliasFano(f *testing.F) {
ef.Build()

// Try to read from ef
for i := 0; i < count; i++ {
for i := range keys {
if ef.Get(uint64(i)) != keys[i] {
t.Fatalf("i %d: got %d, expected %d", i, ef.Get(uint64(i)), keys[i])
}
}

var i int
it := ef.Iterator()
for it.HasNext() {
it.Next()
v, err := it.Next()
if err != nil {
t.Fatalf("it.next: got err: %v", err)
}
if v != keys[i] {
t.Fatalf("it.next: got %d, expected %d", v, keys[i])
}
i++
}
if i != len(keys) {
t.Fatalf("it.len: got %d, expected %d", i, len(keys))
}

i--
rit := ef.ReverseIterator()
for rit.HasNext() {
v, err := rit.Next()
if err != nil {
t.Fatalf("rit.next: got err: %v", err)
}
if v != keys[i] {
t.Fatalf("rit.next: got %d, expected %d, i %d, len, %d", v, keys[i], i, len(keys))
}
i--
}
if i != -1 {
t.Fatalf("rit.len: got %d, expected %d", i, -1)
}

buf := bytes.NewBuffer(nil)
ef.Write(buf)
err := ef.Write(buf)
if err != nil {
t.Fatalf("write: got err: %v", err)
}
if ef.Max() != Max(buf.Bytes()) {
t.Fatalf("max: got %d, expected %d", ef.Max(), Max(buf.Bytes()))
}
Expand Down
Loading

0 comments on commit 58f6327

Please sign in to comment.