Combine memcached/redis with in-memory cache #5509

GiedriusS · 2022-07-15T14:55:42Z

Is your proposal related to a problem?

Our current Memcached & Redis implementations are good at reducing cost however they don't improve query durations as much because each time a query comes in, Thanos Store has to retrieve each key from Memcached/Redis. This saturates the network. We could do better by caching some data in memory so that it wouldn't be needed to retrieve data each time.

Describe the solution you'd like

For Redis we could use client side caching - the server tracks requested keys and then sends us a notification if a key expires.

For Memcached, we could save keys in memory during SetAsync(). The only downside - Memcached could expire some keys while we still have them in memory. In our case, since we use --max-time and --min-time, it means that TTLs don't matter as much because the users only see the data from Thanos Store after a few days had passed.

Describe alternatives you've considered

Groupcache - however it is very slow due to contention on one map (link) and because retrieval happens key-by-key. I have tried implementing multi-key fetch in our thanos-community fork but it is still not finished because of the complexity. As an alternative solution, we could implement this which is somewhere in the middle.

Additional context

There are some Go libraries for implementing a multi-level cache however they don't support batching multiple Get requests into one (at least I haven't found anything).

The text was updated successfully, but these errors were encountered:

fpetkovski · 2022-07-16T12:20:23Z

A custom implementation of multi-level caching might be good for this. We could have a new implementation of cache.Cache which wraps multiple concrete caches and uses them in the order in which they are specified.

rueian · 2022-07-19T14:33:59Z

Hi @GiedriusS,

If you are interesting in using redis' client-side caching, here is an example rueidis implementation similar to the cacheutil.RedisClient:

package cacheutil

import (
	"context"
	"net"
	"time"

	"github.com/go-kit/log"
	"github.com/go-kit/log/level"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"github.com/rueian/rueidis"
)

// RueidisClient is a wrap of rueidis.Client.
type RueidisClient struct {
	client rueidis.Client
	config RedisClientConfig

	logger           log.Logger
	durationSet      prometheus.Observer
	durationSetMulti prometheus.Observer
	durationGetMulti prometheus.Observer
}

// NewRueidisClient makes a new RueidisClient.
func NewRueidisClient(logger log.Logger, name string, conf []byte, reg prometheus.Registerer) (*RueidisClient, error) {
	config, err := parseRedisClientConfig(conf)
	if err != nil {
		return nil, err
	}

	return NewRueidisClientWithConfig(logger, name, config, reg)
}

// NewRueidisClientWithConfig makes a new RedisClient.
func NewRueidisClientWithConfig(logger log.Logger, name string, config RedisClientConfig,
	reg prometheus.Registerer) (*RueidisClient, error) {

	if err := config.validate(); err != nil {
		return nil, err
	}
	client, err := rueidis.NewClient(rueidis.ClientOption{
		InitAddress:      []string{config.Addr},
		Username:         config.Username,
		Password:         config.Password,
		SelectDB:         config.DB,
		Dialer:           net.Dialer{Timeout: config.DialTimeout},
		ConnWriteTimeout: config.WriteTimeout,
	})
	if err != nil {
		return nil, err
	}

	if reg != nil {
		reg = prometheus.WrapRegistererWith(prometheus.Labels{"name": name}, reg)
	}

	c := &RueidisClient{
		client: client,
		config: config,
		logger: logger,
	}
	duration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
		Name:    "thanos_redis_operation_duration_seconds",
		Help:    "Duration of operations against redis.",
		Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 3, 6, 10},
	}, []string{"operation"})
	c.durationSet = duration.WithLabelValues(opSet)
	c.durationSetMulti = duration.WithLabelValues(opSetMulti)
	c.durationGetMulti = duration.WithLabelValues(opGetMulti)
	return c, nil
}

// SetAsync implement RemoteCacheClient.
func (c *RueidisClient) SetAsync(ctx context.Context, key string, value []byte, ttl time.Duration) error {
	start := time.Now()
	if err := c.client.Do(ctx, c.client.B().Set().Key(key).Value(rueidis.BinaryString(value)).ExSeconds(int64(ttl.Seconds())).Build()).Error(); err != nil {
		level.Warn(c.logger).Log("msg", "failed to set item into redis", "err", err, "key", key, "value_size", len(value))
		return nil
	}
	c.durationSet.Observe(time.Since(start).Seconds())
	return nil
}

// SetMulti set multiple keys and value.
func (c *RueidisClient) SetMulti(ctx context.Context, data map[string][]byte, ttl time.Duration) {
	if len(data) == 0 {
		return
	}
	start := time.Now()
	sets := make(rueidis.Commands, 0, len(data))
	ittl := int64(ttl.Seconds())
	for k, v := range data {
		sets = append(sets, c.client.B().Setex().Key(k).Seconds(ittl).Value(rueidis.BinaryString(v)).Build())
	}
	for _, resp := range c.client.DoMulti(ctx, sets...) {
		if err := resp.Error(); err != nil {
			level.Warn(c.logger).Log("msg", "failed to set multi items from redis", "err", err, "items", len(data))
			return
		}
	}
	c.durationSetMulti.Observe(time.Since(start).Seconds())
}

// GetMulti implement RemoteCacheClient.
func (c *RueidisClient) GetMulti(ctx context.Context, keys []string) map[string][]byte {
	if len(keys) == 0 {
		return nil
	}
	start := time.Now()
	results := make(map[string][]byte, len(keys))

	resps, err := rueidis.MGetCache(c.client, ctx, time.Hour, keys)
	if err != nil {
		level.Warn(c.logger).Log("msg", "failed to mget items from redis", "err", err, "items", len(resps))
	}
	for key, resp := range resps {
		if val, err := resp.ToString(); err == nil {
			results[key] = stringToBytes(val)
		}
	}
	c.durationGetMulti.Observe(time.Since(start).Seconds())
	return results
}

// Stop implement RemoteCacheClient.
func (c *RueidisClient) Stop() {
	c.client.Close()
}

Sadly, the github.com/alicebob/miniredis/v2 does not recognize client-side caching commands currently. So I couldn't finish the tests.

GiedriusS · 2022-07-22T10:17:24Z

@rueian I have tried it out however I get a panic:

panic: multi key command with different key slots are not allowed                                                                               
goroutine 33284 [running]:                                                                                                                      
github.com/rueian/rueidis/internal/cmds.check(...)                                                                                              
/home/giedrius/go/pkg/mod/github.com/rueian/[email protected]/internal/cmds/cmds.go:224                                                           
github.com/rueian/rueidis/internal/cmds.Mget.Key({0xc0f8a96498?, 0xc228?, 0xf9d7?}, {0xc0c41b25c0, 0x2, 0x8acdfb?})

Mhm, how could we know what key slots our cache keys have?

rueian · 2022-07-22T10:31:26Z

@rueian I have tried it out however I get a panic:

panic: multi key command with different key slots are not allowed                                                                               
goroutine 33284 [running]:                                                                                                                      
github.com/rueian/rueidis/internal/cmds.check(...)                                                                                              
/home/giedrius/go/pkg/mod/github.com/rueian/[email protected]/internal/cmds/cmds.go:224                                                           
github.com/rueian/rueidis/internal/cmds.Mget.Key({0xc0f8a96498?, 0xc228?, 0xf9d7?}, {0xc0c41b25c0, 0x2, 0x8acdfb?})

Mhm, how could we know what key slots our cache keys have?

Hi @GiedriusS, I guess you are connecting to a redis cluster and unfortunately redis cluster does not allow MGET command having different slots.

I will see what I can do for this use case.

GiedriusS · 2022-07-22T10:34:06Z

Yeah, I had to set up Redis Cluster because it handles distributing load according to hashes for the client if I understood correctly :/ I was under the initial impression that the client itself needs to do it but it seems like in the Redis world, Redis Cluster takes care of that for the client

rueian · 2022-07-23T17:20:58Z

Yeah, I had to set up Redis Cluster because it handles distributing load according to hashes for the client if I understood correctly :/ I was under the initial impression that the client itself needs to do it but it seems like in the Redis world, Redis Cluster takes care of that for the client

Hi @GiedriusS,

Redis Cluster nodes only check if the incoming key slot belongs to them and redis client is responsible for sending commands to the right node.

I have added a new MGetCache helper in rueidis v0.0.65 and also have updated the above example. This should work with Reids Cluster now. Could you have a try?

Use the new helper introduced [here](thanos-io#5509 (comment)). Signed-off-by: Giedrius Statkevičius <[email protected]>

GiedriusS · 2022-07-25T12:22:11Z

Nice, it works fantastic! Thank you so much for your prompt responses and amazing library 🍻 Maybe you have a donation link somewhere? (: FYI here's a graph showing network usage on my servers running Thanos: . In my case I have Grafana instances evaluating alerts using historical data in S3. So, now instead of retrieving index cache items over and over again, they are downloaded once to memory and the hot items are kept there. This improves the situation a lot! You can probably guess when Rueidis with client-side caching has been enabled. I will now work on a PR to add this functionality to Thanos.

I can also see in Redis dashboards that the servers are getting PTTL commands which means that client-side caching works as intended 🎉

The only thing that is missing in rueidis IMO is the ability to use something other than the in-memory cache. Perhaps that could be exposed as an interface through client options so that we could try different cache providers such as Badger? If you would be open to such a change then I could try implementing it. But anyway, that's for another time 😄

rueian · 2022-07-25T13:12:16Z

Happy to see it works! I don't have a donation link but if you can share rueidis with your friends that would be my honor.

Perhaps that could be exposed as an interface through client options so that we could try different cache providers such as Badger?

Good idea! Currently, there is only an internal interface:
https://github.com/rueian/rueidis/blob/6517919074ccc7663f8354fbe7367c153f729129/lru.go#L21-L28

But I think it won't have many differences if I made it exposed publicly.

I will now work on a PR to add this functionality to Thanos.

Please let me know if you have any further issues with rueidis. I am very happy to help.

GiedriusS · 2022-07-28T06:57:01Z

I have noticed a small dead-lock in rueidis from the new MGet helper. Uploading a goroutine dump from my Thanos Store. I think it only occurs during heavy loads. I'll try looking into it but uploading this in advance 🙏 I hope that you'll have some time to take a look
goroutine.txt

rueian · 2022-07-28T13:49:11Z

Hi @GiedriusS,

I am looking into that, but currently, I haven't found the cause. Could you also try to remove the ReadTimeout and see if it happens again?

While trying to find the cause of this deadlock, I fixed another critical bug in MGetCache which lead to returning a wrong response. Please use the new v0.0.67.

rueian · 2022-08-05T12:56:29Z

Hi @GiedriusS,

Bugs related to the ReadTimeout have been fixed in v0.0.68. Could you take a try?

GiedriusS · 2022-08-05T13:09:44Z

Thank you so much! Will take a look. I only got bit by that problem twice, I think. A bit busy but I'm also about to submit a proposal to Thanos for supporting multiple clients for the same caching systems so that we could finally get rueidis into Thanos.

Use the new helper introduced [here](thanos-io#5509 (comment)). Signed-off-by: Giedrius Statkevičius <[email protected]>

stale · 2022-11-13T21:23:08Z

Hello 👋 Looks like there was no activity on this issue for the last two months.
Do you mind updating us on the status? Is this still reproducible or needed? If yes, just comment on this PR or push a commit. Thanks! 🤗
If there will be no activity in the next two weeks, this issue will be closed (we can always reopen an issue if we need!). Alternatively, use remind command if you wish to be reminded at some point in future.

yeya24 · 2023-06-18T18:02:49Z

I think we still want to implement a two layer cache when using memcached?

yeya24 · 2023-10-04T05:00:02Z

We added multi level cache implementation in Cortex, like @fpetkovski it is just a wrapper. cortexproject/cortex#5451

We need this as we are using memcached as caches so lacking of client side cache support like Redis does.

GiedriusS added feature request/improvement difficulty: medium component: store labels Jul 15, 2022

GiedriusS added a commit to vinted/thanos that referenced this issue Jul 25, 2022

cacheutil: update Rueidis client

af838a6

Use the new helper introduced [here](thanos-io#5509 (comment)). Signed-off-by: Giedrius Statkevičius <[email protected]>

GiedriusS mentioned this issue Jul 25, 2022

cacheutil: update Rueidis client vinted/thanos#9

Merged

GiedriusS added a commit to vinted/thanos that referenced this issue Aug 9, 2022

cacheutil: update Rueidis client

0e30685

Use the new helper introduced [here](thanos-io#5509 (comment)). Signed-off-by: Giedrius Statkevičius <[email protected]>

GiedriusS added a commit to vinted/thanos that referenced this issue Oct 18, 2022

cacheutil: update Rueidis client

21115d2

Use the new helper introduced [here](thanos-io#5509 (comment)). Signed-off-by: Giedrius Statkevičius <[email protected]>

GiedriusS added the dont-go-stale Label for important issues which tells the stalebot not to close them label Nov 14, 2022

bobykus31 mentioned this issue Sep 11, 2023

Combine memcached/redis with in-memory cache in Loki grafana/loki#10533

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Combine memcached/redis with in-memory cache #5509

Combine memcached/redis with in-memory cache #5509

GiedriusS commented Jul 15, 2022 •

edited

Loading

fpetkovski commented Jul 16, 2022

rueian commented Jul 19, 2022 •

edited

Loading

GiedriusS commented Jul 22, 2022

rueian commented Jul 22, 2022 •

edited

Loading

GiedriusS commented Jul 22, 2022

rueian commented Jul 23, 2022

GiedriusS commented Jul 25, 2022 •

edited

Loading

rueian commented Jul 25, 2022 •

edited

Loading

GiedriusS commented Jul 28, 2022 •

edited

Loading

rueian commented Jul 28, 2022 •

edited

Loading

rueian commented Aug 5, 2022 •

edited

Loading

GiedriusS commented Aug 5, 2022

stale bot commented Nov 13, 2022

yeya24 commented Jun 18, 2023

yeya24 commented Oct 4, 2023

Combine memcached/redis with in-memory cache #5509

Combine memcached/redis with in-memory cache #5509

Comments

GiedriusS commented Jul 15, 2022 • edited Loading

Is your proposal related to a problem?

Describe the solution you'd like

Describe alternatives you've considered

Additional context

fpetkovski commented Jul 16, 2022

rueian commented Jul 19, 2022 • edited Loading

GiedriusS commented Jul 22, 2022

rueian commented Jul 22, 2022 • edited Loading

GiedriusS commented Jul 22, 2022

rueian commented Jul 23, 2022

GiedriusS commented Jul 25, 2022 • edited Loading

rueian commented Jul 25, 2022 • edited Loading

GiedriusS commented Jul 28, 2022 • edited Loading

rueian commented Jul 28, 2022 • edited Loading

rueian commented Aug 5, 2022 • edited Loading

GiedriusS commented Aug 5, 2022

stale bot commented Nov 13, 2022

yeya24 commented Jun 18, 2023

yeya24 commented Oct 4, 2023

GiedriusS commented Jul 15, 2022 •

edited

Loading

rueian commented Jul 19, 2022 •

edited

Loading

rueian commented Jul 22, 2022 •

edited

Loading

GiedriusS commented Jul 25, 2022 •

edited

Loading

rueian commented Jul 25, 2022 •

edited

Loading

GiedriusS commented Jul 28, 2022 •

edited

Loading

rueian commented Jul 28, 2022 •

edited

Loading

rueian commented Aug 5, 2022 •

edited

Loading