diff --git a/items.c b/items.c index a743fb577f..7ce6b15216 100644 --- a/items.c +++ b/items.c @@ -398,6 +398,15 @@ char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit return buffer; } +void item_stats_evictions(uint64_t *evicted) { + int i; + mutex_lock(&cache_lock); + for (i = 0; i < LARGEST_ID; i++) { + evicted[i] = itemstats[i].evicted; + } + pthread_mutex_unlock(&cache_lock); +} + void do_item_stats(ADD_STAT add_stats, void *c) { int i; for (i = 0; i < LARGEST_ID; i++) { diff --git a/items.h b/items.h index fc7b85eab7..2ec142dba6 100644 --- a/items.h +++ b/items.h @@ -24,3 +24,4 @@ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv); item *do_item_touch(const char *key, const size_t nkey, uint32_t exptime, const uint32_t hv); void item_stats_reset(void); extern pthread_mutex_t cache_lock; +void item_stats_evictions(uint64_t *evicted); diff --git a/memcached.c b/memcached.c index e89c555d8b..496ec13ce4 100644 --- a/memcached.c +++ b/memcached.c @@ -3189,6 +3189,26 @@ static void process_verbosity_command(conn *c, token_t *tokens, const size_t nto return; } +static void process_slabs_automove_command(conn *c, token_t *tokens, const size_t ntokens) { + unsigned int level; + + assert(c != NULL); + + set_noreply_maybe(c, tokens, ntokens); + + level = strtoul(tokens[2].value, NULL, 10); + if (level == 0) { + settings.slab_automove = false; + } else if (level == 1) { + settings.slab_automove = true; + } else { + out_string(c, "ERROR"); + return; + } + out_string(c, "OK"); + return; +} + static void process_command(conn *c, char *command) { token_t tokens[MAX_TOKENS]; @@ -3303,45 +3323,51 @@ static void process_command(conn *c, char *command) { conn_set_state(c, conn_closing); - } else if (ntokens == 5 && (strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0 && - strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0)) { - int src, dst, rv; + } else if (strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0) { + if (ntokens == 5 && strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0) { + int src, dst, rv; - if (settings.slab_reassign == false) { - out_string(c, "CLIENT_ERROR slab reassignment disabled"); - return; - } + if (settings.slab_reassign == false) { + out_string(c, "CLIENT_ERROR slab reassignment disabled"); + return; + } - src = strtol(tokens[2].value, NULL, 10); - dst = strtol(tokens[3].value, NULL, 10); + src = strtol(tokens[2].value, NULL, 10); + dst = strtol(tokens[3].value, NULL, 10); - if (errno == ERANGE) { - out_string(c, "CLIENT_ERROR bad command line format"); - return; - } + if (errno == ERANGE) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } - rv = slabs_reassign(src, dst); - switch (rv) { - case REASSIGN_OK: - out_string(c, "OK"); - break; - case REASSIGN_RUNNING: - out_string(c, "BUSY"); - break; - case REASSIGN_BADCLASS: - out_string(c, "BADCLASS"); - break; - case REASSIGN_NOSPARE: - out_string(c, "NOSPARE"); - break; - case REASSIGN_DEST_NOT_FULL: - out_string(c, "NOTFULL"); - break; - case REASSIGN_SRC_NOT_SAFE: - out_string(c, "UNSAFE"); - break; + rv = slabs_reassign(src, dst); + switch (rv) { + case REASSIGN_OK: + out_string(c, "OK"); + break; + case REASSIGN_RUNNING: + out_string(c, "BUSY"); + break; + case REASSIGN_BADCLASS: + out_string(c, "BADCLASS"); + break; + case REASSIGN_NOSPARE: + out_string(c, "NOSPARE"); + break; + case REASSIGN_DEST_NOT_FULL: + out_string(c, "NOTFULL"); + break; + case REASSIGN_SRC_NOT_SAFE: + out_string(c, "UNSAFE"); + break; + } + return; + } else if (ntokens == 4 && + (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) { + process_slabs_automove_command(c, tokens, ntokens); + } else { + out_string(c, "ERROR"); } - return; } else if ((ntokens == 3 || ntokens == 4) && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) { process_verbosity_command(c, tokens, ntokens); } else { diff --git a/scripts/mc_slab_mover b/scripts/mc_slab_mover new file mode 100755 index 0000000000..d7bd5e4839 --- /dev/null +++ b/scripts/mc_slab_mover @@ -0,0 +1,260 @@ +#! /usr/bin/perl +# See memcached for LICENSE +# Copyright 2011 Dormando (dormando@rydia.net) + +=head1 NAME + +mc_slab_mover -- example utility for slab page reassignment for memcached + +=head1 SYNOPSIS + + $ mc_slab_mover --host="127.0.0.1:11211" --verbose + $ mc_slab_mover --host="127.0.0.1:11211" --automove + $ mc_slab_mover --host="127.0.0.1:11211" --sleep=60 --loops=4 --automove + +=head1 DESCRIPTION + +This utility is an example implementation of an algorithm for reassigning +slab memory in a running memcached instance. If memcached's built-in +automover isn't working for you, you may use this script as an example +base and expand on it. We welcome modifications or alternatives on the +mailing list. + +=head1 ALGORITHM + +The default algorithm is simple, and may serve for a common case: over +time one slab may grow in use compare to others, and as evictions stop +in one slab and start in another it will reassign memory. + +If a slab has the most evictions three times in a row, it will pull a page +from a slab which has had zero evictions three times in a row. + +There are many traffic patterns where this does not work well. IE: If you +never use expirations and rely on the LRU (so all slabs always evict), +it will not be as likely to find source pages to move. + +=head1 OPTIONS + +=over + +=item --host="IP:PORT" + +The hostname to connect to. NOTE: If connection to the host breaks, script +will stop. + +=item --sleep=10 + +How long to wait between loops for gathering stats. + +=item --loops=3 + +How many loops to run before making a decision for a move. + +=item --verbose + +Prints a formatted dump of some common statistics per loop. + +=item --automove + +Enables the automover, and will attempt to move memory around if it finds +viable candidates. + +=back + +=head1 AUTHOR + +Dormando ELE + +=head1 LICENSE + +Licensed for use and redistribution under the same terms as Memcached itself. + +=cut + +use warnings; +use strict; + +use IO::Socket::INET; + +use FindBin; +use Data::Dumper qw/Dumper/; +use Getopt::Long; + +my %opts = ('sleep' => 10, automove => 0, verbose => 0, loops => 3); +GetOptions( + "host=s" => \$opts{host}, + "sleep=i" => \$opts{'sleep'}, + "loops=i" => \$opts{loops}, + "automove" => \$opts{automove}, + "verbose" => \$opts{verbose}, + ) or usage(); + +die "Must specify at least --host='127.0.0.1:11211'" unless $opts{host}; +my $sock = IO::Socket::INET->new(PeerAddr => $opts{host}, + Timeout => 3); +die "$!\n" unless $sock; + +my %stats = (); +my %move = (winner => 0, wins => 0); + +$SIG{INT} = sub { + print "STATS: ", Dumper(\%stats), "\n"; + exit; +}; +$SIG{USR1} = sub { + print "STATS: ", Dumper(\%stats), "\n"; +}; +run(); + +sub usage { + print qq{Usage: + mc_slab_ratios --host="127.0.0.1:11211" --verbose --automove + run `perldoc mc_slab_ratios` for full information + +}; + exit 1; +} + +sub run { + my $slabs_before = grab_stats(); + + while (1) { + sleep $opts{'sleep'}; + my $slabs_after = grab_stats(); + + my ($totals, $sorted) = calc_results_evicted($slabs_before, $slabs_after); +# my ($totals, $sorted) = calc_results_numratio($slabs_before, $slabs_after); + + my $pct = sub { + my ($num, $divisor) = @_; + return 0 unless $divisor; + return ($num / $divisor); + }; + if ($opts{verbose}) { + printf " %02s: %-8s (pct ) %-10s (pct ) %-6s (pct ) get_hits (pct ) cmd_set (pct )\n", + 'sb', 'evicted', 'items', 'pages'; + for my $slab (@$sorted) { + printf " %02d: %-8d (%.2f%%) %-10s (%.4f%%) %-6d (%.2f%%) %-8d (%.3f%%) %-7d (%.2f%%)\n", + $slab->{slab}, $slab->{evicted_d}, + $pct->($slab->{evicted_d}, $totals->{evicted_d}), + $slab->{number}, + $pct->($slab->{number}, $totals->{number}), + $slab->{total_pages}, + $pct->($slab->{total_pages}, $totals->{total_pages}), + $slab->{get_hits_d}, + $pct->($slab->{get_hits_d}, $totals->{get_hits_d}), + $slab->{cmd_set_d}, + $pct->($slab->{cmd_set_d}, $totals->{cmd_set_d}); + } + } + + next unless @$sorted; + my $highest = $sorted->[-1]; + $stats{$highest->{slab}}++; + print " (winner: ", $highest->{slab}, " wins: ", $stats{$highest->{slab}}, ")\n"; + automove_basic($totals, $sorted) if ($opts{automove}); + + $slabs_before = $slabs_after; + } +} + +sub grab_stats { + my %slabs = (); + for my $stat (qw/items slabs/) { + print $sock "stats $stat\r\n"; + while (my $line = <$sock>) { + chomp $line; + last if ($line =~ m/^END/); + if ($line =~ m/^STAT (?:items:)?(\d+):(\S+) (\S+)/) { + my ($slab, $var, $val) = ($1, $2, $3); + $slabs{$slab}->{$var} = $val; + } + } + } + + return \%slabs; +} + +# Really stupid algo, same as the initial algo built into memcached. +# If a slab "wins" most evictions 3 times in a row, pick from a slab which +# has had 0 evictions 3 times in a row and move it over. +sub automove_basic { + my ($totals, $sorted) = @_; + + my $source = 0; + my $dest = 0; + my $high = $sorted->[-1]; + return unless $high->{evicted_d} > 0; + if ($move{winner} == $high->{slab}) { + $move{wins}++; + $dest = $move{winner} if $move{wins} >= $opts{loops}; + } else { + $move{wins} = 1; + $move{winner} = $high->{slab}; + } + for my $slab (@$sorted) { + my $id = $slab->{slab}; + if ($slab->{evicted_d} == 0 && $slab->{total_pages} > 2) { + $move{zeroes}->{$id}++; + $source = $id if (!$source && $move{zeroes}->{$id} >= $opts{loops}); + } else { + delete $move{zeroes}->{$slab->{slab}} + if exists $move{zeroes}->{$slab->{slab}}; + } + } + + if ($source && $dest) { + print " slabs reassign $source $dest\n"; + print $sock "slabs reassign $source $dest\r\n"; + my $res = <$sock>; + print " RES: ", $res; + } elsif ($dest && !$source) { + print "FAIL: want to move memory to $dest but no valid source slab available\n"; + } +} + +# Using just the evicted stats. +sub calc_results_evicted { + my ($slabs, $totals) = calc_slabs(@_); + my @sorted = sort { $a->{evicted_d} <=> $b->{evicted_d} } values %$slabs; + return ($totals, \@sorted); +} + +# Weighted ratios of evictions vs total stored items +# Seems to fail as an experiment, but it tries to weight stats. +# In this case evictions in underused classes tend to get vastly inflated +sub calc_results_numratio { + my ($slabs, $totals) = calc_slabs(@_, sub { + my ($sb, $sa, $s) = @_; + if ($s->{evicted_d}) { + $s->{numratio} = $s->{evicted_d} / $s->{number}; + } else { $s->{numratio} = 0; } + }); + my @sorted = sort { $a->{numratio} <=> $b->{numratio} } values %$slabs; + return ($totals, \@sorted); +} + +sub calc_slabs { + my ($slabs_before, $slabs_after, $code) = @_; + my %slabs = (); + my %totals = (); + for my $id (keys %$slabs_after) { + my $sb = $slabs_before->{$id}; + my $sa = $slabs_after->{$id}; + next unless ($sb && $sa); + my %slab = %$sa; + for my $key (keys %slab) { + # Add totals, diffs + if ($slab{$key} =~ m/^\d+$/) { + $totals{$key} += $slab{$key}; + $slab{$key . '_d'} = $sa->{$key} - $sb->{$key}; + $totals{$key . '_d'} += $sa->{$key} - $sb->{$key}; + } + } + # External code + $code->($sb, $sa, \%slab) if $code; + $slab{slab} = $id; + $slabs{$id} = \%slab; + } + return (\%slabs, \%totals); +} diff --git a/slabs.c b/slabs.c index eddd59eb04..10b12b9dcf 100644 --- a/slabs.c +++ b/slabs.c @@ -627,16 +627,83 @@ static void slab_rebalance_finish(void) { } } +/* Return 1 means a decision was reached. + * Move to its own thread (created/destroyed as needed) once automover is more + * complex. + */ +static int slab_automove_decision(int *src, int *dst) { + static uint64_t evicted_old[POWER_LARGEST]; + static unsigned int slab_zeroes[POWER_LARGEST]; + static unsigned int slab_winner = 0; + static unsigned int slab_wins = 0; + uint64_t evicted_new[POWER_LARGEST]; + uint64_t evicted_diff = 0; + uint64_t evicted_max = 0; + unsigned int highest_slab = 0; + unsigned int total_pages[POWER_LARGEST]; + int i; + int source = 0; + int dest = 0; + static rel_time_t next_run; + + /* Run less frequently than the slabmove tester. */ + if (current_time >= next_run) { + next_run = current_time + 10; + } else { + return 0; + } + + item_stats_evictions(evicted_new); + pthread_mutex_lock(&cache_lock); + for (i = POWER_SMALLEST; i < power_largest; i++) { + total_pages[i] = slabclass[i].slabs; + } + pthread_mutex_unlock(&cache_lock); + + /* Find a candidate source; something with zero evicts 3+ times */ + for (i = POWER_SMALLEST; i < power_largest; i++) { + evicted_diff = evicted_new[i] - evicted_old[i]; + if (evicted_diff == 0 && total_pages[i] > 2) { + slab_zeroes[i]++; + if (source == 0 && slab_zeroes[i] >= 3) + source = i; + } else { + slab_zeroes[i] = 0; + if (evicted_diff > evicted_max) { + evicted_max = evicted_diff; + highest_slab = i; + } + } + evicted_old[i] = evicted_new[i]; + } + + /* Pick a valid destination */ + if (slab_winner != 0 && slab_winner == highest_slab) { + slab_wins++; + if (slab_wins >= 3) + dest = slab_winner; + } else { + slab_wins = 1; + slab_winner = highest_slab; + } + + if (source && dest) { + *src = source; + *dst = dest; + return 1; + } + return 0; +} + /* Slab rebalancer thread. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and * go to sleep if locks are contended */ static void *slab_maintenance_thread(void *arg) { int was_busy = 0; + int src, dest; while (do_run_slab_thread) { - /* TODO: Call code to make a calculated decision */ - if (slab_rebalance_signal == 1) { if (slab_rebalance_start() < 0) { /* Handle errors with more specifity as required. */ @@ -646,6 +713,9 @@ static void *slab_maintenance_thread(void *arg) { } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) { /* If we have a decision to continue, continue it */ was_busy = slab_rebalance_move(); + } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) { + /* Blind to the return codes. It will retry on its own */ + slabs_reassign(src, dest); } if (slab_rebal.done) {