From f4983b2068d13e5dc71fc075c35a085e904999cf Mon Sep 17 00:00:00 2001 From: dormando Date: Sun, 8 Jan 2012 20:27:07 -0800 Subject: [PATCH] Fix a race condition from 1.4.10 on item_remove Updates the slab mover for the new method. 1.4.10 lacks some crucial protection around item freeing and removal, resulting in some potential crashes. Moving the cache_lock around item_remove caused a 30% performance drop, so it's been reimplemented with GCC atomics. refcount of 1 now means an item is linked but has no reference, which allows us to test an atomic sub and fetch of 0 as a clear indicator of when to free an item. --- items.c | 44 ++++++++++++++------------------- slabs.c | 75 ++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/items.c b/items.c index 174b50ff85..faf43b5c4f 100644 --- a/items.c +++ b/items.c @@ -104,7 +104,7 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim rel_time_t oldest_live = settings.oldest_live; search = tails[id]; - if (search != NULL && search->refcount == 0) { + if (search != NULL && (__sync_add_and_fetch(&search->refcount, 1) == 2)) { if ((search->exptime != 0 && search->exptime < current_time) || (search->time < oldest_live)) { // dead by flush STATS_LOCK(); @@ -118,7 +118,6 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim itemstats[id].expired_unfetched++; } it = search; - it->refcount = 1; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ @@ -143,15 +142,18 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim stats.evictions++; STATS_UNLOCK(); it = search; - it->refcount = 1; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ it->slabs_clsid = 0; + } else { + __sync_sub_and_fetch(&search->refcount, 1); } } else { /* If the LRU is empty or locked, attempt to allocate memory */ it = slabs_alloc(ntotal, id); + if (search != NULL) + __sync_sub_and_fetch(&search->refcount, 1); } if (it == NULL) { @@ -163,10 +165,10 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim * free it anyway. */ if (search != NULL && - search->refcount != 0 && + search->refcount != 2 && search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; - search->refcount = 0; + search->refcount = 1; do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0)); } pthread_mutex_unlock(&cache_lock); @@ -206,7 +208,6 @@ void item_free(item *it) { /* so slab size changer can tell later if item is already free or not */ clsid = it->slabs_clsid; it->slabs_clsid = 0; - it->it_flags |= ITEM_SLABBED; DEBUG_REFCNT(it, 'F'); slabs_free(it, ntotal, clsid); } @@ -272,6 +273,7 @@ static void item_unlink_q(item *it) { int do_item_link(item *it, const uint32_t hv) { MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes); assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0); + mutex_lock(&cache_lock); it->it_flags |= ITEM_LINKED; it->time = current_time; @@ -281,11 +283,11 @@ int do_item_link(item *it, const uint32_t hv) { stats.total_items += 1; STATS_UNLOCK(); - mutex_lock(&cache_lock); /* Allocate a new CAS ID on link. */ ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0); assoc_insert(it, hv); item_link_q(it); + __sync_add_and_fetch(&it->refcount, 1); pthread_mutex_unlock(&cache_lock); return 1; @@ -302,7 +304,7 @@ void do_item_unlink(item *it, const uint32_t hv) { STATS_UNLOCK(); assoc_delete(ITEM_key(it), it->nkey, hv); item_unlink_q(it); - if (it->refcount == 0) item_free(it); + do_item_remove(it); } pthread_mutex_unlock(&cache_lock); } @@ -318,7 +320,7 @@ void do_item_unlink_nolock(item *it, const uint32_t hv) { STATS_UNLOCK(); assoc_delete(ITEM_key(it), it->nkey, hv); item_unlink_q(it); - if (it->refcount == 0) item_free(it); + do_item_remove(it); } } @@ -326,15 +328,9 @@ void do_item_remove(item *it) { MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes); assert((it->it_flags & ITEM_SLABBED) == 0); - mutex_lock(&cache_lock); - if (it->refcount != 0) { - it->refcount--; - DEBUG_REFCNT(it, '-'); - } - if (it->refcount == 0 && (it->it_flags & ITEM_LINKED) == 0) { + if (__sync_sub_and_fetch(&it->refcount, 1) == 0) { item_free(it); } - pthread_mutex_unlock(&cache_lock); } void do_item_update(item *it) { @@ -488,14 +484,14 @@ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) { mutex_lock(&cache_lock); item *it = assoc_find(key, nkey, hv); if (it != NULL) { - it->refcount++; + __sync_add_and_fetch(&it->refcount, 1); /* Optimization for slab reassignment. prevents popular items from * jamming in busy wait. Can only do this here to satisfy lock order * of item_lock, cache_lock, slabs_lock. */ if (slab_rebalance_signal && ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) { - it->refcount--; do_item_unlink_nolock(it, hv); + do_item_remove(it); it = NULL; } } @@ -514,19 +510,15 @@ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) { if (it != NULL) { if (settings.oldest_live != 0 && settings.oldest_live <= current_time && it->time <= settings.oldest_live) { - mutex_lock(&cache_lock); - it->refcount--; - do_item_unlink_nolock(it, hv); - pthread_mutex_unlock(&cache_lock); + do_item_unlink(it, hv); + do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by flush"); } } else if (it->exptime != 0 && it->exptime <= current_time) { - mutex_lock(&cache_lock); - it->refcount--; - do_item_unlink_nolock(it, hv); - pthread_mutex_unlock(&cache_lock); + do_item_unlink(it, hv); + do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by expire"); diff --git a/slabs.c b/slabs.c index 6029c34ca4..a2757d02bc 100644 --- a/slabs.c +++ b/slabs.c @@ -292,6 +292,7 @@ static void do_slabs_free(void *ptr, const size_t size, unsigned int id) { #endif it = (item *)ptr; + it->it_flags |= ITEM_SLABBED; it->prev = 0; it->next = p->slots; if (it->next) it->next->prev = it; @@ -521,6 +522,10 @@ static int slab_rebalance_start(void) { return 0; } +enum move_status { + MOVE_PASS=0, MOVE_DONE, MOVE_BUSY +}; + /* refcount == 0 is safe since nobody can incr while cache_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get @@ -530,6 +535,8 @@ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; + int refcount = 0; + enum move_status status = MOVE_PASS; pthread_mutex_lock(&cache_lock); pthread_mutex_lock(&slabs_lock); @@ -538,30 +545,54 @@ static int slab_rebalance_move(void) { for (x = 0; x < slab_bulk_check; x++) { item *it = slab_rebal.slab_pos; - if (it->refcount == 0) { - if (it->it_flags & ITEM_SLABBED) { - /* remove from freelist linked list */ - if (s_cls->slots == it) { - s_cls->slots = it->next; + status = MOVE_PASS; + if (it->slabs_clsid != 255) { + refcount = __sync_add_and_fetch(&it->refcount, 1); + if (refcount == 1) { /* item is unlinked, unused */ + if (it->it_flags & ITEM_SLABBED) { + /* remove from slab freelist */ + if (s_cls->slots == it) { + s_cls->slots = it->next; + } + if (it->next) it->next->prev = it->prev; + if (it->prev) it->prev->next = it->next; + s_cls->sl_curr--; + status = MOVE_DONE; + } else { + status = MOVE_BUSY; } - if (it->next) it->next->prev = it->prev; - if (it->prev) it->prev->next = it->next; - s_cls->sl_curr--; - } else if (it->it_flags != 0) { - it->refcount = 1; - /* Call unlink with refcount == 1 so it won't free */ - do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); - it->refcount = 0; - } - it->it_flags = 0; - it->slabs_clsid = 0; - } else { - if (settings.verbose > 2) { - fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", - it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); + } else if (refcount == 2) { /* item is linked but not busy */ + if ((it->it_flags & ITEM_LINKED) != 0) { + do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); + status = MOVE_DONE; + } else { + /* refcount == 1 + !ITEM_LINKED means the item is being + * uploaded to, or was just unlinked but hasn't been freed + * yet. Let it bleed off on its own and try again later */ + status = MOVE_BUSY; + } + } else { + if (settings.verbose > 2) { + fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", + it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); + } + status = MOVE_BUSY; } - slab_rebal.busy_items++; - was_busy++; + } + + switch (status) { + case MOVE_DONE: + it->refcount = 0; + it->it_flags = 0; + it->slabs_clsid = 255; + break; + case MOVE_BUSY: + slab_rebal.busy_items++; + was_busy++; + __sync_sub_and_fetch(&it->refcount, 1); + break; + case MOVE_PASS: + break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;