Skip to content

Commit

Permalink
blk-mq: improve support for shared tags maps
Browse files Browse the repository at this point in the history
This adds support for active queue tracking, meaning that the
blk-mq tagging maintains a count of active users of a tag set.
This allows us to maintain a notion of fairness between users,
so that we can distribute the tag depth evenly without starving
some users while allowing others to try unfair deep queues.

If sharing of a tag set is detected, each hardware queue will
track the depth of its own queue. And if this exceeds the total
depth divided by the number of active queues, the user is actively
throttled down.

The active queue count is done lazily to avoid bouncing that data
between submitter and completer. Each hardware queue gets marked
active when it allocates its first tag, and gets marked inactive
when 1) the last tag is cleared, and 2) the queue timeout grace
period has passed.

Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
axboe committed May 13, 2014
1 parent 1f236ab commit 0d2602c
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 27 deletions.
10 changes: 10 additions & 0 deletions block/blk-mq-sysfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,11 @@ static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
return blk_mq_tag_sysfs_show(hctx->tags, page);
}

static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
}

static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
{
unsigned int i, first = 1;
Expand Down Expand Up @@ -267,6 +272,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_dispatched_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
.attr = {.name = "active", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_active_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
.attr = {.name = "pending", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_rq_list_show,
Expand All @@ -287,6 +296,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_pending.attr,
&blk_mq_hw_sysfs_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
NULL,
};

Expand Down
112 changes: 95 additions & 17 deletions block/blk-mq-tag.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"

void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx,
bool reserved)
void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved)
{
int tag, zero = 0;

tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved);
blk_mq_put_tag(tags, tag, &zero);
tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved);
blk_mq_put_tag(hctx, tag, &zero);
}

static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
Expand All @@ -40,6 +39,84 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
return bt_has_free_tags(&tags->bitmap_tags);
}

static inline void bt_index_inc(unsigned int *index)
{
*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
}

/*
* If a previously inactive queue goes active, bump the active user count.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);

return true;
}

/*
* If a previously busy queue goes inactive, potential waiters could now
* be allowed to queue. Wake them up and check.
*/
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
struct blk_mq_bitmap_tags *bt;
int i, wake_index;

if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;

atomic_dec(&tags->active_queues);

/*
* Will only throttle depth on non-reserved tags
*/
bt = &tags->bitmap_tags;
wake_index = bt->wake_index;
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];

if (waitqueue_active(&bs->wait))
wake_up(&bs->wait);

bt_index_inc(&wake_index);
}
}

/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct blk_mq_bitmap_tags *bt)
{
unsigned int depth, users;

if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
return true;
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;

/*
* Don't try dividing an ant
*/
if (bt->depth == 1)
return true;

users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;

/*
* Allow at least some tags
*/
depth = max((bt->depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}

static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
{
int tag, org_last_tag, end;
Expand Down Expand Up @@ -78,11 +155,15 @@ static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
* multiple users will tend to stick to different cachelines, at least
* until the map is exhausted.
*/
static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
unsigned int *tag_cache)
{
unsigned int last_tag, org_last_tag;
int index, i, tag;

if (!hctx_may_queue(hctx, bt))
return -1;

last_tag = org_last_tag = *tag_cache;
index = TAG_TO_INDEX(bt, last_tag);

Expand Down Expand Up @@ -117,11 +198,6 @@ static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
return tag;
}

static inline void bt_index_inc(unsigned int *index)
{
*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
}

static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx)
{
Expand All @@ -142,7 +218,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
DEFINE_WAIT(wait);
int tag;

tag = __bt_get(bt, last_tag);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
return tag;

Expand All @@ -156,7 +232,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
was_empty = list_empty(&wait.task_list);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);

tag = __bt_get(bt, last_tag);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;

Expand Down Expand Up @@ -200,14 +276,13 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
return tag;
}

unsigned int blk_mq_get_tag(struct blk_mq_tags *tags,
struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
gfp_t gfp, bool reserved)
{
if (!reserved)
return __blk_mq_get_tag(tags, hctx, last_tag, gfp);
return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);

return __blk_mq_get_reserved_tag(tags, gfp);
return __blk_mq_get_reserved_tag(hctx->tags, gfp);
}

static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
Expand Down Expand Up @@ -265,9 +340,11 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
bt_clear_tag(&tags->breserved_tags, tag);
}

void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag,
void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
unsigned int *last_tag)
{
struct blk_mq_tags *tags = hctx->tags;

if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;

Expand Down Expand Up @@ -465,6 +542,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
res = bt_unused_tags(&tags->breserved_tags);

page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));

return page - orig_page;
}
27 changes: 24 additions & 3 deletions block/blk-mq-tag.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;

atomic_t active_queues;

struct blk_mq_bitmap_tags bitmap_tags;
struct blk_mq_bitmap_tags breserved_tags;

Expand All @@ -49,9 +51,9 @@ struct blk_mq_tags {
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);

extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag);
extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
Expand All @@ -68,4 +70,23 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};

extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);

static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return false;

return __blk_mq_tag_busy(hctx);
}

static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return;

__blk_mq_tag_idle(hctx);
}

#endif
Loading

0 comments on commit 0d2602c

Please sign in to comment.