Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tfw_sched_hash: implement Highest Random Weight (Rendezvous) hashing #58

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tempesta_fw/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,4 @@ tfw_addr_sa_len(const TfwAddr *addr)
validate_addr(addr);
return (addr->family == AF_INET6) ? sizeof(addr->v6) : sizeof(addr->v4);
}
EXPORT_SYMBOL(tfw_addr_sa_len);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented many times: do not pollute space of exported symbols names. The function is pretty short and must be moved to .h file instead of being exported.

2 changes: 1 addition & 1 deletion tempesta_fw/sched/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ ifneq ($(NDEBUG), 1)
EXTRA_CFLAGS += -DDEBUG -O0 -g3
endif

obj-m = tfw_sched_dummy.o tfw_sched_http.o tfw_sched_rr.o
obj-m = tfw_sched_dummy.o tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o
276 changes: 211 additions & 65 deletions tempesta_fw/sched/tfw_sched_hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,27 @@
* Tempesta FW
*
* Hash-based HTTP request scheduler.
*
* The scheduler computes hash of URI and Host header fields of a HTTP request
* and uses the hash value as an index in the array of servers added with
* the tfw_sched_add_srv() function. Therefore, requests with the same URI and
* Host are mapped to the same server (unless the list of servers is changed).
* and uses the hash value as a key for searching a TfwServer object.
* Hence, HTTP requests with the same URI and Host always go to the same server
* unless it is offline.
*
* Also, the scheduler utilizes the Rendezvous hashing (Highest Random Weight)
* method that allows to stick every HTTP message to its server, preserving
* this mapping when other servers go down or new servers added.
* The scheduler hashes not only HTTP requests, but also servers (TfwServer
* objects), and for each incoming HTTP request it searches for a best match
* among all server hashes. Each Host/URI hash has only one best matching
* TfwSrver hash which is chosen, and thus any request always goes to its home
* server unless it is offline.
*
* TODO:
* - Replace the hash function (currnlty djb2 is used).
* - Refactoring: there is simpilar logic in all scheduler modules related
* - Refactoring: there is simpler logic in all scheduler modules related
* to lists of TfwServer objects. The code should be extracted and re-used.
*
* Copyright (C) 2012-2014 NatSys Lab. ([email protected]).
* Copyright (C) 2015 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand All @@ -29,127 +39,263 @@
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
#include <linux/hash.h>
#include <linux/module.h>

#include "http_msg.h"
#include "log.h"
#include "sched.h"
#include "http_msg.h"

MODULE_AUTHOR(TFW_AUTHOR);
MODULE_DESCRIPTION("Tempesta hash-based scheduler");
MODULE_VERSION("0.0.1");
MODULE_VERSION("0.0.2");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I referenced a link to the versioning scheme in one of my previous reviews. Please fix the version.

MODULE_LICENSE("GPL");


#define BANNER "tfw_sched_hash: "
#define ERR(...) TFW_ERR(BANNER __VA_ARGS__)
#define LOG(...) TFW_LOG(BANNER __VA_ARGS__)
#define DBG(...) TFW_DBG(BANNER __VA_ARGS__)

/* TODO: change this to a global setting after merging sched/http. */
#define MAX_SERVERS_N 64
typedef struct {
TfwServer *srv;
unsigned long hash;
} TfwSrvHash;

/**
* Servers added to the scheduler are stored in this statically allocated array.
* Only writes are protected with the servers_write_lock, reads are lock-free.
* The list of TfwSrvHash implemented as array.
*
* The array is chosen instead of a linked list to allow easy binary search
* implementation and perhaps to improve spatial locality of TfwSrvHash objects.
*
* The RCU mechanism is used to protect the list from being modified,
* while the data is scanned by tfw_sched_hash_get_srv().
*
* @rcu is needed to make updates possible in an atomic context.
* That happens if a server connection is lost, so the disconnect callback is
* executed in a softirq context and a TfwServer is deleted from the scheduler.
*/
static TfwServer *servers[MAX_SERVERS_N];
static int servers_n;
static DEFINE_SPINLOCK(servers_write_lock);
typedef struct {
struct rcu_head rcu;
size_t n;
TfwSrvHash srv_hashes[];
} TfwSrvHashList;

static int
find_srv_idx(TfwServer *srv)
{
int i;
for (i = 0; i < servers_n; ++i) {
if (servers[i] == srv)
return i;
}
#define TFW_SRV_HASH_LIST_SIZE(n) \
({ \
/* We put two NULLs when the list is empty in order \
* to simplify the algorithm in tfw_sched_hash_get_srv() */ \
size_t _nulls = n ? 1 : 2; \
(sizeof(TfwSrvHashList) + ((n) + _nulls) * sizeof(TfwSrvHash)); \
})

return -1;
}
/**
* A NULL-terminated array of TfwSrvHash that stores all servers added to the
* scheduler via tfw_sched_hash_add_srv().
*
* Concurrent RCU updaters must be synchronized with servers_update_lock.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment seems outdated, there is no symbol with name "servers_update_lock"

*/
static TfwSrvHashList *tfw_srv_hash_list __rcu;
static DEFINE_SPINLOCK(srv_hashes_update_lock);

/**
* Find an appropriate TfwServer for the HTTP request @msg.
* The server is chosen based on the hash value of URI/Host fields of the @msg,
* so multiple requests to the same resource are mapped to the same server.
*
* Higest Random Weight hashing method is involved: for each message we
* calculate randomized weights as follows: (msg_hash ^ srv_hash), and pick a
* server with the highest weight.
* That sticks messages with certain Host/URI to certain IP addresses.
* A server always receives requests with some URI/Host values bound to it,
* and that holds if connection to the server is lost and then re-established,
* and if other servers go offline, and even if Tempesta FW is restarted.
*
* The drawbacks of HRW hashing are:
* - A weak hash function adds unfairness to the load balancing.
* There may be a case when a server pulls all load from all other servers.
* And such condition (if it occurs although it is quite improbable) is
* quite stable: it cannot be fixed by adding/removing servers and restarting
* the Tempesta FW instance.
* - For every HTTP request, we have to scan the list of all servers to find
* a matching one with the highest weight. That adds some overhead.
* Currently the linear search is used. We may switch to binary search, but
* benchmarks are required to prove that it will behave better on the small
* number of servers that we usually have.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have the benchmarks: https://github.com/krizhanovsky/NatSys-Lab/blob/master/array_scans.cc

The program is only 136 lines in length, so it requires almost the same time to check the performance case as to write the comment ;) Don't be lazy.

*/
static TfwServer *
tfw_sched_hash_get_srv(TfwMsg *msg)
{
TfwServer *srv;
int n;
TfwServer *best_srv;
TfwSrvHash *curr_srv_hash;
TfwSrvHashList *srv_hash_list;
unsigned long msg_hash, curr_weight, best_weight;

msg_hash = tfw_http_req_key_calc((TfwHttpReq *)msg);

unsigned long hash = tfw_http_req_key_calc((TfwHttpReq *)msg);
rcu_read_lock();
srv_hash_list = rcu_dereference(tfw_srv_hash_list);

do {
n = servers_n;
if (!n) {
ERR("No servers added to the scheduler\n");
return NULL;
/* 1. Set best = first element of the list. */
curr_srv_hash = &srv_hash_list->srv_hashes[0];
best_srv = curr_srv_hash->srv;
best_weight = msg_hash ^ curr_srv_hash->hash;

/* 2. Try to find a better one among 2nd...Nth elements of the list.
* We don't check for N == 0 here; the list is terminated with two NULL
* elements when it is empty, so we can avoid the extra branch here. */
while ((++curr_srv_hash)->srv) {
curr_weight = msg_hash ^ curr_srv_hash->hash;
if (curr_weight > best_weight) {
best_weight = curr_weight;
best_srv = curr_srv_hash->srv;
}
srv = servers[hash % n];
} while (!srv);
++curr_srv_hash;
}

rcu_read_unlock();

return srv;
return best_srv;
}

static unsigned long
tfw_sched_hash_calc_srv(TfwServer *srv)
{
unsigned long hash;
union {
TfwAddr addr;
unsigned char bytes[0];
} a;
size_t i, bytes_n;

/**
* Here we just cast the whole TfwAddr to an array of bytes
* and use the standard hash_long() to calculate the hash.
*
* That only works if the following invariants are held:
* - There are no gaps between structure fields.
* - No structure fields (e.g. sin6_flowinfo) are changed if we
* re-connect to the same server.
*/
tfw_server_get_addr(srv, &a.addr);
bytes_n = tfw_addr_sa_len(&a.addr);

hash = 0;
for (i = 0; i < bytes_n; ++i) {
hash = hash_long(hash ^ a.bytes[i], BITS_PER_LONG);
}

return hash;
}

static int
tfw_sched_hash_add_srv(TfwServer *srv)
{
int ret = 0;
TfwSrvHash *new_list_entry;
TfwSrvHashList *new_list, *old_list;
size_t new_list_size, old_list_size;

spin_lock_bh(&srv_hashes_update_lock);

old_list = tfw_srv_hash_list;
old_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n);
new_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n + 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Usually growing in exponentionaly is more preferred than realloc and copy the array for each server. Just keep in ming #76.


spin_lock_bh(&servers_write_lock);
if (servers_n >= MAX_SERVERS_N) {
ERR("Can't add a server to the scheduler - the list is full\n");
if (old_list->n >= TFW_SCHED_MAX_SERVERS) {
ERR("maximum number of servers reached\n");
ret = -ENOBUFS;
goto out;
}

new_list = kzalloc(new_list_size, GFP_ATOMIC);
if (!new_list) {
ret = -ENOMEM;
} else if (find_srv_idx(srv) >= 0) {
ERR("Can't add the server to the scheduler - already added\n");
ret = -EEXIST;
} else {
servers[servers_n] = srv;
++servers_n;
goto out;
}
spin_unlock_bh(&servers_write_lock);

memcpy(new_list, old_list, old_list_size);
new_list_entry = &new_list->srv_hashes[new_list->n++];
new_list_entry->srv = srv;
new_list_entry->hash = tfw_sched_hash_calc_srv(srv);

rcu_assign_pointer(tfw_srv_hash_list, new_list);
kfree_rcu(old_list, rcu);
out:
spin_unlock_bh(&srv_hashes_update_lock);
return ret;
}

static int
tfw_sched_hash_del_srv(TfwServer *srv)
{
int ret = 0;
int i;

spin_lock_bh(&servers_write_lock);
i = find_srv_idx(srv);
if (i < 0) {
ERR("Can't delete the server from the scheduler - not found\n");
ret = -ENOENT;
} else {
servers[i] = servers[servers_n - 1];
--servers_n;
servers[servers_n] = NULL;
TfwSrvHash *curr;
TfwSrvHashList *new_list, *old_list;
size_t new_list_size, old_list_size;

spin_lock_bh(&srv_hashes_update_lock);

old_list = tfw_srv_hash_list;
old_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n);
new_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n - 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there is no need to shrink the list - ever 64K servers is just about 1MB of memory.


new_list = kzalloc(new_list_size, GFP_ATOMIC);
if (!new_list) {
ret = -ENOMEM;
goto out;
}

/* Copy entries to the new list excluding the one being deleted. */
for (curr = old_list->srv_hashes; curr->srv; ++curr) {
if (curr->srv != srv)
new_list->srv_hashes[new_list->n++] = *curr;
}
spin_unlock_bh(&servers_write_lock);
BUG_ON(new_list->n != (old_list->n - 1));

rcu_assign_pointer(tfw_srv_hash_list, new_list);
kfree_rcu(old_list, rcu);
out:
spin_unlock_bh(&srv_hashes_update_lock);
return ret;
}

static TfwScheduler tfw_sched_hash_mod = {
.name = "hash",
.get_srv = tfw_sched_hash_get_srv,
.add_srv = tfw_sched_hash_add_srv,
.del_srv = tfw_sched_hash_del_srv
};

int
tfw_sched_hash_init(void)
{
static TfwScheduler tfw_sched_hash_mod = {
.name = "hash",
.get_srv = tfw_sched_hash_get_srv,
.add_srv = tfw_sched_hash_add_srv,
.del_srv = tfw_sched_hash_del_srv
};
int r;

LOG("init\n");

return tfw_sched_register(&tfw_sched_hash_mod);
/* Allocate a dummy list to avoid NULL checks in other functions. */
tfw_srv_hash_list = kzalloc(TFW_SRV_HASH_LIST_SIZE(0), GFP_KERNEL);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why to allocate an empty array if you know that you'll insert some entries soon? Why not to allocate some adequate values, like 16?

if (!tfw_srv_hash_list) {
ERR("can't allocate empty server list\n");
}

r = tfw_sched_register(&tfw_sched_hash_mod);
if (r) {
ERR("can't register as a scheduler module of Tempesta FW\n");
kfree(tfw_srv_hash_list);
}

return r;
}
module_init(tfw_sched_hash_init);

void
tfw_sched_hash_exit(void)
{
/* At the moment of un-loading the module the list shall not be used. */
BUG_ON(tfw_srv_hash_list->n);

kfree(tfw_srv_hash_list);
tfw_sched_unregister();
}
module_exit(tfw_sched_hash_exit);