tempesta-tech · vdmit11 · Feb 20, 2015 · Mar 27, 2015 · krizhanovsky · Mar 28, 2015
diff --git a/tempesta_fw/addr.c b/tempesta_fw/addr.c
@@ -515,3 +515,4 @@ tfw_addr_sa_len(const TfwAddr *addr)
         validate_addr(addr);
         return (addr->family == AF_INET6) ? sizeof(addr->v6) : sizeof(addr->v4);
 }
+EXPORT_SYMBOL(tfw_addr_sa_len);
diff --git a/tempesta_fw/sched/Makefile b/tempesta_fw/sched/Makefile
@@ -23,4 +23,4 @@ ifneq ($(NDEBUG), 1)
 EXTRA_CFLAGS += -DDEBUG -O0 -g3
 endif
 
-obj-m = tfw_sched_dummy.o tfw_sched_http.o tfw_sched_rr.o
+obj-m = tfw_sched_dummy.o tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o
diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c
@@ -2,17 +2,27 @@
  *		Tempesta FW
  *
  * Hash-based HTTP request scheduler.
+ *
  * The scheduler computes hash of URI and Host header fields of a HTTP request
- * and uses the hash value as an index in the array of servers added with
- * the tfw_sched_add_srv() function. Therefore, requests with the same URI and
- * Host are mapped to the same server (unless the list of servers is changed).
+ * and uses the hash value as a key for searching a TfwServer object.
+ * Hence, HTTP requests with the same URI and Host always go to the same server
+ * unless it is offline.
+ *
+ * Also, the scheduler utilizes the Rendezvous hashing (Highest Random Weight)
+ * method that allows to stick every HTTP message to its server, preserving
+ * this mapping when other servers go down or new servers added.
+ * The scheduler hashes not only HTTP requests, but also servers (TfwServer
+ * objects), and for each incoming HTTP request it searches for a best match
+ * among all server hashes. Each Host/URI hash has only one best matching
+ * TfwSrver hash which is chosen, and thus any request always goes to its home
+ * server unless it is offline.
  *
  * TODO:
- *  - Replace the hash function (currnlty djb2 is used).
- *  - Refactoring: there is simpilar logic in all scheduler modules related
+ *  - Refactoring: there is simpler logic in all scheduler modules related
  *    to lists of TfwServer objects. The code should be extracted and re-used.
  *
  * Copyright (C) 2012-2014 NatSys Lab. ([email protected]).
+ * Copyright (C) 2015 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -29,127 +39,263 @@
  * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  */
+#include <linux/hash.h>
 #include <linux/module.h>
 
+#include "http_msg.h"
 #include "log.h"
 #include "sched.h"
-#include "http_msg.h"
 
 MODULE_AUTHOR(TFW_AUTHOR);
 MODULE_DESCRIPTION("Tempesta hash-based scheduler");
-MODULE_VERSION("0.0.1");
+MODULE_VERSION("0.0.2");
 MODULE_LICENSE("GPL");
 
-
 #define BANNER "tfw_sched_hash: "
 #define ERR(...) TFW_ERR(BANNER __VA_ARGS__)
 #define LOG(...) TFW_LOG(BANNER __VA_ARGS__)
 #define DBG(...) TFW_DBG(BANNER __VA_ARGS__)
 
-/* TODO: change this to a global setting after merging sched/http. */
-#define MAX_SERVERS_N 64
+typedef struct {
+	TfwServer *srv;
+	unsigned long hash;
+} TfwSrvHash;
 
 /**
- * Servers added to the scheduler are stored in this statically allocated array.
- * Only writes are protected with the servers_write_lock, reads are lock-free.
+ * The list of TfwSrvHash implemented as array.
+ *
+ * The array is chosen instead of a linked list to allow easy binary search
+ * implementation and perhaps to improve spatial locality of TfwSrvHash objects.
+ *
+ * The RCU mechanism is used to protect the list from being modified,
+ * while the data is scanned by tfw_sched_hash_get_srv().
+ *
+ * @rcu is needed to make updates possible in an atomic context.
+ * That happens if a server connection is lost, so the disconnect callback is
+ * executed in a softirq context and a TfwServer is deleted from the scheduler.
  */
-static TfwServer *servers[MAX_SERVERS_N];
-static int servers_n;
-static DEFINE_SPINLOCK(servers_write_lock);
+typedef struct {
+	struct rcu_head rcu;
+	size_t		n;
+	TfwSrvHash	srv_hashes[];
+} TfwSrvHashList;
 
-static int
-find_srv_idx(TfwServer *srv)
-{
-	int i;
-	for (i = 0; i < servers_n; ++i) {
-		if (servers[i] == srv)
-			return i;
-	}
+#define TFW_SRV_HASH_LIST_SIZE(n) 					\
+({									\
+	/* We put two NULLs when the list is empty in order		\
+	 * to simplify the algorithm in tfw_sched_hash_get_srv() */	\
+	size_t _nulls = n ? 1 : 2;					\
+	(sizeof(TfwSrvHashList) + ((n) + _nulls) * sizeof(TfwSrvHash));	\
+})
 
-	return -1;
-}
+/**
+ * A NULL-terminated array of TfwSrvHash that stores all servers added to the
+ * scheduler via tfw_sched_hash_add_srv().
+ *
+ * Concurrent RCU updaters must be synchronized with servers_update_lock.
+ */
+static TfwSrvHashList *tfw_srv_hash_list __rcu;
+static DEFINE_SPINLOCK(srv_hashes_update_lock);
 
+/**
+ * Find an appropriate TfwServer for the HTTP request @msg.
+ * The server is chosen based on the hash value of URI/Host fields of the @msg,
+ * so multiple requests to the same resource are mapped to the same server.
+ *
+ * Higest Random Weight hashing method is involved: for each message we
+ * calculate randomized weights as follows: (msg_hash ^ srv_hash), and pick a
+ * server with the highest weight.
+ * That sticks messages with certain Host/URI to certain IP addresses.
+ * A server always receives requests with some URI/Host values bound to it,
+ * and that holds if connection to the server is lost and then re-established,
+ * and if other servers go offline, and even if Tempesta FW is restarted.
+ *
+ * The drawbacks of HRW hashing are:
+ *  - A weak hash function adds unfairness to the load balancing.
+ *    There may be a case when a server pulls all load from all other servers.
+ *    And such condition (if it occurs although it is quite improbable) is
+ *    quite stable: it cannot be fixed by adding/removing servers and restarting
+ *    the Tempesta FW instance.
+ *  - For every HTTP request, we have to scan the list of all servers to find
+ *    a matching one with the highest weight. That adds some overhead.
+ *    Currently the linear search is used. We may switch to binary search, but
+ *    benchmarks are required to prove that it will behave better on the small
+ *    number of servers that we usually have.
+ */
 static TfwServer *
 tfw_sched_hash_get_srv(TfwMsg *msg)
 {
-	TfwServer *srv;
-	int n;
+	TfwServer *best_srv;
+	TfwSrvHash *curr_srv_hash;
+	TfwSrvHashList *srv_hash_list;
+	unsigned long msg_hash, curr_weight, best_weight;
+
+	msg_hash = tfw_http_req_key_calc((TfwHttpReq *)msg);
 
-	unsigned long hash = tfw_http_req_key_calc((TfwHttpReq *)msg);
+	rcu_read_lock();
+	srv_hash_list = rcu_dereference(tfw_srv_hash_list);
 
-	do {
-		n = servers_n;
-		if (!n) {
-			ERR("No servers added to the scheduler\n");
-			return NULL;
+	/* 1. Set best = first element of the list. */
+	curr_srv_hash = &srv_hash_list->srv_hashes[0];
+	best_srv = curr_srv_hash->srv;
+	best_weight = msg_hash ^ curr_srv_hash->hash;
+
+	/* 2. Try to find a better one among 2nd...Nth elements of the list.
+	 * We don't check for N == 0 here; the list is terminated with two NULL
+	 * elements when it is empty, so we can avoid the extra branch here. */
+	while ((++curr_srv_hash)->srv) {
+		curr_weight = msg_hash ^ curr_srv_hash->hash;
+		if (curr_weight > best_weight) {
+			best_weight = curr_weight;
+			best_srv = curr_srv_hash->srv;
 		}
-		srv = servers[hash % n];
-	} while (!srv);
+		++curr_srv_hash;
+	}
+
+	rcu_read_unlock();
 
-	return srv;
+	return best_srv;
+}
+
+static unsigned long
+tfw_sched_hash_calc_srv(TfwServer *srv)
+{
+	unsigned long hash;
+	union {
+		TfwAddr addr;
+		unsigned char bytes[0];
+	} a;
+	size_t i, bytes_n;
+
+	/**
+	 * Here we just cast the whole TfwAddr to an array of bytes
+	 * and use the standard hash_long() to calculate the hash.
+	 *
+	 * That only works if the following invariants are held:
+	 *  - There are no gaps between structure fields.
+	 *  - No structure fields (e.g. sin6_flowinfo) are changed if we
+	 *    re-connect to the same server.
+	 */
+	tfw_server_get_addr(srv, &a.addr);
+	bytes_n = tfw_addr_sa_len(&a.addr);
+
+	hash = 0;
+	for (i = 0; i < bytes_n; ++i) {
+		hash = hash_long(hash ^ a.bytes[i], BITS_PER_LONG);
+	}
+
+	return hash;
 }
 
 static int
 tfw_sched_hash_add_srv(TfwServer *srv)
 {
 	int ret = 0;
+	TfwSrvHash *new_list_entry;
+	TfwSrvHashList *new_list, *old_list;
+	size_t new_list_size, old_list_size;
+
+	spin_lock_bh(&srv_hashes_update_lock);
+
+	old_list = tfw_srv_hash_list;
+	old_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n);
+	new_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n + 1);
 
-	spin_lock_bh(&servers_write_lock);
-	if (servers_n >= MAX_SERVERS_N) {
-		ERR("Can't add a server to the scheduler - the list is full\n");
+	if (old_list->n >= TFW_SCHED_MAX_SERVERS) {
+		ERR("maximum number of servers reached\n");
+		ret = -ENOBUFS;
+		goto out;
+	}
+
+	new_list = kzalloc(new_list_size, GFP_ATOMIC);
+	if (!new_list) {
 		ret = -ENOMEM;
-	} else if (find_srv_idx(srv) >= 0) {
-		ERR("Can't add the server to the scheduler - already added\n");
-		ret = -EEXIST;
-	} else {
-		servers[servers_n] = srv;
-		++servers_n;
+		goto out;
 	}
-	spin_unlock_bh(&servers_write_lock);
 
+	memcpy(new_list, old_list, old_list_size);
+	new_list_entry = &new_list->srv_hashes[new_list->n++];
+	new_list_entry->srv = srv;
+	new_list_entry->hash = tfw_sched_hash_calc_srv(srv);
+
+	rcu_assign_pointer(tfw_srv_hash_list, new_list);
+	kfree_rcu(old_list, rcu);
+out:
+	spin_unlock_bh(&srv_hashes_update_lock);
 	return ret;
 }
 
 static int
 tfw_sched_hash_del_srv(TfwServer *srv)
 {
 	int ret = 0;
-	int i;
-
-	spin_lock_bh(&servers_write_lock);
-	i = find_srv_idx(srv);
-	if (i < 0) {
-		ERR("Can't delete the server from the scheduler - not found\n");
-		ret = -ENOENT;
-	} else {
-		servers[i] = servers[servers_n - 1];
-		--servers_n;
-		servers[servers_n] = NULL;
+	TfwSrvHash *curr;
+	TfwSrvHashList *new_list, *old_list;
+	size_t new_list_size, old_list_size;
+
+	spin_lock_bh(&srv_hashes_update_lock);
+
+	old_list = tfw_srv_hash_list;
+	old_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n);
+	new_list_size = TFW_SRV_HASH_LIST_SIZE(old_list->n - 1);
+
+	new_list = kzalloc(new_list_size, GFP_ATOMIC);
+	if (!new_list) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Copy entries to the new list excluding the one being deleted. */
+	for (curr = old_list->srv_hashes; curr->srv; ++curr) {
+		if (curr->srv != srv)
+			new_list->srv_hashes[new_list->n++] = *curr;
 	}
-	spin_unlock_bh(&servers_write_lock);
+	BUG_ON(new_list->n != (old_list->n - 1));
 
+	rcu_assign_pointer(tfw_srv_hash_list, new_list);
+	kfree_rcu(old_list, rcu);
+out:
+	spin_unlock_bh(&srv_hashes_update_lock);
 	return ret;
 }
 
+static TfwScheduler tfw_sched_hash_mod = {
+	.name = "hash",
+	.get_srv = tfw_sched_hash_get_srv,
+	.add_srv = tfw_sched_hash_add_srv,
+	.del_srv = tfw_sched_hash_del_srv
+};
+
 int
 tfw_sched_hash_init(void)
 {
-	static TfwScheduler tfw_sched_hash_mod = {
-		.name = "hash",
-		.get_srv = tfw_sched_hash_get_srv,
-		.add_srv = tfw_sched_hash_add_srv,
-		.del_srv = tfw_sched_hash_del_srv
-	};
+	int r;
 
 	LOG("init\n");
 
-	return tfw_sched_register(&tfw_sched_hash_mod);
+	/* Allocate a dummy list to avoid NULL checks in other functions. */
+	tfw_srv_hash_list = kzalloc(TFW_SRV_HASH_LIST_SIZE(0), GFP_KERNEL);
+	if (!tfw_srv_hash_list) {
+		ERR("can't allocate empty server list\n");
+	}
+
+	r = tfw_sched_register(&tfw_sched_hash_mod);
+	if (r) {
+		ERR("can't register as a scheduler module of Tempesta FW\n");
+		kfree(tfw_srv_hash_list);
+	}
+
+	return r;
 }
 module_init(tfw_sched_hash_init);
 
 void
 tfw_sched_hash_exit(void)
 {
+	/* At the moment of un-loading the module the list shall not be used. */
+	BUG_ON(tfw_srv_hash_list->n);
+
+	kfree(tfw_srv_hash_list);
 	tfw_sched_unregister();
 }
 module_exit(tfw_sched_hash_exit);