dovecot-2.2: director: Detect if directors' hosts have become de...

dovecot at dovecot.org dovecot at dovecot.org
Mon Oct 12 12:49:32 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/8f225e43e6e3
changeset: 19293:8f225e43e6e3
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Oct 12 15:47:46 2015 +0300
description:
director: Detect if directors' hosts have become desynced by sending hosts_hash in SYNC parameter.
Also fix up such a situation by resending all HOSTs.

diffstat:

 src/director/director-connection.c |  47 ++++++++++++++++++++++++++++++++--
 src/director/director-host.h       |   4 ++
 src/director/director.c            |  51 ++++++++++++++++++++++++++-----------
 src/director/director.h            |   3 +-
 4 files changed, 84 insertions(+), 21 deletions(-)

diffs (234 lines):

diff -r bae8efd8b5b3 -r 8f225e43e6e3 src/director/director-connection.c
--- a/src/director/director-connection.c	Mon Oct 12 15:41:55 2015 +0300
+++ b/src/director/director-connection.c	Mon Oct 12 15:47:46 2015 +0300
@@ -228,7 +228,8 @@
 		dir->sync_seq++;
 		director_set_ring_unsynced(dir);
 		director_sync_send(dir, dir->self_host, dir->sync_seq,
-				   DIRECTOR_VERSION_MINOR, ioloop_time);
+				   DIRECTOR_VERSION_MINOR, ioloop_time,
+				   mail_hosts_hash(dir->mail_hosts));
 	}
 	director_connection_set_ping_timeout(conn);
 }
@@ -1243,7 +1244,7 @@
 director_connection_sync_host(struct director_connection *conn,
 			      struct director_host *host,
 			      uint32_t seq, unsigned int minor_version,
-			      unsigned int timestamp)
+			      unsigned int timestamp, unsigned int hosts_hash)
 {
 	struct director *dir = conn->dir;
 
@@ -1261,6 +1262,16 @@
 		   successfully connected to both directions */
 		i_assert(dir->left != NULL && dir->right != NULL);
 
+		if (hosts_hash != 0 &&
+		    hosts_hash != mail_hosts_hash(conn->dir->mail_hosts)) {
+			i_error("director(%s): Hosts unexpectedly changed during SYNC reply - resending"
+				"(seq=%u, old hosts_hash=%u, new hosts_hash=%u)",
+				conn->name, seq, hosts_hash,
+				mail_hosts_hash(dir->mail_hosts));
+			(void)director_resend_sync(dir);
+			return FALSE;
+		}
+
 		dir->ring_min_version = minor_version;
 		if (!dir->ring_handshaked) {
 			/* the ring is handshaked */
@@ -1311,10 +1322,32 @@
 			return FALSE;
 		}
 
+		if (hosts_hash != 0 &&
+		    hosts_hash != mail_hosts_hash(conn->dir->mail_hosts)) {
+			if (host->desynced_hosts_hash != hosts_hash) {
+				dir_debug("Ignore director %s stale SYNC request whose hosts don't match us "
+					  "(seq=%u, remote hosts_hash=%u, my hosts_hash=%u)",
+					  net_ip2addr(&host->ip), seq, hosts_hash,
+					  mail_hosts_hash(dir->mail_hosts));
+				host->desynced_hosts_hash = hosts_hash;
+				return FALSE;
+			}
+			/* we'll get here only if we received a SYNC twice
+			   with the same wrong hosts_hash. FIXME: this gets
+			   triggered unnecessarily sometimes if hosts are
+			   changing rapidly. */
+			i_error("director(%s): Director %s SYNC request hosts don't match us - resending hosts "
+				"(seq=%u, remote hosts_hash=%u, my hosts_hash=%u)",
+				conn->name, net_ip2addr(&host->ip), seq,
+				hosts_hash, mail_hosts_hash(dir->mail_hosts));
+			director_resend_hosts(dir);
+			return FALSE;
+		}
+		host->desynced_hosts_hash = 0;
 		if (dir->right != NULL) {
 			/* forward it to the connection on right */
 			director_sync_send(dir, host, seq, minor_version,
-					   timestamp);
+					   timestamp, hosts_hash);
 		}
 	}
 	return TRUE;
@@ -1328,6 +1361,7 @@
 	struct ip_addr ip;
 	in_port_t port;
 	unsigned int arg_count, seq, minor_version = 0, timestamp = ioloop_time;
+	unsigned int hosts_hash = 0;
 
 	arg_count = str_array_length(args);
 	if (arg_count < 3 ||
@@ -1344,13 +1378,18 @@
 		director_cmd_error(conn, "Invalid parameters");
 		return FALSE;
 	}
+	if (arg_count >= 6 && str_to_uint(args[5], &hosts_hash) < 0) {
+		director_cmd_error(conn, "Invalid parameters");
+		return FALSE;
+	}
 
 	/* find the originating director. if we don't see it, it was already
 	   removed and we can ignore this sync. */
 	host = director_host_lookup(dir, &ip, port);
 	if (host != NULL) {
 		if (!director_connection_sync_host(conn, host, seq,
-						   minor_version, timestamp))
+						   minor_version, timestamp,
+						   hosts_hash))
 			return TRUE;
 	}
 
diff -r bae8efd8b5b3 -r 8f225e43e6e3 src/director/director-host.h
--- a/src/director/director-host.h	Mon Oct 12 15:41:55 2015 +0300
+++ b/src/director/director-host.h	Mon Oct 12 15:47:46 2015 +0300
@@ -23,6 +23,10 @@
 	/* use these to avoid infinitely sending SYNCs for directors that
 	   aren't connected in the ring. */
 	unsigned int last_sync_seq, last_sync_seq_counter, last_sync_timestamp;
+	/* whenever we receive a SYNC with stale hosts_hash, set this. if it's
+	   already set and equals the current hosts_hash, re-send our hosts to
+	   everybody in case they somehow got out of sync. */
+	unsigned int desynced_hosts_hash;
 	/* Last time host was detected to be down */
 	time_t last_network_failure;
 	time_t last_protocol_failure;
diff -r bae8efd8b5b3 -r 8f225e43e6e3 src/director/director.c
--- a/src/director/director.c	Mon Oct 12 15:41:55 2015 +0300
+++ b/src/director/director.c	Mon Oct 12 15:47:46 2015 +0300
@@ -321,7 +321,7 @@
 
 void director_sync_send(struct director *dir, struct director_host *host,
 			uint32_t seq, unsigned int minor_version,
-			unsigned int timestamp)
+			unsigned int timestamp, unsigned int hosts_hash)
 {
 	string_t *str;
 
@@ -331,7 +331,8 @@
 	if (minor_version > 0 &&
 	    director_connection_get_minor_version(dir->right) > 0) {
 		/* only minor_version>0 supports extra parameters */
-		str_printfa(str, "\t%u\t%u", minor_version, timestamp);
+		str_printfa(str, "\t%u\t%u\t%u", minor_version,
+			    timestamp, hosts_hash);
 	}
 	str_append_c(str, '\n');
 	director_connection_send(dir->right, str_c(str));
@@ -349,7 +350,8 @@
 		/* send a new SYNC in case the previous one got dropped */
 		dir->self_host->last_sync_timestamp = ioloop_time;
 		director_sync_send(dir, dir->self_host, dir->sync_seq,
-				   DIRECTOR_VERSION_MINOR, ioloop_time);
+				   DIRECTOR_VERSION_MINOR, ioloop_time,
+				   mail_hosts_hash(dir->mail_hosts));
 		if (dir->to_sync != NULL)
 			timeout_reset(dir->to_sync);
 		return TRUE;
@@ -412,7 +414,8 @@
 		director_connection_set_synced(dir->left, FALSE);
 	director_connection_set_synced(dir->right, FALSE);
 	director_sync_send(dir, dir->self_host, dir->sync_seq,
-			   DIRECTOR_VERSION_MINOR, ioloop_time);
+			   DIRECTOR_VERSION_MINOR, ioloop_time,
+			   mail_hosts_hash(dir->mail_hosts));
 }
 
 void director_sync_freeze(struct director *dir)
@@ -515,21 +518,13 @@
 				     DIRECTOR_VERSION_RING_REMOVE, cmd);
 }
 
-void director_update_host(struct director *dir, struct director_host *src,
-			  struct director_host *orig_src,
-			  struct mail_host *host)
+static void
+director_send_host(struct director *dir, struct director_host *src,
+		   struct director_host *orig_src,
+		   struct mail_host *host)
 {
 	string_t *str;
 
-	/* update state in case this is the first mail host being added */
-	director_set_state_changed(dir);
-
-	dir_debug("Updating host %s vhost_count=%u "
-		  "down=%d last_updown_change=%ld (hosts_hash=%u)",
-		  net_ip2addr(&host->ip), host->vhost_count, host->down,
-		  (long)host->last_updown_change,
-		  mail_hosts_hash(dir->mail_hosts));
-
 	if (orig_src == NULL) {
 		orig_src = dir->self_host;
 		orig_src->last_seq++;
@@ -556,6 +551,30 @@
 	}
 	str_append_c(str, '\n');
 	director_update_send(dir, src, str_c(str));
+}
+
+void director_resend_hosts(struct director *dir)
+{
+	struct mail_host *const *hostp;
+
+	array_foreach(mail_hosts_get(dir->mail_hosts), hostp)
+		director_send_host(dir, dir->self_host, NULL, *hostp);
+}
+
+void director_update_host(struct director *dir, struct director_host *src,
+			  struct director_host *orig_src,
+			  struct mail_host *host)
+{
+	/* update state in case this is the first mail host being added */
+	director_set_state_changed(dir);
+
+	dir_debug("Updating host %s vhost_count=%u "
+		  "down=%d last_updown_change=%ld (hosts_hash=%u)",
+		  net_ip2addr(&host->ip), host->vhost_count, host->down,
+		  (long)host->last_updown_change,
+		  mail_hosts_hash(dir->mail_hosts));
+
+	director_send_host(dir, src, orig_src, host);
 
 	host->desynced = TRUE;
 	director_sync(dir);
diff -r bae8efd8b5b3 -r 8f225e43e6e3 src/director/director.h
--- a/src/director/director.h	Mon Oct 12 15:41:55 2015 +0300
+++ b/src/director/director.h	Mon Oct 12 15:47:46 2015 +0300
@@ -115,7 +115,7 @@
 void director_set_state_changed(struct director *dir);
 void director_sync_send(struct director *dir, struct director_host *host,
 			uint32_t seq, unsigned int minor_version,
-			unsigned int timestamp);
+			unsigned int timestamp, unsigned int hosts_hash);
 bool director_resend_sync(struct director *dir);
 
 void director_notify_ring_added(struct director_host *added_host,
@@ -126,6 +126,7 @@
 void director_update_host(struct director *dir, struct director_host *src,
 			  struct director_host *orig_src,
 			  struct mail_host *host) ATTR_NULL(3);
+void director_resend_hosts(struct director *dir);
 void director_remove_host(struct director *dir, struct director_host *src,
 			  struct director_host *orig_src,
 			  struct mail_host *host) ATTR_NULL(2, 3);


More information about the dovecot-cvs mailing list