From aae3e451faeb4ba752bf637ab5792980ee850a2c Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:26:19 +0100 Subject: [PATCH 1/9] ctdb-daemon: ensure restart() callback is called in half-connected state If NODE_FLAGS_DISCONNECTED is set the node can be in half-connected state. With this change we ensure to restart the transport for this case. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit 6a4fa0785fc83561939fa41617d526eb96c1af89) --- ctdb/server/ctdb_server.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 9724d1fe0a8..c5d92af0cfb 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -301,6 +301,12 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) */ void ctdb_node_dead(struct ctdb_node *node) { + if (node->ctdb->methods == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + return; + } + + node->ctdb->methods->restart(node); if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, @@ -315,13 +321,6 @@ void ctdb_node_dead(struct ctdb_node *node) DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); ctdb_daemon_cancel_controls(node->ctdb, node); - - if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); - return; - } - - node->ctdb->methods->restart(node); } /* -- 2.24.1 From ee34f8f5ed8c149c30fc093048c2fc78af53d06a Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 1 Mar 2020 16:40:41 +1100 Subject: [PATCH 2/9] ctdb-daemon: more logical whitespace, debug modernisation BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Martin Schwenke Reviewed-by: Ralph Boehme (cherry picked from commit 15762a34559599cf908e30651a2d4c11560068ed) --- ctdb/server/ctdb_server.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index c5d92af0cfb..4b4c2e9896f 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -302,11 +302,11 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) void ctdb_node_dead(struct ctdb_node *node) { if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + DBG_ERR("Can not restart transport while shutting down\n"); return; } - node->ctdb->methods->restart(node); + if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, -- 2.24.1 From 4d7b147b6d9c92a80466d857a14ce8d67829f091 Mon Sep 17 00:00:00 2001 From: Noel Power Date: Sat, 29 Feb 2020 15:49:28 +0000 Subject: [PATCH 3/9] ctdb-tcp: move free of inbound queue to TCP restart Since commit 77deaadca8e8dbc3c92ea16893099c72f6dc874e, a nodeA which had previously accepted a connection from nodeB (where nodeB dies e.g. as as result of fencing) when nodeB attempts to connect again after restarting is always rejected with ctdb_listen_event: Incoming queue active, rejecting connection from w.x.y.z messages. Consolidate dead node handling in the TCP restart handling. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Noel Power Reviewed-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit 0ff1b78fc2f0491f9e11131d0040bdaba8873770) --- ctdb/tcp/tcp_init.c | 2 +- ctdb/tcp/tcp_io.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 0eb9799ac4a..fe49022a1f6 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->private_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - + TALLOC_FREE(tnode->in_queue); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c index 2d8ec0f7062..b1e44c8783d 100644 --- a/ctdb/tcp/tcp_io.c +++ b/ctdb/tcp/tcp_io.c @@ -75,7 +75,6 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) return; failed: - TALLOC_FREE(tnode->in_queue); node->ctdb->upcalls->node_dead(node); TALLOC_FREE(data); -- 2.24.1 From 01d78d9502c9663c99cd8bf1d5d57c968fae04e0 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:13:12 +0100 Subject: [PATCH 4/9] ctdb-tcp: always call node_dead() upcall in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() is called when we receive data on the outgoing connection. This can happen when we get an EOF on the connection because the other side as closed. In this case data will be NULL. It would also be called if we received data from the peer. In this case data will not be NULL. The latter case is a fatal error though and we already call ctdb_tcp_stop_connection() for this case as well, which means even though the node is not fully connected anymore, by not calling the node_dead() upcall NODE_FLAGS_DISCONNECTED will not be set. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit b83ef98c7466b2a81968555de83fb977bb6ca9f0) --- ctdb/tcp/tcp_connect.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 04897f44249..a2fd42fdac9 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -65,9 +65,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) struct ctdb_tcp_node *tnode = talloc_get_type( node->private_data, struct ctdb_tcp_node); - if (data == NULL) { - node->ctdb->upcalls->node_dead(node); - } + node->ctdb->upcalls->node_dead(node); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.24.1 From 51fb08e1c33171eafdbda58f4800001073f7e894 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:28:20 +0100 Subject: [PATCH 5/9] ctdb-tcp: Remove redundant restart in ctdb_tcp_tnode_cb() The node dead upcall has already restarted the outgoing connection. There's no need to repeat it. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Signed-off-by: Martin Schwenke (backported from commit ea37ecdcd5960311f54a7a5510b88a654da23daa) --- ctdb/tcp/tcp_connect.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index a2fd42fdac9..253bf741cbf 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -62,15 +62,9 @@ void ctdb_tcp_stop_connection(struct ctdb_node *node) void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) { struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node); - struct ctdb_tcp_node *tnode = talloc_get_type( - node->private_data, struct ctdb_tcp_node); node->ctdb->upcalls->node_dead(node); - ctdb_tcp_stop_connection(node); - tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, - timeval_current_ofs(3, 0), - ctdb_tcp_node_connect, node); TALLOC_FREE(data); } -- 2.24.1 From 6e5465ea2382f81563d85b0993046825492eb19d Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:36:00 +0100 Subject: [PATCH 6/9] ctdb-tcp: rename ctdb_tcp_stop_connection() to ctdb_tcp_stop_outgoing() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (backported from commit 1e2a967ff41cc29c3a0d7f61a46937c68fdb90ba) --- ctdb/tcp/ctdb_tcp.h | 2 +- ctdb/tcp/tcp_connect.c | 12 ++++++------ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index daabad74297..095056e8544 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); -void ctdb_tcp_stop_connection(struct ctdb_node *node); +void ctdb_tcp_stop_outgoing(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 253bf741cbf..e242e0b8676 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -38,9 +38,9 @@ #include "ctdb_tcp.h" /* - stop any connecting (established or pending) to a node + stop any outgoing connection (established or pending) to a node */ -void ctdb_tcp_stop_connection(struct ctdb_node *node) +void ctdb_tcp_stop_outgoing(struct ctdb_node *node) { struct ctdb_tcp_node *tnode = talloc_get_type( node->private_data, struct ctdb_tcp_node); @@ -90,7 +90,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); if (ret != 0 || error != 0) { - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), ctdb_tcp_node_connect, node); @@ -128,7 +128,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, node->name); if (tnode->out_queue == NULL) { DBG_ERR("Failed to set up outgoing queue\n"); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), @@ -168,7 +168,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); sock_out = node->address; @@ -252,7 +252,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, return; failed: - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index fe49022a1f6..b44824d94ec 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); TALLOC_FREE(tnode->in_queue); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From 60936534007cc3fdbace1cba48f1753738765d5e Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 11:54:51 +0100 Subject: [PATCH 7/9] ctdb-tcp: add ctdb_tcp_stop_incoming() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Signed-off-by: Martin Schwenke (backported from commit 2c73dbafba50b28e72a8ec7b4382fae42fca6d17) --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 10 ++++++++++ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 095056e8544..cb8d66fa5dc 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -49,6 +49,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); void ctdb_tcp_stop_outgoing(struct ctdb_node *node); +void ctdb_tcp_stop_incoming(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index e242e0b8676..22f149b5b3c 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,16 @@ void ctdb_tcp_stop_outgoing(struct ctdb_node *node) } } +/* + stop incoming connection to a node + */ +void ctdb_tcp_stop_incoming(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->private_data, struct ctdb_tcp_node); + + TALLOC_FREE(tnode->in_queue); +} /* called when a complete packet has come in - should not happen on this socket diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index b44824d94ec..ea1edd418cf 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->private_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - TALLOC_FREE(tnode->in_queue); + ctdb_tcp_stop_incoming(node); ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.24.1 From 79a11ca83a7d8b0db5487f94dfe994cf26f44fd5 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 15:59:32 +1100 Subject: [PATCH 8/9] ctdb-tcp: Factor out function ctdb_tcp_start_outgoing() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke (cherry picked from commit 3c8747fe29486a4f95308b335a5e3ec1807f62cb) --- ctdb/tcp/tcp_connect.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 22f149b5b3c..0cd22e1b4c9 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -164,11 +164,8 @@ static void ctdb_node_connect_write(struct tevent_context *ev, /* called when we should try and establish a tcp connection to a node */ -void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, - struct timeval t, void *private_data) +static void ctdb_tcp_start_outgoing(struct ctdb_node *node) { - struct ctdb_node *node = talloc_get_type(private_data, - struct ctdb_node); struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data, struct ctdb_tcp_node); struct ctdb_context *ctdb = node->ctdb; @@ -178,8 +175,6 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_outgoing(node); - sock_out = node->address; tnode->out_fd = socket(sock_out.sa.sa_family, SOCK_STREAM, IPPROTO_TCP); @@ -270,6 +265,18 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, node); } +void ctdb_tcp_node_connect(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + + ctdb_tcp_stop_outgoing(node); + ctdb_tcp_start_outgoing(node); +} + /* called when we get contacted by another node currently makes no attempt to check if the connection is really from a ctdb -- 2.24.1 From 968419e13c6133d3f2b06756ec6f56fd40b71517 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 16:11:23 +1100 Subject: [PATCH 9/9] ctdb-tcp: Do not stop outbound connection in ctdb_tcp_node_connect() The only place the outgoing connection needs to be stopped is when there is a timeout when waiting for the connection to become writable. Add a new function ctdb_tcp_node_connect_timeout() to handle this case. All of the other cases are attempts to establish a new outgoing connection (initial attempt, retry after an error or disconnect, ...) so drop stopping the connection in those cases. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke Autobuild-User(master): Martin Schwenke Autobuild-Date(master): Thu Mar 12 05:29:20 UTC 2020 on sn-devel-184 (cherry picked from commit 319c93f0c6a949545229b616dfbd4f51baf11171) --- ctdb/tcp/tcp_connect.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 0cd22e1b4c9..4192605d4ef 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -161,6 +161,11 @@ static void ctdb_node_connect_write(struct tevent_context *ev, } +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data); + /* called when we should try and establish a tcp connection to a node */ @@ -251,7 +256,7 @@ static void ctdb_tcp_start_outgoing(struct ctdb_node *node) tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), - ctdb_tcp_node_connect, + ctdb_tcp_node_connect_timeout, node); return; @@ -273,6 +278,17 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct ctdb_node *node = talloc_get_type_abort(private_data, struct ctdb_node); + ctdb_tcp_start_outgoing(node); +} + +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + ctdb_tcp_stop_outgoing(node); ctdb_tcp_start_outgoing(node); } -- 2.24.1