From 046dec754aab2b3544047c01b1a5548f6a363458 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 19 Jan 2018 14:55:21 +1100 Subject: [PATCH 1/3] ctdb-recoverd: Drop unnecessary code This has already been done in update_flags(). BUG: https://bugzilla.samba.org/show_bug.cgi?id=14513 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 3ab52b528673e08caa66f00e963528c591a84fe1) --- ctdb/server/ctdb_recoverd.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index f825427e7a3..d42333b860d 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2667,20 +2667,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, } } - /* - * Update node flags obtained from each active node. This ensure we have - * up-to-date information for all the nodes. - */ - for (j=0; jnum; j++) { - if (nodemap->nodes[j].pnn == ctdb->pnn) { - continue; - } - if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { - continue; - } - nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags; - } - for (j=0; jnum; j++) { if (nodemap->nodes[j].pnn == ctdb->pnn) { continue; -- 2.28.0 From c962739485571db8486c705e7096bc806ef2e630 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Tue, 16 Jan 2018 15:15:51 +1100 Subject: [PATCH 2/3] ctdb-recoverd: Drop unnecessary and broken code update_flags() has already updated the recovery master's canonical node map, based on the flags from each remote node, and pushed out these flags to all nodes. If i == j then the node map has already been updated from this remote node's flags, so simply drop this case. Although update_flags() has updated flags for all nodes, it did not update each node map in remote_nodemaps[] to reflect this. This means that remote_nodemaps[] may contain inconsistent flags for some nodes so it should not be used to check consistency when i != j. Further, a meaningful difference in flags can only really occur if update_flags() failed. In that case this code is never reached. These observations combine to imply that this whole loop should be dropped. This leaves potential sub-second inconsistencies due to out-of-band healthy/unhealthy flag changes pushed via CTDB_SRVID_PUSH_NODE_FLAGS. These updates could be dropped (takeover run asks each node for available IPs rather than making centralised decisions based on node flags) but for now they will be fixed in the next iteration of main_loop(). BUG: https://bugzilla.samba.org/show_bug.cgi?id=14513 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 4b01f54041dee469971f244e64064eed46de2ed5) --- ctdb/server/ctdb_recoverd.c | 47 ------------------------------------- 1 file changed, 47 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index d42333b860d..856fcbb72c8 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2667,53 +2667,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, } } - for (j=0; jnum; j++) { - if (nodemap->nodes[j].pnn == ctdb->pnn) { - continue; - } - if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { - continue; - } - - /* verify the flags are consistent - */ - for (i=0; inum; i++) { - if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) { - continue; - } - - if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) { - DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n", - nodemap->nodes[j].pnn, - nodemap->nodes[i].pnn, - remote_nodemaps[j]->nodes[i].flags, - nodemap->nodes[i].flags)); - if (i == j) { - DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j)); - update_flags_on_all_nodes( - rec, - nodemap->nodes[i].pnn, - remote_nodemaps[j]->nodes[i].flags); - ctdb_set_culprit(rec, nodemap->nodes[j].pnn); - do_recovery(rec, mem_ctx, pnn, nodemap, - vnnmap); - return; - } else { - DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i)); - update_flags_on_all_nodes( - rec, - nodemap->nodes[i].pnn, - nodemap->nodes[i].flags); - ctdb_set_culprit(rec, nodemap->nodes[j].pnn); - do_recovery(rec, mem_ctx, pnn, nodemap, - vnnmap); - return; - } - } - } - } - - /* count how many active nodes there are */ num_lmasters = 0; for (i=0; inum; i++) { -- 2.28.0 From 58438c6523763124fb0a019e46584b4c68417273 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 30 Sep 2020 10:48:38 +1000 Subject: [PATCH 3/3] ctdb-tests: Strengthen node state checking in ctdb disable/enable test Check that the desired state is set on all nodes instead of just the test node. This ensures that node flags have correctly propagated across the cluster. RN: Fix remaining ctdb disable/enable bug BUG: https://bugzilla.samba.org/show_bug.cgi?id=14513 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Tue Oct 6 04:32:06 UTC 2020 on sn-devel-184 (cherry picked from commit b68105b8f7c20692d23d457f2777edcf44f12bb8) Signed-off-by: Martin Schwenke --- ctdb/tests/INTEGRATION/failover/pubips.030.disable_enable.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/tests/INTEGRATION/failover/pubips.030.disable_enable.sh b/ctdb/tests/INTEGRATION/failover/pubips.030.disable_enable.sh index c0bb62d1991..0b8425b2556 100755 --- a/ctdb/tests/INTEGRATION/failover/pubips.030.disable_enable.sh +++ b/ctdb/tests/INTEGRATION/failover/pubips.030.disable_enable.sh @@ -21,10 +21,10 @@ select_test_node_and_ips echo "Disabling node $test_node" try_command_on_node 1 $CTDB disable -n $test_node -wait_until_node_has_status $test_node disabled +wait_until_node_has_status $test_node disabled 30 all wait_until_node_has_no_ips "$test_node" echo "Re-enabling node $test_node" try_command_on_node 1 $CTDB enable -n $test_node -wait_until_node_has_status $test_node enabled +wait_until_node_has_status $test_node enabled 30 all wait_until_node_has_some_ips "$test_node" -- 2.28.0