From 404d4cf2032a7e932b12932e4bac0b87c4df6141 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 16:28:28 +1100 Subject: [PATCH 1/9] ctdb-recoverd: Free cluster mutex handler on failure to take lock If nested events occur while the file descriptor handler is still active then chaos can ensue. For example, if a node is banned and the lock is explicitly cancelled (e.g. due to election loss) then double-talloc-free()s abound. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 621658cbed5d91d7096fc208bac2ff93a1880e7d) --- ctdb/server/ctdb_recoverd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 578127a4514..a63021e4d8b 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -897,6 +897,16 @@ static void take_reclock_handler(char status, struct ctdb_recovery_lock_handle *s = (struct ctdb_recovery_lock_handle *) private_data; + s->locked = (status == '0') ; + + /* + * If unsuccessful then ensure the process has exited and that + * the file descriptor event handler has been cancelled + */ + if (! s->locked) { + TALLOC_FREE(s->h); + } + switch (status) { case '0': s->latency = latency; @@ -912,7 +922,6 @@ static void take_reclock_handler(char status, } s->done = true; - s->locked = (status == '0') ; } static void force_election(struct ctdb_recoverd *rec, -- 2.20.1 From ba24f974d9ecefe14a247ce01726b037cfdec878 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 16:36:13 +1100 Subject: [PATCH 2/9] ctdb-recoverd: Clean up logging on failure to take recovery lock Add an explicit case for a timeout and clean up the other messages. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 7e4aae6943291c3144c8a3ff97537e8d4c7dc7c9) --- ctdb/server/ctdb_recoverd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index a63021e4d8b..473e6cbe8cc 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -913,12 +913,15 @@ static void take_reclock_handler(char status, break; case '1': - DEBUG(DEBUG_ERR, - ("Unable to take recovery lock - contention\n")); + D_ERR("Unable to take recovery lock - contention\n"); + break; + + case '2': + D_ERR("Unable to take recovery lock - timeout\n"); break; default: - DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n")); + D_ERR("Unable to take recover lock - unknown error\n"); } s->done = true; -- 2.20.1 From 5efce3581b3bf5d8d5fd7af371c722234b28da57 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 10 Jan 2019 13:24:34 +1100 Subject: [PATCH 3/9] ctdb-recoverd: Make recoverd context available in recovery lock handle BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit c0fb62ed3954fc6e8667480aba92003fc270f257) --- ctdb/server/ctdb_recoverd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 473e6cbe8cc..c3beac7d18b 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -888,6 +888,7 @@ struct ctdb_recovery_lock_handle { bool locked; double latency; struct ctdb_cluster_mutex_handle *h; + struct ctdb_recoverd *rec; }; static void take_reclock_handler(char status, @@ -954,6 +955,8 @@ static bool ctdb_recovery_lock(struct ctdb_recoverd *rec) return false; }; + s->rec = rec; + h = ctdb_cluster_mutex(s, ctdb, ctdb->recovery_lock, -- 2.20.1 From 5a666176f415b10b7b15c4a036da1784a888fbca Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 10 Jan 2019 14:01:57 +1100 Subject: [PATCH 4/9] ctdb-recoverd: Ban node on unknown error when taking recovery lock We really shouldn't see unknown errors. They probably represent a misconfigured recovery lock or similar. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 45a77d65b2e39b4af94da4ab99575f4ee08a7ebd) --- ctdb/server/ctdb_recoverd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index c3beac7d18b..584d65d61a7 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -923,6 +923,17 @@ static void take_reclock_handler(char status, default: D_ERR("Unable to take recover lock - unknown error\n"); + + { + struct ctdb_recoverd *rec = s->rec; + struct ctdb_context *ctdb = rec->ctdb; + uint32_t pnn = ctdb_get_pnn(ctdb); + + D_ERR("Banning this node\n"); + ctdb_ban_node(rec, + pnn, + ctdb->tunable.recovery_ban_period); + } } s->done = true; -- 2.20.1 From aa3c6ec12677abf3c847810fd249acf2b8b5c609 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 22 Feb 2019 15:09:33 +1100 Subject: [PATCH 5/9] ctdb-recoverd: Time out attempt to take recovery lock after 120s Currently this will wait forever. It really needs a timeout in case the cluster filesystem (or other lock mechanism) is completely wedged. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 13a1a4808935290dceb219daccd7aac3fda4e184) --- ctdb/server/ctdb_recoverd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 584d65d61a7..9b3559b2a92 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -971,7 +971,7 @@ static bool ctdb_recovery_lock(struct ctdb_recoverd *rec) h = ctdb_cluster_mutex(s, ctdb, ctdb->recovery_lock, - 0, + 120, take_reclock_handler, s, lost_reclock_handler, -- 2.20.1 From cb33080209e473aaf396ece15a2ce6215e0e3684 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 12:13:08 +1100 Subject: [PATCH 6/9] ctdb-tests: Force test failure if local daemon setup fails BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit ce09d9c3e4c72ebec7a21686ae913398a5c9020f) --- ctdb/tests/simple/scripts/local_daemons.bash | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctdb/tests/simple/scripts/local_daemons.bash b/ctdb/tests/simple/scripts/local_daemons.bash index cf6671757b3..5d07b1f6398 100644 --- a/ctdb/tests/simple/scripts/local_daemons.bash +++ b/ctdb/tests/simple/scripts/local_daemons.bash @@ -34,6 +34,9 @@ setup_ctdb () ${public_addresses:+-P} ${public_addresses} \ ${CTDB_USE_IPV6:+-6} \ ${TEST_SOCKET_WRAPPER_SO_PATH:+-S} ${TEST_SOCKET_WRAPPER_SO_PATH} + if [ $? -ne 0 ] ; then + exit 1 + fi local pnn for pnn in $(seq 0 $(($TEST_LOCAL_DAEMONS - 1))) ; do -- 2.20.1 From 737b578fd9c412364c9d82758324cdc7f4b4479b Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 12:13:29 +1100 Subject: [PATCH 7/9] ctdb-tests: Add -R option for local daemons to use recovery lock command Under the covers, a command is always used. However, there is no way of testing of the code path where a command is explicitly configured. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit e74f5243fcb7594939769c16f3c79ab167dd1227) --- ctdb/tests/local_daemons.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ctdb/tests/local_daemons.sh b/ctdb/tests/local_daemons.sh index 9329a60643c..07cf1e9b135 100755 --- a/ctdb/tests/local_daemons.sh +++ b/ctdb/tests/local_daemons.sh @@ -132,6 +132,7 @@ Options: -N Nodes file (default: automatically generated) -n Number of nodes (default: 3) -P Public addresses file (default: automatically generated) + -R Use a command for the recovery lock (default: use a file) -S Socket wrapper shared library to preload (default: none) -6 Generate IPv6 IPs for nodes, public addresses (default: IPv4) EOF @@ -145,17 +146,19 @@ local_daemons_setup () _nodes_file="" _num_nodes=3 _public_addresses_file="" + _recovery_lock_use_command=false _socket_wrapper="" _use_ipv6=false set -e - while getopts "FN:n:P:S:6h?" _opt ; do + while getopts "FN:n:P:RS:6h?" _opt ; do case "$_opt" in F) _disable_failover=true ;; N) _nodes_file="$OPTARG" ;; n) _num_nodes="$OPTARG" ;; P) _public_addresses_file="$OPTARG" ;; + R) _recovery_lock_use_command=true ;; S) _socket_wrapper="$OPTARG" ;; 6) _use_ipv6=true ;; \?|h) local_daemons_setup_usage ;; @@ -188,6 +191,11 @@ local_daemons_setup () $_use_ipv6 >"$_public_addresses_all" fi + _recovery_lock="${directory}/rec.lock" + if $_recovery_lock_use_command ; then + _recovery_lock="! ${CTDB_CLUSTER_MUTEX_HELPER} ${_recovery_lock}" + fi + if [ -n "$_socket_wrapper" ] ; then setup_socket_wrapper "$_socket_wrapper" fi @@ -221,7 +229,7 @@ local_daemons_setup () log level = INFO [cluster] - recovery lock = ${directory}/rec.lock + recovery lock = ${_recovery_lock} node address = ${_node_ip} [database] -- 2.20.1 From a1c548f3b314fdbd55293ff2249745fec429244c Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 12:15:33 +1100 Subject: [PATCH 8/9] ctdb-tests: Add a test for configuring the recovery lock as a command BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit ebc082122fb34ffb8cbcafde9ad39bcc241d33ed) --- ctdb/tests/simple/01_ctdb_reclock_command.sh | 28 ++++++++++++++++++++ ctdb/tests/simple/scripts/local_daemons.bash | 3 +++ 2 files changed, 31 insertions(+) create mode 100755 ctdb/tests/simple/01_ctdb_reclock_command.sh diff --git a/ctdb/tests/simple/01_ctdb_reclock_command.sh b/ctdb/tests/simple/01_ctdb_reclock_command.sh new file mode 100755 index 00000000000..34f82e981a1 --- /dev/null +++ b/ctdb/tests/simple/01_ctdb_reclock_command.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +test_info() +{ + cat < Date: Mon, 21 Jan 2019 12:16:43 +1100 Subject: [PATCH 9/9] ctdb-cluster-mutex: Separate out command and file handling This code is difficult to read and there really is no common code between the 2 cases. For example, there is no need to split a filename into words. Separating each of the 2 cases into its own function makes the logic much easier to understand. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Mon Feb 25 03:40:16 CET 2019 on sn-devel-144 (cherry picked from commit c93430fe8fe530a55b9a04cf6cc660c3d420e333) --- ctdb/server/ctdb_cluster_mutex.c | 113 +++++++++++++++++++------------ 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/ctdb/server/ctdb_cluster_mutex.c b/ctdb/server/ctdb_cluster_mutex.c index 330d5fd1d90..2e3cb8112ad 100644 --- a/ctdb/server/ctdb_cluster_mutex.c +++ b/ctdb/server/ctdb_cluster_mutex.c @@ -118,72 +118,101 @@ static void cluster_mutex_handler(struct tevent_context *ev, static char cluster_mutex_helper[PATH_MAX+1] = ""; -static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx, - const char *argstring, char ***argv) +static bool cluster_mutex_helper_args_file(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) { - int nargs, i, ret, n; - bool is_command = false; + bool ok; char **args = NULL; - char *strv = NULL; - char *t = NULL; - if (argstring != NULL && argstring[0] == '!') { - /* This is actually a full command */ - is_command = true; - t = discard_const(&argstring[1]); - } else { - is_command = false; - t = discard_const(argstring); + ok = ctdb_set_helper("cluster mutex helper", + cluster_mutex_helper, + sizeof(cluster_mutex_helper), + "CTDB_CLUSTER_MUTEX_HELPER", + CTDB_HELPER_BINDIR, + "ctdb_mutex_fcntl_helper"); + if (! ok) { + DBG_ERR("ctdb exiting with error: " + "Unable to set cluster mutex helper\n"); + exit(1); } - ret = strv_split(mem_ctx, &strv, t, " \t"); - if (ret != 0) { - DEBUG(DEBUG_ERR, - ("Unable to parse mutex helper string \"%s\" (%s)\n", - argstring, strerror(ret))); + + /* Array includes default helper, file and NULL */ + args = talloc_array(mem_ctx, char *, 3); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); return false; } - n = strv_count(strv); - args = talloc_array(mem_ctx, char *, n + (is_command ? 1 : 2)); + args[0] = cluster_mutex_helper; - if (args == NULL) { - DEBUG(DEBUG_ERR,(__location__ " out of memory\n")); + args[1] = talloc_strdup(args, argstring); + if (args[1] == NULL) { + DBG_ERR("Memory allocation error\n"); return false; } - nargs = 0; - - if (! is_command) { - if (!ctdb_set_helper("cluster mutex helper", - cluster_mutex_helper, - sizeof(cluster_mutex_helper), - "CTDB_CLUSTER_MUTEX_HELPER", - CTDB_HELPER_BINDIR, - "ctdb_mutex_fcntl_helper")) { - DEBUG(DEBUG_ERR,("ctdb exiting with error: %s\n", - __location__ - " Unable to set cluster mutex helper\n")); - exit(1); - } + args[2] = NULL; + + *argv = args; + return true; +} - args[nargs++] = cluster_mutex_helper; +static bool cluster_mutex_helper_args_cmd(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + int i, ret, n; + char **args = NULL; + char *strv = NULL; + char *t = NULL; + + ret = strv_split(mem_ctx, &strv, argstring, " \t"); + if (ret != 0) { + D_ERR("Unable to parse mutex helper command \"%s\" (%s)\n", + argstring, + strerror(ret)); + return false; } + n = strv_count(strv); + + /* Extra slot for NULL */ + args = talloc_array(mem_ctx, char *, n + 1); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + } + + talloc_steal(args, strv); t = NULL; - for (i = 0; i < n; i++) { - /* Don't copy, just keep cmd_args around */ + for (i = 0 ; i < n; i++) { t = strv_next(strv, t); - args[nargs++] = t; + args[i] = t; } - /* Make sure last argument is NULL */ - args[nargs] = NULL; + args[n] = NULL; *argv = args; return true; } +static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + bool ok; + + if (argstring != NULL && argstring[0] == '!') { + ok = cluster_mutex_helper_args_cmd(mem_ctx, &argstring[1], argv); + } else { + ok = cluster_mutex_helper_args_file(mem_ctx, argstring, argv); + } + + return ok; +} + struct ctdb_cluster_mutex_handle * ctdb_cluster_mutex(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, -- 2.20.1