From f8b906b94c6fccbf8e347295982c2becdc2d890c Mon Sep 17 00:00:00 2001
From: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Date: Wed, 16 Jun 2021 17:35:19 +1200
Subject: [PATCH 1/3] torture: talloc_string_sub tests for utf-8 brevity

If we allow overly long UTF-8 sequences (in the tests, encoding '\0'
as 2, 3, or 4 bytes), it might be possible for bad strings to slip
through.

We fail. But wait for the next commit.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14684

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
---
 lib/util/tests/str.c          | 58 +++++++++++++++++++++++++++++++++++
 selftest/knownfail.d/str-utf8 |  1 +
 2 files changed, 59 insertions(+)
 create mode 100644 selftest/knownfail.d/str-utf8

diff --git a/lib/util/tests/str.c b/lib/util/tests/str.c
index 93bf809f385..41a28366cf4 100644
--- a/lib/util/tests/str.c
+++ b/lib/util/tests/str.c
@@ -91,6 +91,52 @@ static bool test_talloc_string_sub_multiple(struct torture_context *tctx)
 	return true;
 }
 
+/*
+ * with these next three tests, the failure is that the pattern looks like
+ * "+++" because the \x.. bytes encode a zero byte in UTF-8. If we are not
+ * careful with these strings we will see crashes instead of failures.
+ */
+
+static bool test_talloc_string_sub_tricky_utf8_4(struct torture_context *tctx)
+{
+	const char string[] =  "++++--\xD8\xBB";
+	const char pattern[] = "+++\xF0\x80\x80\x80++";
+	const char replace[] = "...";
+
+	char *t = talloc_string_sub(tctx, string, pattern, replace);
+	torture_assert_str_equal(tctx, t, string,
+				 "should reject 4 byte NUL char");
+	talloc_free(t);
+	return true;
+}
+
+static bool test_talloc_string_sub_tricky_utf8_3(struct torture_context *tctx)
+{
+	const char string[] =  "++++--\xD8\xBB";
+	const char pattern[] = "+++\xE0\x80\x80++";
+	const char replace[] = "...";
+
+	char *t = talloc_string_sub(tctx, string, pattern, replace);
+	torture_assert_str_equal(tctx, t, string,
+				 "should reject 3 byte NUL char");
+	talloc_free(t);
+	return true;
+}
+
+static bool test_talloc_string_sub_tricky_utf8_2(struct torture_context *tctx)
+{
+	const char string[] =  "++++--\xD8\xBB";
+	const char pattern[] = "+++\xC0\x80++";
+	const char replace[] = "...";
+
+	char *t = talloc_string_sub(tctx, string, pattern, replace);
+	torture_assert_str_equal(tctx, t, string,
+				 "should reject 2 byte NUL char");
+	talloc_free(t);
+	return true;
+}
+
+
 
 
 struct torture_suite *torture_local_util_str(TALLOC_CTX *mem_ctx)
@@ -118,5 +164,17 @@ struct torture_suite *torture_local_util_str(TALLOC_CTX *mem_ctx)
 	torture_suite_add_simple_test(suite, "string_sub_talloc_multiple", 
 				      test_talloc_string_sub_multiple);
 
+	torture_suite_add_simple_test(suite,
+				      "test_talloc_string_sub_tricky_utf8_4",
+				      test_talloc_string_sub_tricky_utf8_4);
+
+	torture_suite_add_simple_test(suite,
+				      "test_talloc_string_sub_tricky_utf8_3",
+				      test_talloc_string_sub_tricky_utf8_3);
+
+	torture_suite_add_simple_test(suite,
+				      "test_talloc_string_sub_tricky_utf8_2",
+				      test_talloc_string_sub_tricky_utf8_2);
+
 	return suite;
 }
diff --git a/selftest/knownfail.d/str-utf8 b/selftest/knownfail.d/str-utf8
new file mode 100644
index 00000000000..b003ea8b097
--- /dev/null
+++ b/selftest/knownfail.d/str-utf8
@@ -0,0 +1 @@
+^samba4.local.str.+utf8_[234]
-- 
2.25.1


From 0ec7fd9dccb51ac680a25d3c0dbde11c5ce1a297 Mon Sep 17 00:00:00 2001
From: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Date: Thu, 8 Apr 2021 21:18:46 +1200
Subject: [PATCH 2/3] util/iconv: reject improperly packed UTF-8

If we allow a string that encodes say '\0' as a multi-byte sequence,
we are open to confusion where we mix NUL terminated strings with
sized data blobs, which is to say EVERYWHERE.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14684

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
---
 lib/util/charset/iconv.c      | 32 +++++++++++++++++++++-----------
 selftest/knownfail.d/str-utf8 |  1 -
 2 files changed, 21 insertions(+), 12 deletions(-)
 delete mode 100644 selftest/knownfail.d/str-utf8

diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c
index 1f2d49c0e27..43b3306b0de 100644
--- a/lib/util/charset/iconv.c
+++ b/lib/util/charset/iconv.c
@@ -832,6 +832,11 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 			}
 			uc[1] = (c[0]>>2) & 0x7;
 			uc[0] = (c[0]<<6) | (c[1]&0x3f);
+			if (uc[1] == 0 && uc[0] < 0x80) {
+				/* this should have been a single byte */
+				errno = EILSEQ;
+				goto error;
+			}
 			c  += 2;
 			in_left  -= 2;
 			out_left -= 2;
@@ -840,14 +845,24 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 		}
 
 		if ((c[0] & 0xf0) == 0xe0) {
+			unsigned int codepoint;
 			if (in_left < 3 ||
 			    (c[1] & 0xc0) != 0x80 ||
 			    (c[2] & 0xc0) != 0x80) {
 				errno = EILSEQ;
 				goto error;
 			}
-			uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
-			uc[0] = (c[1]<<6) | (c[2]&0x3f);
+			codepoint = ((c[2] & 0x3f)        |
+				     ((c[1] & 0x3f) << 6) |
+				     ((c[0] & 0x0f) << 12));
+
+			if (codepoint < 0x800) {
+				/* this should be a 1 or 2 byte sequence */
+				errno = EILSEQ;
+				goto error;
+			}
+			uc[0] = codepoint & 0xff;
+			uc[1] = codepoint >> 8;
 			c  += 3;
 			in_left  -= 3;
 			out_left -= 2;
@@ -870,15 +885,10 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 				((c[1]&0x3f)<<12) |
 				((c[0]&0x7)<<18);
 			if (codepoint < 0x10000) {
-				/* accept UTF-8 characters that are not
-				   minimally packed, but pack the result */
-				uc[0] = (codepoint & 0xFF);
-				uc[1] = (codepoint >> 8);
-				c += 4;
-				in_left -= 4;
-				out_left -= 2;
-				uc += 2;
-				continue;
+				/* reject UTF-8 characters that are not
+				   minimally packed */
+				errno = EILSEQ;
+				goto error;
 			}
 
 			codepoint -= 0x10000;
diff --git a/selftest/knownfail.d/str-utf8 b/selftest/knownfail.d/str-utf8
deleted file mode 100644
index b003ea8b097..00000000000
--- a/selftest/knownfail.d/str-utf8
+++ /dev/null
@@ -1 +0,0 @@
-^samba4.local.str.+utf8_[234]
-- 
2.25.1


From cdabd81c232b0cb26342b2f3a67ade3b6bb23c73 Mon Sep 17 00:00:00 2001
From: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Date: Thu, 8 Apr 2021 21:20:17 +1200
Subject: [PATCH 3/3] util/charset: warn loudly on unexpected E2BIG

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
---
 lib/util/charset/convert_string.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/util/charset/convert_string.c b/lib/util/charset/convert_string.c
index 96a3af68d27..88b128be547 100644
--- a/lib/util/charset/convert_string.c
+++ b/lib/util/charset/convert_string.c
@@ -435,8 +435,8 @@ bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
 				break;
 			case E2BIG:
 				reason = "output buffer is too small";
-				DBG_NOTICE("Conversion error: %s\n",
-					   reason);
+				DBG_ERR("Conversion error: %s\n",
+					reason);
 				break;
 			case EILSEQ:
 				reason="Illegal multibyte sequence";
-- 
2.25.1