From f8b906b94c6fccbf8e347295982c2becdc2d890c Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Wed, 16 Jun 2021 17:35:19 +1200 Subject: [PATCH 1/3] torture: talloc_string_sub tests for utf-8 brevity If we allow overly long UTF-8 sequences (in the tests, encoding '\0' as 2, 3, or 4 bytes), it might be possible for bad strings to slip through. We fail. But wait for the next commit. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14684 Signed-off-by: Douglas Bagnall --- lib/util/tests/str.c | 58 +++++++++++++++++++++++++++++++++++ selftest/knownfail.d/str-utf8 | 1 + 2 files changed, 59 insertions(+) create mode 100644 selftest/knownfail.d/str-utf8 diff --git a/lib/util/tests/str.c b/lib/util/tests/str.c index 93bf809f385..41a28366cf4 100644 --- a/lib/util/tests/str.c +++ b/lib/util/tests/str.c @@ -91,6 +91,52 @@ static bool test_talloc_string_sub_multiple(struct torture_context *tctx) return true; } +/* + * with these next three tests, the failure is that the pattern looks like + * "+++" because the \x.. bytes encode a zero byte in UTF-8. If we are not + * careful with these strings we will see crashes instead of failures. + */ + +static bool test_talloc_string_sub_tricky_utf8_4(struct torture_context *tctx) +{ + const char string[] = "++++--\xD8\xBB"; + const char pattern[] = "+++\xF0\x80\x80\x80++"; + const char replace[] = "..."; + + char *t = talloc_string_sub(tctx, string, pattern, replace); + torture_assert_str_equal(tctx, t, string, + "should reject 4 byte NUL char"); + talloc_free(t); + return true; +} + +static bool test_talloc_string_sub_tricky_utf8_3(struct torture_context *tctx) +{ + const char string[] = "++++--\xD8\xBB"; + const char pattern[] = "+++\xE0\x80\x80++"; + const char replace[] = "..."; + + char *t = talloc_string_sub(tctx, string, pattern, replace); + torture_assert_str_equal(tctx, t, string, + "should reject 3 byte NUL char"); + talloc_free(t); + return true; +} + +static bool test_talloc_string_sub_tricky_utf8_2(struct torture_context *tctx) +{ + const char string[] = "++++--\xD8\xBB"; + const char pattern[] = "+++\xC0\x80++"; + const char replace[] = "..."; + + char *t = talloc_string_sub(tctx, string, pattern, replace); + torture_assert_str_equal(tctx, t, string, + "should reject 2 byte NUL char"); + talloc_free(t); + return true; +} + + struct torture_suite *torture_local_util_str(TALLOC_CTX *mem_ctx) @@ -118,5 +164,17 @@ struct torture_suite *torture_local_util_str(TALLOC_CTX *mem_ctx) torture_suite_add_simple_test(suite, "string_sub_talloc_multiple", test_talloc_string_sub_multiple); + torture_suite_add_simple_test(suite, + "test_talloc_string_sub_tricky_utf8_4", + test_talloc_string_sub_tricky_utf8_4); + + torture_suite_add_simple_test(suite, + "test_talloc_string_sub_tricky_utf8_3", + test_talloc_string_sub_tricky_utf8_3); + + torture_suite_add_simple_test(suite, + "test_talloc_string_sub_tricky_utf8_2", + test_talloc_string_sub_tricky_utf8_2); + return suite; } diff --git a/selftest/knownfail.d/str-utf8 b/selftest/knownfail.d/str-utf8 new file mode 100644 index 00000000000..b003ea8b097 --- /dev/null +++ b/selftest/knownfail.d/str-utf8 @@ -0,0 +1 @@ +^samba4.local.str.+utf8_[234] -- 2.25.1 From 0ec7fd9dccb51ac680a25d3c0dbde11c5ce1a297 Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Thu, 8 Apr 2021 21:18:46 +1200 Subject: [PATCH 2/3] util/iconv: reject improperly packed UTF-8 If we allow a string that encodes say '\0' as a multi-byte sequence, we are open to confusion where we mix NUL terminated strings with sized data blobs, which is to say EVERYWHERE. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14684 Signed-off-by: Douglas Bagnall --- lib/util/charset/iconv.c | 32 +++++++++++++++++++++----------- selftest/knownfail.d/str-utf8 | 1 - 2 files changed, 21 insertions(+), 12 deletions(-) delete mode 100644 selftest/knownfail.d/str-utf8 diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index 1f2d49c0e27..43b3306b0de 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -832,6 +832,11 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, } uc[1] = (c[0]>>2) & 0x7; uc[0] = (c[0]<<6) | (c[1]&0x3f); + if (uc[1] == 0 && uc[0] < 0x80) { + /* this should have been a single byte */ + errno = EILSEQ; + goto error; + } c += 2; in_left -= 2; out_left -= 2; @@ -840,14 +845,24 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, } if ((c[0] & 0xf0) == 0xe0) { + unsigned int codepoint; if (in_left < 3 || (c[1] & 0xc0) != 0x80 || (c[2] & 0xc0) != 0x80) { errno = EILSEQ; goto error; } - uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF); - uc[0] = (c[1]<<6) | (c[2]&0x3f); + codepoint = ((c[2] & 0x3f) | + ((c[1] & 0x3f) << 6) | + ((c[0] & 0x0f) << 12)); + + if (codepoint < 0x800) { + /* this should be a 1 or 2 byte sequence */ + errno = EILSEQ; + goto error; + } + uc[0] = codepoint & 0xff; + uc[1] = codepoint >> 8; c += 3; in_left -= 3; out_left -= 2; @@ -870,15 +885,10 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, ((c[1]&0x3f)<<12) | ((c[0]&0x7)<<18); if (codepoint < 0x10000) { - /* accept UTF-8 characters that are not - minimally packed, but pack the result */ - uc[0] = (codepoint & 0xFF); - uc[1] = (codepoint >> 8); - c += 4; - in_left -= 4; - out_left -= 2; - uc += 2; - continue; + /* reject UTF-8 characters that are not + minimally packed */ + errno = EILSEQ; + goto error; } codepoint -= 0x10000; diff --git a/selftest/knownfail.d/str-utf8 b/selftest/knownfail.d/str-utf8 deleted file mode 100644 index b003ea8b097..00000000000 --- a/selftest/knownfail.d/str-utf8 +++ /dev/null @@ -1 +0,0 @@ -^samba4.local.str.+utf8_[234] -- 2.25.1 From cdabd81c232b0cb26342b2f3a67ade3b6bb23c73 Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Thu, 8 Apr 2021 21:20:17 +1200 Subject: [PATCH 3/3] util/charset: warn loudly on unexpected E2BIG Signed-off-by: Douglas Bagnall --- lib/util/charset/convert_string.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/util/charset/convert_string.c b/lib/util/charset/convert_string.c index 96a3af68d27..88b128be547 100644 --- a/lib/util/charset/convert_string.c +++ b/lib/util/charset/convert_string.c @@ -435,8 +435,8 @@ bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic, break; case E2BIG: reason = "output buffer is too small"; - DBG_NOTICE("Conversion error: %s\n", - reason); + DBG_ERR("Conversion error: %s\n", + reason); break; case EILSEQ: reason="Illegal multibyte sequence"; -- 2.25.1