From 01e2b16ec658717d377515d1c7c4d8ecaf4c88d1 Mon Sep 17 00:00:00 2001 From: John Thacker Date: Wed, 12 Oct 2022 23:11:56 -0400 Subject: [PATCH] wsutil/str_util: Add ws_utf8_truncate Add a convenience function to truncate a UTF-8 string to no more than certain length, while ensuring that the string ends with a complete character instead of a partial sequence (by truncating up to 3 additional bytes as necessary.) The common use case is when a valid UTF-8 string is copied into a buffer via snprintf, strlcpy, or strlcat and truncated, to fix up the end of the string and keep the string valid. The buffer holding the string must be large enough, and the string must be valid up to the point of truncation (aside from the possible partial sequence at the end). For speed, the function does not check those conditions. Ping #18412. --- wsutil/str_util.c | 15 +++++++++++++++ wsutil/str_util.h | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/wsutil/str_util.c b/wsutil/str_util.c index f41e639bde..0d6e1694b8 100644 --- a/wsutil/str_util.c +++ b/wsutil/str_util.c @@ -1031,6 +1031,21 @@ format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, cha return wmem_strbuf_finalize(buf); } +char* +ws_utf8_truncate(char *string, size_t len) +{ + char* last_char; + + /* Ensure that it is null terminated */ + string[len] = '\0'; + last_char = g_utf8_find_prev_char(string, string + len); + if (last_char != NULL && g_utf8_get_char_validated(last_char, -1) == (gunichar)-2) { + /* The last UTF-8 character was truncated into a partial sequence. */ + *last_char = '\0'; + } + return string; +} + /* ASCII/EBCDIC conversion tables from * https://web.archive.org/web/20060813174742/http://www.room42.com/store/computer_center/code_tables.shtml */ diff --git a/wsutil/str_util.h b/wsutil/str_util.h index 9b2b1e295e..03bd76498c 100644 --- a/wsutil/str_util.h +++ b/wsutil/str_util.h @@ -288,6 +288,29 @@ WS_DLL_PUBLIC char *format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, char chr); +/** + * Truncate a UTF-8 string in place so that it is no larger than len bytes, + * ensuring that the string is null terminated and ends with a complete + * character instead of a partial sequence (e.g., possibly truncating up + * to 3 additional bytes if the terminal character is 4 bytes long). + * + * The buffer holding the string must be large enough (at least len + 1 + * including the null terminator), and the first len bytes of the buffer + * must be a valid UTF-8 string, except for possibly ending in a partial + * sequence or not being null terminated. This is a convenience function + * that for speed does not check either of those conditions. + * + * A common use case is when a valid UTF-8 string has been copied into a + * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated, + * to ensure that the final UTF-8 character is not a partial sequence. + * + * @param string A pointer to the input string + * @param len The maximum length to truncate to + * @return ptr to the string + */ +WS_DLL_PUBLIC +char* ws_utf8_truncate(char *string, size_t len); + WS_DLL_PUBLIC void EBCDIC_to_ASCII(guint8 *buf, guint bytes);