wsutil/str_util: Add ws_utf8_truncate

Add a convenience function to truncate a UTF-8 string to no more than certain length, while ensuring that the string ends with a complete character instead of a partial sequence (by truncating up to 3 additional bytes as necessary.) The common use case is when a valid UTF-8 string is copied into a buffer via snprintf, strlcpy, or strlcat and truncated, to fix up the end of the string and keep the string valid. The buffer holding the string must be large enough, and the string must be valid up to the point of truncation (aside from the possible partial sequence at the end). For speed, the function does not check those conditions. Ping #18412.
2022-10-12 23:11:56 -04:00 · 2022-10-12 23:11:56 -04:00 · 01e2b16ec6
parent 87441e45d8
commit 01e2b16ec6
2 changed files with 38 additions and 0 deletions
--- a/wsutil/str_util.c
+++ b/wsutil/str_util.c
@ -1031,6 +1031,21 @@ format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, cha
    return wmem_strbuf_finalize(buf);
 }

+char*
+ws_utf8_truncate(char *string, size_t len)
+{
+    char* last_char;
+
+    /* Ensure that it is null terminated */
+    string[len] = '\0';
+    last_char = g_utf8_find_prev_char(string, string + len);
+    if (last_char != NULL && g_utf8_get_char_validated(last_char, -1) == (gunichar)-2) {
+        /* The last UTF-8 character was truncated into a partial sequence. */
+        *last_char = '\0';
+    }
+    return string;
+}
+
 /* ASCII/EBCDIC conversion tables from
 * https://web.archive.org/web/20060813174742/http://www.room42.com/store/computer_center/code_tables.shtml
 */
--- a/wsutil/str_util.h
+++ b/wsutil/str_util.h
@ -288,6 +288,29 @@ WS_DLL_PUBLIC
 char *format_text_chr(wmem_allocator_t *allocator,
                        const char *string, size_t len, char chr);

+/**
+ * Truncate a UTF-8 string in place so that it is no larger than len bytes,
+ * ensuring that the string is null terminated and ends with a complete
+ * character instead of a partial sequence (e.g., possibly truncating up
+ * to 3 additional bytes if the terminal character is 4 bytes long).
+ *
+ * The buffer holding the string must be large enough (at least len + 1
+ * including the null terminator), and the first len bytes of the buffer
+ * must be a valid UTF-8 string, except for possibly ending in a partial
+ * sequence or not being null terminated. This is a convenience function
+ * that for speed does not check either of those conditions.
+ *
+ * A common use case is when a valid UTF-8 string has been copied into a
+ * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated,
+ * to ensure that the final UTF-8 character is not a partial sequence.
+ *
+ * @param string A pointer to the input string
+ * @param len The maximum length to truncate to
+ * @return    ptr to the string
+ */
+WS_DLL_PUBLIC
+char* ws_utf8_truncate(char *string, size_t len);
+
 WS_DLL_PUBLIC
 void EBCDIC_to_ASCII(guint8 *buf, guint bytes);