From 98dee22519936530d5a6c835ae57f0e03ea10d89 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Thu, 12 Sep 2024 16:52:40 +0200 Subject: [PATCH] Add UTF-8 to UTF-32 decoding Add internal functions to convert UTF-32 to UTF-8, with corresponding tests. --- meson.build | 8 +++- src/utf8-decoding.c | 99 +++++++++++++++++++++++++++++++++++++++++++++ src/utf8-decoding.h | 20 +++++++++ test/utf8.c | 22 ++++++++++ 4 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 src/utf8-decoding.c create mode 100644 src/utf8-decoding.h diff --git a/meson.build b/meson.build index b0ae2e7b..21665434 100644 --- a/meson.build +++ b/meson.build @@ -753,7 +753,13 @@ test( ) test( 'utf8', - executable('test-utf8', 'test/utf8.c', dependencies: test_dep), + executable( + 'test-utf8', + 'test/utf8.c', + 'src/utf8-decoding.c', + 'src/utf8-decoding.h', + dependencies: test_dep + ), env: test_env, ) test( diff --git a/src/utf8-decoding.c b/src/utf8-decoding.c new file mode 100644 index 00000000..e805ab28 --- /dev/null +++ b/src/utf8-decoding.c @@ -0,0 +1,99 @@ +/* + * Copyright © 2024 Pierre Le Marre + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "config.h" + +#include "utf8-decoding.h" + +/* Array mapping the leading byte to the length of a UTF-8 sequence. + * A value of zero indicates that the byte can not begin a UTF-8 sequence. */ +static const uint8_t utf8_sequence_length_by_leading_byte[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */ + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */ + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */ +}; + +/* Length of next utf-8 sequence */ +uint8_t +utf8_sequence_length(const char *s) +{ + return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]]; +} + +/* Reads the next UTF-8 sequence in a string */ +uint32_t +utf8_next_code_point(const char *s, size_t max_size, size_t *size_out) +{ + uint32_t cp = 0; + uint8_t len = utf8_sequence_length(s); + *size_out = 0; + + if (!max_size || len > max_size) + return INVALID_UTF8_CODE_POINT; + + /* Handle leading byte */ + switch (len) { + case 1: + *size_out = 1; + return (uint32_t)s[0]; + case 2: + cp = (uint32_t)s[0] & 0x1f; + break; + case 3: + cp = (uint32_t)s[0] & 0x0f; + break; + case 4: + cp = (uint32_t)s[0] & 0x07; + break; + default: + return INVALID_UTF8_CODE_POINT; + } + + /* Process remaining bytes of the UTF-8 sequence */ + for (size_t k = 1; k < len; k++) { + if (((uint32_t)s[k] & 0xc0) != 0x80) + return INVALID_UTF8_CODE_POINT; + cp <<= 6; + cp |= (uint32_t)s[k] & 0x3f; + } + + /* Check surrogates */ + if (cp >= 0xd800 && cp <= 0xdfff) + return INVALID_UTF8_CODE_POINT; + + *size_out = len; + return cp; +} diff --git a/src/utf8-decoding.h b/src/utf8-decoding.h new file mode 100644 index 00000000..b7e208d7 --- /dev/null +++ b/src/utf8-decoding.h @@ -0,0 +1,20 @@ + +#ifndef UTF8_DECODING_H +#define UTF8_DECODING_H + +#include "config.h" + +#include +#include + +/* Check if a char is the start of a UTF-8 sequence */ +#define is_utf8_start(c) (((c) & 0xc0) != 0x80) +#define INVALID_UTF8_CODE_POINT UINT32_MAX + +uint8_t +utf8_sequence_length(const char *s); + +uint32_t +utf8_next_code_point(const char *s, size_t max_size, size_t *size_out); + +#endif diff --git a/test/utf8.c b/test/utf8.c index e90dcdab..ba7e0dbc 100644 --- a/test/utf8.c +++ b/test/utf8.c @@ -32,6 +32,7 @@ #include "src/keysym.h" #include "test.h" #include "utf8.h" +#include "utf8-decoding.h" #include "utils.h" #define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1)) @@ -179,6 +180,26 @@ test_utf32_to_utf8(void) check_utf32_to_utf8(0xffffffff, 0, ""); } +static void +/* Check roundtrip UTF-32 → UTF-8 → UTF-32 */ +test_utf8_to_utf32(void) +{ + char buffer[XKB_KEYSYM_UTF8_MAX_SIZE]; + for (uint32_t cp = 0; cp < 0x10ffff; cp++) { + int length = utf32_to_utf8(cp, buffer) - 1; + /* Check surrogates */ + if (cp >= 0xd800 && cp <= 0xdfff) { + assert(length == -1); + } else { + assert(length > 0); + size_t length2 = 0; + uint32_t cp2 = utf8_next_code_point(buffer, (size_t)length, &length2); + assert(cp2 != INVALID_UTF8_CODE_POINT && cp2 == cp && + length2 == (size_t)length); + } + } +} + int main(void) { @@ -186,6 +207,7 @@ main(void) test_is_valid_utf8(); test_utf32_to_utf8(); + test_utf8_to_utf32(); return 0; }