diff --git a/src/compose/dump.h b/src/compose/dump.h new file mode 100644 index 000000000..daeaf62d8 --- /dev/null +++ b/src/compose/dump.h @@ -0,0 +1,98 @@ +/* + * Copyright © 2013 Pierre Le Marre + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +/* Ad-hoc escaping for UTF-8 string + * + * Note that it only escapes the strict minimum to get a valid Compose file. + * It also escapes hexadecimal digits after an hexadecimal escape. This is not + * strictly needed by the current implementation: "\x0abcg" parses as "␊bcg", + * but better be cautious than sorry and produce "\x0a\x62\x63g" instead. + * In the latter string there is no ambiguity and no need to know the maximum + * number of digits supported by the escape sequence. + */ + +#ifndef COMPOSE_DUMP_H +#define COMPOSE_DUMP_H + +#include "config.h" + +#include + +#include "src/utils.h" + +static char* +escape_utf8_string_literal(const char *from) +{ + const int length = strlen(from); + /* Longest escape is converting ASCII character to "\xNN" */ + const size_t max_new_length = 4 * length + 1; + char* to = calloc(max_new_length, sizeof(to)); + if (!to) + return NULL; + + size_t t = 0; + bool previous_hex_escape = false; + uint8_t nbytes = 0; + for (int f = 0; f < length;) { + if ((unsigned char) from[f] < 0x80) { + /* ASCII */ + if (from[f] <= 0x10 || from[f] == 0x7f || + (is_xdigit(from[f]) && previous_hex_escape)) + { + /* Control character or + hexadecimal digit following an hexadecimal escape */ + snprintf_safe(&to[t], 5, "\\x%02x", from[f]); + t += 4; + previous_hex_escape = true; + } else if (from[f] == '"' || from[f] == '\\') { + /* Quote */ + snprintf_safe(&to[t], 3, "\\%c", from[f]); + t += 2; + previous_hex_escape = false; + } else { + /* Other characters */ + to[t++] = from[f]; + previous_hex_escape = false; + } + f++; + continue; + } + else if ((unsigned char) from[f] < 0xe0) + nbytes = 2; + else if ((unsigned char) from[f] < 0xf0) + nbytes = 3; + else + nbytes = 4; + memcpy(&to[t], &from[f], nbytes); + t += nbytes; + f += nbytes; + previous_hex_escape = false; + } + to[t++] = '\0'; + if (t < max_new_length) + return realloc(to, t); + return to; +} + +#endif diff --git a/src/compose/parser.c b/src/compose/parser.c index ac11446a3..55a55eb03 100644 --- a/src/compose/parser.c +++ b/src/compose/parser.c @@ -468,6 +468,22 @@ resolve_modifier(const char *name) return XKB_MOD_INVALID; } +/* For testing purposes */ +char * +parse_string_literal(struct xkb_context *ctx, const char *string) +{ + struct scanner s; + union lvalue val; + scanner_init(&s, ctx, string, strlen(string), "(unamed)", NULL); + switch (lex(&s, &val)) { + case TOK_STRING: + return strdup(val.string.str); + default: + fprintf(stderr, "ERROR: %s\n", s.s); + return NULL; + } +} + static bool parse(struct xkb_compose_table *table, struct scanner *s, unsigned include_depth); diff --git a/src/compose/parser.h b/src/compose/parser.h index 487f1a9f8..8651ee6c7 100644 --- a/src/compose/parser.h +++ b/src/compose/parser.h @@ -27,6 +27,9 @@ #define MAX_LHS_LEN 10 #define MAX_INCLUDE_DEPTH 5 +char * +parse_string_literal(struct xkb_context *ctx, const char *string); + bool parse_string(struct xkb_compose_table *table, const char *string, size_t len, diff --git a/test/compose.c b/test/compose.c index 56bd889c0..8b8b59032 100644 --- a/test/compose.c +++ b/test/compose.c @@ -26,6 +26,9 @@ #include "xkbcommon/xkbcommon-compose.h" #include "test.h" +#include "src/utf8.h" +#include "src/compose/parser.h" +#include "src/compose/dump.h" static const char * compose_status_string(enum xkb_compose_status status) @@ -769,13 +772,13 @@ test_traverse(struct xkb_context *ctx) } static void -test_escape_sequences(struct xkb_context *ctx) +test_decode_escape_sequences(struct xkb_context *ctx) { /* The following escape sequences should be ignored: * • \401 overflows * • \0 and \x0 produce NULL */ - const char *table_string = " : \"\\401f\\x0o\\0o\" X\n"; + const char table_string[] = " : \"\\401f\\x0o\\0o\" X\n"; assert(test_compose_seq_buffer(ctx, table_string, XKB_KEY_o, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSING, "", XKB_KEY_NoSymbol, @@ -783,6 +786,97 @@ test_escape_sequences(struct xkb_context *ctx) XKB_KEY_NoSymbol)); } +static uint32_t +random_non_null_unicode_char(bool ascii) +{ + if (ascii) + return 0x01 + (rand() % 0x80); + switch (rand() % 5) { + case 0: + /* U+0080..U+07FF: 2 bytes in UTF-8 */ + return 0x80 + (rand() % 0x800); + case 1: + /* U+0800..U+FFFF: 3 bytes in UTF-8 */ + return 0x800 + (rand() % 0x10000); + case 2: + /* U+10000..U+10FFFF: 4 bytes in UTF-8 */ + return 0x10000 + (rand() % 0x110000); + default: + /* NOTE: Higher probability for ASCII */ + /* U+0001..U+007F: 1 byte in UTF-8 */ + return 0x01 + (rand() % 0x80); + } +} + +static void +test_encode_escape_sequences(struct xkb_context *ctx) +{ + char *escaped; + escaped = escape_utf8_string_literal(""); + assert(streq_not_null(escaped, "")); + free(escaped); + + /* Test round-trip of random strings */ +# define SAMPLE_SIZE 1000 +# define MIN_CODE_POINT 0x0001 +# define MAX_CODE_POINTS_COUNT 15 + char buf[1 + MAX_CODE_POINTS_COUNT * 4]; + for (int ascii = 1; ascii >= 0; ascii--) { + for (size_t s = 0; s < SAMPLE_SIZE; s++) { + /* Create the string */ + size_t length = 1 + (rand() % MAX_CODE_POINTS_COUNT); + size_t c = 0; + for (size_t idx = 0; idx < length; idx++) { + uint32_t cp = random_non_null_unicode_char(ascii); + int count = utf32_to_utf8(cp, &buf[c]); + /* Handle invalid code point in UTF-8 */ + if (!count) + /* Use replacement character: � */ + count = utf32_to_utf8(0xfffd, &buf[c]); + c += count - 1; + assert(c <= sizeof(buf) - 1); + } + assert(buf[c] == '\0'); + assert(strlen(buf) == c); + assert(is_valid_utf8(buf, c)); + /* Escape the string */ + escaped = escape_utf8_string_literal(buf); + if (!escaped) + break; + assert(is_valid_utf8(escaped, strlen(escaped))); + char *string_literal = asprintf_safe("\"%s\"", escaped); + if (!string_literal) { + free(escaped); + break; + } + /* Unescape the string */ + char *unescaped = parse_string_literal(ctx, string_literal); + assert(streq_not_null(buf, unescaped)); + free(unescaped); + free(string_literal); + free(escaped); + } + } +# undef SAMPLE_SIZE +# undef MIN_CODE_POINT +# undef MAX_CODE_POINTS_COUNT + + const char table_string[] = + " : \"\\x0abcg\\\"x\" A\n" + " : \"éxyz\" B\n" + " : \"€xyz\" C\n" + " : \"✨xyz\" D\n" + " : \"✨\\x0aé\\x0a€x\\\"\" E\n"; + + assert(test_compose_seq_buffer(ctx, table_string, + XKB_KEY_a, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "\x0a""bcg\"x", XKB_KEY_A, + XKB_KEY_b, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "éxyz", XKB_KEY_B, + XKB_KEY_c, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "€xyz", XKB_KEY_C, + XKB_KEY_d, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "✨xyz", XKB_KEY_D, + XKB_KEY_e, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "✨\x0aé\x0a€x\"", XKB_KEY_E, + XKB_KEY_NoSymbol)); +} + int main(int argc, char *argv[]) { @@ -818,7 +912,8 @@ main(int argc, char *argv[]) test_include(ctx); test_override(ctx); test_traverse(ctx); - test_escape_sequences(ctx); + test_decode_escape_sequences(ctx); + test_encode_escape_sequences(ctx); xkb_context_unref(ctx); return 0; diff --git a/tools/compile-compose.c b/tools/compile-compose.c index dd20f0a0e..42f3f9e18 100644 --- a/tools/compile-compose.c +++ b/tools/compile-compose.c @@ -25,12 +25,12 @@ #include #include -#include -#include +#include #include "xkbcommon/xkbcommon.h" #include "xkbcommon/xkbcommon-keysyms.h" #include "xkbcommon/xkbcommon-compose.h" +#include "src/compose/dump.h" static void usage(FILE *fp, char *progname) @@ -69,10 +69,17 @@ print_compose_table_entry(struct xkb_compose_table_entry *entry) printf(" "); } } - printf(":"); + printf(" : "); const char *utf8 = xkb_compose_table_entry_utf8(entry); if (*utf8 != '\0') { - printf(" \"%s\"", utf8); + char *escaped = escape_utf8_string_literal(utf8); + if (!escaped) { + fprintf(stderr, "ERROR: Cannot escape the string: allocation error\n"); + printf("\"\""); + } else { + printf(" \"%s\"", escaped); + free(escaped); + } } const xkb_keysym_t keysym = xkb_compose_table_entry_keysym(entry); if (keysym != XKB_KEY_NoSymbol) {