From adef3743dc3aa4b266815b54dcf7a96a3e4f0297 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Sun, 29 Oct 2023 07:20:29 +0100 Subject: [PATCH 1/3] Compose: skip heading UTF-8 encoded BOM (U+FEFF) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leading BOM is legal and is used as a signature — an indication that an otherwise unmarked text file is in UTF-8. See: https://www.unicode.org/faq/utf_bom.html#bom5 for further details. --- src/compose/parser.c | 4 ++++ test/compose.c | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/compose/parser.c b/src/compose/parser.c index 5545a33f8..6740f21b2 100644 --- a/src/compose/parser.c +++ b/src/compose/parser.c @@ -534,6 +534,10 @@ parse(struct xkb_compose_table *table, struct scanner *s, production.mods = 0; production.modmask = 0; + /* Skip UTF-8 encoded BOM (U+FEFF) */ + /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */ + scanner_str(s, "\xef\xbb\xbf", 3); + /* fallthrough */ initial_eol: diff --git a/test/compose.c b/test/compose.c index 8c633d704..d7192f67c 100644 --- a/test/compose.c +++ b/test/compose.c @@ -172,6 +172,16 @@ test_compose_seq_buffer(struct xkb_context *ctx, const char *buffer, ...) return ok; } +static void +test_compose_utf8_bom(struct xkb_context *ctx) +{ + const char *buffer = "\xef\xbb\xbf : X"; + assert(test_compose_seq_buffer(ctx, buffer, + XKB_KEY_A, XKB_COMPOSE_FEED_ACCEPTED, XKB_COMPOSE_COMPOSED, "X", XKB_KEY_X, + XKB_KEY_NoSymbol)); +} + + static void test_seqs(struct xkb_context *ctx) { @@ -723,6 +733,7 @@ main(int argc, char *argv[]) unsetenv("XLOCALEDIR"); #endif + test_compose_utf8_bom(ctx); test_seqs(ctx); test_conflicting(ctx); test_XCOMPOSEFILE(ctx); From 8d6681759e5f063f849dcfb8a7dcd497c688d1e0 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Sun, 29 Oct 2023 07:31:34 +0100 Subject: [PATCH 2/3] xkbcomp: skip heading UTF-8 encoded BOM (U+FEFF) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leading BOM is legal and is used as a signature — an indication that an otherwise unmarked text file is in UTF-8. See: https://www.unicode.org/faq/utf_bom.html#bom5 for further details. --- src/xkbcomp/scanner.c | 5 +++++ test/buffercomp.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/xkbcomp/scanner.c b/src/xkbcomp/scanner.c index 57babbb20..8bff375df 100644 --- a/src/xkbcomp/scanner.c +++ b/src/xkbcomp/scanner.c @@ -199,6 +199,11 @@ XkbParseString(struct xkb_context *ctx, const char *string, size_t len, { struct scanner scanner; scanner_init(&scanner, ctx, string, len, file_name, NULL); + + /* Skip UTF-8 encoded BOM (U+FEFF) */ + /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */ + scanner_str(&scanner, "\xef\xbb\xbf", 3); + return parse(ctx, &scanner, map); } diff --git a/test/buffercomp.c b/test/buffercomp.c index 9a7603654..b9b5e9d8d 100644 --- a/test/buffercomp.c +++ b/test/buffercomp.c @@ -78,6 +78,18 @@ main(int argc, char *argv[]) keymap = test_compile_buffer(ctx, "", 0); assert(!keymap); + /* Accept UTF-8 encoded BOM (U+FEFF) */ + const char *bom = + "\xef\xbb\xbfxkb_keymap {" + " xkb_keycodes { include \"evdev\" };" + " xkb_types { include \"complete\" };" + " xkb_compat { include \"complete\" };" + " xkb_symbols { include \"pc\" };" + "};"; + keymap = test_compile_buffer(ctx, bom, strlen(bom)); + assert(keymap); + xkb_keymap_unref(keymap); + /* Make sure we can recompile our output for a normal keymap from rules. */ keymap = test_compile_rules(ctx, NULL, NULL, "ru,ca,de,us", ",multix,neo,intl", NULL); From a1cc8062362f2d7b06f257c9f1feba768d52fe1a Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Sun, 29 Oct 2023 07:44:39 +0100 Subject: [PATCH 3/3] rules: skip heading UTF-8 encoded BOM (U+FEFF) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leading BOM is legal and is used as a signature — an indication that an otherwise unmarked text file is in UTF-8. See: https://www.unicode.org/faq/utf_bom.html#bom5 for further details. --- src/xkbcomp/rules.c | 4 ++++ test/data/rules/utf-8_with_bom | 22 ++++++++++++++++++++++ test/rules-file.c | 12 ++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 test/data/rules/utf-8_with_bom diff --git a/src/xkbcomp/rules.c b/src/xkbcomp/rules.c index f5d9c4960..daa4f3dec 100644 --- a/src/xkbcomp/rules.c +++ b/src/xkbcomp/rules.c @@ -1099,6 +1099,10 @@ read_rules_file(struct xkb_context *ctx, scanner_init(&scanner, matcher->ctx, string, size, path, NULL); + /* Skip UTF-8 encoded BOM (U+FEFF) */ + /* See: https://www.unicode.org/faq/utf_bom.html#bom5 */ + scanner_str(&scanner, "\xef\xbb\xbf", 3); + ret = matcher_match(matcher, &scanner, include_depth, string, size, path); unmap_file(string, size); diff --git a/test/data/rules/utf-8_with_bom b/test/data/rules/utf-8_with_bom new file mode 100644 index 000000000..a3c3a73ab --- /dev/null +++ b/test/data/rules/utf-8_with_bom @@ -0,0 +1,22 @@ +// NOTE: this file is encoded in UTF-8 with a leading BOM (U+FEFF) +! model = keycodes + my_model = my_keycodes + * = default_keycodes + +! layout variant = symbols + my_layout my_variant = my_symbols+extra_variant + +! layout = symbols + my_layout = my_symbols + * = default_symbols + +! model = types + my_model = my_types + * = default_types + +! model = compat + my_model = my_compat + * = default_compat + +! option = compat + my_option = |some:compat diff --git a/test/rules-file.c b/test/rules-file.c index d217ba960..302aa6878 100644 --- a/test/rules-file.c +++ b/test/rules-file.c @@ -94,6 +94,18 @@ main(int argc, char *argv[]) ctx = test_get_context(0); assert(ctx); + struct test_data test_utf_8_with_bom = { + .rules = "utf-8_with_bom", + + .model = "my_model", .layout = "my_layout", .variant = "my_variant", + .options = "my_option", + + .keycodes = "my_keycodes", .types = "my_types", + .compat = "my_compat|some:compat", + .symbols = "my_symbols+extra_variant", + }; + assert(test_rules(ctx, &test_utf_8_with_bom)); + struct test_data test1 = { .rules = "simple",