From f7ff5159619cda9bcb54e90b6a4ed5db36fc626f Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Fri, 23 Feb 2024 16:15:30 +0100 Subject: [PATCH] `*scanf()` fixes to make TeX work (#1109) * Fix reading the same symbol twice when using `{f,}scanf()` PR #924 appears to use `unget()` subtly incorrectly when parsing floating point numbers. The rest of the code only uses `unget()` immediately followed by `goto Done;` to return back the symbol that can't possibly belong to the directive we're processing. With floating-point, however, the ungot characters could very well be valid for the *next* directive, so we will essentially read them twice. It can't be seen in `sscanf()` tests because `unget()` is a no-op there, but the test I added for `fscanf()` fails like this: ... EXPECT_EQ(0xDEAD, i1) need 57005 (or 0xdead) = got 908973 (or 0x000ddead) ... EXPECT_EQ(0xBEEF, i2) need 48879 (or 0xbeef) = got 769775 (or 0x000bbeef) This means we read 0xDDEAD instead of 0xDEAD and 0xBBEEF instead of 0xBEEF. I checked that both musl and glibc read 0xDEAD/0xBEEF, as expected. Fix the failing test by removing the unneeded `unget()` calls. * Don't read invalid floating-point numbers in `*scanf()` Currently, we just ignore any errors from `strtod()`. They can happen either because no valid float can be parsed at all, or because the state machine recognizes only a prefix of a valid floating-point number. Fix this by making sure `strtod()` parses everything we recognized, provided it's non-empty. This requires to pop the last character off the FP buffer, which is supposed to be parsed by the next `*scanf()` directive. * Make `%c` parsing in `*scanf()` respect the C standard Currently, `%c`-style directives always succeed even if there are actually fewer characters in the input than requested. Before the fix, the added test fails like this: ... EXPECT_EQ(2, sscanf("ab", "%c %c %c", &c2, &c3, &c4)) need 2 (or 0x02 or '\2' or ENOENT) = got 3 (or 0x03 or '\3' or ESRCH) ... EXPECT_EQ(0, sscanf("abcd", "%5c", s2)) need 0 (or 0x0 or '\0') = got 1 (or 0x01 or '\1' or EPERM) musl and glibc pass this test. --- libc/stdio/vcscanf.c | 40 +++++++++++++++++++++++++---------- test/libc/stdio/fscanf_test.c | 33 +++++++++++++++++++++++++++++ test/libc/stdio/sscanf_test.c | 38 +++++++++++++++++++++++---------- 3 files changed, 89 insertions(+), 22 deletions(-) create mode 100644 test/libc/stdio/fscanf_test.c diff --git a/libc/stdio/vcscanf.c b/libc/stdio/vcscanf.c index cb890edb490..f58f21e74eb 100644 --- a/libc/stdio/vcscanf.c +++ b/libc/stdio/vcscanf.c @@ -50,6 +50,12 @@ } \ c; \ }) +#define UNBUFFER \ + ({ \ + if (c != -1) { \ + fpbuf[--fpbufcur] = '\0'; \ + } \ + }) /** * String / file / stream decoder. @@ -369,10 +375,11 @@ int __vcscanf(int callback(void *), // } } while ((c = BUFFER) != -1 && c != ')'); if (c == ')') { - c = BUFFER; + c = READ; } goto GotFloatingPointNumber; } else { + UNBUFFER; goto GotFloatingPointNumber; } } else { @@ -410,9 +417,7 @@ int __vcscanf(int callback(void *), // goto Done; } } else { - if (c != -1 && unget) { - unget(c, arg); - } + UNBUFFER; goto GotFloatingPointNumber; } } else { @@ -465,13 +470,24 @@ int __vcscanf(int callback(void *), // Continue: continue; Break: - if (c != -1 && unget) { - unget(c, arg); - } + UNBUFFER; break; } while ((c = BUFFER) != -1); GotFloatingPointNumber: - fp = strtod((char *)fpbuf, NULL); + /* An empty buffer can't be a valid float; don't even bother parsing. */ + bool valid = fpbufcur > 0; + if (valid) { + char *ep; + fp = strtod((char *)fpbuf, &ep); + /* We should have parsed the whole buffer. */ + valid = ep == (char *)fpbuf + fpbufcur; + } + free(fpbuf); + fpbuf = NULL; + fpbufcur = fpbufsize = 0; + if (!valid) { + goto Done; + } if (!discard) { ++items; void *out = va_arg(va, void *); @@ -481,9 +497,6 @@ int __vcscanf(int callback(void *), // *(double *)out = (double)fp; } } - free(fpbuf); - fpbuf = NULL; - fpbufcur = fpbufsize = 0; continue; ReportConsumed: n_ptr = va_arg(va, int *); @@ -537,6 +550,11 @@ int __vcscanf(int callback(void *), // if (!j && c == -1 && !items) { items = -1; goto Done; + } else if (rawmode && j != width) { + /* The C standard says that %c "matches a sequence of characters of + * **exactly** the number specified by the field width". If we have + * fewer characters, what we've just read is invalid. */ + goto Done; } else if (!rawmode && j < bufsize) { if (charbytes == sizeof(char)) { buf[j] = '\0'; diff --git a/test/libc/stdio/fscanf_test.c b/test/libc/stdio/fscanf_test.c new file mode 100644 index 00000000000..accfb4bdbd9 --- /dev/null +++ b/test/libc/stdio/fscanf_test.c @@ -0,0 +1,33 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Ivan Komarov │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/math.h" +#include "libc/stdio/stdio.h" +#include "libc/testlib/testlib.h" + +TEST(fscanf, test_readAfterFloat) { + FILE *f = fmemopen("infDEAD-.125e-2BEEF", 19, "r"); + float f1 = 666.666f, f2 = f1; + int i1 = 666, i2 = i1; + EXPECT_EQ(4, fscanf(f, "%f%x%f%x", &f1, &i1, &f2, &i2)); + EXPECT_TRUE(isinf(f1)); + EXPECT_EQ(0xDEAD, i1); + EXPECT_EQ(-0.125e-2f, f2); + EXPECT_EQ(0xBEEF, i2); + fclose(f); +} diff --git a/test/libc/stdio/sscanf_test.c b/test/libc/stdio/sscanf_test.c index 571617715d5..40e35f1c42d 100644 --- a/test/libc/stdio/sscanf_test.c +++ b/test/libc/stdio/sscanf_test.c @@ -69,9 +69,17 @@ TEST(sscanf, testNonDirectiveCharacterMatching) { } TEST(sscanf, testCharacter) { - char c = 0; - EXPECT_EQ(1, sscanf("a", "%c", &c)); - EXPECT_EQ('a', c); + char c1 = 0, c2 = c1, c3 = c2, c4 = c3; + char s1[32] = {0}, s2[32] = {0}; + EXPECT_EQ(1, sscanf("a", "%c", &c1)); + EXPECT_EQ(2, sscanf("ab", "%c %c %c", &c2, &c3, &c4)); + EXPECT_EQ(1, sscanf("abcde", "%5c", s1)); + EXPECT_EQ(0, sscanf("abcd", "%5c", s2)); + + EXPECT_EQ('a', c1); + EXPECT_EQ('a', c2); + EXPECT_EQ('b', c3); + EXPECT_STREQ("abcde", &s1[0]); } TEST(sscanf, testStringBuffer) { @@ -394,6 +402,20 @@ TEST(sscanf, floating_point_infinity_double_precision) { EXPECT_TRUE(isinf(g)); } +TEST(sscanf, floating_point_invalid) { + float dummy; + EXPECT_EQ(0, sscanf("junk", "%f", &dummy)); + EXPECT_EQ(0, sscanf("e9", "%f", &dummy)); + EXPECT_EQ(0, sscanf("-e9", "%f", &dummy)); +} + +TEST(sscanf, floating_point_invalid_double_precision) { + double dummy; + EXPECT_EQ(0, sscanf("junk", "%lf", &dummy)); + EXPECT_EQ(0, sscanf("e9", "%lf", &dummy)); + EXPECT_EQ(0, sscanf("-e9", "%lf", &dummy)); +} + TEST(sscanf, floating_point_documentation_examples) { float a = 666.666f, b = a, c = b, d = c, e = d, f = e, g = f, h = g, i = h, j = i; @@ -401,7 +423,7 @@ TEST(sscanf, floating_point_documentation_examples) { EXPECT_EQ(2, sscanf("111.11 -2.22", "%f %f", &a, &b)); EXPECT_EQ(3, sscanf("Nan nan(2) inF", "%f %f %f", &c, &d, &e)); EXPECT_EQ( - 5, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk", + 2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk", "%f %f %f %f %f", &f, &g, &h, &i, &j)); EXPECT_EQ(111.11f, a); @@ -411,9 +433,6 @@ TEST(sscanf, floating_point_documentation_examples) { EXPECT_TRUE(isinf(e)); EXPECT_EQ(0X1.BC70A3D70A3D7P+6f, f); EXPECT_TRUE(isinf(g)); - EXPECT_EQ(-0.0000000123f, h); - EXPECT_EQ(.0f, i); - EXPECT_EQ(.0f, j); } TEST(sscanf, floating_point_documentation_examples_double_precision) { @@ -423,7 +442,7 @@ TEST(sscanf, floating_point_documentation_examples_double_precision) { EXPECT_EQ(2, sscanf("111.11 -2.22", "%lf %lf", &a, &b)); EXPECT_EQ(3, sscanf("Nan nan(2) inF", "%lf %lf %lf", &c, &d, &e)); EXPECT_EQ( - 5, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk", + 2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk", "%lf %lf %lf %lf %lf", &f, &g, &h, &i, &j)); EXPECT_EQ(111.11, a); @@ -433,9 +452,6 @@ TEST(sscanf, floating_point_documentation_examples_double_precision) { EXPECT_TRUE(isinf(e)); EXPECT_EQ(0X1.BC70A3D70A3D7P+6, f); EXPECT_TRUE(isinf(g)); - EXPECT_EQ(-0.0000000123, h); - EXPECT_EQ(.0, i); - EXPECT_EQ(.0, j); } TEST(sscanf, luplus) {