From f7ff5159619cda9bcb54e90b6a4ed5db36fc626f Mon Sep 17 00:00:00 2001
From: Ivan Komarov <Ivan.Komarov@dfyz.info>
Date: Fri, 23 Feb 2024 16:15:30 +0100
Subject: [PATCH] `*scanf()` fixes to make TeX work (#1109)

* Fix reading the same symbol twice when using `{f,}scanf()`

PR #924 appears to use `unget()` subtly incorrectly when parsing
floating point numbers. The rest of the code only uses `unget()`
immediately followed by `goto Done;` to return back the symbol that
can't possibly belong to the directive we're processing.

With floating-point, however, the ungot characters could very well
be valid for the *next* directive, so we will essentially read them
twice. It can't be seen in `sscanf()` tests because `unget()` is a
no-op there, but the test I added for `fscanf()` fails like this:

        ...
        EXPECT_EQ(0xDEAD, i1)
                need 57005 (or 0xdead) =
                 got 908973 (or 0x000ddead)
        ...
        EXPECT_EQ(0xBEEF, i2)
                need 48879 (or 0xbeef) =
                 got 769775 (or 0x000bbeef)

This means we read 0xDDEAD instead of 0xDEAD and 0xBBEEF instead of
0xBEEF. I checked that both musl and glibc read 0xDEAD/0xBEEF, as
expected.

Fix the failing test by removing the unneeded `unget()` calls.

* Don't read invalid floating-point numbers in `*scanf()`

Currently, we just ignore any errors from `strtod()`. They can
happen either because no valid float can be parsed at all, or
because the state machine recognizes only a prefix of a valid
floating-point number.

Fix this by making sure `strtod()` parses everything we recognized,
provided it's non-empty. This requires to pop the last character
off the FP buffer, which is supposed to be parsed by the next
`*scanf()` directive.

* Make `%c` parsing in `*scanf()` respect the C standard

Currently, `%c`-style directives always succeed even if there
are actually fewer characters in the input than requested.

Before the fix, the added test fails like this:

        ...
        EXPECT_EQ(2, sscanf("ab", "%c %c %c", &c2, &c3, &c4))
                need 2 (or 0x02 or '\2' or ENOENT) =
                 got 3 (or 0x03 or '\3' or ESRCH)
        ...
        EXPECT_EQ(0, sscanf("abcd", "%5c", s2))
                need 0 (or 0x0 or '\0') =
                 got 1 (or 0x01 or '\1' or EPERM)

musl and glibc pass this test.
---
 libc/stdio/vcscanf.c          | 40 +++++++++++++++++++++++++----------
 test/libc/stdio/fscanf_test.c | 33 +++++++++++++++++++++++++++++
 test/libc/stdio/sscanf_test.c | 38 +++++++++++++++++++++++----------
 3 files changed, 89 insertions(+), 22 deletions(-)
 create mode 100644 test/libc/stdio/fscanf_test.c

diff --git a/libc/stdio/vcscanf.c b/libc/stdio/vcscanf.c
index cb890edb490..f58f21e74eb 100644
--- a/libc/stdio/vcscanf.c
+++ b/libc/stdio/vcscanf.c
@@ -50,6 +50,12 @@
     }                                         \
     c;                                        \
   })
+#define UNBUFFER                \
+  ({                            \
+    if (c != -1) {              \
+      fpbuf[--fpbufcur] = '\0'; \
+    }                           \
+  })
 
 /**
  * String / file / stream decoder.
@@ -369,10 +375,11 @@ int __vcscanf(int callback(void *),    //
                   }
                 } while ((c = BUFFER) != -1 && c != ')');
                 if (c == ')') {
-                  c = BUFFER;
+                  c = READ;
                 }
                 goto GotFloatingPointNumber;
               } else {
+                UNBUFFER;
                 goto GotFloatingPointNumber;
               }
             } else {
@@ -410,9 +417,7 @@ int __vcscanf(int callback(void *),    //
                   goto Done;
                 }
               } else {
-                if (c != -1 && unget) {
-                  unget(c, arg);
-                }
+                UNBUFFER;
                 goto GotFloatingPointNumber;
               }
             } else {
@@ -465,13 +470,24 @@ int __vcscanf(int callback(void *),    //
         Continue:
           continue;
         Break:
-          if (c != -1 && unget) {
-            unget(c, arg);
-          }
+          UNBUFFER;
           break;
         } while ((c = BUFFER) != -1);
       GotFloatingPointNumber:
-        fp = strtod((char *)fpbuf, NULL);
+        /* An empty buffer can't be a valid float; don't even bother parsing. */
+        bool valid = fpbufcur > 0;
+        if (valid) {
+          char *ep;
+          fp = strtod((char *)fpbuf, &ep);
+          /* We should have parsed the whole buffer. */
+          valid = ep == (char *)fpbuf + fpbufcur;
+        }
+        free(fpbuf);
+        fpbuf = NULL;
+        fpbufcur = fpbufsize = 0;
+        if (!valid) {
+          goto Done;
+        }
         if (!discard) {
           ++items;
           void *out = va_arg(va, void *);
@@ -481,9 +497,6 @@ int __vcscanf(int callback(void *),    //
             *(double *)out = (double)fp;
           }
         }
-        free(fpbuf);
-        fpbuf = NULL;
-        fpbufcur = fpbufsize = 0;
         continue;
       ReportConsumed:
         n_ptr = va_arg(va, int *);
@@ -537,6 +550,11 @@ int __vcscanf(int callback(void *),    //
               if (!j && c == -1 && !items) {
                 items = -1;
                 goto Done;
+              } else if (rawmode && j != width) {
+                /* The C standard says that %c "matches a sequence of characters of
+                 * **exactly** the number specified by the field width". If we have
+                 * fewer characters, what we've just read is invalid. */
+                goto Done;
               } else if (!rawmode && j < bufsize) {
                 if (charbytes == sizeof(char)) {
                   buf[j] = '\0';
diff --git a/test/libc/stdio/fscanf_test.c b/test/libc/stdio/fscanf_test.c
new file mode 100644
index 00000000000..accfb4bdbd9
--- /dev/null
+++ b/test/libc/stdio/fscanf_test.c
@@ -0,0 +1,33 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Ivan Komarov                                                  │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/math.h"
+#include "libc/stdio/stdio.h"
+#include "libc/testlib/testlib.h"
+
+TEST(fscanf, test_readAfterFloat) {
+  FILE *f = fmemopen("infDEAD-.125e-2BEEF", 19, "r");
+  float f1 = 666.666f, f2 = f1;
+  int i1 = 666, i2 = i1;
+  EXPECT_EQ(4, fscanf(f, "%f%x%f%x", &f1, &i1, &f2, &i2));
+  EXPECT_TRUE(isinf(f1));
+  EXPECT_EQ(0xDEAD, i1);
+  EXPECT_EQ(-0.125e-2f, f2);
+  EXPECT_EQ(0xBEEF, i2);
+  fclose(f);
+}
diff --git a/test/libc/stdio/sscanf_test.c b/test/libc/stdio/sscanf_test.c
index 571617715d5..40e35f1c42d 100644
--- a/test/libc/stdio/sscanf_test.c
+++ b/test/libc/stdio/sscanf_test.c
@@ -69,9 +69,17 @@ TEST(sscanf, testNonDirectiveCharacterMatching) {
 }
 
 TEST(sscanf, testCharacter) {
-  char c = 0;
-  EXPECT_EQ(1, sscanf("a", "%c", &c));
-  EXPECT_EQ('a', c);
+  char c1 = 0, c2 = c1, c3 = c2, c4 = c3;
+  char s1[32] = {0}, s2[32] = {0};
+  EXPECT_EQ(1, sscanf("a", "%c", &c1));
+  EXPECT_EQ(2, sscanf("ab", "%c %c %c", &c2, &c3, &c4));
+  EXPECT_EQ(1, sscanf("abcde", "%5c", s1));
+  EXPECT_EQ(0, sscanf("abcd", "%5c", s2));
+
+  EXPECT_EQ('a', c1);
+  EXPECT_EQ('a', c2);
+  EXPECT_EQ('b', c3);
+  EXPECT_STREQ("abcde", &s1[0]);
 }
 
 TEST(sscanf, testStringBuffer) {
@@ -394,6 +402,20 @@ TEST(sscanf, floating_point_infinity_double_precision) {
   EXPECT_TRUE(isinf(g));
 }
 
+TEST(sscanf, floating_point_invalid) {
+  float dummy;
+  EXPECT_EQ(0, sscanf("junk", "%f", &dummy));
+  EXPECT_EQ(0, sscanf("e9", "%f", &dummy));
+  EXPECT_EQ(0, sscanf("-e9", "%f", &dummy));
+}
+
+TEST(sscanf, floating_point_invalid_double_precision) {
+  double dummy;
+  EXPECT_EQ(0, sscanf("junk", "%lf", &dummy));
+  EXPECT_EQ(0, sscanf("e9", "%lf", &dummy));
+  EXPECT_EQ(0, sscanf("-e9", "%lf", &dummy));
+}
+
 TEST(sscanf, floating_point_documentation_examples) {
   float a = 666.666f, b = a, c = b, d = c, e = d, f = e, g = f, h = g, i = h,
         j = i;
@@ -401,7 +423,7 @@ TEST(sscanf, floating_point_documentation_examples) {
   EXPECT_EQ(2, sscanf("111.11 -2.22", "%f %f", &a, &b));
   EXPECT_EQ(3, sscanf("Nan nan(2) inF", "%f %f %f", &c, &d, &e));
   EXPECT_EQ(
-      5, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
+      2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
                 "%f %f %f %f %f", &f, &g, &h, &i, &j));
 
   EXPECT_EQ(111.11f, a);
@@ -411,9 +433,6 @@ TEST(sscanf, floating_point_documentation_examples) {
   EXPECT_TRUE(isinf(e));
   EXPECT_EQ(0X1.BC70A3D70A3D7P+6f, f);
   EXPECT_TRUE(isinf(g));
-  EXPECT_EQ(-0.0000000123f, h);
-  EXPECT_EQ(.0f, i);
-  EXPECT_EQ(.0f, j);
 }
 
 TEST(sscanf, floating_point_documentation_examples_double_precision) {
@@ -423,7 +442,7 @@ TEST(sscanf, floating_point_documentation_examples_double_precision) {
   EXPECT_EQ(2, sscanf("111.11 -2.22", "%lf %lf", &a, &b));
   EXPECT_EQ(3, sscanf("Nan nan(2) inF", "%lf %lf %lf", &c, &d, &e));
   EXPECT_EQ(
-      5, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
+      2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
                 "%lf %lf %lf %lf %lf", &f, &g, &h, &i, &j));
 
   EXPECT_EQ(111.11, a);
@@ -433,9 +452,6 @@ TEST(sscanf, floating_point_documentation_examples_double_precision) {
   EXPECT_TRUE(isinf(e));
   EXPECT_EQ(0X1.BC70A3D70A3D7P+6, f);
   EXPECT_TRUE(isinf(g));
-  EXPECT_EQ(-0.0000000123, h);
-  EXPECT_EQ(.0, i);
-  EXPECT_EQ(.0, j);
 }
 
 TEST(sscanf, luplus) {