From 42f4926e8caa6bb003f7dcdddb48b2b87c3c11a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Koz=C5=82owski?= <kubukoz@gmail.com>
Date: Mon, 2 Oct 2023 02:22:05 +0200
Subject: [PATCH] Begin work on a new scanner

---
 .../smithyql/parser/v2/scanner.scala          | 166 +++++++++++++++++
 .../smithyql/parser/v2/ScannerTests.scala     | 171 ++++++++++++++++++
 2 files changed, 337 insertions(+)
 create mode 100644 modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala
 create mode 100644 modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala

diff --git a/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala b/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala
new file mode 100644
index 00000000..1cf4799d
--- /dev/null
+++ b/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala
@@ -0,0 +1,166 @@
+package playground.smithyql.parser.v2.scanner
+
+import cats.kernel.Eq
+import cats.syntax.all.*
+
+case class Token(
+  kind: TokenKind,
+  text: String,
+) {
+  def width: Int = text.length
+}
+
+object Token {
+  implicit val eq: Eq[Token] = Eq.fromUniversalEquals
+}
+
+sealed trait TokenKind extends Product with Serializable {
+
+  def apply(
+    text: String
+  ): Token = Token(this, text)
+
+}
+
+object TokenKind {
+  case object KW_IMPORT extends TokenKind
+  case object DOT extends TokenKind
+  case object COMMA extends TokenKind
+  case object HASH extends TokenKind
+  case object LB extends TokenKind
+  case object RB extends TokenKind
+  case object LBR extends TokenKind
+  case object RBR extends TokenKind
+  case object EQ extends TokenKind
+  case object SPACE extends TokenKind
+  case object NEWLINE extends TokenKind
+  case object IDENT extends TokenKind
+  case object COMMENT extends TokenKind
+  case object Error extends TokenKind
+
+  implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
+}
+
+object Scanner {
+
+  /** Entrypoint to scanning text into tokens.
+    *
+    * Always produces an output that can be rendered back to the original text.
+    */
+  def scan(
+    s: String
+  ): List[Token] = {
+    var remaining = s
+    var tokens = List.empty[Token]
+    def add(
+      tok: Token
+    ) = tokens ::= tok
+
+    def readSimple(
+      token: Char,
+      tok: TokenKind,
+    ): PartialFunction[Char, Unit] = { case `token` =>
+      add(tok(token.toString))
+      remaining = remaining.tail
+    }
+
+    def simpleTokens(
+      pairings: (
+        Char,
+        TokenKind,
+      )*
+    ): PartialFunction[Char, Unit] = pairings
+      .map(readSimple.tupled)
+      .reduce(_ orElse _)
+
+    val readOne: PartialFunction[Char, Unit] = simpleTokens(
+      '.' -> TokenKind.DOT,
+      ',' -> TokenKind.COMMA,
+      '#' -> TokenKind.HASH,
+      '[' -> TokenKind.LB,
+      ']' -> TokenKind.RB,
+      '{' -> TokenKind.LBR,
+      '}' -> TokenKind.RBR,
+      '=' -> TokenKind.EQ,
+    ).orElse {
+      case letter if letter.isLetter =>
+        val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')
+        add(TokenKind.IDENT(letters))
+        remaining = rest
+    }
+
+    // split "whitespace" string into chains of contiguous newlines OR whitespace characters.
+    def whitespaceChains(
+      whitespace: String
+    ): List[Token] = {
+      val isNewline = (ch: Char) => ch == '\n'
+
+      if (whitespace.isEmpty)
+        Nil
+      else if (isNewline(whitespace.head)) {
+        val (nl, rest) = whitespace.span(isNewline)
+        TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
+      } else {
+        val (wsp, rest) = whitespace.span(!isNewline(_))
+        TokenKind.SPACE(wsp) :: whitespaceChains(rest)
+      }
+    }
+
+    def eatWhitespace(
+    ) = {
+      val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
+      if (wsp.isEmpty())
+        false
+      else {
+        whitespaceChains(wsp).foreach(add)
+        remaining = rest
+
+        true
+      }
+    }
+
+    def eatComments(
+    ) =
+      if (!remaining.startsWith("//"))
+        false
+      else {
+        while (remaining.startsWith("//")) {
+          val (comment, rest) = remaining.span(_ != '\n')
+          add(TokenKind.COMMENT(comment))
+          remaining = rest
+        }
+
+        true
+      }
+
+    def eatErrors(
+    ) = {
+      // todo: bug: even if the next character starts a multi-char token, this will consider it an error.
+      // instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace.
+      val (failures, rest) = remaining.span(!readOne.isDefinedAt(_))
+      remaining = rest
+      if (failures.nonEmpty) {
+        add(TokenKind.Error(failures))
+        true
+      } else
+        false
+    }
+
+    while (remaining.nonEmpty) {
+      val last = remaining
+
+      readOne.applyOrElse[Char, Any](
+        remaining.head,
+        (_: Char) =>
+          // nothing matched. Eat whitespace and see if the rest is an error
+          eatWhitespace() || eatComments() || eatErrors(),
+      )
+
+      if (remaining == last)
+        sys.error(s"no progress in the last run! remaining string: $remaining")
+    }
+
+    tokens.reverse
+  }
+
+}
diff --git a/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala b/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala
new file mode 100644
index 00000000..a2cb582f
--- /dev/null
+++ b/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala
@@ -0,0 +1,171 @@
+package playground.smithyql.parser.v2
+
+import cats.effect.IO
+import cats.implicits._
+import org.scalacheck.Arbitrary
+import org.scalacheck.Gen
+import playground.smithyql.parser.v2.scanner.Scanner
+import playground.smithyql.parser.v2.scanner.Token
+import playground.smithyql.parser.v2.scanner.TokenKind
+import weaver._
+import weaver.scalacheck.Checkers
+
+import Scanner.scan
+
+object ScannerTests extends SimpleIOSuite with Checkers {
+
+  def arbTests(
+    name: TestName
+  )(
+    withArb: Arbitrary[String] => IO[Expectations]
+  ): Unit = {
+
+    val sampleStringGen = Gen.oneOf(
+      Gen.alphaStr,
+      Gen.alphaNumStr,
+      Gen.asciiPrintableStr,
+      Gen.identifier,
+      Gen.oneOf(List(' ', '\n', '\t', '\r', '\f', '\b')).map(_.toString),
+    )
+
+    val arbString: Arbitrary[String] = Arbitrary {
+      Gen.listOf(sampleStringGen).map(_.mkString)
+    }
+
+    test(name)(withArb(Arbitrary.arbString))
+    test(name.copy(name = name.name + " (prepared input)"))(withArb(arbString))
+  }
+
+  arbTests("Any string input scans successfully") { implicit arbString =>
+    forall { (s: String) =>
+      scan(s): Unit
+      success
+    }
+  }
+
+  arbTests("Scanning is lossless") { implicit arbString =>
+    forall { (s: String) =>
+      assert.eql(scan(s).foldMap(_.text), s)
+    }
+  }
+
+  private def scanTest(
+    input: String,
+    explicitName: String = "",
+  )(
+    expected: List[Token]
+  ): Unit =
+    pureTest(
+      if (explicitName.nonEmpty)
+        explicitName
+      else
+        "Scan string: " + sanitize(input)
+    ) {
+      assert.eql(expected, scan(input))
+    }
+
+  private def sanitize(
+    text: String
+  ) = text.replace(" ", "·").replace("\n", "↵")
+
+  scanTest("{")(List(TokenKind.LBR("{")))
+  scanTest("}")(List(TokenKind.RBR("}")))
+  scanTest("[")(List(TokenKind.LB("[")))
+  scanTest("]")(List(TokenKind.RB("]")))
+  scanTest(".")(List(TokenKind.DOT(".")))
+  scanTest(",")(List(TokenKind.COMMA(",")))
+  scanTest("#")(List(TokenKind.HASH("#")))
+  scanTest("=")(List(TokenKind.EQ("=")))
+  scanTest("a")(List(TokenKind.IDENT("a")))
+
+  // idents
+  scanTest("abcdef")(List(TokenKind.IDENT("abcdef")))
+
+  scanTest(
+    "hello_world"
+  )(
+    List(
+      TokenKind.IDENT("hello_world")
+    )
+  )
+
+  scanTest(
+    "helloworld123"
+  )(
+    List(
+      TokenKind.IDENT("helloworld123")
+    )
+  )
+
+  // whitespace
+  scanTest(" ")(List(TokenKind.SPACE(" ")))
+  scanTest("\n")(List(TokenKind.NEWLINE("\n")))
+
+  // contiguous whitespace of all kinds
+  // notably newlines are grouped together separately from other whitespace
+  scanTest("  \r \r \n\n")(List(TokenKind.SPACE("  \r \r "), TokenKind.NEWLINE("\n\n")))
+  scanTest("  \n\n  \n ")(
+    List(
+      TokenKind.SPACE("  "),
+      TokenKind.NEWLINE("\n\n"),
+      TokenKind.SPACE("  "),
+      TokenKind.NEWLINE("\n"),
+      TokenKind.SPACE(" "),
+    )
+  )
+
+  // comments
+  scanTest("// hello 123 foo bar --")(List(TokenKind.COMMENT("// hello 123 foo bar --")))
+
+  scanTest(
+    explicitName = "Scan multiple line-comments",
+    input =
+      """//hello
+        |//world""".stripMargin,
+  )(
+    List(
+      TokenKind.COMMENT("//hello"),
+      TokenKind.NEWLINE("\n"),
+      TokenKind.COMMENT("//world"),
+    )
+  )
+
+  scanTest(
+    "hello world //this is a comment"
+  )(
+    List(
+      TokenKind.IDENT("hello"),
+      TokenKind.SPACE(" "),
+      TokenKind.IDENT("world"),
+      TokenKind.SPACE(" "),
+      TokenKind.COMMENT("//this is a comment"),
+    )
+  )
+
+  // errors
+
+  scanTest(
+    explicitName = "Error tokens for input that doesn't match any other token",
+    input = "🤷*%$^@-+?",
+  )(List(TokenKind.Error("🤷*%$^@-+?")))
+
+  scanTest(
+    explicitName = "Error tokens mixed between other tokens",
+    input = "hello@world-this?is=an<example",
+  )(
+    List(
+      TokenKind.IDENT("hello"),
+      TokenKind.Error("@"),
+      TokenKind.IDENT("world"),
+      TokenKind.Error("-"),
+      TokenKind.IDENT("this"),
+      TokenKind.Error("?"),
+      TokenKind.IDENT("is"),
+      TokenKind.EQ("="),
+      TokenKind.IDENT("an"),
+      TokenKind.Error("<"),
+      TokenKind.IDENT("example"),
+    )
+  )
+
+}