Skip to content

Commit

Permalink
Begin work on a new scanner
Browse files Browse the repository at this point in the history
  • Loading branch information
kubukoz committed Oct 2, 2023
1 parent ef8880e commit 42f4926
Show file tree
Hide file tree
Showing 2 changed files with 337 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package playground.smithyql.parser.v2.scanner

import cats.kernel.Eq
import cats.syntax.all.*

case class Token(
kind: TokenKind,
text: String,
) {
def width: Int = text.length
}

object Token {
implicit val eq: Eq[Token] = Eq.fromUniversalEquals
}

sealed trait TokenKind extends Product with Serializable {

def apply(
text: String
): Token = Token(this, text)

}

object TokenKind {
case object KW_IMPORT extends TokenKind
case object DOT extends TokenKind
case object COMMA extends TokenKind
case object HASH extends TokenKind
case object LB extends TokenKind
case object RB extends TokenKind
case object LBR extends TokenKind
case object RBR extends TokenKind
case object EQ extends TokenKind
case object SPACE extends TokenKind
case object NEWLINE extends TokenKind
case object IDENT extends TokenKind
case object COMMENT extends TokenKind
case object Error extends TokenKind

implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
}

object Scanner {

/** Entrypoint to scanning text into tokens.
*
* Always produces an output that can be rendered back to the original text.
*/
def scan(
s: String
): List[Token] = {
var remaining = s
var tokens = List.empty[Token]
def add(
tok: Token
) = tokens ::= tok

def readSimple(
token: Char,
tok: TokenKind,
): PartialFunction[Char, Unit] = { case `token` =>
add(tok(token.toString))
remaining = remaining.tail
}

def simpleTokens(
pairings: (
Char,
TokenKind,
)*
): PartialFunction[Char, Unit] = pairings
.map(readSimple.tupled)
.reduce(_ orElse _)

val readOne: PartialFunction[Char, Unit] = simpleTokens(
'.' -> TokenKind.DOT,
',' -> TokenKind.COMMA,
'#' -> TokenKind.HASH,
'[' -> TokenKind.LB,
']' -> TokenKind.RB,
'{' -> TokenKind.LBR,
'}' -> TokenKind.RBR,
'=' -> TokenKind.EQ,
).orElse {
case letter if letter.isLetter =>
val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')
add(TokenKind.IDENT(letters))
remaining = rest
}

// split "whitespace" string into chains of contiguous newlines OR whitespace characters.
def whitespaceChains(
whitespace: String
): List[Token] = {
val isNewline = (ch: Char) => ch == '\n'

if (whitespace.isEmpty)
Nil
else if (isNewline(whitespace.head)) {
val (nl, rest) = whitespace.span(isNewline)
TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
} else {
val (wsp, rest) = whitespace.span(!isNewline(_))
TokenKind.SPACE(wsp) :: whitespaceChains(rest)
}
}

def eatWhitespace(
) = {
val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
if (wsp.isEmpty())
false
else {
whitespaceChains(wsp).foreach(add)
remaining = rest

true
}
}

def eatComments(
) =
if (!remaining.startsWith("//"))
false
else {
while (remaining.startsWith("//")) {
val (comment, rest) = remaining.span(_ != '\n')
add(TokenKind.COMMENT(comment))
remaining = rest
}

true
}

def eatErrors(
) = {
// todo: bug: even if the next character starts a multi-char token, this will consider it an error.
// instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace.
val (failures, rest) = remaining.span(!readOne.isDefinedAt(_))
remaining = rest
if (failures.nonEmpty) {
add(TokenKind.Error(failures))
true
} else
false
}

while (remaining.nonEmpty) {
val last = remaining

readOne.applyOrElse[Char, Any](
remaining.head,
(_: Char) =>
// nothing matched. Eat whitespace and see if the rest is an error
eatWhitespace() || eatComments() || eatErrors(),
)

if (remaining == last)
sys.error(s"no progress in the last run! remaining string: $remaining")
}

tokens.reverse
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
package playground.smithyql.parser.v2

import cats.effect.IO
import cats.implicits._
import org.scalacheck.Arbitrary
import org.scalacheck.Gen
import playground.smithyql.parser.v2.scanner.Scanner
import playground.smithyql.parser.v2.scanner.Token
import playground.smithyql.parser.v2.scanner.TokenKind
import weaver._
import weaver.scalacheck.Checkers

import Scanner.scan

object ScannerTests extends SimpleIOSuite with Checkers {

def arbTests(
name: TestName
)(
withArb: Arbitrary[String] => IO[Expectations]
): Unit = {

val sampleStringGen = Gen.oneOf(
Gen.alphaStr,
Gen.alphaNumStr,
Gen.asciiPrintableStr,
Gen.identifier,
Gen.oneOf(List(' ', '\n', '\t', '\r', '\f', '\b')).map(_.toString),
)

val arbString: Arbitrary[String] = Arbitrary {
Gen.listOf(sampleStringGen).map(_.mkString)
}

test(name)(withArb(Arbitrary.arbString))
test(name.copy(name = name.name + " (prepared input)"))(withArb(arbString))
}

arbTests("Any string input scans successfully") { implicit arbString =>
forall { (s: String) =>
scan(s): Unit
success
}
}

arbTests("Scanning is lossless") { implicit arbString =>
forall { (s: String) =>
assert.eql(scan(s).foldMap(_.text), s)
}
}

private def scanTest(
input: String,
explicitName: String = "",
)(
expected: List[Token]
): Unit =
pureTest(
if (explicitName.nonEmpty)
explicitName
else
"Scan string: " + sanitize(input)
) {
assert.eql(expected, scan(input))
}

private def sanitize(
text: String
) = text.replace(" ", "·").replace("\n", "")

scanTest("{")(List(TokenKind.LBR("{")))
scanTest("}")(List(TokenKind.RBR("}")))
scanTest("[")(List(TokenKind.LB("[")))
scanTest("]")(List(TokenKind.RB("]")))
scanTest(".")(List(TokenKind.DOT(".")))
scanTest(",")(List(TokenKind.COMMA(",")))
scanTest("#")(List(TokenKind.HASH("#")))
scanTest("=")(List(TokenKind.EQ("=")))
scanTest("a")(List(TokenKind.IDENT("a")))

// idents
scanTest("abcdef")(List(TokenKind.IDENT("abcdef")))

scanTest(
"hello_world"
)(
List(
TokenKind.IDENT("hello_world")
)
)

scanTest(
"helloworld123"
)(
List(
TokenKind.IDENT("helloworld123")
)
)

// whitespace
scanTest(" ")(List(TokenKind.SPACE(" ")))
scanTest("\n")(List(TokenKind.NEWLINE("\n")))

// contiguous whitespace of all kinds
// notably newlines are grouped together separately from other whitespace
scanTest(" \r \r \n\n")(List(TokenKind.SPACE(" \r \r "), TokenKind.NEWLINE("\n\n")))
scanTest(" \n\n \n ")(
List(
TokenKind.SPACE(" "),
TokenKind.NEWLINE("\n\n"),
TokenKind.SPACE(" "),
TokenKind.NEWLINE("\n"),
TokenKind.SPACE(" "),
)
)

// comments
scanTest("// hello 123 foo bar --")(List(TokenKind.COMMENT("// hello 123 foo bar --")))

scanTest(
explicitName = "Scan multiple line-comments",
input =
"""//hello
|//world""".stripMargin,
)(
List(
TokenKind.COMMENT("//hello"),
TokenKind.NEWLINE("\n"),
TokenKind.COMMENT("//world"),
)
)

scanTest(
"hello world //this is a comment"
)(
List(
TokenKind.IDENT("hello"),
TokenKind.SPACE(" "),
TokenKind.IDENT("world"),
TokenKind.SPACE(" "),
TokenKind.COMMENT("//this is a comment"),
)
)

// errors

scanTest(
explicitName = "Error tokens for input that doesn't match any other token",
input = "🤷*%$^@-+?",
)(List(TokenKind.Error("🤷*%$^@-+?")))

scanTest(
explicitName = "Error tokens mixed between other tokens",
input = "hello@world-this?is=an<example",
)(
List(
TokenKind.IDENT("hello"),
TokenKind.Error("@"),
TokenKind.IDENT("world"),
TokenKind.Error("-"),
TokenKind.IDENT("this"),
TokenKind.Error("?"),
TokenKind.IDENT("is"),
TokenKind.EQ("="),
TokenKind.IDENT("an"),
TokenKind.Error("<"),
TokenKind.IDENT("example"),
)
)

}

0 comments on commit 42f4926

Please sign in to comment.