Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V2 Parser: Add scanner #340

Merged
merged 22 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion modules/ast/src/test/scala/playground/Assertions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ object Assertions extends Expectations.Helpers {
val stringWithResets = d.show()(conf).linesWithSeparators.map(Console.RESET + _).mkString

failure(
s"Diff failed:\n${Console.RESET}(${conf.right("expected")}, ${conf.left("actual")})\n\n" + stringWithResets
s"Diff failed:\n${Console.RESET}(${conf.left("expected")}, ${conf.right("actual")})\n\n" + stringWithResets
)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
package playground.smithyql.parser.v2.scanner

import cats.kernel.Eq
import cats.parse.Numbers
import cats.syntax.all.*

import scala.annotation.nowarn

case class Token(
kind: TokenKind,
text: String,
) {
def width: Int = text.length
}

object Token {
implicit val eq: Eq[Token] = Eq.fromUniversalEquals
}

sealed trait TokenKind extends Product with Serializable {

def apply(
text: String
): Token = Token(this, text)

}

object TokenKind {
case object KW_USE extends TokenKind
case object KW_SERVICE extends TokenKind
case object KW_BOOLEAN extends TokenKind
case object LIT_NUMBER extends TokenKind
case object LIT_STRING extends TokenKind
case object KW_NULL extends TokenKind

case object DOT extends TokenKind
case object COMMA extends TokenKind
case object HASH extends TokenKind
case object LB extends TokenKind
case object RB extends TokenKind
case object LBR extends TokenKind
case object RBR extends TokenKind
case object COLON extends TokenKind
case object EQ extends TokenKind
case object SPACE extends TokenKind
case object NEWLINE extends TokenKind
case object IDENT extends TokenKind
case object COMMENT extends TokenKind
case object Error extends TokenKind

implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
}

object Scanner {

/** Entrypoint to scanning text into tokens.
*
* Always produces an output that can be rendered back to the original text.
*/
def scan(
s: String
): List[Token] = {
var remaining = s
var tokens = List.empty[Token]
def add(
tok: Token
) = tokens ::= tok

def readSimple(
token: String,
tok: TokenKind,
): PartialFunction[Unit, Unit] = {
case _ if remaining.startsWith(token) =>
add(tok(token.toString))
remaining = remaining.drop(token.length())
}

def simpleTokens(
pairings: (
String,
TokenKind,
)*
): PartialFunction[Unit, Unit] = pairings.map(readSimple.tupled).reduce(_.orElse(_))

val keywords = Map(
"use" -> TokenKind.KW_USE,
"service" -> TokenKind.KW_SERVICE,
"null" -> TokenKind.KW_NULL,
"true" -> TokenKind.KW_BOOLEAN,
"false" -> TokenKind.KW_BOOLEAN,
)

val readIdent: PartialFunction[Unit, Unit] = {
case _ if remaining.head.isLetter =>
val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')

keywords.get(letters) match {
case Some(kind) =>
// we matched a keyword, return it.
add(kind(letters))

case None =>
// normal ident
add(TokenKind.IDENT(letters))
}

remaining = rest
}

val readPunctuation: PartialFunction[Unit, Unit] = simpleTokens(
"." -> TokenKind.DOT,
"," -> TokenKind.COMMA,
"#" -> TokenKind.HASH,
"[" -> TokenKind.LB,
"]" -> TokenKind.RB,
"{" -> TokenKind.LBR,
"}" -> TokenKind.RBR,
":" -> TokenKind.COLON,
"=" -> TokenKind.EQ,
)

val readStringLiteral: PartialFunction[Unit, Unit] = {
case _ if remaining.startsWith("\"") =>
val (str, rest) = remaining.tail.span(_ != '\"')
if (rest.isEmpty) { // hit EOF
add(TokenKind.LIT_STRING("\"" + str))
remaining = rest
} else {
add(TokenKind.LIT_STRING("\"" + str + "\""))
remaining = rest.tail
}
}

val readNumberLiteral: PartialFunction[Unit, Unit] = {
// I love this language
object jsonNumber {
def unapply(
@nowarn("cat=unused")
unused: Unit
): Option[
(
String,
String,
)
] =
// For now, we're using the cats-parse implementation simply because it's consistent with the current implementation
// and we can rewrite this later on when we drop support for the other parser
// and no longer need cats-parse.
Numbers.jsonNumber.parse(remaining).toOption
}

{ case jsonNumber(rest, num) =>
add(TokenKind.LIT_NUMBER(num.toString))
remaining = rest
}
}

// readOne and friends are all partial functions: this is the current implementation of lookahead.
// it's not great, but it kinda works.
val readOne: PartialFunction[Unit, Unit] = readIdent
.orElse(readPunctuation)
.orElse(readStringLiteral)
.orElse(readNumberLiteral)

// split "whitespace" string into chains of contiguous newlines OR whitespace characters.
def whitespaceChains(
whitespace: String
): List[Token] = {
val isNewline = (ch: Char) => ch == '\n'

if (whitespace.isEmpty)
Nil
else if (isNewline(whitespace.head)) {
val (nl, rest) = whitespace.span(isNewline)
TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
} else {
val (wsp, rest) = whitespace.span(!isNewline(_))
TokenKind.SPACE(wsp) :: whitespaceChains(rest)
}
}

def eatWhitespace(
) = {
val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
if (wsp.isEmpty())
false
else {
whitespaceChains(wsp).foreach(add)
remaining = rest

true
}
}

def eatComments(
) =
if (!remaining.startsWith("//"))
false
else {
while (remaining.startsWith("//")) {
val (comment, rest) = remaining.span(_ != '\n')
add(TokenKind.COMMENT(comment))
remaining = rest
}

true
}

def eatErrors(
) = {
// todo: bug: even if the next character starts a multi-char token, this will consider it an error.
// instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace.
val (failures, _) = remaining.span { _ =>
if (readOne.isDefinedAt(()))
// this will match. stop!
false
else {
// didn't match. We need to move the cursor manually here
remaining = remaining.tail
true
}
}

if (failures.nonEmpty) {
add(TokenKind.Error(failures))
true
} else
false
}

while (remaining.nonEmpty) {
val last = remaining

readOne.lift(()).isDefined ||
eatWhitespace() ||
eatComments() ||
eatErrors(): Unit

// last-effort sanity check
if (remaining == last)
sys.error(s"no progress in the last run! remaining string: $remaining")
}

tokens.reverse
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import io.circe.Decoder
import io.circe.syntax._
import playground.Assertions._
import playground.smithyql._
import playground.smithyql.parser.v2.scanner.Scanner
import playground.smithyql.parser.v2.scanner.TokenKind
import weaver._

import java.nio.file
Expand Down Expand Up @@ -52,11 +54,30 @@ trait ParserSuite extends SimpleIOSuite {
}
}
}

validTokensTest(testCase, trimWhitespace)
}

private def validTokensTest(
testCase: TestCase,
trimWhitespace: Boolean,
) =
test(testCase.name + " (v2 scanner)") {
testCase.readInput(trimWhitespace).map { input =>
val scanned = Scanner.scan(input)

val errors = scanned.filter(_.kind == TokenKind.Error)
// non-empty inputs should parse to non-empty outputs
assert(input.isEmpty || scanned.nonEmpty) &&
assert(errors.isEmpty)
}
}

// invalidTokens: a flag that tells the suite whether the file should contain invalid tokens.
def loadNegativeParserTests[Alg[_[_]]: SourceParser](
prefix: String,
trimWhitespace: Boolean = false,
invalidTokens: Boolean,
): Unit = loadTestCases("", List("negative", prefix)).foreach { testCase =>
test(testCase.name) {
testCase.readInput(trimWhitespace).map { input =>
Expand All @@ -66,6 +87,10 @@ trait ParserSuite extends SimpleIOSuite {
}
}
}

if (!invalidTokens)
validTokensTest(testCase, trimWhitespace)

}

private def readText(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ import playground.smithyql.Prelude
import playground.smithyql.parser.ParserSuite

object PreludeParserNegativeTests extends ParserSuite {
loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true)
loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true, invalidTokens = false)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package playground.smithyql.parser.v2

import com.softwaremill.diffx.Diff
import playground.smithyql.parser.v2.scanner.Token
import playground.smithyql.parser.v2.scanner.TokenKind

object Diffs {

implicit val tokenKindDiff: Diff[TokenKind] = Diff.derived
implicit val tokenDiff: Diff[Token] = Diff.derived

}
Loading