-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
337 additions
and
0 deletions.
There are no files selected for viewing
166 changes: 166 additions & 0 deletions
166
modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
package playground.smithyql.parser.v2.scanner | ||
|
||
import cats.kernel.Eq | ||
import cats.syntax.all.* | ||
|
||
case class Token( | ||
kind: TokenKind, | ||
text: String, | ||
) { | ||
def width: Int = text.length | ||
} | ||
|
||
object Token { | ||
implicit val eq: Eq[Token] = Eq.fromUniversalEquals | ||
} | ||
|
||
sealed trait TokenKind extends Product with Serializable { | ||
|
||
def apply( | ||
text: String | ||
): Token = Token(this, text) | ||
|
||
} | ||
|
||
object TokenKind { | ||
case object KW_IMPORT extends TokenKind | ||
case object DOT extends TokenKind | ||
case object COMMA extends TokenKind | ||
case object HASH extends TokenKind | ||
case object LB extends TokenKind | ||
case object RB extends TokenKind | ||
case object LBR extends TokenKind | ||
case object RBR extends TokenKind | ||
case object EQ extends TokenKind | ||
case object SPACE extends TokenKind | ||
case object NEWLINE extends TokenKind | ||
case object IDENT extends TokenKind | ||
case object COMMENT extends TokenKind | ||
case object Error extends TokenKind | ||
|
||
implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals | ||
} | ||
|
||
object Scanner { | ||
|
||
/** Entrypoint to scanning text into tokens. | ||
* | ||
* Always produces an output that can be rendered back to the original text. | ||
*/ | ||
def scan( | ||
s: String | ||
): List[Token] = { | ||
var remaining = s | ||
var tokens = List.empty[Token] | ||
def add( | ||
tok: Token | ||
) = tokens ::= tok | ||
|
||
def readSimple( | ||
token: Char, | ||
tok: TokenKind, | ||
): PartialFunction[Char, Unit] = { case `token` => | ||
add(tok(token.toString)) | ||
remaining = remaining.tail | ||
} | ||
|
||
def simpleTokens( | ||
pairings: ( | ||
Char, | ||
TokenKind, | ||
)* | ||
): PartialFunction[Char, Unit] = pairings | ||
.map(readSimple.tupled) | ||
.reduce(_ orElse _) | ||
|
||
val readOne: PartialFunction[Char, Unit] = simpleTokens( | ||
'.' -> TokenKind.DOT, | ||
',' -> TokenKind.COMMA, | ||
'#' -> TokenKind.HASH, | ||
'[' -> TokenKind.LB, | ||
']' -> TokenKind.RB, | ||
'{' -> TokenKind.LBR, | ||
'}' -> TokenKind.RBR, | ||
'=' -> TokenKind.EQ, | ||
).orElse { | ||
case letter if letter.isLetter => | ||
val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_') | ||
add(TokenKind.IDENT(letters)) | ||
remaining = rest | ||
} | ||
|
||
// split "whitespace" string into chains of contiguous newlines OR whitespace characters. | ||
def whitespaceChains( | ||
whitespace: String | ||
): List[Token] = { | ||
val isNewline = (ch: Char) => ch == '\n' | ||
|
||
if (whitespace.isEmpty) | ||
Nil | ||
else if (isNewline(whitespace.head)) { | ||
val (nl, rest) = whitespace.span(isNewline) | ||
TokenKind.NEWLINE(nl) :: whitespaceChains(rest) | ||
} else { | ||
val (wsp, rest) = whitespace.span(!isNewline(_)) | ||
TokenKind.SPACE(wsp) :: whitespaceChains(rest) | ||
} | ||
} | ||
|
||
def eatWhitespace( | ||
) = { | ||
val (wsp, rest) = remaining.span(ch => ch.isWhitespace) | ||
if (wsp.isEmpty()) | ||
false | ||
else { | ||
whitespaceChains(wsp).foreach(add) | ||
remaining = rest | ||
|
||
true | ||
} | ||
} | ||
|
||
def eatComments( | ||
) = | ||
if (!remaining.startsWith("//")) | ||
false | ||
else { | ||
while (remaining.startsWith("//")) { | ||
val (comment, rest) = remaining.span(_ != '\n') | ||
add(TokenKind.COMMENT(comment)) | ||
remaining = rest | ||
} | ||
|
||
true | ||
} | ||
|
||
def eatErrors( | ||
) = { | ||
// todo: bug: even if the next character starts a multi-char token, this will consider it an error. | ||
// instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace. | ||
val (failures, rest) = remaining.span(!readOne.isDefinedAt(_)) | ||
remaining = rest | ||
if (failures.nonEmpty) { | ||
add(TokenKind.Error(failures)) | ||
true | ||
} else | ||
false | ||
} | ||
|
||
while (remaining.nonEmpty) { | ||
val last = remaining | ||
|
||
readOne.applyOrElse[Char, Any]( | ||
remaining.head, | ||
(_: Char) => | ||
// nothing matched. Eat whitespace and see if the rest is an error | ||
eatWhitespace() || eatComments() || eatErrors(), | ||
) | ||
|
||
if (remaining == last) | ||
sys.error(s"no progress in the last run! remaining string: $remaining") | ||
} | ||
|
||
tokens.reverse | ||
} | ||
|
||
} |
171 changes: 171 additions & 0 deletions
171
modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
package playground.smithyql.parser.v2 | ||
|
||
import cats.effect.IO | ||
import cats.implicits._ | ||
import org.scalacheck.Arbitrary | ||
import org.scalacheck.Gen | ||
import playground.smithyql.parser.v2.scanner.Scanner | ||
import playground.smithyql.parser.v2.scanner.Token | ||
import playground.smithyql.parser.v2.scanner.TokenKind | ||
import weaver._ | ||
import weaver.scalacheck.Checkers | ||
|
||
import Scanner.scan | ||
|
||
object ScannerTests extends SimpleIOSuite with Checkers { | ||
|
||
def arbTests( | ||
name: TestName | ||
)( | ||
withArb: Arbitrary[String] => IO[Expectations] | ||
): Unit = { | ||
|
||
val sampleStringGen = Gen.oneOf( | ||
Gen.alphaStr, | ||
Gen.alphaNumStr, | ||
Gen.asciiPrintableStr, | ||
Gen.identifier, | ||
Gen.oneOf(List(' ', '\n', '\t', '\r', '\f', '\b')).map(_.toString), | ||
) | ||
|
||
val arbString: Arbitrary[String] = Arbitrary { | ||
Gen.listOf(sampleStringGen).map(_.mkString) | ||
} | ||
|
||
test(name)(withArb(Arbitrary.arbString)) | ||
test(name.copy(name = name.name + " (prepared input)"))(withArb(arbString)) | ||
} | ||
|
||
arbTests("Any string input scans successfully") { implicit arbString => | ||
forall { (s: String) => | ||
scan(s): Unit | ||
success | ||
} | ||
} | ||
|
||
arbTests("Scanning is lossless") { implicit arbString => | ||
forall { (s: String) => | ||
assert.eql(scan(s).foldMap(_.text), s) | ||
} | ||
} | ||
|
||
private def scanTest( | ||
input: String, | ||
explicitName: String = "", | ||
)( | ||
expected: List[Token] | ||
): Unit = | ||
pureTest( | ||
if (explicitName.nonEmpty) | ||
explicitName | ||
else | ||
"Scan string: " + sanitize(input) | ||
) { | ||
assert.eql(expected, scan(input)) | ||
} | ||
|
||
private def sanitize( | ||
text: String | ||
) = text.replace(" ", "·").replace("\n", "↵") | ||
|
||
scanTest("{")(List(TokenKind.LBR("{"))) | ||
scanTest("}")(List(TokenKind.RBR("}"))) | ||
scanTest("[")(List(TokenKind.LB("["))) | ||
scanTest("]")(List(TokenKind.RB("]"))) | ||
scanTest(".")(List(TokenKind.DOT("."))) | ||
scanTest(",")(List(TokenKind.COMMA(","))) | ||
scanTest("#")(List(TokenKind.HASH("#"))) | ||
scanTest("=")(List(TokenKind.EQ("="))) | ||
scanTest("a")(List(TokenKind.IDENT("a"))) | ||
|
||
// idents | ||
scanTest("abcdef")(List(TokenKind.IDENT("abcdef"))) | ||
|
||
scanTest( | ||
"hello_world" | ||
)( | ||
List( | ||
TokenKind.IDENT("hello_world") | ||
) | ||
) | ||
|
||
scanTest( | ||
"helloworld123" | ||
)( | ||
List( | ||
TokenKind.IDENT("helloworld123") | ||
) | ||
) | ||
|
||
// whitespace | ||
scanTest(" ")(List(TokenKind.SPACE(" "))) | ||
scanTest("\n")(List(TokenKind.NEWLINE("\n"))) | ||
|
||
// contiguous whitespace of all kinds | ||
// notably newlines are grouped together separately from other whitespace | ||
scanTest(" \r \r \n\n")(List(TokenKind.SPACE(" \r \r "), TokenKind.NEWLINE("\n\n"))) | ||
scanTest(" \n\n \n ")( | ||
List( | ||
TokenKind.SPACE(" "), | ||
TokenKind.NEWLINE("\n\n"), | ||
TokenKind.SPACE(" "), | ||
TokenKind.NEWLINE("\n"), | ||
TokenKind.SPACE(" "), | ||
) | ||
) | ||
|
||
// comments | ||
scanTest("// hello 123 foo bar --")(List(TokenKind.COMMENT("// hello 123 foo bar --"))) | ||
|
||
scanTest( | ||
explicitName = "Scan multiple line-comments", | ||
input = | ||
"""//hello | ||
|//world""".stripMargin, | ||
)( | ||
List( | ||
TokenKind.COMMENT("//hello"), | ||
TokenKind.NEWLINE("\n"), | ||
TokenKind.COMMENT("//world"), | ||
) | ||
) | ||
|
||
scanTest( | ||
"hello world //this is a comment" | ||
)( | ||
List( | ||
TokenKind.IDENT("hello"), | ||
TokenKind.SPACE(" "), | ||
TokenKind.IDENT("world"), | ||
TokenKind.SPACE(" "), | ||
TokenKind.COMMENT("//this is a comment"), | ||
) | ||
) | ||
|
||
// errors | ||
|
||
scanTest( | ||
explicitName = "Error tokens for input that doesn't match any other token", | ||
input = "🤷*%$^@-+?", | ||
)(List(TokenKind.Error("🤷*%$^@-+?"))) | ||
|
||
scanTest( | ||
explicitName = "Error tokens mixed between other tokens", | ||
input = "hello@world-this?is=an<example", | ||
)( | ||
List( | ||
TokenKind.IDENT("hello"), | ||
TokenKind.Error("@"), | ||
TokenKind.IDENT("world"), | ||
TokenKind.Error("-"), | ||
TokenKind.IDENT("this"), | ||
TokenKind.Error("?"), | ||
TokenKind.IDENT("is"), | ||
TokenKind.EQ("="), | ||
TokenKind.IDENT("an"), | ||
TokenKind.Error("<"), | ||
TokenKind.IDENT("example"), | ||
) | ||
) | ||
|
||
} |