From ba5a324935a38484e132f7df8e030df6f6e8aa76 Mon Sep 17 00:00:00 2001 From: Wilhelm Behncke Date: Fri, 11 Aug 2023 11:34:09 +0200 Subject: [PATCH] TASK: Prepare Lexer interface for parser use cases --- .../Lexer/CharacterStream/CharacterStream.php | 24 ++ .../CharacterStreamSnapshot.php | 36 ++ src/Language/Lexer/CharacterStream/Cursor.php | 21 ++ .../Lexer/CharacterStream/CursorSnapshot.php | 37 ++ src/Language/Lexer/Lexer.php | 343 +++++++++++++----- src/Language/Lexer/LexerException.php | 37 ++ src/Language/Lexer/Matcher/Matcher.php | 20 +- .../Lexer/Matcher/Optional/Optional.php | 43 +++ src/Language/Lexer/Token/TokenType.php | 7 +- test/Unit/Language/Lexer/LexerTest.php | 136 ++++--- 10 files changed, 547 insertions(+), 157 deletions(-) create mode 100644 src/Language/Lexer/CharacterStream/CharacterStreamSnapshot.php create mode 100644 src/Language/Lexer/CharacterStream/CursorSnapshot.php create mode 100644 src/Language/Lexer/Matcher/Optional/Optional.php diff --git a/src/Language/Lexer/CharacterStream/CharacterStream.php b/src/Language/Lexer/CharacterStream/CharacterStream.php index df8a7c4..cf13037 100644 --- a/src/Language/Lexer/CharacterStream/CharacterStream.php +++ b/src/Language/Lexer/CharacterStream/CharacterStream.php @@ -24,6 +24,9 @@ use PackageFactory\ComponentEngine\Parser\Source\Position; +/** + * @internal + */ final class CharacterStream { private int $byte; @@ -81,4 +84,25 @@ public function getPreviousPosition(): Position { return $this->cursor->getPreviousPosition(); } + + public function makeSnapshot(): CharacterStreamSnapshot + { + return new CharacterStreamSnapshot( + byte: $this->byte, + cursor: $this->cursor->makeSnapshot(), + characterUnderCursor: $this->characterUnderCursor + ); + } + + public function restoreSnapshot(CharacterStreamSnapshot $snapshot): void + { + $this->byte = $snapshot->byte; + $this->cursor->restoreSnapshot($snapshot->cursor); + $this->characterUnderCursor = $snapshot->characterUnderCursor; + } + + public function getRest(): string + { + return $this->characterUnderCursor . substr($this->source, $this->byte); + } } diff --git a/src/Language/Lexer/CharacterStream/CharacterStreamSnapshot.php b/src/Language/Lexer/CharacterStream/CharacterStreamSnapshot.php new file mode 100644 index 0000000..e101b5f --- /dev/null +++ b/src/Language/Lexer/CharacterStream/CharacterStreamSnapshot.php @@ -0,0 +1,36 @@ +. + */ + +declare(strict_types=1); + +namespace PackageFactory\ComponentEngine\Language\Lexer\CharacterStream; + +/** + * @internal + */ +final class CharacterStreamSnapshot +{ + public function __construct( + public readonly int $byte, + public readonly CursorSnapshot $cursor, + public readonly ?string $characterUnderCursor = null + ) { + } +} diff --git a/src/Language/Lexer/CharacterStream/Cursor.php b/src/Language/Lexer/CharacterStream/Cursor.php index d2f5c48..f2bf00b 100644 --- a/src/Language/Lexer/CharacterStream/Cursor.php +++ b/src/Language/Lexer/CharacterStream/Cursor.php @@ -24,6 +24,9 @@ use PackageFactory\ComponentEngine\Parser\Source\Position; +/** + * @internal + */ final class Cursor { private int $currentLineNumber = 0; @@ -58,4 +61,22 @@ public function getPreviousPosition(): Position return new Position($this->previousLineNumber, $this->previousColumnNumber); } + + public function makeSnapshot(): CursorSnapshot + { + return new CursorSnapshot( + currentLineNumber: $this->currentLineNumber, + currentColumnNumber: $this->currentColumnNumber, + previousLineNumber: $this->previousLineNumber, + previousColumnNumber: $this->previousColumnNumber + ); + } + + public function restoreSnapshot(CursorSnapshot $snapshot): void + { + $this->currentLineNumber = $snapshot->currentLineNumber; + $this->currentColumnNumber = $snapshot->currentColumnNumber; + $this->previousLineNumber = $snapshot->previousLineNumber; + $this->previousColumnNumber = $snapshot->previousColumnNumber; + } } diff --git a/src/Language/Lexer/CharacterStream/CursorSnapshot.php b/src/Language/Lexer/CharacterStream/CursorSnapshot.php new file mode 100644 index 0000000..eadc09b --- /dev/null +++ b/src/Language/Lexer/CharacterStream/CursorSnapshot.php @@ -0,0 +1,37 @@ +. + */ + +declare(strict_types=1); + +namespace PackageFactory\ComponentEngine\Language\Lexer\CharacterStream; + +/** + * @internal + */ +final class CursorSnapshot +{ + public function __construct( + public readonly int $currentLineNumber, + public readonly int $currentColumnNumber, + public readonly int $previousLineNumber, + public readonly int $previousColumnNumber + ) { + } +} diff --git a/src/Language/Lexer/Lexer.php b/src/Language/Lexer/Lexer.php index 8993183..3a83ff1 100644 --- a/src/Language/Lexer/Lexer.php +++ b/src/Language/Lexer/Lexer.php @@ -22,6 +22,7 @@ namespace PackageFactory\ComponentEngine\Language\Lexer; +use LogicException; use PackageFactory\ComponentEngine\Language\Lexer\CharacterStream\CharacterStream; use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Matcher; use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Result; @@ -33,135 +34,253 @@ final class Lexer { + private readonly TokenTypes $TOKEN_TYPES_SPACE; + private readonly TokenTypes $TOKEN_TYPES_SPACE_AND_COMMENTS; + private readonly CharacterStream $characterStream; - private ?Position $startPosition = null; + private Position $startPosition; private int $offset = 0; private string $buffer = ''; private ?TokenType $tokenTypeUnderCursor = null; private ?Token $tokenUnderCursor = null; - private ?LexerException $latestError = null; public function __construct(string $source) { + $this->TOKEN_TYPES_SPACE = TokenTypes::from( + TokenType::SPACE, + TokenType::END_OF_LINE + ); + $this->TOKEN_TYPES_SPACE_AND_COMMENTS = TokenTypes::from( + TokenType::SPACE, + TokenType::END_OF_LINE, + TokenType::COMMENT + ); + $this->characterStream = new CharacterStream($source); + $this->startPosition = Position::zero(); } - public function read(TokenType $tokenType): void + public function getTokenTypeUnderCursor(): TokenType { - assert($this->latestError === null); - $this->startPosition = $this->characterStream->getCurrentPosition(); + assert($this->tokenTypeUnderCursor !== null); - if ($this->characterStream->isEnd()) { - throw $this->latestError = LexerException::becauseOfUnexpectedEndOfSource( - expectedTokenTypes: TokenTypes::from($tokenType), - affectedRangeInSource: $this->startPosition->toRange() + return $this->tokenTypeUnderCursor; + } + + public function getTokenUnderCursor(): Token + { + return $this->tokenUnderCursor ??= new Token( + rangeInSource: Range::from($this->startPosition, $this->getEndPosition()), + type: $this->getTokenTypeUnderCursor(), + value: $this->buffer + ); + } + + public function isEnd(): bool + { + return $this->characterStream->isEnd(); + } + + public function assertIsEnd(): void + { + if (!$this->isEnd()) { + throw LexerException::becauseOfUnexpectedExceedingSource( + affectedRangeInSource: $this->characterStream->getCurrentPosition()->toRange(), + exceedingCharacter: $this->characterStream->current() ?? '' ); } + } - $this->tokenTypeUnderCursor = null; - $this->tokenUnderCursor = null; - $this->offset = 0; - $this->buffer = ''; + public function getStartPosition(): Position + { - while (true) { - $character = $this->characterStream->current(); - $result = Matcher::for($tokenType)->match($character, $this->offset); + return $this->startPosition; + } - if ($result === Result::KEEP) { - $this->offset++; - $this->buffer .= $character; - $this->characterStream->next(); - continue; - } + public function getEndPosition(): Position + { - if ($result === Result::SATISFIED) { - $this->tokenTypeUnderCursor = $tokenType; - break; - } + return $this->characterStream->getPreviousPosition(); + } - if ($result === Result::CANCEL) { - throw $this->latestError = LexerException::becauseOfUnexpectedCharacterSequence( - expectedTokenTypes: TokenTypes::from($tokenType), - affectedRangeInSource: Range::from( - $this->startPosition, - $this->characterStream->getCurrentPosition() - ), - actualCharacterSequence: $this->buffer . $character - ); - } + public function read(TokenType $tokenType): void + { + + if ($this->characterStream->isEnd()) { + throw LexerException::becauseOfUnexpectedEndOfSource( + expectedTokenTypes: TokenTypes::from($tokenType), + affectedRangeInSource: $this->characterStream->getCurrentPosition()->toRange() + ); + } + + if ($this->extract($tokenType)) { + $this->tokenTypeUnderCursor = $tokenType; + return; } + + throw LexerException::becauseOfUnexpectedCharacterSequence( + expectedTokenTypes: TokenTypes::from($tokenType), + affectedRangeInSource: Range::from( + $this->startPosition, + $this->characterStream->getCurrentPosition() + ), + actualCharacterSequence: $this->buffer . $this->characterStream->current() + ); } public function readOneOf(TokenTypes $tokenTypes): void { - assert($this->latestError === null); - $this->startPosition = $this->characterStream->getCurrentPosition(); if ($this->characterStream->isEnd()) { - throw $this->latestError = LexerException::becauseOfUnexpectedEndOfSource( + throw LexerException::becauseOfUnexpectedEndOfSource( expectedTokenTypes: $tokenTypes, - affectedRangeInSource: $this->startPosition->toRange() + affectedRangeInSource: $this->characterStream->getCurrentPosition()->toRange() ); } - $this->tokenTypeUnderCursor = null; - $this->tokenUnderCursor = null; - $this->offset = 0; - $this->buffer = ''; + $foundTokenType = $this->extractOneOf($tokenTypes); + if ($foundTokenType === null) { + throw LexerException::becauseOfUnexpectedCharacterSequence( + expectedTokenTypes: $tokenTypes, + affectedRangeInSource: Range::from( + $this->startPosition, + $this->characterStream->getPreviousPosition() + ), + actualCharacterSequence: $this->buffer + ); + } - $tokenTypeCandidates = $tokenTypes->items; - while (count($tokenTypeCandidates)) { - $character = $this->characterStream->current(); + $this->tokenTypeUnderCursor = $foundTokenType; + } - $nextTokenTypeCandidates = []; - foreach ($tokenTypeCandidates as $tokenType) { - $result = Matcher::for($tokenType)->match($character, $this->offset); + public function probe(TokenType $tokenType): bool + { - if ($result === Result::KEEP) { - $nextTokenTypeCandidates[] = $tokenType; - continue; - } + if ($this->characterStream->isEnd()) { + return false; + } - if ($result === Result::SATISFIED) { - $this->tokenTypeUnderCursor = $tokenType; - return; - } - } + $snapshot = $this->characterStream->makeSnapshot(); - $this->offset++; - $this->buffer .= $character; - $tokenTypeCandidates = $nextTokenTypeCandidates; - $this->characterStream->next(); + if ($tokenType = $this->extract($tokenType)) { + $this->tokenTypeUnderCursor = $tokenType; + return true; } - throw $this->latestError = LexerException::becauseOfUnexpectedCharacterSequence( - expectedTokenTypes: $tokenTypes, - affectedRangeInSource: Range::from( - $this->startPosition, - $this->characterStream->getPreviousPosition() - ), - actualCharacterSequence: $this->buffer - ); + $this->characterStream->restoreSnapshot($snapshot); + return false; + } + + public function probeOneOf(TokenTypes $tokenTypes): bool + { + if ($this->characterStream->isEnd()) { + return false; + } + + $snapshot = $this->characterStream->makeSnapshot(); + + if ($tokenType = $this->extractOneOf($tokenTypes)) { + $this->tokenTypeUnderCursor = $tokenType; + return true; + } + + $this->characterStream->restoreSnapshot($snapshot); + return false; + } + + public function peek(TokenType $tokenType): bool + { + if ($this->characterStream->isEnd()) { + return false; + } + + $snapshot = $this->characterStream->makeSnapshot(); + $result = $this->extract($tokenType) !== null; + $this->characterStream->restoreSnapshot($snapshot); + + return $result; + } + + public function peekOneOf(TokenTypes $tokenTypes): ?TokenType + { + if ($this->characterStream->isEnd()) { + return null; + } + + $snapshot = $this->characterStream->makeSnapshot(); + $foundTokenType = $this->extractOneOf($tokenTypes); + $this->characterStream->restoreSnapshot($snapshot); + + return $foundTokenType; + } + + public function expect(TokenType $tokenType): void + { + if ($this->characterStream->isEnd()) { + throw LexerException::becauseOfUnexpectedEndOfSource( + expectedTokenTypes: TokenTypes::from($tokenType), + affectedRangeInSource: $this->characterStream->getCurrentPosition()->toRange() + ); + } + + $snapshot = $this->characterStream->makeSnapshot(); + if ($this->extract($tokenType) === null) { + throw LexerException::becauseOfUnexpectedCharacterSequence( + expectedTokenTypes: TokenTypes::from($tokenType), + affectedRangeInSource: Range::from( + $this->startPosition, + $this->characterStream->getPreviousPosition() + ), + actualCharacterSequence: $this->buffer + ); + } + + $this->characterStream->restoreSnapshot($snapshot); + } + + public function expectOneOf(TokenTypes $tokenTypes): TokenType + { + if ($this->characterStream->isEnd()) { + throw LexerException::becauseOfUnexpectedEndOfSource( + expectedTokenTypes: $tokenTypes, + affectedRangeInSource: $this->characterStream->getCurrentPosition()->toRange() + ); + } + + $snapshot = $this->characterStream->makeSnapshot(); + $foundTokenType = $this->extractOneOf($tokenTypes); + if ($foundTokenType === null) { + throw LexerException::becauseOfUnexpectedCharacterSequence( + expectedTokenTypes: $tokenTypes, + affectedRangeInSource: Range::from( + $this->startPosition, + $this->characterStream->getPreviousPosition() + ), + actualCharacterSequence: $this->buffer + ); + } + + $this->characterStream->restoreSnapshot($snapshot); + + return $foundTokenType; } public function skipSpace(): void { - assert($this->latestError === null); - $this->skip(TokenType::SPACE, TokenType::END_OF_LINE); + $this->skipAnyOf($this->TOKEN_TYPES_SPACE); } public function skipSpaceAndComments(): void { - assert($this->latestError === null); - $this->skip(TokenType::SPACE, TokenType::END_OF_LINE, TokenType::COMMENT); + $this->skipAnyOf($this->TOKEN_TYPES_SPACE_AND_COMMENTS); } - private function skip(TokenType ...$tokenTypes): void + private function skipAnyOf(TokenTypes $tokenTypes): void { while (true) { $character = $this->characterStream->current(); - foreach ($tokenTypes as $tokenType) { + foreach ($tokenTypes->items as $tokenType) { $matcher = Matcher::for($tokenType); if ($matcher->match($character, 0) === Result::KEEP) { @@ -174,24 +293,66 @@ private function skip(TokenType ...$tokenTypes): void } } - public function getTokenUnderCursor(): Token + private function extract(TokenType $tokenType): ?TokenType { - assert($this->latestError === null); - assert($this->startPosition !== null); - assert($this->tokenTypeUnderCursor !== null); + $this->startPosition = $this->characterStream->getCurrentPosition(); + $this->tokenUnderCursor = null; + $this->offset = 0; + $this->buffer = ''; - return $this->tokenUnderCursor ??= new Token( - rangeInSource: Range::from( - $this->startPosition, - $this->characterStream->getPreviousPosition() - ), - type: $this->tokenTypeUnderCursor, - value: $this->buffer - ); + while (true) { + $character = $this->characterStream->current(); + $result = Matcher::for($tokenType)->match($character, $this->offset); + + if ($result === Result::SATISFIED) { + return $tokenType; + } + + if ($result === Result::CANCEL) { + return null; + } + + $this->offset++; + $this->buffer .= $character; + $this->characterStream->next(); + } } - public function isEnd(): bool + private function extractOneOf(TokenTypes $tokenTypes): ?TokenType { - return $this->characterStream->isEnd(); + $this->startPosition = $this->characterStream->getCurrentPosition(); + $this->tokenUnderCursor = null; + $this->offset = 0; + $this->buffer = ''; + + $tokenTypeCandidates = $tokenTypes->items; + while (count($tokenTypeCandidates)) { + $character = $this->characterStream->current(); + + $nextTokenTypeCandidates = []; + foreach ($tokenTypeCandidates as $tokenType) { + $result = Matcher::for($tokenType)->match($character, $this->offset); + + if ($result === Result::SATISFIED) { + return $tokenType; + } + + if ($result === Result::KEEP) { + $nextTokenTypeCandidates[] = $tokenType; + } + } + + $this->offset++; + $this->buffer .= $character; + $tokenTypeCandidates = $nextTokenTypeCandidates; + $this->characterStream->next(); + } + + return null; + } + + public function dumpRest(): string + { + return $this->characterStream->getRest(); } } diff --git a/src/Language/Lexer/LexerException.php b/src/Language/Lexer/LexerException.php index 85e823f..99a0bbe 100644 --- a/src/Language/Lexer/LexerException.php +++ b/src/Language/Lexer/LexerException.php @@ -22,6 +22,7 @@ namespace PackageFactory\ComponentEngine\Language\Lexer; +use PackageFactory\ComponentEngine\Language\Lexer\Token\Token; use PackageFactory\ComponentEngine\Language\Lexer\Token\TokenTypes; use PackageFactory\ComponentEngine\Language\Util\DebugHelper; use PackageFactory\ComponentEngine\Parser\Source\Range; @@ -33,6 +34,13 @@ private function __construct( string $message, public readonly Range $affectedRangeInSource ) { + $message = sprintf( + '[%s:%s] %s', + $affectedRangeInSource->start->lineNumber, + $affectedRangeInSource->start->columnNumber, + $message + ); + parent::__construct($message, $code); } @@ -65,4 +73,33 @@ public static function becauseOfUnexpectedCharacterSequence( affectedRangeInSource: $affectedRangeInSource ); } + + public static function becauseOfUnexpectedToken( + TokenTypes $expectedTokenTypes, + Token $actualToken + ): self { + return new self( + code: 1691575769, + message: sprintf( + 'Unexpected token "%s" was encountered. Expected %s instead.', + DebugHelper::describeToken($actualToken), + DebugHelper::describeTokenTypes($expectedTokenTypes) + ), + affectedRangeInSource: $actualToken->rangeInSource + ); + } + + public static function becauseOfUnexpectedExceedingSource( + Range $affectedRangeInSource, + string $exceedingCharacter + ): self { + return new self( + code: 1691675396, + message: sprintf( + 'Expected source to end, but found exceeding character "%s".', + $exceedingCharacter + ), + affectedRangeInSource: $affectedRangeInSource + ); + } } diff --git a/src/Language/Lexer/Matcher/Matcher.php b/src/Language/Lexer/Matcher/Matcher.php index af6249b..88e8d69 100644 --- a/src/Language/Lexer/Matcher/Matcher.php +++ b/src/Language/Lexer/Matcher/Matcher.php @@ -26,6 +26,7 @@ use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Exact\Exact; use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Fixed\Fixed; use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Not\Not; +use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Optional\Optional; use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Sequence\Sequence; use PackageFactory\ComponentEngine\Language\Lexer\Token\TokenType; @@ -40,7 +41,10 @@ final public static function for(TokenType $tokenType): self { return self::$instancesByTokenType[$tokenType->value] ??= match ($tokenType) { TokenType::COMMENT => - new Sequence(new Exact('#'), new Not(new Exact("\n"))), + new Sequence( + new Exact('#'), + new Optional(new Not(new Exact("\n"))) + ), TokenType::KEYWORD_FROM => new Exact('from'), @@ -70,7 +74,7 @@ final public static function for(TokenType $tokenType): self TokenType::STRING_LITERAL_DELIMITER => new Exact('"'), TokenType::STRING_LITERAL_CONTENT => - new Not(new Characters('"\\' . "\n")), + new Not(new Characters('"\\')), TokenType::INTEGER_BINARY => new Sequence(new Exact('0b'), new Characters('01')), @@ -147,12 +151,16 @@ final public static function for(TokenType $tokenType): self new Exact('&&'), TokenType::SYMBOL_BOOLEAN_OR => new Exact('||'), - TokenType::SYMBOL_STRICT_EQUALs => + TokenType::SYMBOL_STRICT_EQUALS => new Exact('==='), - TokenType::SYMBOL_NOT_EQUALs => + TokenType::SYMBOL_NOT_EQUALS => new Exact('!=='), + TokenType::SYMBOL_GREATER_THAN => + new Exact('>'), TokenType::SYMBOL_GREATER_THAN_OR_EQUAL => new Exact('>='), + TokenType::SYMBOL_LESS_THAN => + new Exact('<'), TokenType::SYMBOL_LESS_THAN_OR_EQUAL => new Exact('<='), TokenType::SYMBOL_ARROW_SINGLE => @@ -161,13 +169,15 @@ final public static function for(TokenType $tokenType): self new Exact('?.'), TokenType::SYMBOL_NULLISH_COALESCE => new Exact('??'), + TokenType::SYMBOL_CLOSE_TAG => + new Exact(' new Characters( 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' ), TokenType::TEXT => - new Not(new Characters('<{}>')), + new Not(new Characters('<{}>' . " \t\n")), TokenType::SPACE => new Characters(" \t"), diff --git a/src/Language/Lexer/Matcher/Optional/Optional.php b/src/Language/Lexer/Matcher/Optional/Optional.php new file mode 100644 index 0000000..20de392 --- /dev/null +++ b/src/Language/Lexer/Matcher/Optional/Optional.php @@ -0,0 +1,43 @@ +. + */ + +declare(strict_types=1); + +namespace PackageFactory\ComponentEngine\Language\Lexer\Matcher\Optional; + +use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Result; +use PackageFactory\ComponentEngine\Language\Lexer\Matcher\Matcher; + +final class Optional extends Matcher +{ + public function __construct(private readonly Matcher $innerMatcher) + { + } + + public function match(?string $character, int $offset): Result + { + $result = $this->innerMatcher->match($character, $offset); + if ($offset === 0 && $result === Result::CANCEL) { + return Result::SATISFIED; + } + + return $result; + } +} diff --git a/src/Language/Lexer/Token/TokenType.php b/src/Language/Lexer/Token/TokenType.php index 93bd0c3..3fdbd7e 100644 --- a/src/Language/Lexer/Token/TokenType.php +++ b/src/Language/Lexer/Token/TokenType.php @@ -75,13 +75,16 @@ enum TokenType: string case SYMBOL_PIPE = 'SYMBOL_PIPE'; case SYMBOL_BOOLEAN_AND = 'SYMBOL_BOOLEAN_AND'; case SYMBOL_BOOLEAN_OR = 'SYMBOL_BOOLEAN_OR'; - case SYMBOL_STRICT_EQUALs = 'SYMBOL_STRICT_EQUALs'; - case SYMBOL_NOT_EQUALs = 'SYMBOL_NOT_EQUALs'; + case SYMBOL_STRICT_EQUALS = 'SYMBOL_STRICT_EQUALS'; + case SYMBOL_NOT_EQUALS = 'SYMBOL_NOT_EQUALS'; + case SYMBOL_GREATER_THAN = 'SYMBOL_GREATER_THAN'; case SYMBOL_GREATER_THAN_OR_EQUAL = 'SYMBOL_GREATER_THAN_OR_EQUAL'; + case SYMBOL_LESS_THAN = 'SYMBOL_LESS_THAN'; case SYMBOL_LESS_THAN_OR_EQUAL = 'SYMBOL_LESS_THAN_OR_EQUAL'; case SYMBOL_ARROW_SINGLE = 'SYMBOL_ARROW_SINGLE'; case SYMBOL_OPTCHAIN = 'SYMBOL_OPTCHAIN'; case SYMBOL_NULLISH_COALESCE = 'SYMBOL_NULLISH_COALESCE'; + case SYMBOL_CLOSE_TAG = 'SYMBOL_CLOSE_TAG'; case WORD = 'WORD'; case TEXT = 'TEXT'; diff --git a/test/Unit/Language/Lexer/LexerTest.php b/test/Unit/Language/Lexer/LexerTest.php index 9cb8a03..da12dfc 100644 --- a/test/Unit/Language/Lexer/LexerTest.php +++ b/test/Unit/Language/Lexer/LexerTest.php @@ -22,7 +22,6 @@ namespace PackageFactory\ComponentEngine\Test\Unit\Language\Lexer; -use AssertionError; use PackageFactory\ComponentEngine\Language\Lexer\Lexer; use PackageFactory\ComponentEngine\Language\Lexer\LexerException; use PackageFactory\ComponentEngine\Language\Lexer\Token\Token; @@ -64,6 +63,8 @@ protected function assertThrowsLexerException(callable $fn, LexerException $expe */ public static function singleTokenExamples(): iterable { + yield ($source = '#') => + [$source, TokenType::COMMENT]; yield ($source = '# This is a comment') => [$source, TokenType::COMMENT]; yield ($source = '# ๐ŸŒต๐Ÿ†šโŒš๏ธ: Multi-byte characters are not a problem inside a comment.') => @@ -170,9 +171,9 @@ public static function singleTokenExamples(): iterable yield ($source = '||') => [$source, TokenType::SYMBOL_BOOLEAN_OR]; yield ($source = '===') => - [$source, TokenType::SYMBOL_STRICT_EQUALs]; + [$source, TokenType::SYMBOL_STRICT_EQUALS]; yield ($source = '!==') => - [$source, TokenType::SYMBOL_NOT_EQUALs]; + [$source, TokenType::SYMBOL_NOT_EQUALS]; yield ($source = '>=') => [$source, TokenType::SYMBOL_GREATER_THAN_OR_EQUAL]; yield ($source = '<=') => @@ -191,9 +192,9 @@ public static function singleTokenExamples(): iterable yield ($source = '1245ValidWord') => [$source, TokenType::WORD]; - yield ($source = 'Just some text. Nothing special.') => + yield ($source = 'JustSomeText.TextTerminates-Only-At??Space//Characters.') => [$source, TokenType::TEXT]; - yield ($source = '๐ŸŒต๐Ÿ†šโŒš๏ธ: Multi-byte characters are not a problem inside of text.') => + yield ($source = '๐ŸŒต๐Ÿ†šโŒš๏ธ') => [$source, TokenType::TEXT]; yield ($source = ' ') => @@ -218,11 +219,63 @@ public static function singleTokenExamples(): iterable * @param TokenType $expectedTokenType * @return void */ - public function readsSingleToken(string $source, TokenType $expectedTokenType): void + public function readSavesTokenOfGivenTypeIfMatchIsFound(string $source, TokenType $expectedTokenType): void { $lexer = new Lexer($source); $lexer->read($expectedTokenType); + $this->assertEquals( + $expectedTokenType, + $lexer->getTokenTypeUnderCursor() + ); + + $this->assertEquals( + new Position(0, 0), + $lexer->getStartPosition() + ); + + $this->assertEquals( + new Position(0, \mb_strlen($source) - 1), + $lexer->getEndPosition() + ); + + $this->assertEquals( + new Token( + rangeInSource: self::range([0, 0], [0, \mb_strlen($source) - 1]), + type: $expectedTokenType, + value: $source + ), + $lexer->getTokenUnderCursor() + ); + } + + /** + * @dataProvider singleTokenExamples + * @test + * @param string $source + * @param TokenType $expectedTokenType + * @return void + */ + public function readOneOfSavesTokenOfGivenTypeIfMatchIsFound(string $source, TokenType $expectedTokenType): void + { + $lexer = new Lexer($source); + $lexer->readOneOf(TokenTypes::from($expectedTokenType)); + + $this->assertEquals( + $expectedTokenType, + $lexer->getTokenTypeUnderCursor() + ); + + $this->assertEquals( + new Position(0, 0), + $lexer->getStartPosition() + ); + + $this->assertEquals( + new Position(0, \mb_strlen($source) - 1), + $lexer->getEndPosition() + ); + $this->assertEquals( new Token( rangeInSource: self::range([0, 0], [0, \mb_strlen($source) - 1]), @@ -422,8 +475,8 @@ public static function multipleTokensExamples(): iterable ]; $source = << inside. + ThisIsSomeText-with-expressions{} + line-breaks, spaces andTags<>inside. AFX; yield $source => [ $source, @@ -431,18 +484,23 @@ public static function multipleTokensExamples(): iterable TokenType::TEXT, TokenType::BRACKET_CURLY_OPEN, TokenType::BRACKET_CURLY_CLOSE, + TokenType::SPACE, TokenType::END_OF_LINE, TokenType::BRACKET_ANGLE_OPEN, TokenType::BRACKET_ANGLE_CLOSE ), - new Token(self::range([0, 0], [0, 34]), TokenType::TEXT, 'This is some text with expressions '), - new Token(self::range([0, 35], [0, 35]), TokenType::BRACKET_CURLY_OPEN, '{'), - new Token(self::range([0, 36], [0, 36]), TokenType::BRACKET_CURLY_CLOSE, '}'), - new Token(self::range([0, 37], [0, 37]), TokenType::END_OF_LINE, "\n"), - new Token(self::range([1, 0], [1, 8]), TokenType::TEXT, 'and tags '), - new Token(self::range([1, 9], [1, 9]), TokenType::BRACKET_ANGLE_OPEN, '<'), - new Token(self::range([1, 10], [1, 10]), TokenType::BRACKET_ANGLE_CLOSE, '>'), - new Token(self::range([1, 11], [1, 18]), TokenType::TEXT, ' inside.'), + new Token(self::range([0, 0], [0, 30]), TokenType::TEXT, 'ThisIsSomeText-with-expressions'), + new Token(self::range([0, 31], [0, 31]), TokenType::BRACKET_CURLY_OPEN, '{'), + new Token(self::range([0, 32], [0, 32]), TokenType::BRACKET_CURLY_CLOSE, '}'), + new Token(self::range([0, 33], [0, 33]), TokenType::END_OF_LINE, "\n"), + new Token(self::range([1, 0], [1, 11]), TokenType::TEXT, 'line-breaks,'), + new Token(self::range([1, 12], [1, 14]), TokenType::SPACE, ' '), + new Token(self::range([1, 15], [1, 20]), TokenType::TEXT, 'spaces'), + new Token(self::range([1, 21], [1, 23]), TokenType::SPACE, ' '), + new Token(self::range([1, 24], [1, 30]), TokenType::TEXT, 'andTags'), + new Token(self::range([1, 31], [1, 31]), TokenType::BRACKET_ANGLE_OPEN, '<'), + new Token(self::range([1, 32], [1, 32]), TokenType::BRACKET_ANGLE_CLOSE, '>'), + new Token(self::range([1, 33], [1, 39]), TokenType::TEXT, 'inside.'), ]; } @@ -453,7 +511,7 @@ public static function multipleTokensExamples(): iterable * @param Token ...$expectedTokens * @return void */ - public function readsMultipleTokens( + public function testReadOneOfWithMultipleTokenTypes( string $source, TokenTypes $tokenTypes, Token ...$expectedTokens @@ -504,7 +562,6 @@ public static function failingSingleTokenExamples(): iterable yield from $example(TokenType::STRING_LITERAL_DELIMITER, '\'', '\''); yield from $example(TokenType::STRING_LITERAL_CONTENT, '"', '"'); - yield from $example(TokenType::STRING_LITERAL_CONTENT, "\n", "\n"); yield from $example(TokenType::STRING_LITERAL_CONTENT, '\\', '\\'); yield from $example(TokenType::INTEGER_BINARY, '001001', '00'); @@ -564,8 +621,8 @@ public static function failingSingleTokenExamples(): iterable yield from $example(TokenType::SYMBOL_PIPE, '๐ŸŒต', '๐ŸŒต'); yield from $example(TokenType::SYMBOL_BOOLEAN_AND, 'ยงยง', 'ยง'); yield from $example(TokenType::SYMBOL_BOOLEAN_OR, '//', '/'); - yield from $example(TokenType::SYMBOL_STRICT_EQUALs, '!==', '!'); - yield from $example(TokenType::SYMBOL_NOT_EQUALs, '===', '='); + yield from $example(TokenType::SYMBOL_STRICT_EQUALS, '!==', '!'); + yield from $example(TokenType::SYMBOL_NOT_EQUALS, '===', '='); yield from $example(TokenType::SYMBOL_GREATER_THAN_OR_EQUAL, '=>', '='); yield from $example(TokenType::SYMBOL_LESS_THAN_OR_EQUAL, '=<', '='); yield from $example(TokenType::SYMBOL_ARROW_SINGLE, '=>', '='); @@ -875,45 +932,6 @@ public function skipsSpaceAndComments(): void ); } - /** - * @return iterable - */ - public static function illegalOperationsAfterFailureExamples(): iterable - { - yield [fn (Lexer $lexer) => $lexer->read(TokenType::KEYWORD_IMPORT)]; - yield [ - fn (Lexer $lexer) => $lexer->readOneOf( - TokenTypes::from( - TokenType::KEYWORD_IMPORT, - TokenType::KEYWORD_NULL, - TokenType::SYMBOL_ARROW_SINGLE, - TokenType::BRACKET_ANGLE_CLOSE, - ) - ) - ]; - yield [fn (Lexer $lexer) => $lexer->skipSpace()]; - yield [fn (Lexer $lexer) => $lexer->skipSpaceAndComments()]; - yield [fn (Lexer $lexer) => $lexer->getTokenUnderCursor()]; - } - - /** - * @dataProvider illegalOperationsAfterFailureExamples - * @test - * @param callable $operation - * @return void - */ - public function cannotBeReusedAfterFailure(callable $operation): void - { - $lexer = new Lexer('import'); - try { - $lexer->read(TokenType::SYMBOL_BOOLEAN_AND); - } catch (LexerException $e) { - } - - $this->expectException(AssertionError::class); - $operation($lexer); - } - /** * @test */