Skip to content

Commit

Permalink
feat: add symbols and identifier lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
vighnesh153 committed Feb 7, 2024
1 parent 03b6f0b commit 30d0bce
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,91 @@ class Lexer private constructor(
}
}

fun Lexer.readNextCharacter() {
currentCharacter = if (peekIndex >= input.length)
Char.MIN_VALUE
else
input[peekIndex]
currentIndex = peekIndex
peekIndex += 1
fun Lexer.nextToken(): Token {
var t = Token(tokenType = TokenType.ILLEGAL, tokenLiteral = "")

skipWhitespace()

// todo: add row and column number in the token

when (currentCharacter) {
'=' -> t = Token(tokenType = TokenType.EQUALS, tokenLiteral = TokenType.EQUALS.value)
',' -> t = Token(tokenType = TokenType.COMMA, tokenLiteral = TokenType.COMMA.value)
'+' -> t = Token(tokenType = TokenType.PLUS, tokenLiteral = TokenType.PLUS.value)
';' -> t = Token(tokenType = TokenType.SEMICOLON, tokenLiteral = TokenType.SEMICOLON.value)
'@' -> t = Token(tokenType = TokenType.AT_SIGN, tokenLiteral = TokenType.AT_SIGN.value)

'-' -> t = Token(tokenType = TokenType.MINUS, tokenLiteral = TokenType.MINUS.value)
'*' -> t = Token(tokenType = TokenType.ASTERISK, tokenLiteral = TokenType.ASTERISK.value)
'/' -> t = Token(tokenType = TokenType.FORWARD_SLASH, tokenLiteral = TokenType.FORWARD_SLASH.value)
'\\' -> t = Token(tokenType = TokenType.BACK_SLASH, tokenLiteral = TokenType.BACK_SLASH.value)
'%' -> t = Token(tokenType = TokenType.MODULUS, tokenLiteral = TokenType.MODULUS.value)
'!' -> t = Token(tokenType = TokenType.BANG, tokenLiteral = TokenType.BANG.value)
'&' -> t = Token(tokenType = TokenType.AMPERSAND, tokenLiteral = TokenType.AMPERSAND.value)
'|' -> t = Token(tokenType = TokenType.VERTICAL_BAR, tokenLiteral = TokenType.VERTICAL_BAR.value)
'^' -> t = Token(tokenType = TokenType.CARET, tokenLiteral = TokenType.CARET.value)
'?' -> t = Token(tokenType = TokenType.QUESTION, tokenLiteral = TokenType.QUESTION.value)
':' -> t = Token(tokenType = TokenType.COLON, tokenLiteral = TokenType.COLON.value)
'.' -> t = Token(tokenType = TokenType.DOT, tokenLiteral = TokenType.DOT.value)
'~' -> t = Token(tokenType = TokenType.TILDE, tokenLiteral = TokenType.TILDE.value)
'\'' -> t = Token(tokenType = TokenType.SINGLE_QUOTE, tokenLiteral = TokenType.SINGLE_QUOTE.value)
'"' -> t = Token(tokenType = TokenType.DOUBLE_QUOTE, tokenLiteral = TokenType.DOUBLE_QUOTE.value)
'`' -> t = Token(tokenType = TokenType.BACKTICK, tokenLiteral = TokenType.BACKTICK.value)
'(' -> t = Token(tokenType = TokenType.LEFT_PARENTHESIS, tokenLiteral = TokenType.LEFT_PARENTHESIS.value)
')' -> t = Token(tokenType = TokenType.RIGHT_PARENTHESIS, tokenLiteral = TokenType.RIGHT_PARENTHESIS.value)
'{' -> t = Token(tokenType = TokenType.LEFT_CURLY_BRACE, tokenLiteral = TokenType.LEFT_CURLY_BRACE.value)
'}' -> t = Token(tokenType = TokenType.RIGHT_CURLY_BRACE, tokenLiteral = TokenType.RIGHT_CURLY_BRACE.value)
'[' -> t = Token(tokenType = TokenType.LEFT_SQUARE_BRACKET, tokenLiteral = TokenType.LEFT_SQUARE_BRACKET.value)
']' -> t =
Token(tokenType = TokenType.RIGHT_SQUARE_BRACKET, tokenLiteral = TokenType.RIGHT_SQUARE_BRACKET.value)

'<' -> t = Token(tokenType = TokenType.LEFT_ANGLE_BRACKET, tokenLiteral = TokenType.LEFT_ANGLE_BRACKET.value)
'>' -> t = Token(tokenType = TokenType.RIGHT_ANGLE_BRACKET, tokenLiteral = TokenType.RIGHT_ANGLE_BRACKET.value)
Char.MIN_VALUE -> t = Token.EOF
else -> {
if (currentCharacter.isAcceptableIdentifierStart()) {
t = Token(
tokenType = TokenType.IDENTIFIER,
tokenLiteral = readIdentifier()
)
} else if (currentCharacter.isDigit()) {
// read integer
// read float
// read double
}
}
}

readNextCharacter()

return t
}

fun Lexer.nextToken(): Token {
return Token(tokenType = TokenType.EOF, tokenLiteral = "")
}
fun Char.isUnderscore(): Boolean = this == '_'
fun Char.isAcceptableIdentifierStart(): Boolean = isUnderscore() || isLetter()
fun Char.isAcceptableIdentifierNonStart(): Boolean = isAcceptableIdentifierStart() || isDigit()

fun Lexer.readIdentifier(): String {
val startIndex = currentIndex
if (!currentCharacter.isAcceptableIdentifierStart()) {
throw Error("You should not attempt to read an identifier which doesn't start with '_' or a letter")
}
while (currentCharacter.isAcceptableIdentifierNonStart()) {
readNextCharacter()
}
return input.slice(startIndex..<currentIndex)
}

fun Lexer.peekCharacter(): Char {
if (peekIndex >= input.length) {
return Char.MIN_VALUE
}
return input[peekIndex]
}

fun Lexer.skipWhitespace() {
val whiteSpaceCharacters = listOf(' ', '\t', '\n', '\r')
while (whiteSpaceCharacters.contains(currentCharacter)) {
readNextCharacter()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package interpreters.javalang.common.lexer

fun Lexer.readNextCharacter() {
currentCharacter = if (peekIndex >= input.length)
Char.MIN_VALUE
else
input[peekIndex]
currentIndex = peekIndex
peekIndex += 1
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@ package interpreters.javalang.common.tokens
data class Token(
val tokenType: TokenType,
val tokenLiteral: String
)
) {
companion object {
val EOF = Token(tokenType = TokenType.EOF, tokenLiteral = "")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ enum class TokenType(val value: String) {
ASTERISK_EQUALS("*="),
FORWARD_SLASH("/"),
FORWARD_SLASH_EQUALS("/="),
BACK_SLASH("/"),
MODULUS("%"),
MODULUS_EQUALS("%="),
BANG("!"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package interpreters.javalang.common.lexer

import interpreters.javalang.common.tokens.Token
import interpreters.javalang.common.tokens.TokenType
import kotlin.test.Test
import kotlin.test.assertEquals
Expand All @@ -8,15 +9,78 @@ class LexerTest {
@Test
fun lexerNextToken() {
val input = """
5,;@
,;@+-*/\%!&|^?:.~'"`
(){}[]<>
abc _aa a123 __11 _
""".trimIndent()

val expectedTokens = listOf(
ExpectedToken(id = 0, tokenType = TokenType.INTEGER_LITERAL, tokenLiteral = "5"),
ExpectedToken(id = 1, tokenType = TokenType.COMMA, tokenLiteral = ","),
ExpectedToken(id = 2, tokenType = TokenType.SEMICOLON, tokenLiteral = ";"),
ExpectedToken(id = 3, tokenType = TokenType.AT_SIGN, tokenLiteral = "@"),
ExpectedToken(id = 4, tokenType = TokenType.EOF, tokenLiteral = ""),
ExpectedToken(id = 0, tokenType = TokenType.COMMA, tokenLiteral = TokenType.COMMA.value),
ExpectedToken(id = 1, tokenType = TokenType.SEMICOLON, tokenLiteral = TokenType.SEMICOLON.value),
ExpectedToken(id = 2, tokenType = TokenType.AT_SIGN, tokenLiteral = TokenType.AT_SIGN.value),
ExpectedToken(id = 3, tokenType = TokenType.PLUS, tokenLiteral = TokenType.PLUS.value),
ExpectedToken(id = 4, tokenType = TokenType.MINUS, tokenLiteral = TokenType.MINUS.value),
ExpectedToken(id = 5, tokenType = TokenType.ASTERISK, tokenLiteral = TokenType.ASTERISK.value),
ExpectedToken(id = 6, tokenType = TokenType.FORWARD_SLASH, tokenLiteral = TokenType.FORWARD_SLASH.value),
ExpectedToken(id = 7, tokenType = TokenType.BACK_SLASH, tokenLiteral = TokenType.BACK_SLASH.value),
ExpectedToken(id = 8, tokenType = TokenType.MODULUS, tokenLiteral = TokenType.MODULUS.value),
ExpectedToken(id = 9, tokenType = TokenType.BANG, tokenLiteral = TokenType.BANG.value),
ExpectedToken(id = 10, tokenType = TokenType.AMPERSAND, tokenLiteral = TokenType.AMPERSAND.value),
ExpectedToken(id = 11, tokenType = TokenType.VERTICAL_BAR, tokenLiteral = TokenType.VERTICAL_BAR.value),
ExpectedToken(id = 12, tokenType = TokenType.CARET, tokenLiteral = TokenType.CARET.value),
ExpectedToken(id = 13, tokenType = TokenType.QUESTION, tokenLiteral = TokenType.QUESTION.value),
ExpectedToken(id = 14, tokenType = TokenType.COLON, tokenLiteral = TokenType.COLON.value),
ExpectedToken(id = 15, tokenType = TokenType.DOT, tokenLiteral = TokenType.DOT.value),
ExpectedToken(id = 16, tokenType = TokenType.TILDE, tokenLiteral = TokenType.TILDE.value),
ExpectedToken(id = 17, tokenType = TokenType.SINGLE_QUOTE, tokenLiteral = TokenType.SINGLE_QUOTE.value),
ExpectedToken(id = 18, tokenType = TokenType.DOUBLE_QUOTE, tokenLiteral = TokenType.DOUBLE_QUOTE.value),
ExpectedToken(id = 19, tokenType = TokenType.BACKTICK, tokenLiteral = TokenType.BACKTICK.value),
ExpectedToken(
id = 20,
tokenType = TokenType.LEFT_PARENTHESIS,
tokenLiteral = TokenType.LEFT_PARENTHESIS.value
),
ExpectedToken(
id = 21,
tokenType = TokenType.RIGHT_PARENTHESIS,
tokenLiteral = TokenType.RIGHT_PARENTHESIS.value
),
ExpectedToken(
id = 22,
tokenType = TokenType.LEFT_CURLY_BRACE,
tokenLiteral = TokenType.LEFT_CURLY_BRACE.value
),
ExpectedToken(
id = 23,
tokenType = TokenType.RIGHT_CURLY_BRACE,
tokenLiteral = TokenType.RIGHT_CURLY_BRACE.value
),
ExpectedToken(
id = 24,
tokenType = TokenType.LEFT_SQUARE_BRACKET,
tokenLiteral = TokenType.LEFT_SQUARE_BRACKET.value
),
ExpectedToken(
id = 25,
tokenType = TokenType.RIGHT_SQUARE_BRACKET,
tokenLiteral = TokenType.RIGHT_SQUARE_BRACKET.value
),
ExpectedToken(
id = 26,
tokenType = TokenType.LEFT_ANGLE_BRACKET,
tokenLiteral = TokenType.LEFT_ANGLE_BRACKET.value
),
ExpectedToken(
id = 27,
tokenType = TokenType.RIGHT_ANGLE_BRACKET,
tokenLiteral = TokenType.RIGHT_ANGLE_BRACKET.value
),
ExpectedToken(id = 28, tokenType = TokenType.IDENTIFIER, tokenLiteral = "abc"),
ExpectedToken(id = 29, tokenType = TokenType.IDENTIFIER, tokenLiteral = "_aa"),
ExpectedToken(id = 30, tokenType = TokenType.IDENTIFIER, tokenLiteral = "a123"),
ExpectedToken(id = 31, tokenType = TokenType.IDENTIFIER, tokenLiteral = "__11"),
ExpectedToken(id = 32, tokenType = TokenType.IDENTIFIER, tokenLiteral = "_"),
ExpectedToken(id = 33, tokenType = Token.EOF.tokenType, tokenLiteral = Token.EOF.tokenLiteral),
)

// In the expectedTokens, if ids are not unique, throw error
Expand Down

0 comments on commit 30d0bce

Please sign in to comment.