Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenize whitespace #1570

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Ast/include/Luau/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct Lexeme

Comment,
BlockComment,
Whitespace,

Attribute,

Expand Down Expand Up @@ -100,7 +101,7 @@ struct Lexeme
public:
union
{
const char* data; // String, Number, Comment
const char* data; // String, Number, Comment, Whitespace
const char* name; // Name
unsigned int codepoint; // BrokenUnicode
};
Expand Down Expand Up @@ -156,6 +157,7 @@ class Lexer
Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0});

void setSkipComments(bool skip);
void setSkipWhitespace(bool skip);
void setReadNames(bool read);

const Location& previousLocation() const
Expand All @@ -164,7 +166,7 @@ class Lexer
}

const Lexeme& next();
const Lexeme& next(bool skipComments, bool updatePrevLocation);
const Lexeme& next(bool skipComments, bool skipWhitespace, bool updatePrevLocation);
void nextline();

Lexeme lookahead();
Expand Down Expand Up @@ -228,6 +230,7 @@ class Lexer
AstNameTable& names;

bool skipComments;
bool skipWhitespace;
bool readNames;

enum class BraceType
Expand Down
35 changes: 27 additions & 8 deletions Ast/src/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <limits.h>

LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2)
LUAU_FASTFLAGVARIABLE(LuauLexerTokenizesWhitespace)
namespace Luau
{

Expand Down Expand Up @@ -36,7 +37,7 @@ Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t siz
{
LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
);
}

Expand All @@ -53,7 +54,7 @@ unsigned int Lexeme::getLength() const
{
LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
);

return length;
Expand Down Expand Up @@ -316,6 +317,7 @@ Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Positio
)
, names(names)
, skipComments(false)
, skipWhitespace(true)
, readNames(true)
{
}
Expand All @@ -325,31 +327,39 @@ void Lexer::setSkipComments(bool skip)
skipComments = skip;
}

void Lexer::setSkipWhitespace(bool skip)
{
skipWhitespace = skip;
}

void Lexer::setReadNames(bool read)
{
readNames = read;
}

const Lexeme& Lexer::next()
{
return next(this->skipComments, true);
return next(this->skipComments, this->skipWhitespace, true);
}

const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation)
const Lexeme& Lexer::next(bool skipComments, bool skipWhitespace, bool updatePrevLocation)
{
// in skipComments mode we reject valid comments
do
{
// consume whitespace before the token
while (isSpace(peekch()))
consumeAny();
if (!FFlag::LuauLexerTokenizesWhitespace)
{
// consume whitespace before the token
while (isSpace(peekch()))
consumeAny();
}

if (updatePrevLocation)
prevLocation = lexeme.location;

lexeme = readNext();
updatePrevLocation = false;
} while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
} while ((skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment)) || (skipWhitespace && lexeme.type == Lexeme::Whitespace));

return lexeme;
}
Expand Down Expand Up @@ -967,6 +977,15 @@ Lexeme Lexer::readNext()

return Lexeme(Location(start, position()), name.second, name.first.value);
}
else if (FFlag::LuauLexerTokenizesWhitespace && isSpace(peekch()))
{
size_t startOffset = offset;

while (isSpace(peekch()))
consumeAny();

return Lexeme(Location(start, position()), Lexeme::Whitespace, &buffer[startOffset], offset - startOffset);
}
else if (peekch() & 0x80)
{
return readUtf8Error();
Expand Down
8 changes: 4 additions & 4 deletions Ast/src/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3572,13 +3572,13 @@ AstTypeError* Parser::reportMissingTypeError(const Location& parseErrorLocation,

void Parser::nextLexeme()
{
Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type;
Lexeme::Type type = lexer.next(/* skipComments= */ false, /* skipWhitespace= */ false, true).type;

while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment)
while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment || type == Lexeme::Whitespace)
{
const Lexeme& lexeme = lexer.current();

if (options.captureComments)
if (options.captureComments && type != Lexeme::Whitespace)
commentLocations.push_back(Comment{lexeme.type, lexeme.location});

// Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme.
Expand All @@ -3598,7 +3598,7 @@ void Parser::nextLexeme()
hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)});
}

type = lexer.next(/* skipComments= */ false, /* updatePrevLocation= */ false).type;
type = lexer.next(/* skipComments= */ false, /* skipWhitespace= */ false, /* updatePrevLocation= */ false).type;
}
}

Expand Down
64 changes: 64 additions & 0 deletions tests/Lexer.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

using namespace Luau;

LUAU_FASTFLAG(LuauLexerTokenizesWhitespace)

TEST_SUITE_BEGIN("LexerTests");

TEST_CASE("broken_string_works")
Expand Down Expand Up @@ -242,4 +244,66 @@ TEST_CASE("string_interpolation_with_unicode_escape")
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_CASE("lexer_tokenizes_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};

const std::string testInput = "local x = 1";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipWhitespace(false);

CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);

auto space = lexer.next();
CHECK_EQ(space.type, Lexeme::Whitespace);
CHECK_EQ(std::string(space.data, space.getLength()), std::string(" "));

CHECK_EQ(lexer.next().type, '=');

auto space2 = lexer.next();
CHECK_EQ(space2.type, Lexeme::Whitespace);
CHECK_EQ(std::string(space2.data, space2.getLength()), std::string(" "));

CHECK_EQ(lexer.next().type, Lexeme::Number);
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_CASE("lexer_tokenizes_multiline_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};

const std::string testInput = R"(local x

y = 2
)";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipWhitespace(false);

CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);

auto multilineSpace = lexer.next();
CHECK_EQ(multilineSpace.type, Lexeme::Whitespace);
CHECK_EQ(std::string(multilineSpace.data, multilineSpace.getLength()), std::string("\n\n "));

CHECK_EQ(lexer.next().type, Lexeme::Name);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, '=');
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Number);

auto multilineSpace2 = lexer.next();
CHECK_EQ(multilineSpace2.type, Lexeme::Whitespace);
CHECK_EQ(std::string(multilineSpace2.data, multilineSpace2.getLength()), std::string("\n "));

CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_SUITE_END();
Loading