Skip to content

Commit

Permalink
lexer: emit comment and template statement block tokens
Browse files Browse the repository at this point in the history
Tweak the token stream reported by the lexer in order to make it more useful
for alternative, non-compilation downstream parse processes such as code
intelligence gathering within a language server implementation.

 - Instead of silently discarding source code comments in the lexing phase,
   emit TK_COMMENT tokens which is useful to e.g. parse type annotations and
   other structured information.

 - Do not silently discard TK_LSTM tokens but report them to downstream
   parsers instead.

 - Do not silently emit TK_RSTM tokens as TK_SCOL but report them as-is to
   downstrem parsers.

 - Adjust the byte code compiler to properly deal with the changed token
   reporting by discarding incoming TK_COMMENT and TK_LSTM tokens and by
   remapping read TK_RSTM tokens to the TK_SCOL type.

Signed-off-by: Jo-Philipp Wich <[email protected]>
  • Loading branch information
jow- committed Sep 23, 2024
1 parent 328a50f commit 855854f
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 8 deletions.
12 changes: 11 additions & 1 deletion compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,17 @@ uc_compiler_parse_advance(uc_compiler_t *compiler)
compiler->parser->prev = compiler->parser->curr;

while (true) {
compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex);
uc_token_t *tok = uc_lexer_next_token(&compiler->parser->lex);

if (tok->type == TK_COMMENT || tok->type == TK_LSTM) {
ucv_put(tok->uv);
continue;
}
else if (tok->type == TK_RSTM) {
tok->type = TK_SCOL;
}

compiler->parser->curr = *tok;

if (compiler->parser->curr.type != TK_ERROR)
break;
Expand Down
2 changes: 2 additions & 0 deletions include/ucode/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,14 @@ typedef enum {
TK_EXPORT,

TK_EOF,
TK_COMMENT,
TK_ERROR
} uc_tokentype_t;

typedef enum {
UC_LEX_IDENTIFY_BLOCK,
UC_LEX_BLOCK_EXPRESSION_EMIT_TAG,
UC_LEX_BLOCK_STATEMENT_EMIT_TAG,
UC_LEX_BLOCK_COMMENT,
UC_LEX_IDENTIFY_TOKEN,
UC_LEX_PLACEHOLDER_START,
Expand Down
26 changes: 19 additions & 7 deletions lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,23 @@ emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_c
static uc_token_t *
parse_comment(uc_lexer_t *lex, int kind)
{
size_t off = lex->source->off - 1;
int ch;

uc_vector_push(&lex->buffer, '/');

while (true) {
ch = next_char(lex);

uc_vector_push(&lex->buffer, ch);

if (kind == '/' && (ch == '\n' || ch == EOF))
break;

if (kind == '*' && ch == '*' && check_char(lex, '/'))
if (kind == '*' && ch == '*' && check_char(lex, '/')) {
uc_vector_push(&lex->buffer, '/');
break;
}

if (ch == EOF) {
lex->state = UC_LEX_EOF;
Expand All @@ -192,7 +199,7 @@ parse_comment(uc_lexer_t *lex, int kind)
}
}

return NULL;
return emit_buffer(lex, off, TK_COMMENT, NULL);
}

static void
Expand Down Expand Up @@ -957,8 +964,7 @@ lex_step(uc_lexer_t *lex)

/* found start of statement block */
case '%':
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = STATEMENTS;
lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG;

if (check_char(lex, '-'))
strip = " \n\t\v\f\r";
Expand Down Expand Up @@ -1019,18 +1025,24 @@ lex_step(uc_lexer_t *lex)
return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
}

tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL);

lex->lastoff = lex->source->off;
lex->state = UC_LEX_IDENTIFY_BLOCK;

continue;

return tok;

case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = EXPRESSION;

return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);

case UC_LEX_BLOCK_STATEMENT_EMIT_TAG:
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = STATEMENTS;

return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL);

case UC_LEX_IDENTIFY_TOKEN:
do { tok = lex_find_token(lex); } while (tok == NULL);
Expand All @@ -1049,7 +1061,7 @@ lex_step(uc_lexer_t *lex)
lex->state = UC_LEX_IDENTIFY_BLOCK;
lex->block = NONE;

tok = emit_op(lex, -2, TK_SCOL, NULL);
tok = emit_op(lex, -2, TK_RSTM, NULL);
}

/* found end of expression block */
Expand Down

0 comments on commit 855854f

Please sign in to comment.