Skip to content

Commit

Permalink
Add multiple language support, markdown
Browse files Browse the repository at this point in the history
- return end poisitions, we'll need it for markdown
  parsing
- add markdown
- add markdown_inline
  • Loading branch information
gaborcsardi committed Oct 20, 2024
1 parent 61ad03e commit c38ecc0
Show file tree
Hide file tree
Showing 23 changed files with 153,756 additions and 1,625 deletions.
8 changes: 6 additions & 2 deletions R/scan-deps.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ scan_path_deps <- function(path) {
cache <- get_deps_cache_path(hash)
if (file.exists(cache)) {
deps <- readRDS(cache)
deps$path <- path
if (!is.null(deps) && nrow(deps) > 0) {
deps$path <- path
}
return(deps)
}

Expand All @@ -38,7 +40,9 @@ scan_path_deps <- function(path) {

# save it to the cache, but anonimize it first. If no deps, save NULL
deps_no_path <- deps
if (!i.snull(deps_no_path)) deps_no_path$path <- ""
if (!is.null(deps_no_path) && nrow(deps_no_path) > 0) {
deps_no_path$path <- ""
}
dir.create(dirname(cache), showWarnings = FALSE, recursive = TRUE)
saveRDS(deps_no_path, cache)

Expand Down
22 changes: 15 additions & 7 deletions R/tree-sitter.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
s_expr <- function(code) {
ts_languages <- c(R = 0L, markdown = 1L, "markdown-inline" = 2L)

s_expr <- function(code, language = c("R", "markdown", "markdown-inline")) {
language <- ts_languages[match.arg(language)]
if (is.character(code)) code <- charToRaw(paste(code, collapse = "\n"))
call_with_cleanup(c_s_expr, code)
call_with_cleanup(c_s_expr, code, language)
}

code_query <- function(code = NULL, query, file = NULL) {
code_query <- function(code = NULL, query, file = NULL,
language = c("R", "markdown", "markdown-inline")) {
language <- ts_languages[match.arg(language)]
qlen <- nchar(query, type = "bytes") + 1L # + \n
qbeg <- c(1L, cumsum(qlen))
qnms <- names(query) %||% rep(NA_character_, length(query))
query1 <- paste0(query, "\n", collapse = "")

if (!is.null(code)) {
if (is.character(code)) code <- charToRaw(paste(code, collapse = "\n"))
res <- call_with_cleanup(c_code_query, code, query1)
res <- call_with_cleanup(c_code_query, code, query1, language)
} else {
res <- call_with_cleanup(c_code_query_path, file, query1)
res <- call_with_cleanup(c_code_query_path, file, query1, language)
}

qorig <- as.integer(cut(res[[1]][[3]], breaks = qbeg, include.lowest = TRUE))
Expand All @@ -30,8 +35,11 @@ code_query <- function(code = NULL, query, file = NULL) {
pattern = viapply(res[[2]], "[[", 1L),
match = viapply(res[[2]], "[[", 2L),
start_byte = viapply(res[[2]], "[[", 6L),
start_row = viapply(res[[2]], "[[", 7L),
start_column = viapply(res[[2]], "[[", 8L),
end_byte = viapply(res[[2]], "[[", 7L),
start_row = viapply(res[[2]], "[[", 8L),
start_column = viapply(res[[2]], "[[", 9L),
end_row = viapply(res[[2]], "[[", 10L),
end_column = viapply(res[[2]], "[[", 11L),
name = vcapply(res[[2]], "[[", 4L),
code = vcapply(res[[2]], "[[", 5L)
)
Expand Down
6 changes: 5 additions & 1 deletion src/Makevars
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ PKG_CFLAGS = \
tree-sitter-files = \
tree-sitter/lib/src/lib.o \
tree-sitter/r/parser.o \
tree-sitter/r/scanner.o
tree-sitter/r/scanner.o \
tree-sitter/markdown/parser.o \
tree-sitter/markdown/scanner.o \
tree-sitter/markdown-inline/parser.o \
tree-sitter/markdown-inline/scanner.o

lib-files = \
init.o cleancall.o tree-sitter.o
Expand Down
6 changes: 3 additions & 3 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ SEXP s_expr(SEXP input);

static const R_CallMethodDef callMethods[] = {
CLEANCALL_METHOD_RECORD,
{ "code_query", (DL_FUNC) &code_query, 2 },
{ "code_query_path", (DL_FUNC) &code_query_path, 2 },
{ "s_expr", (DL_FUNC) &s_expr, 1 },
{ "code_query", (DL_FUNC) &code_query, 3 },
{ "code_query_path", (DL_FUNC) &code_query_path, 3 },
{ "s_expr", (DL_FUNC) &s_expr, 2 },
{ NULL, NULL, 0 }
};

Expand Down
70 changes: 50 additions & 20 deletions src/tree-sitter.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,51 @@
#include "cleancall.h"

#include "tree_sitter/api.h"
extern const TSLanguage* tree_sitter_r(void);
extern const TSLanguage *tree_sitter_r(void);
extern const TSLanguage *tree_sitter_markdown(void);
extern const TSLanguage *tree_sitter_markdown_inline(void);

static void r_free(void *data) {
free(data);
}

static const TSLanguage *rlang = NULL;
static const TSLanguage *r_lang = NULL;
static const TSLanguage *markdown_lang = NULL;
static const TSLanguage *markdown_inline_lang = NULL;

SEXP s_expr(SEXP input) {
if (rlang == NULL) {
rlang = tree_sitter_r();
enum ts_language_t {
TS_LANGUAGE_R = 0,
TS_LANGUAGE_MARKDOWN,
TS_LANGUAGE_MARKDOWN_INLINE
};

static const TSLanguage *get_language(int code) {
switch (code) {
case TS_LANGUAGE_R:
if (r_lang == NULL) {
r_lang = tree_sitter_r();
}
return r_lang;
case TS_LANGUAGE_MARKDOWN:
if (markdown_lang == NULL) {
markdown_lang = tree_sitter_markdown();
}
return markdown_lang;
case TS_LANGUAGE_MARKDOWN_INLINE:
if (markdown_inline_lang == NULL) {
markdown_inline_lang = tree_sitter_markdown_inline();
}
return markdown_inline_lang;
default:
Rf_error("Unknonwn tree-sitter language code");
}
}

SEXP s_expr(SEXP input, SEXP rlanguage) {
const TSLanguage *language = get_language(INTEGER(rlanguage)[0]);
TSParser *parser = NULL;
parser = ts_parser_new();
if (!ts_parser_set_language(parser, rlang)) {
if (!ts_parser_set_language(parser, language)) {
Rf_error("Failed to set R language, internal error.");
}
r_call_on_exit((cleanup_fn_t) ts_parser_delete, parser);
Expand Down Expand Up @@ -321,14 +350,11 @@ bool check_predicates(const struct query_match_t *qm) {
return true;
}

SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern) {
if (rlang == NULL) {
rlang = tree_sitter_r();
}

SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern, SEXP rlanguage) {
const TSLanguage *language = get_language(INTEGER(rlanguage)[0]);
TSParser *parser = NULL;
parser = ts_parser_new();
if (!ts_parser_set_language(parser, rlang)) {
if (!ts_parser_set_language(parser, language)) {
Rf_error("Failed to set R language, internal error.");
}
r_call_on_exit((cleanup_fn_t) ts_parser_delete, parser);
Expand All @@ -337,7 +363,7 @@ SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern) {
uint32_t error_offset;
TSQueryError error_type;
TSQuery *query = ts_query_new(
rlang,
language,
cpattern,
strlen(cpattern),
&error_offset,
Expand Down Expand Up @@ -439,7 +465,7 @@ SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern) {

// collect the results
for (uint16_t cc = 0; cc < match.capture_count; cc++) {
SEXP res1 = PROTECT(Rf_allocVector(VECSXP, 8));
SEXP res1 = PROTECT(Rf_allocVector(VECSXP, 11));
SET_VECTOR_ELT(result_captures, residx++, res1);
UNPROTECT(1);

Expand Down Expand Up @@ -468,9 +494,13 @@ SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern) {
CE_UTF8
)));
SET_VECTOR_ELT(res1, 5, Rf_ScalarInteger(start_byte + 1));
SET_VECTOR_ELT(res1, 6, Rf_ScalarInteger(end_byte + 1));
TSPoint start_point = ts_node_start_point(node);
SET_VECTOR_ELT(res1, 6, Rf_ScalarInteger(start_point.row + 1));
SET_VECTOR_ELT(res1, 7, Rf_ScalarInteger(start_point.column + 1));
SET_VECTOR_ELT(res1, 7, Rf_ScalarInteger(start_point.row + 1));
SET_VECTOR_ELT(res1, 8, Rf_ScalarInteger(start_point.column + 1));
TSPoint end_point = ts_node_end_point(node);
SET_VECTOR_ELT(res1, 9, Rf_ScalarInteger(end_point.row + 1));
SET_VECTOR_ELT(res1, 10, Rf_ScalarInteger(end_point.column + 1));
}
}

Expand All @@ -482,13 +512,13 @@ SEXP code_query_c(const char *c_input, uint32_t length, SEXP pattern) {
return result;
}

SEXP code_query(SEXP input, SEXP pattern) {
SEXP code_query(SEXP input, SEXP pattern, SEXP rlanguage) {
const char *c_input = (const char*) RAW(input);
uint32_t length = Rf_length(input);
return code_query_c(c_input, length, pattern);
return code_query_c(c_input, length, pattern, rlanguage);
}

SEXP code_query_path(SEXP path, SEXP pattern) {
SEXP code_query_path(SEXP path, SEXP pattern, SEXP rlanguage) {
const char *cpath = CHAR(STRING_ELT(path, 0));
FILE *fp = fopen(cpath, "rb");
if (fp == NULL) {
Expand All @@ -512,5 +542,5 @@ SEXP code_query_path(SEXP path, SEXP pattern) {
}
fclose(fp);

return code_query_c(buf, file_size, pattern);
return code_query_c(buf, file_size, pattern, rlanguage);
}
6,268 changes: 6,268 additions & 0 deletions src/tree-sitter/markdown-inline/grammar.json

Large diffs are not rendered by default.

Loading

0 comments on commit c38ecc0

Please sign in to comment.