diff --git a/R/register.R b/R/register.R index a7a1863e0..2e4c963d1 100644 --- a/R/register.R +++ b/R/register.R @@ -1,3 +1,12 @@ +maybe_encode_to_utf_8 <- function(a) { + b <- enc2utf8(a) + res <- iconv(b, from="UTF-8", to="UTF-8") + res +} + +is_not_na <- function(x) { + !is.na(x) +} # helper to clean up non-utf and posixlt vectors encode_values <- function(value) { if (!is.null(names(value))) { @@ -5,7 +14,14 @@ encode_values <- function(value) { } is_character <- vapply(value, is.character, logical(1)) + encoded_df <- lapply(value[is_character], maybe_encode_to_utf_8) + all_are_utf8 <- lapply(encoded_df[is_character], is_not_na) + if (!all(as.data.frame(all_are_utf8))) { + stop("not all values in the provided df are valid utf8") + } + value[is_character] <- lapply(value[is_character], enc2utf8) + is_factor <- vapply(value, is.factor, logical(1)) value[is_factor] <- lapply(value[is_factor], function(x) { levels(x) <- enc2utf8(levels(x)) diff --git a/src/scan.cpp b/src/scan.cpp index 14bc7581c..d53241c35 100644 --- a/src/scan.cpp +++ b/src/scan.cpp @@ -1,8 +1,8 @@ +#include "duckdb/main/client_context.hpp" +#include "duckdb/third_party/utf8proc/include/utf8proc.hpp" #include "rapi.hpp" #include "typesr.hpp" -#include "duckdb/main/client_context.hpp" - using namespace duckdb; using namespace cpp11; @@ -145,7 +145,6 @@ void AppendAnyColumnSegment(const RType &rtype, bool experimental, data_ptr_t co } case RType::STRING: { auto data_ptr = (SEXP *)coldata_ptr; - if (experimental) { D_ASSERT(v.GetType().id() == LogicalTypeId::POINTER); AppendColumnSegment(data_ptr, sexp_offset, v, this_count); diff --git a/tests/testthat/test_strings.R b/tests/testthat/test_strings.R new file mode 100644 index 000000000..4d0fe27f2 --- /dev/null +++ b/tests/testthat/test_strings.R @@ -0,0 +1,14 @@ +library(duckdb) + +test_that("Invalid unicode produces an error", { + # this doesn't throw an error on old releases of R. + skip_if(R.Version()$major <= 4 && R.Version()$minor <= 2.3) + con <- DBI::dbConnect(duckdb::duckdb()) + + my_df <- structure(list(no_municipio_esc = "Est\xe2ncia", no_municipio_prova = "Est\xe2ncia"), row.names = 16L, class = "data.frame") + expect_error(dbWriteTable(con , 'my_table' , my_df )) + + # test that the connection still works. + dbWriteTable(con, 'myTable', data.frame(a=c(1, 2, 3), b=c(4, 5, 6))) + DBI::dbDisconnect(con, shutdown = TRUE) +})