diff --git a/DESCRIPTION b/DESCRIPTION index 3a5a1d6..0f45569 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,8 @@ Imports: writexl, lifecycle, R.utils, - readr + readr, + nanoparquet Suggests: datasets, bit64, diff --git a/NEWS.md b/NEWS.md index 90d04a1..92aee45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # rio 1.1.1.999 (development) * Fix lintr issues #434 (h/t @bisaloo Hugo Gruson) +* Drop support for R < 4.0.0 see #436 +* Add support for parquet in the import tier using `nanoparquet` see rio 1.0.1 below. Bug fixes diff --git a/R/export.R b/R/export.R index 5eea904..54765ae 100644 --- a/R/export.R +++ b/R/export.R @@ -32,7 +32,7 @@ #' \item Weka Attribute-Relation File Format (.arff), using [foreign::write.arff()] #' \item Fixed-width format data (.fwf), using [utils::write.table()] with `row.names = FALSE`, `quote = FALSE`, and `col.names = FALSE` #' \item [CSVY](https://github.com/csvy) (CSV with a YAML metadata header) using [data.table::fwrite()]. -#' \item Apache Arrow Parquet (.parquet), using [arrow::write_parquet()] +#' \item Apache Arrow Parquet (.parquet), using [nanoparquet::write_parquet()] #' \item Feather R/Python interchange format (.feather), using [arrow::write_feather()] #' \item Fast storage (.fst), using [fst::write.fst()] #' \item JSON (.json), using [jsonlite::toJSON()]. In this case, `x` can be a variety of R objects, based on class mapping conventions in this paper: [https://arxiv.org/abs/1403.2805](https://arxiv.org/abs/1403.2805). diff --git a/R/export_methods.R b/R/export_methods.R index 1d07a10..cebde95 100644 --- a/R/export_methods.R +++ b/R/export_methods.R @@ -282,7 +282,7 @@ export_delim <- function(file, x, fwrite = lifecycle::deprecated(), sep = "\t", #' @export .export.rio_parquet <- function(file, x, ...) { - .docall(arrow::write_parquet, ..., args = list(x = x, sink = file)) + .docall(nanoparquet::write_parquet, ..., args = list(x = x, file = file)) } #' @export diff --git a/R/import.R b/R/import.R index 81277fc..9465498 100644 --- a/R/import.R +++ b/R/import.R @@ -42,7 +42,7 @@ #' \item Fortran data (no recognized extension), using [utils::read.fortran()] #' \item Fixed-width format data (.fwf), using a faster version of [utils::read.fwf()] that requires a `widths` argument and by default in rio has `stringsAsFactors = FALSE` #' \item [CSVY](https://github.com/csvy) (CSV with a YAML metadata header) using [data.table::fread()]. -#' \item Apache Arrow Parquet (.parquet), using [arrow::read_parquet()] +#' \item Apache Arrow Parquet (.parquet), using [nanoparquet::read_parquet()] #' \item Feather R/Python interchange format (.feather), using [arrow::read_feather()] #' \item Fast storage (.fst), using [fst::read.fst()] #' \item JSON (.json), using [jsonlite::fromJSON()] diff --git a/R/import_methods.R b/R/import_methods.R index b4add3a..6bac032 100644 --- a/R/import_methods.R +++ b/R/import_methods.R @@ -413,8 +413,8 @@ extract_html_row <- function(x, empty_value) { #' @export .import.rio_parquet <- function(file, which = 1, ...) { - .check_pkg_availability("arrow") - .docall(arrow::read_parquet, ..., args = list(file = file, as_data_frame = TRUE)) + #.check_pkg_availability("arrow") + .docall(nanoparquet::read_parquet, ..., args = list(file = file, options = nanoparquet::parquet_options(class = "data.frame"))) } #' @export diff --git a/R/sysdata.rda b/R/sysdata.rda index 3007435..dbdb0b4 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/README.md b/README.md index 149cba6..0487f54 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,7 @@ The full list of supported formats is below: | Gzip | gz / gzip | base | base | Default | | | Zip files | zip | utils | utils | Default | | | Ambiguous file format | dat | data.table | | Default | Attempt as delimited text data | +| Apache Arrow (Parquet) | parquet | nanoparquet | nanoparquet | Default | | | CSVY (CSV + YAML metadata header) | csvy | data.table | data.table | Default | | | Comma-separated data | csv | data.table | data.table | Default | | | Comma-separated data (European) | csv2 | data.table | data.table | Default | | @@ -159,7 +160,6 @@ The full list of supported formats is below: | Text Representations of R Objects | dump | base | base | Default | | | Weka Attribute-Relation File Format | arff / weka | foreign | foreign | Default | | | XBASE database files | dbf | foreign | foreign | Default | | -| Apache Arrow (Parquet) | parquet | arrow | arrow | Suggest | | | Clipboard | clipboard | clipr | clipr | Suggest | default is tsv | | EViews | eviews / wf1 | hexView | | Suggest | | | Fast Storage | fst | fst | fst | Suggest | | diff --git a/data-raw/single.json b/data-raw/single.json index 0af42e4..6aacab4 100644 --- a/data-raw/single.json +++ b/data-raw/single.json @@ -2,10 +2,10 @@ { "input": "parquet", "format": "parquet", - "type": "suggest", + "type": "import", "format_name": "Apache Arrow (Parquet)", - "import_function": "arrow::read_parquet", - "export_function": "arrow::write_parquet", + "import_function": "nanoparquet::read_parquet", + "export_function": "nanoparquet::write_parquet", "note": "" }, { diff --git a/man/export.Rd b/man/export.Rd index 82dc1c3..454274a 100644 --- a/man/export.Rd +++ b/man/export.Rd @@ -48,7 +48,7 @@ object (not just a data frame). \item Weka Attribute-Relation File Format (.arff), using \code{\link[foreign:write.arff]{foreign::write.arff()}} \item Fixed-width format data (.fwf), using \code{\link[utils:write.table]{utils::write.table()}} with \code{row.names = FALSE}, \code{quote = FALSE}, and \code{col.names = FALSE} \item \href{https://github.com/csvy}{CSVY} (CSV with a YAML metadata header) using \code{\link[data.table:fwrite]{data.table::fwrite()}}. -\item Apache Arrow Parquet (.parquet), using \code{\link[arrow:write_parquet]{arrow::write_parquet()}} +\item Apache Arrow Parquet (.parquet), using \code{\link[nanoparquet:write_parquet]{nanoparquet::write_parquet()}} \item Feather R/Python interchange format (.feather), using \code{\link[arrow:write_feather]{arrow::write_feather()}} \item Fast storage (.fst), using \code{\link[fst:write_fst]{fst::write.fst()}} \item JSON (.json), using \code{\link[jsonlite:fromJSON]{jsonlite::toJSON()}}. In this case, \code{x} can be a variety of R objects, based on class mapping conventions in this paper: \url{https://arxiv.org/abs/1403.2805}. diff --git a/man/import.Rd b/man/import.Rd index 9079b68..3dd10cf 100644 --- a/man/import.Rd +++ b/man/import.Rd @@ -65,7 +65,7 @@ object (not just a data frame). \item Fortran data (no recognized extension), using \code{\link[utils:read.fortran]{utils::read.fortran()}} \item Fixed-width format data (.fwf), using a faster version of \code{\link[utils:read.fwf]{utils::read.fwf()}} that requires a \code{widths} argument and by default in rio has \code{stringsAsFactors = FALSE} \item \href{https://github.com/csvy}{CSVY} (CSV with a YAML metadata header) using \code{\link[data.table:fread]{data.table::fread()}}. -\item Apache Arrow Parquet (.parquet), using \code{\link[arrow:read_parquet]{arrow::read_parquet()}} +\item Apache Arrow Parquet (.parquet), using \code{\link[nanoparquet:read_parquet]{nanoparquet::read_parquet()}} \item Feather R/Python interchange format (.feather), using \code{\link[arrow:read_feather]{arrow::read_feather()}} \item Fast storage (.fst), using \code{\link[fst:write_fst]{fst::read.fst()}} \item JSON (.json), using \code{\link[jsonlite:fromJSON]{jsonlite::fromJSON()}} diff --git a/man/rio.Rd b/man/rio.Rd index 4cb2c39..dfb749a 100644 --- a/man/rio.Rd +++ b/man/rio.Rd @@ -61,6 +61,7 @@ Other contributors: \item Nathan Day \email{nathancday@gmail.com} [contributor] \item Bill Denney \email{wdenney@humanpredictions.com} (\href{https://orcid.org/0000-0002-5759-428X}{ORCID}) [contributor] \item Alex Bokov \email{alex.bokov@gmail.com} (\href{https://orcid.org/0000-0002-0511-9815}{ORCID}) [contributor] + \item Hugo Gruson (\href{https://orcid.org/0000-0002-4094-1476}{ORCID}) [contributor] } }