From 4d7e446237558f4980f040e2afa6d481b4f90410 Mon Sep 17 00:00:00 2001 From: Jonas Benn Date: Thu, 3 Oct 2024 15:38:12 +0200 Subject: [PATCH] Add `sheetreader` as community extension --- extensions/sheetreader/description.yml | 75 ++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 extensions/sheetreader/description.yml diff --git a/extensions/sheetreader/description.yml b/extensions/sheetreader/description.yml new file mode 100644 index 0000000..d1c7068 --- /dev/null +++ b/extensions/sheetreader/description.yml @@ -0,0 +1,75 @@ +extension: + name: sheetreader + description: Fast XLSX file importer + version: 0.1.0 + language: C++ + build: cmake + excluded_platforms: windows_amd64_rtools + license: MIT + maintainers: + - freddie-freeloader + +repo: + github: polydbms/sheetreader-duckdb + ref: 4c9a97acd678f192d16bd711d93e4883a9ced7bb + +docs: + hello_world: | + -- Create table from XLSX file & use default values for parameters + CREATE TABLE data AS FROM sheetreader('data.xlsx'); + + -- Example usage of available named parameters + CREATE TABLE data2 AS FROM sheetreader( + 'data2.xlsx', + sheet_index=1, + threads=16, + skip_rows=0, + has_header=TRUE, + types=[BOOLEAN,VARCHAR], + coerce_to_string=TRUE, + force_types=TRUE + ); + + + extended_description: | + ## About SheetReader + + `sheetreader` is an extension that allows reading XLSX files into DuckDB tables with SheetReader, our blazingly fast XLSX parser (https://github.com/polydbms/sheetreader-core). + + ## Usage + + ### Parameters + + | Name | Description | Type | Default | + |:----|:-----------|:----:|:-------| + | `sheet_index` | Index of the sheet to read. Starts at 1. | `INTEGER` | `1` | + | `sheet_name` | Name of the sheet to read.
Only either `sheet_index` or `sheet_name` can be set. | `VARCHAR` | `""` | + | `threads` | Number of threads to use, while parsing | `INTEGER` | Half of available cores; minimum 1 | + | `skip_rows` | Number of rows to skip | `INTEGER` | `0` | + | `has_header` | Force to treat first row as header row.
| `BOOLEAN` | `false` | + | `types` | List of types for all columns | `LIST(VARCHAR)` | Uses types determined by first & second row (after skipped rows) | + | `coerce_to_string` | Coerce all cells in column of type `VARCHAR` to string (i.e. `VARCHAR`). | `BOOLEAN` | `false` | + | `force_types` | Use `types` even if they are not compatible with types determined by first/second row.
Cells, that are not of the column type, are set to `NULL` or coerced to string, if option is set. | `BOOLEAN` | `false` | + + ## Paper + + SheetReader was published in the [Information Systems Journal](https://www.sciencedirect.com/science/article/abs/pii/S0306437923000194) + + ``` + @article{DBLP:journals/is/GavriilidisHZM23, + author = {Haralampos Gavriilidis and + Felix Henze and + Eleni Tzirita Zacharatou and + Volker Markl}, + title = {SheetReader: Efficient Specialized Spreadsheet Parsing}, + journal = {Inf. Syst.}, + volume = {115}, + pages = {102183}, + year = {2023}, + url = {https://doi.org/10.1016/j.is.2023.102183}, + doi = {10.1016/J.IS.2023.102183}, + timestamp = {Mon, 26 Jun 2023 20:54:32 +0200}, + biburl = {https://dblp.org/rec/journals/is/GavriilidisHZM23.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + } + ```