Merge pull request #8 from TidierOrg/adds-arrow/parquet

TidierOrg · Apr 15, 2024 · 2f9199d · 2f9199d
2 parents 6b88722 + 0d31094
commit 2f9199d
Show file tree

Hide file tree

Showing 11 changed files with 291 additions and 88 deletions.
diff --git a/Project.toml b/Project.toml
@@ -4,17 +4,18 @@ authors = ["Daniel Rizk <[email protected]> and contributors"]
 version = "0.1.0"
 
 [deps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
 ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
 
 [compat]
-julia = "1.9"
 CSV = "0.10"
 DataFrames = "1.5"
 Dates = "1.9"
@@ -23,6 +24,7 @@ HTTP = "1.10"
 ReadStatTables = "0.3"
 Reexport = "0.2, 1"
 XLSX = "0.10"
+julia = "1.9"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 
 TidierFiles.jl is a 100% Julia implementation of the readr, haven, readxl, and writexl R packages.
 
-Powered by the CSV.jl, XLSX.jl and ReadStatTables.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats.
+Powered by the CSV.jl, XLSX.jl, ReadStatTables.jl, Arrow.jl, and Parquet2.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats.
 
 
 Currently supported file types:
@@ -22,6 +22,8 @@ Currently supported file types:
 - `read_sav` and `write_sav` (.sav and .por)
 - `read_sas` and `write_sas` (.sas7bdat and .xpt)
 - `read_dta` and `write_dta` (.dta) 
+- `read_arrow` and `write_arrow`
+- `read_parquet` and `write_parquet`
 
 # Examples
 

diff --git a/docs/examples/UserGuide/Arrow.jl b/docs/examples/UserGuide/Arrow.jl
@@ -0,0 +1,19 @@
+# Arrow file reading and writing is powered by Arrow.jl
+# ## `read_arrow`
+# read_arrow(path; skip=0, n_max=Inf, col_select=nothing)
+
+# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are:
+
+# - `path`: The path to the .parquet file.
+# - `skip`: Number of initial rows to skip before reading data. Default is 0.
+# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows).
+# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns).
+
+# ## `write_arrow` 
+# `write_arrow(df, path)`
+
+# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are:
+
+# - `df`: The DataFrame to be written to a file.
+# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten.
+# - Additional arguments for writing arrow files are not outlined here, but should be available through the same interface of `Arrow.write`. Refer to Arrow.jl [documentation](https://arrow.apache.org/julia/stable/manual/#Arrow.write) at their page for further explanation.
diff --git a/docs/examples/UserGuide/parquet.jl b/docs/examples/UserGuide/parquet.jl
@@ -0,0 +1,20 @@
+# Parquet file reading and writing is powered by Parquet2.jl
+# ## `read_parquet`
+# read_parquet(path; col_names=true, skip=0, n_max=Inf, col_select=nothing)
+
+# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are:
+
+# - `path`: The path to the .parquet file.
+# - `col_names`: Indicates if the first row of the file is used as column names. Default is `true`.
+# - `skip`: Number of initial rows to skip before reading data. Default is 0.
+# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows).
+# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns).
+
+# ## `write_parquet` 
+# `write_parquet(df, path)`
+
+# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are:
+
+# - `df`: The DataFrame to be written to a file.
+# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten.
+# - Additional arguments for writing parquet files are not outlined here, but should be available through the same interface of `Parquet2.writefile`. Refer to [documentation](https://expandingman.gitlab.io/Parquet2.jl/#Writing-Data) at their page for further explanation.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -119,4 +119,6 @@ nav:
   - "Delimited Files": "examples/generated/UserGuide/delim.md"
   - "Excel Files": "examples/generated/UserGuide/xl.md"
   - "Stats Files": "examples/generated/UserGuide/stats.md"
+  - "Arrow Files": "examples/generated/UserGuide/Arrow.md"
+  - "Parquet Files": "examples/generated/UserGuide/parquet.md"
   - "Reference" : "reference.md"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,10 +1,12 @@
 # TidierFiles.jl
 
+<img src="/assets/logo.png" align="right" style="padding-left:10px;" width="150"/>
+
 ## What is TidierFiles.jl?
 
 TidierFiles.jl is a 100% Julia implementation of the readr, haven, readxl, and writexl R packages.
 
-Powered by the CSV.jl, XLSX.jl and ReadStatTables.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats.
+Powered by the CSV.jl, XLSX.jl, ReadStatTables.jl, Arrow.jl and Parquet2.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats.
 
 
 Currently supported file types:
@@ -17,6 +19,8 @@ Currently supported file types:
 - `read_sav` and `write_sav` (.sav and .por)
 - `read_sas` and `write_sas` (.sas7bdat and .xpt)
 - `read_dta` and `write_dta` (.dta) 
+- `read_arrow` and `write_arrow`
+- `read_parquet` and `write_parquet`
 
 # Examples
 

diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl
@@ -7,18 +7,22 @@ using Dates #bc XLSX type parsing does not seem to be working so i made some aut
 using HTTP
 using ReadStatTables
 using Reexport
+using Parquet2
+using Arrow
 
 @reexport using DataFrames: DataFrame
 
 export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, 
  read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, 
- write_dta
+ write_dta, read_arrow, write_arrow, read_parquet, write_parquet
 
 
 include("docstrings.jl")
 include("fwf.jl")
 include("xlfiles.jl")
 include("statsfiles.jl")
+include("parquet_files.jl")
+include("arrow_files.jl")
 
 """
 $docstring_read_csv

diff --git a/src/arrow_files.jl b/src/arrow_files.jl
@@ -0,0 +1,73 @@
+"""
+$docstring_read_arrow
+"""
+function read_arrow(data_file;
+                    col_select=nothing,
+                    skip=0,
+                    n_max=Inf)
+    # Determine if the file is a local file or a URL
+    if startswith(data_file, "http://") || startswith(data_file, "https://")
+        # Fetch the content from the URL
+        response = HTTP.get(data_file)
+
+        # Ensure the request was successful
+        if response.status != 200
+            error("Failed to fetch the Arrow file: HTTP status code ", response.status)
+        end
+
+        # Use the content fetched from the URL as an IOBuffer for reading
+        file_to_read = IOBuffer(response.body)
+    else
+        # Use the local file path
+        file_to_read = data_file
+    end
+
+    # Load the Arrow file into a DataFrame directly
+    df = DataFrame(Arrow.Table(file_to_read); copycols=false)
+
+    # Apply column selection if specified
+    if !isnothing(col_select)
+        df = select(df, col_select)  # Use the select function for safe column selection
+    end
+
+    # Apply row limit and skip if specified
+    if !isinf(n_max) || skip > 0
+        start_row = skip + 1
+        end_row = !isinf(n_max) ? start_row + n_max - 1 : nrow(df)
+        df = df[start_row:min(end_row, nrow(df)), :]
+    end
+
+    return df
+end
+
+"""
+$docstring_write_arrow
+"""
+function write_arrow(tbl, file::String; append=false, compress=:lz4, alignment=8, 
+                        dictencode=false, dictencodenested=false, denseunions=true, 
+                        largelists=false, maxdepth=6, num_threads=Threads.nthreads())
+
+
+        # Prepare keyword arguments for Arrow.write
+        write_options = Dict(
+        # :compress => compressor,
+            :alignment => alignment,
+            :dictencode => dictencode,
+            :dictencodenested => dictencodenested,
+            :denseunions => denseunions,
+            :largelists => largelists,
+            :maxdepth => maxdepth,
+            :ntasks => num_threads
+        )
+
+        # Write the data to file
+        if append
+            # Open the file in append mode and write
+            open(file, "a") do io
+                Arrow.write(io, tbl; write_options..., file=true)
+            end
+        else
+            # Write directly to file, creating or overwriting by default
+            Arrow.write(file, tbl; write_options...)
+        end
+end
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -319,7 +319,7 @@ const docstring_write_xlsx =
     write_xlsx(x; path, overwrite)
 Write a DataFrame, or multiple DataFrames, to an Excel file.
 
-#Arguments
+# Arguments
 -`x`: The data to write. Can be a single Pair{String, DataFrame} for writing one sheet, or a Tuple of such pairs for writing multiple sheets. The String in each pair specifies the sheet name, and the DataFrame is the data to write to that sheet.
 -`path`: The path to the Excel file where the data will be written.
 -`overwrite`: Defaults to false. Whether to overwrite an existing file. If false, an error is thrown when attempting to write to an existing file.
@@ -525,4 +525,88 @@ julia> write_dta(df, "test.dta")
    1 │    sav      10.1
    2 │    por      10.2
 ```
-"""
+"""
+
+const docstring_write_arrow =
+"""
+    write_arrow(df, path)
+Write a DataFrame to an Arrow (.arrow) file.
+Arguments
+-`df`: The DataFrame to be written to a file.
+-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten.
+# Examples
+```jldoctest 
+julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]);
+
+julia> write_arrow(df , "test.arrow");
+```
+"""
+
+const docstring_read_arrow =
+"""
+    read_arrow(df, path)
+Read an Arrow file (.arrow) to a DataFrame.
+Arguments
+-`df`: The DataFrame to be written to a file.
+-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten.
+`skip`: Number of initial lines to skip before reading data. Default is 0.
+`n_max`: Maximum number of rows to read. Default is Inf (read all rows).
+-`col_select`: Optional vector of symbols or strings to select which columns to load.
+# Examples
+```jldoctest 
+julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]);
+
+julia> write_arrow(df , "test.arrow");
+
+julia> read_arrow("test.arrow")
+2×2 DataFrame
+ Row │ AA      AB      
+     │ String  Float64 
+─────┼─────────────────
+   1 │ Arr        10.1
+   2 │ ow         10.2
+```
+"""
+
+const docstring_write_parquet =
+"""
+    write_parquet(df, )
+Write a DataFrame to an Parquet (.parquet) file.
+Arguments
+-`df`: The DataFrame to be written to a file.
+-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten.
+# Examples
+```jldoctest 
+julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]);
+
+julia> write_parquet(df, "test.parquet");
+```
+"""
+
+const docstring_read_parquet =
+"""
+    read_parquet(df, path)
+Read a Paquet File (.parquet) to a DataFrame..
+Arguments
+-`df`: The DataFrame to be written to a file.
+-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten.
+`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true.
+`skip`: Number of initial lines to skip before reading data. Default is 0.
+`n_max`: Maximum number of rows to read. Default is Inf (read all rows).
+-`col_select`: Optional vector of symbols or strings to select which columns to load.
+# Examples
+```jldoctest 
+julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]);
+
+julia> write_parquet(df, "test.parquet");
+
+julia> read_parquet("test.parquet")
+2×2 DataFrame
+ Row │ AA      AB      
+     │ String  Float64 
+─────┼─────────────────
+   1 │ Par        10.1
+   2 │ quet       10.2
+```
+"""
+
diff --git a/src/parquet_files.jl b/src/parquet_files.jl
@@ -0,0 +1,75 @@
+"""
+$docstring_read_parquet
+"""
+function read_parquet(data_file;
+                      col_select=nothing,
+                      skip=0,
+                      n_max=Inf,
+                      col_names=true)  # Handle column names display
+    # Determine if the file is a local file or a URL
+    if startswith(data_file, "http://") || startswith(data_file, "https://")
+        # Fetch the content from the URL
+        response = HTTP.get(data_file)
+
+        # Ensure the request was successful
+        if response.status != 200
+            error("Failed to fetch the Parquet file: HTTP status code ", response.status)
+        end
+
+        # Use the content fetched from the URL as an IOBuffer for reading
+        file_to_read = IOBuffer(response.body)
+    else
+        # Use the local file path
+        file_to_read = data_file
+    end
+
+    # Open the dataset
+    ds = Parquet2.Dataset(file_to_read)
+    df = DataFrame(ds; copycols=false)  # Load the entire dataset initially
+
+    # Apply column selection if provided
+    if !isnothing(col_select)
+        # Ensure column names are in the correct format
+        col_select = [typeof(c) === Symbol ? string(c) : c for c in col_select]
+        df = select(df, col_select)
+    end
+
+    # Apply skip and limit
+    if skip > 0 || !isinf(n_max)
+        start_idx = max(1, skip + 1)
+        end_idx = !isinf(n_max) ? start_idx + n_max - 1 : nrow(df)
+        df = df[start_idx:min(end_idx, nrow(df)), :]
+    end
+
+    # If column names should not be displayed as headers
+    if !col_names
+        # Create a DataFrame with the original column names as the first row
+        col_names_df = DataFrame([transpose(names(df))], [:ColumnNames])
+        # Concatenate the DataFrame with column names as the first row
+        df = vcat(col_names_df, df)
+        # Rename columns to generic names
+        rename!(df, Symbol.(:Column, 1:ncol(df)))
+    end
+
+    return df
+end
+
+"""
+$docstring_write_parquet
+"""
+function write_parquet(data, filename::String; buffer::Union{IO, Nothing}=nothing, 
+    npages::Union{Int, Dict}=1, 
+    compression_codec::Union{Symbol, Dict}=Dict(), 
+    column_metadata::Union{Dict, Pair}=Dict(),
+    metadata::Dict=Dict())
+        # Choose the appropriate method to write data based on `buffer` presence
+        if isnothing(buffer)
+        # Write directly to file with options
+        Parquet2.writefile(filename, data; npages=npages, compression_codec=compression_codec, 
+        column_metadata=column_metadata, metadata=metadata)
+    else
+    # Write to the provided buffer
+        Parquet2.writefile(buffer, data; npages=npages, compression_codec=compression_codec, 
+        column_metadata=column_metadata, metadata=metadata)
+    end
+end