-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from TidierOrg/adds-arrow/parquet
- Loading branch information
Showing
11 changed files
with
291 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,17 +4,18 @@ authors = ["Daniel Rizk <[email protected]> and contributors"] | |
version = "0.1.0" | ||
|
||
[deps] | ||
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" | ||
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" | ||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" | ||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" | ||
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" | ||
Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d" | ||
ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" | ||
Reexport = "189a3867-3050-52da-a836-e630ba90ab69" | ||
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" | ||
|
||
[compat] | ||
julia = "1.9" | ||
CSV = "0.10" | ||
DataFrames = "1.5" | ||
Dates = "1.9" | ||
|
@@ -23,6 +24,7 @@ HTTP = "1.10" | |
ReadStatTables = "0.3" | ||
Reexport = "0.2, 1" | ||
XLSX = "0.10" | ||
julia = "1.9" | ||
|
||
[extras] | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Arrow file reading and writing is powered by Arrow.jl | ||
# ## `read_arrow` | ||
# read_arrow(path; skip=0, n_max=Inf, col_select=nothing) | ||
|
||
# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are: | ||
|
||
# - `path`: The path to the .parquet file. | ||
# - `skip`: Number of initial rows to skip before reading data. Default is 0. | ||
# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). | ||
# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns). | ||
|
||
# ## `write_arrow` | ||
# `write_arrow(df, path)` | ||
|
||
# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are: | ||
|
||
# - `df`: The DataFrame to be written to a file. | ||
# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten. | ||
# - Additional arguments for writing arrow files are not outlined here, but should be available through the same interface of `Arrow.write`. Refer to Arrow.jl [documentation](https://arrow.apache.org/julia/stable/manual/#Arrow.write) at their page for further explanation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Parquet file reading and writing is powered by Parquet2.jl | ||
# ## `read_parquet` | ||
# read_parquet(path; col_names=true, skip=0, n_max=Inf, col_select=nothing) | ||
|
||
# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are: | ||
|
||
# - `path`: The path to the .parquet file. | ||
# - `col_names`: Indicates if the first row of the file is used as column names. Default is `true`. | ||
# - `skip`: Number of initial rows to skip before reading data. Default is 0. | ||
# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). | ||
# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns). | ||
|
||
# ## `write_parquet` | ||
# `write_parquet(df, path)` | ||
|
||
# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are: | ||
|
||
# - `df`: The DataFrame to be written to a file. | ||
# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten. | ||
# - Additional arguments for writing parquet files are not outlined here, but should be available through the same interface of `Parquet2.writefile`. Refer to [documentation](https://expandingman.gitlab.io/Parquet2.jl/#Writing-Data) at their page for further explanation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
""" | ||
$docstring_read_arrow | ||
""" | ||
function read_arrow(data_file; | ||
col_select=nothing, | ||
skip=0, | ||
n_max=Inf) | ||
# Determine if the file is a local file or a URL | ||
if startswith(data_file, "http://") || startswith(data_file, "https://") | ||
# Fetch the content from the URL | ||
response = HTTP.get(data_file) | ||
|
||
# Ensure the request was successful | ||
if response.status != 200 | ||
error("Failed to fetch the Arrow file: HTTP status code ", response.status) | ||
end | ||
|
||
# Use the content fetched from the URL as an IOBuffer for reading | ||
file_to_read = IOBuffer(response.body) | ||
else | ||
# Use the local file path | ||
file_to_read = data_file | ||
end | ||
|
||
# Load the Arrow file into a DataFrame directly | ||
df = DataFrame(Arrow.Table(file_to_read); copycols=false) | ||
|
||
# Apply column selection if specified | ||
if !isnothing(col_select) | ||
df = select(df, col_select) # Use the select function for safe column selection | ||
end | ||
|
||
# Apply row limit and skip if specified | ||
if !isinf(n_max) || skip > 0 | ||
start_row = skip + 1 | ||
end_row = !isinf(n_max) ? start_row + n_max - 1 : nrow(df) | ||
df = df[start_row:min(end_row, nrow(df)), :] | ||
end | ||
|
||
return df | ||
end | ||
|
||
""" | ||
$docstring_write_arrow | ||
""" | ||
function write_arrow(tbl, file::String; append=false, compress=:lz4, alignment=8, | ||
dictencode=false, dictencodenested=false, denseunions=true, | ||
largelists=false, maxdepth=6, num_threads=Threads.nthreads()) | ||
|
||
|
||
# Prepare keyword arguments for Arrow.write | ||
write_options = Dict( | ||
# :compress => compressor, | ||
:alignment => alignment, | ||
:dictencode => dictencode, | ||
:dictencodenested => dictencodenested, | ||
:denseunions => denseunions, | ||
:largelists => largelists, | ||
:maxdepth => maxdepth, | ||
:ntasks => num_threads | ||
) | ||
|
||
# Write the data to file | ||
if append | ||
# Open the file in append mode and write | ||
open(file, "a") do io | ||
Arrow.write(io, tbl; write_options..., file=true) | ||
end | ||
else | ||
# Write directly to file, creating or overwriting by default | ||
Arrow.write(file, tbl; write_options...) | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
""" | ||
$docstring_read_parquet | ||
""" | ||
function read_parquet(data_file; | ||
col_select=nothing, | ||
skip=0, | ||
n_max=Inf, | ||
col_names=true) # Handle column names display | ||
# Determine if the file is a local file or a URL | ||
if startswith(data_file, "http://") || startswith(data_file, "https://") | ||
# Fetch the content from the URL | ||
response = HTTP.get(data_file) | ||
|
||
# Ensure the request was successful | ||
if response.status != 200 | ||
error("Failed to fetch the Parquet file: HTTP status code ", response.status) | ||
end | ||
|
||
# Use the content fetched from the URL as an IOBuffer for reading | ||
file_to_read = IOBuffer(response.body) | ||
else | ||
# Use the local file path | ||
file_to_read = data_file | ||
end | ||
|
||
# Open the dataset | ||
ds = Parquet2.Dataset(file_to_read) | ||
df = DataFrame(ds; copycols=false) # Load the entire dataset initially | ||
|
||
# Apply column selection if provided | ||
if !isnothing(col_select) | ||
# Ensure column names are in the correct format | ||
col_select = [typeof(c) === Symbol ? string(c) : c for c in col_select] | ||
df = select(df, col_select) | ||
end | ||
|
||
# Apply skip and limit | ||
if skip > 0 || !isinf(n_max) | ||
start_idx = max(1, skip + 1) | ||
end_idx = !isinf(n_max) ? start_idx + n_max - 1 : nrow(df) | ||
df = df[start_idx:min(end_idx, nrow(df)), :] | ||
end | ||
|
||
# If column names should not be displayed as headers | ||
if !col_names | ||
# Create a DataFrame with the original column names as the first row | ||
col_names_df = DataFrame([transpose(names(df))], [:ColumnNames]) | ||
# Concatenate the DataFrame with column names as the first row | ||
df = vcat(col_names_df, df) | ||
# Rename columns to generic names | ||
rename!(df, Symbol.(:Column, 1:ncol(df))) | ||
end | ||
|
||
return df | ||
end | ||
|
||
""" | ||
$docstring_write_parquet | ||
""" | ||
function write_parquet(data, filename::String; buffer::Union{IO, Nothing}=nothing, | ||
npages::Union{Int, Dict}=1, | ||
compression_codec::Union{Symbol, Dict}=Dict(), | ||
column_metadata::Union{Dict, Pair}=Dict(), | ||
metadata::Dict=Dict()) | ||
# Choose the appropriate method to write data based on `buffer` presence | ||
if isnothing(buffer) | ||
# Write directly to file with options | ||
Parquet2.writefile(filename, data; npages=npages, compression_codec=compression_codec, | ||
column_metadata=column_metadata, metadata=metadata) | ||
else | ||
# Write to the provided buffer | ||
Parquet2.writefile(buffer, data; npages=npages, compression_codec=compression_codec, | ||
column_metadata=column_metadata, metadata=metadata) | ||
end | ||
end |
Oops, something went wrong.