diff --git a/Project.toml b/Project.toml index aa8e1be..505cb3d 100644 --- a/Project.toml +++ b/Project.toml @@ -4,17 +4,18 @@ authors = ["Daniel Rizk and contributors"] version = "0.1.0" [deps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d" ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" [compat] -julia = "1.9" CSV = "0.10" DataFrames = "1.5" Dates = "1.9" @@ -23,6 +24,7 @@ HTTP = "1.10" ReadStatTables = "0.3" Reexport = "0.2, 1" XLSX = "0.10" +julia = "1.9" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index 52840e5..b4c9f15 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ TidierFiles.jl is a 100% Julia implementation of the readr, haven, readxl, and writexl R packages. -Powered by the CSV.jl, XLSX.jl and ReadStatTables.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats. +Powered by the CSV.jl, XLSX.jl, ReadStatTables.jl, Arrow.jl, and Parquet2.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats. Currently supported file types: @@ -22,6 +22,8 @@ Currently supported file types: - `read_sav` and `write_sav` (.sav and .por) - `read_sas` and `write_sas` (.sas7bdat and .xpt) - `read_dta` and `write_dta` (.dta) +- `read_arrow` and `write_arrow` +- `read_parquet` and `write_parquet` # Examples diff --git a/docs/examples/UserGuide/Arrow.jl b/docs/examples/UserGuide/Arrow.jl new file mode 100644 index 0000000..d04dd7e --- /dev/null +++ b/docs/examples/UserGuide/Arrow.jl @@ -0,0 +1,19 @@ +# Arrow file reading and writing is powered by Arrow.jl +# ## `read_arrow` +# read_arrow(path; skip=0, n_max=Inf, col_select=nothing) + +# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are: + +# - `path`: The path to the .parquet file. +# - `skip`: Number of initial rows to skip before reading data. Default is 0. +# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). +# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns). + +# ## `write_arrow` +# `write_arrow(df, path)` + +# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are: + +# - `df`: The DataFrame to be written to a file. +# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten. +# - Additional arguments for writing arrow files are not outlined here, but should be available through the same interface of `Arrow.write`. Refer to Arrow.jl [documentation](https://arrow.apache.org/julia/stable/manual/#Arrow.write) at their page for further explanation. \ No newline at end of file diff --git a/docs/examples/UserGuide/parquet.jl b/docs/examples/UserGuide/parquet.jl new file mode 100644 index 0000000..d636dd6 --- /dev/null +++ b/docs/examples/UserGuide/parquet.jl @@ -0,0 +1,20 @@ +# Parquet file reading and writing is powered by Parquet2.jl +# ## `read_parquet` +# read_parquet(path; col_names=true, skip=0, n_max=Inf, col_select=nothing) + +# This function reads a Parquet (.parquet) file into a DataFrame. The arguments are: + +# - `path`: The path to the .parquet file. +# - `col_names`: Indicates if the first row of the file is used as column names. Default is `true`. +# - `skip`: Number of initial rows to skip before reading data. Default is 0. +# - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). +# - `col_select`: Optional vector of symbols or strings to select which columns to load. Default is `nothing` (load all columns). + +# ## `write_parquet` +# `write_parquet(df, path)` + +# This function writes a DataFrame to a Parquet (.parquet) file. The arguments are: + +# - `df`: The DataFrame to be written to a file. +# - `path`: The path where the .parquet file will be created. If a file at this path already exists, it will be overwritten. +# - Additional arguments for writing parquet files are not outlined here, but should be available through the same interface of `Parquet2.writefile`. Refer to [documentation](https://expandingman.gitlab.io/Parquet2.jl/#Writing-Data) at their page for further explanation. \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c6f3a86..e3917f2 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -119,4 +119,6 @@ nav: - "Delimited Files": "examples/generated/UserGuide/delim.md" - "Excel Files": "examples/generated/UserGuide/xl.md" - "Stats Files": "examples/generated/UserGuide/stats.md" + - "Arrow Files": "examples/generated/UserGuide/Arrow.md" + - "Parquet Files": "examples/generated/UserGuide/parquet.md" - "Reference" : "reference.md" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index cbeb60a..169c788 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,10 +1,12 @@ # TidierFiles.jl + + ## What is TidierFiles.jl? TidierFiles.jl is a 100% Julia implementation of the readr, haven, readxl, and writexl R packages. -Powered by the CSV.jl, XLSX.jl and ReadStatTables.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats. +Powered by the CSV.jl, XLSX.jl, ReadStatTables.jl, Arrow.jl and Parquet2.jl packages, TidierFiles.jl aims to bring a consistent interface to the reading and writing of tabular data, including a consistent syntax to read files locally versus from the web and consistent keyword arguments across data formats. Currently supported file types: @@ -17,6 +19,8 @@ Currently supported file types: - `read_sav` and `write_sav` (.sav and .por) - `read_sas` and `write_sas` (.sas7bdat and .xpt) - `read_dta` and `write_dta` (.dta) +- `read_arrow` and `write_arrow` +- `read_parquet` and `write_parquet` # Examples diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl index ead47f9..00dbc28 100644 --- a/src/TidierFiles.jl +++ b/src/TidierFiles.jl @@ -7,18 +7,22 @@ using Dates #bc XLSX type parsing does not seem to be working so i made some aut using HTTP using ReadStatTables using Reexport +using Parquet2 +using Arrow @reexport using DataFrames: DataFrame export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, - write_dta + write_dta, read_arrow, write_arrow, read_parquet, write_parquet include("docstrings.jl") include("fwf.jl") include("xlfiles.jl") include("statsfiles.jl") +include("parquet_files.jl") +include("arrow_files.jl") """ $docstring_read_csv diff --git a/src/arrow_files.jl b/src/arrow_files.jl new file mode 100644 index 0000000..7cd4da7 --- /dev/null +++ b/src/arrow_files.jl @@ -0,0 +1,73 @@ +""" +$docstring_read_arrow +""" +function read_arrow(data_file; + col_select=nothing, + skip=0, + n_max=Inf) + # Determine if the file is a local file or a URL + if startswith(data_file, "http://") || startswith(data_file, "https://") + # Fetch the content from the URL + response = HTTP.get(data_file) + + # Ensure the request was successful + if response.status != 200 + error("Failed to fetch the Arrow file: HTTP status code ", response.status) + end + + # Use the content fetched from the URL as an IOBuffer for reading + file_to_read = IOBuffer(response.body) + else + # Use the local file path + file_to_read = data_file + end + + # Load the Arrow file into a DataFrame directly + df = DataFrame(Arrow.Table(file_to_read); copycols=false) + + # Apply column selection if specified + if !isnothing(col_select) + df = select(df, col_select) # Use the select function for safe column selection + end + + # Apply row limit and skip if specified + if !isinf(n_max) || skip > 0 + start_row = skip + 1 + end_row = !isinf(n_max) ? start_row + n_max - 1 : nrow(df) + df = df[start_row:min(end_row, nrow(df)), :] + end + + return df +end + +""" +$docstring_write_arrow +""" +function write_arrow(tbl, file::String; append=false, compress=:lz4, alignment=8, + dictencode=false, dictencodenested=false, denseunions=true, + largelists=false, maxdepth=6, num_threads=Threads.nthreads()) + + + # Prepare keyword arguments for Arrow.write + write_options = Dict( + # :compress => compressor, + :alignment => alignment, + :dictencode => dictencode, + :dictencodenested => dictencodenested, + :denseunions => denseunions, + :largelists => largelists, + :maxdepth => maxdepth, + :ntasks => num_threads + ) + + # Write the data to file + if append + # Open the file in append mode and write + open(file, "a") do io + Arrow.write(io, tbl; write_options..., file=true) + end + else + # Write directly to file, creating or overwriting by default + Arrow.write(file, tbl; write_options...) + end +end \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index f08637f..ad1df95 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -319,7 +319,7 @@ const docstring_write_xlsx = write_xlsx(x; path, overwrite) Write a DataFrame, or multiple DataFrames, to an Excel file. -#Arguments +# Arguments -`x`: The data to write. Can be a single Pair{String, DataFrame} for writing one sheet, or a Tuple of such pairs for writing multiple sheets. The String in each pair specifies the sheet name, and the DataFrame is the data to write to that sheet. -`path`: The path to the Excel file where the data will be written. -`overwrite`: Defaults to false. Whether to overwrite an existing file. If false, an error is thrown when attempting to write to an existing file. @@ -525,4 +525,88 @@ julia> write_dta(df, "test.dta") 1 │ sav 10.1 2 │ por 10.2 ``` -""" \ No newline at end of file +""" + +const docstring_write_arrow = +""" + write_arrow(df, path) +Write a DataFrame to an Arrow (.arrow) file. +Arguments +-`df`: The DataFrame to be written to a file. +-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Examples +```jldoctest +julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]); + +julia> write_arrow(df , "test.arrow"); +``` +""" + +const docstring_read_arrow = +""" + read_arrow(df, path) +Read an Arrow file (.arrow) to a DataFrame. +Arguments +-`df`: The DataFrame to be written to a file. +-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +`skip`: Number of initial lines to skip before reading data. Default is 0. +`n_max`: Maximum number of rows to read. Default is Inf (read all rows). +-`col_select`: Optional vector of symbols or strings to select which columns to load. +# Examples +```jldoctest +julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]); + +julia> write_arrow(df , "test.arrow"); + +julia> read_arrow("test.arrow") +2×2 DataFrame + Row │ AA AB + │ String Float64 +─────┼───────────────── + 1 │ Arr 10.1 + 2 │ ow 10.2 +``` +""" + +const docstring_write_parquet = +""" + write_parquet(df, ) +Write a DataFrame to an Parquet (.parquet) file. +Arguments +-`df`: The DataFrame to be written to a file. +-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Examples +```jldoctest +julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]); + +julia> write_parquet(df, "test.parquet"); +``` +""" + +const docstring_read_parquet = +""" + read_parquet(df, path) +Read a Paquet File (.parquet) to a DataFrame.. +Arguments +-`df`: The DataFrame to be written to a file. +-`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. +`skip`: Number of initial lines to skip before reading data. Default is 0. +`n_max`: Maximum number of rows to read. Default is Inf (read all rows). +-`col_select`: Optional vector of symbols or strings to select which columns to load. +# Examples +```jldoctest +julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]); + +julia> write_parquet(df, "test.parquet"); + +julia> read_parquet("test.parquet") +2×2 DataFrame + Row │ AA AB + │ String Float64 +─────┼───────────────── + 1 │ Par 10.1 + 2 │ quet 10.2 +``` +""" + diff --git a/src/parquet_files.jl b/src/parquet_files.jl new file mode 100644 index 0000000..84b002c --- /dev/null +++ b/src/parquet_files.jl @@ -0,0 +1,75 @@ +""" +$docstring_read_parquet +""" +function read_parquet(data_file; + col_select=nothing, + skip=0, + n_max=Inf, + col_names=true) # Handle column names display + # Determine if the file is a local file or a URL + if startswith(data_file, "http://") || startswith(data_file, "https://") + # Fetch the content from the URL + response = HTTP.get(data_file) + + # Ensure the request was successful + if response.status != 200 + error("Failed to fetch the Parquet file: HTTP status code ", response.status) + end + + # Use the content fetched from the URL as an IOBuffer for reading + file_to_read = IOBuffer(response.body) + else + # Use the local file path + file_to_read = data_file + end + + # Open the dataset + ds = Parquet2.Dataset(file_to_read) + df = DataFrame(ds; copycols=false) # Load the entire dataset initially + + # Apply column selection if provided + if !isnothing(col_select) + # Ensure column names are in the correct format + col_select = [typeof(c) === Symbol ? string(c) : c for c in col_select] + df = select(df, col_select) + end + + # Apply skip and limit + if skip > 0 || !isinf(n_max) + start_idx = max(1, skip + 1) + end_idx = !isinf(n_max) ? start_idx + n_max - 1 : nrow(df) + df = df[start_idx:min(end_idx, nrow(df)), :] + end + + # If column names should not be displayed as headers + if !col_names + # Create a DataFrame with the original column names as the first row + col_names_df = DataFrame([transpose(names(df))], [:ColumnNames]) + # Concatenate the DataFrame with column names as the first row + df = vcat(col_names_df, df) + # Rename columns to generic names + rename!(df, Symbol.(:Column, 1:ncol(df))) + end + + return df +end + +""" +$docstring_write_parquet +""" +function write_parquet(data, filename::String; buffer::Union{IO, Nothing}=nothing, + npages::Union{Int, Dict}=1, + compression_codec::Union{Symbol, Dict}=Dict(), + column_metadata::Union{Dict, Pair}=Dict(), + metadata::Dict=Dict()) + # Choose the appropriate method to write data based on `buffer` presence + if isnothing(buffer) + # Write directly to file with options + Parquet2.writefile(filename, data; npages=npages, compression_codec=compression_codec, + column_metadata=column_metadata, metadata=metadata) + else + # Write to the provided buffer + Parquet2.writefile(buffer, data; npages=npages, compression_codec=compression_codec, + column_metadata=column_metadata, metadata=metadata) + end +end \ No newline at end of file diff --git a/src/testing.jl b/src/testing.jl deleted file mode 100644 index 46344b3..0000000 --- a/src/testing.jl +++ /dev/null @@ -1,82 +0,0 @@ -xl_path = "/Users/danielrizk/Downloads/Assignment_Datasets/import.xlsx" -read_excel(xl_path) - -df1 = DataFrames.DataFrame(COL1=[10,20,30], COL2=["First", "Second", "Third"]) -df2 = DataFrames.DataFrame(AA=["sav", "por"], AB=[10.1, 10.2]) - - -write_xlsx(("REPORT_A" => df1, "REPORT_B" => df2); path="/Users/danielrizk/Downloads/report.xlsx", overwrite = true) -read_excel("/Users/danielrizk/Downloads/report.xlsx", sheet = "REPORT_B", skip = 1, n_max = 4, missingstring = [10.2]) -write_xlsx("REPORT_A" => df1; path="multi_sheet_report.xlsx") - -XLSX.writetable("/Users/danielrizk/Downloads/report.xlsx", sheets) - - -XLSX.writetable("/Users/danielrizk/Downloads/report.xlsx", "REPORT_A" => df1, "REPORT_B" => df2) - -csv_path ="/Users/danielrizk/Downloads/TidierDB.jl/mtcars.csv" -read_csv(csv_path) - - -df = DataFrame(integers=[1, 2, 3, 4], strings=["This", "Package makes", "File reading/writing", "even smoother"], floats=[10.2, 20.3, 30.4, 40.5], dates=[Date(2018,2,20), Date(2018,2,21), Date(2018,2,22), Date(2018,2,23)], times=[Dates.Time(19,10), Dates.Time(19,20), Dates.Time(19,30), Dates.Time(19,40)], datetimes=[Dates.DateTime(2018,5,20,19,10), Dates.DateTime(2018,5,20,19,20), Dates.DateTime(2018,5,20,19,30), Dates.DateTime(2018,5,20,19,40)]) -df1 = DataFrames.DataFrame(COL1=[10,20,30], COL2=["First", "Second", "Third"]) - -mtcarsastsv = read_csv(csv_path, col_names = true) -write_tsv(mtcarsastsv, "/Users/danielrizk/Downloads/mtcars.tsv" ) -read_tsv("/Users/danielrizk/Downloads/mtcars.tsv", num_threads = 5) -write_csv(mtcars, "/Users/danielrizk/Downloads/mtcars.csv") - -read_csv("/Users/danielrizk/Downloads/mtcars.csv", col_names = true, num_threads = 5, missingstring = ["4"]) -read_delim("/Users/danielrizk/Downloads/mtcars.tsv", delim = "\t") -read_delim("/Users/danielrizk/Downloads/mtcars.csv", delim = ",") - -read_csv("/Users/danielrizk/Downloads/mtcars.tsv") -df = DataFrame(integers=[1, 2, 3, 4], strings=["This", "Package makes", "File reading/writing", "even smoother"], floats=[10.2, 20.3, 30.4, 40.5], dates=[Date(2018,2,20), Date(2018,2,21), Date(2018,2,22), Date(2018,2,23)], times=[Dates.Time(19,10), Dates.Time(19,20), Dates.Time(19,30), Dates.Time(19,40)], datetimes=[Dates.DateTime(2018,5,20,19,10), Dates.DateTime(2018,5,20,19,20), Dates.DateTime(2018,5,20,19,30), Dates.DateTime(2018,5,20,19,40)]) -write_csv(df, "/Users/danielrizk/Downloads/testing.csv" , col_names= true, num_threads = 2) -read_csv("/Users/danielrizk/Downloads/testing.csv", missingstring=["40.5", "10.2"]) - -tsv_path = "/Users/danielrizk/Downloads/pythonsratch/UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv" - - -read_excel("https://freetestdata.com/wp-content/uploads/2021/09/Free_Test_Data_100KB_XLSX.xlsx") -read_tsv("/Users/danielrizk/opt/anaconda3/pkgs/gensim-4.1.2-py39he9d5cce_0/lib/python3.9/site-packages/gensim/test/test_data/wordsim353.tsv") -read_tsv(tsv_path,col_names = false) - - -read_fwf("/Users/danielrizk/Downloads/fwftest.txt") -read_table("/Users/danielrizk/Downloads/fwftest.txt", col_names= false) - -read_csv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv", skip = 4, missingstring = ["1"]) -read_tsv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv", skip = 4, missingstring = ["1"]) -read_delim("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv", skip = 4, missingstring = ["1"]) - -write_table(df, "/Users/danielrizk/Downloads/fwftest2.txt") -read_table( "/Users/danielrizk/Downloads/fwftest2.txt") - - -read_sas("/Users/danielrizk/Downloads/naws_all.sas7bdat", skip = 10, n_max=44 ) - -read_sav("/Users/danielrizk/Downloads/naws_all.sav", skip = 10, n_max=44) - -read_dta("/Users/danielrizk/Downloads/naws_all.dta", skip = 15, n_max=44, num_threads = 10) - -writestat("/Users/danielrizk/Downloads/test.dta", df) -using ReadStatTables -read_dta("https://www.dol.gov/sites/dolgov/files/ETA/naws/pdfs/NAWS_EPA.zip") - -readstat -using HTTP -col_names = ["Name", "Age", "ID", "Position", "Salary"] -df2 -widths_colnames = fwf_empty(path, num_lines=4, col_names = ["Name", "Age", "ID", "Position", "Salary"]) -read_fwf(path, fwf_empty(path, num_lines=4, col_names = ["Name", "Age", "ID", "Position", "Salary"]), skip_to=3, n_max=3) - -read_fwf("testing_files/fwftest.txt", fwf_empty("testing_files/fwftest.txt", num_lines= 4)) -fwf_empty("testing_files/fwftest.txt") -df = DataFrames.DataFrame(AA=["sav", "por"], AB=[10.1, 10.2]); - -write_sav(df2 , "/Users/danielrizk/Downloads/test2.sav") -write_sav(df2 , "/Users/danielrizk/Downloads/test2.por") - - -read_dta( "/Users/danielrizk/Downloads/test2.dta") \ No newline at end of file