From 685107fcfdcff594e4fe122203a6c5f6796a7865 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Mon, 11 Nov 2024 19:50:45 -0500 Subject: [PATCH 1/4] adds `col_types` and all args for csv.jl based readers --- src/TidierFiles.jl | 170 ++++++++++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 73 deletions(-) diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl index f75bc03..08c5b4c 100644 --- a/src/TidierFiles.jl +++ b/src/TidierFiles.jl @@ -31,6 +31,7 @@ $docstring_read_csv """ function read_csv(files; delim=',', + decimal='.', col_names=true, skip=0, n_max=Inf, @@ -39,7 +40,9 @@ function read_csv(files; missingstring="", escape_double=true, ntasks::Int = Threads.nthreads(), - num_threads::Union{Int, Nothing}=nothing) + num_threads::Union{Int, Nothing}=nothing, + col_types = nothing, + kwargs...) # Catch any other keyword arguments # Normalize input to always be a vector of files file_list = (typeof(files) <: AbstractString) ? [files] : files @@ -53,22 +56,25 @@ function read_csv(files; # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare CSV reading options - read_options = ( - delim = delim, - header = col_names === true ? 1 : 0, - skipto = skipto + 1, - footerskip = 0, - select = col_select, - limit = limit, - comment = comment, - missingstring = missingstring, - escapechar = escape_double ? '"' : '\\', - quotechar = '"', - normalizenames = false, - ntasks = effective_ntasks > 1 + read_options = Dict( + :delim => delim, + :decimal => decimal, + :header => col_names === true ? 1 : 0, + :skipto => skipto + 1, + :footerskip => 0, + :select => col_select, + :limit => limit, + :comment => comment, + :missingstring => missingstring, + :escapechar => escape_double ? '"' : '\\', + :quotechar => '"', + :ntasks => effective_ntasks > 1, + :types => col_types ) + # Merge additional keyword arguments into the read_options dictionary + merge!(read_options, kwargs) + # Initialize an empty DataFrame final_df = DataFrame() @@ -94,22 +100,24 @@ function read_csv(files; return final_df end + """ $docstring_read_delim """ function read_delim(files; - delim='\t', - decimal='.', - col_names=true, - skip=0, - n_max=Inf, - groupmark=nothing, - col_select=nothing, - comment=nothing, - missingstring="", - escape_double=true, - ntasks::Int = Threads.nthreads(), - num_threads::Union{Int, Nothing}=nothing) + delim=',', + decimal='.', + col_names=true, + skip=0, + n_max=Inf, + col_select=nothing, + comment=nothing, + missingstring="", + escape_double=true, + ntasks::Int = Threads.nthreads(), + num_threads::Union{Int, Nothing}=nothing, + col_types = nothing, + kwargs...) # Normalize input to always be a vector of files file_list = (typeof(files) <: AbstractString) ? [files] : files @@ -124,22 +132,23 @@ function read_delim(files; skipto = skip + (col_names === true ? 1 : 0) # Prepare CSV reading options - read_options = ( - delim = delim, - decimal = decimal, - header = col_names === true ? 1 : 0, - skipto = skipto + 1, - select = col_select, - groupmark = groupmark, - footerskip = 0, - limit = limit, - comment = comment, - missingstring = missingstring, - escapechar = escape_double ? '"' : '\\', - quotechar = '"', - normalizenames = false, - ntasks = effective_ntasks > 1 + read_options = Dict( + :delim => delim, + :decimal => decimal, + :header => col_names === true ? 1 : 0, + :skipto => skipto + 1, + :select => col_select, + :footerskip => 0, + :limit => limit, + :comment => comment, + :missingstring => missingstring, + :escapechar => escape_double ? '"' : '\\', + :quotechar => '"', + :normalizenames => false, + :ntasks => effective_ntasks > 1, + :types => col_types ) + merge!(read_options, kwargs) # Initialize an empty DataFrame final_df = DataFrame() @@ -172,6 +181,7 @@ $docstring_read_tsv """ function read_tsv(files; delim='\t', + decimal='.', col_names=true, skip=0, n_max=Inf, @@ -180,7 +190,10 @@ function read_tsv(files; missingstring="", escape_double=true, ntasks::Int = Threads.nthreads(), - num_threads::Union{Int, Nothing}=nothing) + num_threads::Union{Int, Nothing}=nothing, + col_types = nothing, + groupmark=nothing, + kwargs...) # Normalize input to always be a vector of files file_list = (typeof(files) <: AbstractString) ? [files] : files @@ -194,22 +207,27 @@ function read_tsv(files; # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare CSV reading options - read_options = ( - delim = delim, - header = col_names === true ? 1 : 0, - skipto = skipto + 1, - footerskip = 0, - limit = limit, - select = col_select, - comment = comment, - missingstring = missingstring, - escapechar = escape_double ? '"' : '\\', - quotechar = '"', - normalizenames = false, - ntasks = effective_ntasks > 1 + read_options = Dict( + :delim => delim, + :decimal => decimal, + :header => col_names === true ? 1 : 0, + :skipto => skipto + 1, + :select => col_select, + :groupmark => groupmark, + :footerskip => 0, + :limit => limit, + :comment => comment, + :missingstring => missingstring, + :escapechar => escape_double ? '"' : '\\', + :quotechar => '"', + :normalizenames => false, + :ntasks => effective_ntasks > 1, + :types => col_types ) + # Merge additional keyword arguments into the read_options dictionary + merge!(read_options, kwargs) + # Initialize an empty DataFrame final_df = DataFrame() @@ -251,7 +269,10 @@ function read_csv2(files; missingstring="", escape_double=true, ntasks::Int = Threads.nthreads(), - num_threads::Union{Int, Nothing}=nothing) + num_threads::Union{Int, Nothing}=nothing, + col_types = nothing, + kwargs... + ) # Normalize input to always be a vector of files file_list = (typeof(files) <: AbstractString) ? [files] : files @@ -266,22 +287,25 @@ function read_csv2(files; skipto = skip + (col_names === true ? 1 : 0) # Prepare CSV reading options - read_options = ( - delim = delim, - decimal = decimal, - header = col_names === true ? 1 : 0, - groupmark = groupmark, - skipto = skipto + 1, - footerskip = 0, - select = col_select, - limit = limit, - comment = comment, - missingstring = missingstring, - escapechar = escape_double ? '"' : '\\', - quotechar = '"', - normalizenames = false, - ntasks = effective_ntasks > 1 + read_options = Dict( + :delim => delim, + :decimal => decimal, + :header => col_names === true ? 1 : 0, + :skipto => skipto + 1, + :select => col_select, + :groupmark => groupmark, + :footerskip => 0, + :limit => limit, + :comment => comment, + :missingstring => missingstring, + :escapechar => escape_double ? '"' : '\\', + :quotechar => '"', + :normalizenames => false, + :ntasks => effective_ntasks > 1, + :types => col_types ) + + merge!(read_options, kwargs) # Initialize an empty DataFrame final_df = DataFrame() From d3b51a2cfcdf744478dd0e7f72afd6c074ad0176 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Tue, 12 Nov 2024 18:23:11 -0500 Subject: [PATCH 2/4] docstring fix, missing string now missing value --- Project.toml | 2 +- README.md | 2 +- docs/examples/UserGuide/delim.jl | 28 ++++++------ docs/src/index.md | 6 +-- src/TidierFiles.jl | 32 +++++++------- src/docstrings.jl | 73 +++++++++++++++++++------------- src/xlfiles.jl | 6 +-- 7 files changed, 82 insertions(+), 67 deletions(-) diff --git a/Project.toml b/Project.toml index 5db264f..43c98cd 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierFiles" uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db" authors = ["Daniel Rizk and contributors"] -version = "0.1.5" +version = "0.1.6" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/README.md b/README.md index e3d4830..d38ba71 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ read_csv("testing.csv", missingstring=["40.5", "10.2"]) The file reading functions include the following keyword arguments: - `path` -- `missingstring` +- `missing_value` - `col_names` - `col_select` - `num_threads` diff --git a/docs/examples/UserGuide/delim.jl b/docs/examples/UserGuide/delim.jl index d4232de..baae173 100644 --- a/docs/examples/UserGuide/delim.jl +++ b/docs/examples/UserGuide/delim.jl @@ -4,15 +4,15 @@ using TidierFiles # ## read_csv/tsv/delim -read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missingstring = ["4"]) +read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missing_value = ["4"]) -#read_csv(file; delim=',', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=1) +#read_csv(file; delim=',', col_names=true, skip=0, n_max=Inf, comment=nothing, missing_value="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=1) -#read_tsv(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) +#read_tsv(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missing_value="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) -#read_delim(file; delim='\t', decimal = '.', groupmark = nothing col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) +#read_delim(file; delim='\t', decimal = '.', groupmark = nothing col_names=true, skip=0, n_max=Inf, comment=nothing, missing_value="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) -#read_csv2(file; delim=';', decimal = ',', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) +#read_csv2(file; delim=';', decimal = ',', col_names=true, skip=0, n_max=Inf, comment=nothing, missing_value="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads()) #These functions read a delimited file (CSV, TSV, or custom delimiter) into a DataFrame. The arguments are: @@ -22,27 +22,27 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin # - `skip`: Number of lines to skip before reading data. Default is 0. # - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). # - `comment`: Character indicating comment lines to ignore. Default is `nothing`. -# - `missingstring`: String(s) representing missing values. Default is `""`. +# - `missing_value`: String(s) representing missing values. Default is `""`. # - `col_select`: Optional vector of symbols or strings to select columns to load. Default is `nothing`. # - `groupmark`: A symbol that separates groups of digits Default is `nothing`. # - `decimal`: An ASCII Char argument that is used when parsing float values. Default is '.'. # - `escape_double`: Interpret two consecutive quote characters as a single quote. Default is `true`. -# - `col_types`: Optional specification of column types. Default is `nothing` (types are inferred). +# - `col_types`: Optional specification of column types using a Dict. Default is `nothing` (types are inferred). # - `num_threads`: Number of threads to use for parallel execution. Default is 1 for `read_csv` and the number of available threads for `read_tsv` and `read_delim`. # The functions return a DataFrame containing the parsed data from the file. # ## `write_csv` and `write_tsv` -# write_csv(x, file; missingstring="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) +# write_csv(x, file; missing_value="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) -# write_tsv(x, file; missingstring="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) +# write_tsv(x, file; missing_value="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) # These functions write a DataFrame to a CSV or TSV file. The arguments are: # - `x`: The DataFrame to write. # - `file`: The path to the output file. -# - `missingstring`: The string to represent missing values. Default is an empty string. +# - `missing_value`: The string to represent missing values. Default is an empty string. # - `append`: Whether to append to an existing file. Default is `false`. # - `col_names`: Whether to write column names as the first line. Default is `true`. # - `eol`: The end-of-line character. Default is `"\n"`. @@ -50,7 +50,7 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin # ## `read_table` -# read_table(file; col_names=true, skip=0, n_max=Inf, comment=nothing, col_select=nothing, missingstring="", num_threads) +# read_table(file; col_names=true, skip=0, n_max=Inf, comment=nothing, col_select=nothing, missing_value="", num_threads) # This function reads a table from a whitespace-delimited file into a DataFrame. The arguments are: @@ -60,18 +60,18 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin # - `n_max`: Maximum number of lines to read. Default is `Inf` (read all lines). # - `comment`: Character or string indicating comment lines to ignore. Default is `nothing`. # - `col_select`: Optional vector of symbols or strings to select columns to load. Default is `nothing`. -# - `missingstring`: The string representing missing values. Default is `""`. +# - `missing_value`: The string representing missing values. Default is `""`. # - `num_threads`: The number of threads to use for writing. Default is the number of available threads. # ## `write_table` -# write_table(x, file; delim='\t', missingstring="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) +# write_table(x, file; delim='\t', missing_value="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads()) # This function writes a DataFrame to a file with customizable delimiter and options. The arguments are: # - `x`: The DataFrame to write. # - `file`: The path to the output file. # - `delim`: The field delimiter. Default is `'\t'` (tab-separated). -# - `missingstring`: The string to represent missing values. Default is `""`. +# - `missing_value`: The string to represent missing values. Default is `""`. # - `append`: Whether to append to an existing file. Default is `false`. # - `col_names`: Whether to write column names as the first line. Default is `true`. # - `eol`: The end-of-line character. Default is `"\n"`. diff --git a/docs/src/index.md b/docs/src/index.md index b905a7b..24830b1 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -45,7 +45,7 @@ df = DataFrame( write_csv(df, "testing.csv" , col_names = true) -read_csv("testing.csv", missingstring=["40.5", "10.2"]) +read_csv("testing.csv", missing_value=["40.5", "10.2"]) ``` ``` @@ -61,7 +61,7 @@ read_csv("testing.csv", missingstring=["40.5", "10.2"]) The file reading functions include the following keyword arguments: - `path` -- `missingstring` +- `missing_value` - `col_names` - `col_select` - `num_threads` @@ -72,7 +72,7 @@ The file reading functions include the following keyword arguments: The path can be a file available either locally or on the web. ```julia -read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missingstring = ["4"]) +read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missing_value = ["4"]) ``` ``` 3×2 DataFrame diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl index 08c5b4c..a7a5961 100644 --- a/src/TidierFiles.jl +++ b/src/TidierFiles.jl @@ -37,7 +37,7 @@ function read_csv(files; n_max=Inf, col_select=nothing, comment=nothing, - missingstring="", + missing_value="", escape_double=true, ntasks::Int = Threads.nthreads(), num_threads::Union{Int, Nothing}=nothing, @@ -65,7 +65,7 @@ function read_csv(files; :select => col_select, :limit => limit, :comment => comment, - :missingstring => missingstring, + :missingstring => missing_value, :escapechar => escape_double ? '"' : '\\', :quotechar => '"', :ntasks => effective_ntasks > 1, @@ -112,7 +112,7 @@ function read_delim(files; n_max=Inf, col_select=nothing, comment=nothing, - missingstring="", + missing_value="", escape_double=true, ntasks::Int = Threads.nthreads(), num_threads::Union{Int, Nothing}=nothing, @@ -141,7 +141,7 @@ function read_delim(files; :footerskip => 0, :limit => limit, :comment => comment, - :missingstring => missingstring, + :missingstring => missing_value, :escapechar => escape_double ? '"' : '\\', :quotechar => '"', :normalizenames => false, @@ -187,7 +187,7 @@ function read_tsv(files; n_max=Inf, col_select=nothing, comment=nothing, - missingstring="", + missing_value="", escape_double=true, ntasks::Int = Threads.nthreads(), num_threads::Union{Int, Nothing}=nothing, @@ -217,7 +217,7 @@ function read_tsv(files; :footerskip => 0, :limit => limit, :comment => comment, - :missingstring => missingstring, + :missingstring => missing_value, :escapechar => escape_double ? '"' : '\\', :quotechar => '"', :normalizenames => false, @@ -266,7 +266,7 @@ function read_csv2(files; n_max=Inf, col_select=nothing, comment=nothing, - missingstring="", + missing_value="", escape_double=true, ntasks::Int = Threads.nthreads(), num_threads::Union{Int, Nothing}=nothing, @@ -297,7 +297,7 @@ function read_csv2(files; :footerskip => 0, :limit => limit, :comment => comment, - :missingstring => missingstring, + :missingstring => missing_value, :escapechar => escape_double ? '"' : '\\', :quotechar => '"', :normalizenames => false, @@ -348,7 +348,7 @@ function read_table(file; n_max=Inf, comment=nothing, col_select=nothing, - missingstring="", + missing_value="", kwargs...) # Open the file and preprocess the lines processed_lines, header = open(file, "r") do io @@ -391,7 +391,7 @@ function read_table(file; df = CSV.File(IOBuffer(join(processed_lines, "\n")); delim=' ', header=header_option, # Pass correct header - missingstring=missingstring, + missingstring=missing_value, select=col_select, kwargs...) |> DataFrame @@ -404,7 +404,7 @@ $docstring_write_csv function write_csv( x::DataFrame, file::String; - missingstring::String = "NA", + missing_value::String = "NA", append::Bool = false, col_names::Bool = true, eol::String = "\n", @@ -416,7 +416,7 @@ function write_csv( x, append = append, header = col_names && !append, - missingstring = missingstring, + missingstring = missing_value, newline = eol, threaded = num_threads > 1 ) end @@ -427,7 +427,7 @@ $docstring_write_tsv function write_tsv( x::DataFrame, file::String; - missingstring::String = "", + missing_value::String = "", append::Bool = false, col_names::Bool = true, eol::String = "\n", @@ -440,7 +440,7 @@ function write_tsv( delim = '\t', # Use tab as the delimiter for TSV append = append, header = col_names && !append, - missingstring = missingstring, + missingstring = missing_value, newline = eol, threaded = num_threads > 1) end @@ -452,7 +452,7 @@ function write_table( x::DataFrame, file::String; delim::Char = '\t', # Default to TSV, but allow flexibility - missingstring::String = "", + missing_value::String = "", append::Bool = false, col_names::Bool = true, eol::String = "\n", @@ -465,7 +465,7 @@ function write_table( delim = delim, # Flexible delimiter based on argument append = append, header = col_names && !append, - missingstring = missingstring, + missingstring = missing_value, newline = eol, threaded = num_threads > 1) end diff --git a/src/docstrings.jl b/src/docstrings.jl index 622f02f..814650c 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1,19 +1,21 @@ const docstring_read_csv = """ read_csv(file; delim=',',col_names=true, skip=0, n_max=Inf, - comment=nothing, missingstring="", col_select, escape_double=true, col_types=nothing, num_threads = 1) + comment=nothing, missing_value="", col_select, escape_double=true, col_types=nothing, num_threads = 1) Reads a CSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments - `file`: Path or vector of paths to the CSV file or a URL to a CSV file. - `delim`: The character delimiting fields in the file. Default is ','. +- `decimal`: Character argument for what character decimal should be. Default is `.` - `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. - `skip`: Number of initial lines to skip before reading data. Default is 0. - `n_max`: Maximum number of rows to read. Default is Inf (read all rows). - `col_select`: Optional vector of symbols or strings to select which columns to load. +- `col_types`: Optional Dict to allow for column type specification - `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `missing_value`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. - `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. - `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 # Examples @@ -22,33 +24,44 @@ julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva julia> write_csv(df, "csvtest.csv"); -julia> read_csv("csvtest.csv", skip = 2, n_max = 3, missingstring = ["95", "Charlie"]) +julia> read_csv("csvtest.csv", skip = 2, n_max = 3, missing_value = ["95", "Charlie"]) 3×3 DataFrame - Row │ ID Name Score - │ Int64 String7 Int64? + Row │ ID Name Score + │ Int64 String7? Int64? +─────┼────────────────────────── + 1 │ 3 missing 77 + 2 │ 4 David 85 + 3 │ 5 Eva missing + +julia> read_csv("csvtest.csv", skip = 2, n_max = 3, col_types = Dict(:ID => Float64)) +3×3 DataFrame + Row │ ID Name Score + │ Float64 String7 Int64 ─────┼───────────────────────── - 1 │ 3 missing 77 - 2 │ 4 David 85 - 3 │ 5 Eva missing + 1 │ 3.0 Charlie 77 + 2 │ 4.0 David 85 + 3 │ 5.0 Eva 95 ``` """ const docstring_read_tsv = """ read_tsv(file; delim='\t',col_names=true, skip=0, n_max=Inf, - comment=nothing, missingstring="", col_select, escape_double=true, col_types=nothing) + comment=nothing, missing_value="", col_select, escape_double=true, col_types=nothing) Reads a TSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments - `file`: Path or vector of paths to the TSV file or a URL to a TSV file. - `delim`: The character delimiting fields in the file. Default is ','. +- `decimal`: Character argument for what character decimal should be. Default is `.` - `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. - `skip`: Number of initial lines to skip before reading data. Default is 0. - `n_max`: Maximum number of rows to read. Default is Inf (read all rows). - `col_select`: Optional vector of symbols or strings to select which columns to load. - `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `col_types`: Optional Dict to allow for column type specification +- `missing_value`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. - `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. - `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. @@ -58,33 +71,35 @@ julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva julia> write_tsv(df, "tsvtest.tsv"); -julia> read_tsv("tsvtest.tsv", skip = 2, n_max = 3, missingstring = ["Charlie"]) +julia> read_tsv("tsvtest.tsv", skip = 2, n_max = 3, missing_value = ["Charlie"]) 3×3 DataFrame - Row │ ID Name Score - │ Int64 String7 Int64 -─────┼─────────────────────── - 1 │ 3 missing 77 - 2 │ 4 David 85 - 3 │ 5 Eva 95 + Row │ ID Name Score + │ Int64 String7? Int64 +─────┼──────────────────────── + 1 │ 3 missing 77 + 2 │ 4 David 85 + 3 │ 5 Eva 95 ``` """ const docstring_read_delim = """ read_delim(file; delim='\t',col_names=true, skip=0, n_max=Inf, - comment=nothing, missingstring="", col_select, escape_double=true, col_types=nothing) + comment=nothing, missing_value="", col_select, escape_double=true, col_types=nothing) Reads a delimited file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments - `file`: Path or vector of paths to the CSV file or a URL to a CSV file. - `delim`: The character delimiting fields in the file. Default is ','. +- `decimal`: Character argument for what character decimal should be. Default is `.` - `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. - `skip`: Number of initial lines to skip before reading data. Default is 0. - `n_max`: Maximum number of rows to read. Default is Inf (read all rows). - `col_select`: Optional vector of symbols or strings to select which columns to load. - `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `col_types`: Optional Dict to allow for column type specification +- `missing_value`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. - `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. - `col_types`: An optional specification of column types, can be a single type applied to all columns, or a collection of types with one for each column. Default is nothing (types are inferred). - `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. @@ -178,13 +193,13 @@ julia> fwf_empty(path, num_lines=4, col_names = ["Name", "Age", "ID", "Position" const docstring_write_csv = """ - write_csv(DataFrame, filepath; na = "", append = false, col_names = true, missingstring, eol = "\n", num_threads = Threads.nthreads()) + write_csv(DataFrame, filepath; na = "", append = false, col_names = true, missing_value, eol = "\n", num_threads = Threads.nthreads()) Write a DataFrame to a CSV (comma-separated values) file. # Arguments - `x`: The DataFrame to write to the CSV file. - `file`: The path to the output CSV file. -- `missingstring`: = "": The string to represent missing values in the output file. Default is an empty string. +- `missing_value`: = "": The string to represent missing values in the output file. Default is an empty string. - `append`: Whether to append to the file if it already exists. Default is false. - `col_names`: = true: Whether to write column names as the first line of the file. Default is true. - `eol`: = "\n": The end-of-line character to use in the output file. Default is the newline character. @@ -200,13 +215,13 @@ julia> write_csv(df, "csvtest.csv"); const docstring_write_tsv = """ - write_tsv(DataFrame, filepath; na = "", append = false, col_names = true, missingstring, eol = "\n", num_threads = Threads.nthreads()) + write_tsv(DataFrame, filepath; na = "", append = false, col_names = true, missing_value, eol = "\n", num_threads = Threads.nthreads()) Write a DataFrame to a TSV (tab-separated values) file. # Arguments - `x`: The DataFrame to write to the TSV file. - `file`: The path to the output TSV file. -- `missingstring`: = "": The string to represent missing values in the output file. Default is an empty string. +- `missing_value`: = "": The string to represent missing values in the output file. Default is an empty string. - `append`: Whether to append to the file if it already exists. Default is false. - `col_names`: = true: Whether to write column names as the first line of the file. Default is true. - `eol`: = "\n": The end-of-line character to use in the output file. Default is the newline character. @@ -222,7 +237,7 @@ julia> write_tsv(df, "tsvtest.tsv"); const docstring_read_table = """ - read_table(file; col_names=true, skip=0, n_max=Inf, comment=nothing, col_select, missingstring="", kwargs...) + read_table(file; col_names=true, skip=0, n_max=Inf, comment=nothing, col_select, missing_value="", kwargs...) Read a table from a file where columns are separated by any amount of whitespace, processing it into a DataFrame. @@ -233,7 +248,7 @@ Read a table from a file where columns are separated by any amount of whitespace - `n_max`: The maximum number of lines to read from the file, after skipping. Inf means read all lines. - `col_select`: Optional vector of symbols or strings to select which columns to load. - `comment`: A character or string indicating the start of a comment. Lines starting with this character are ignored. -- `missingstring`: The string that represents missing values in the table. +- `missing_value`: The string that represents missing values in the table. - `kwargs`: Additional keyword arguments passed to CSV.File. # Examples ```jldoctest @@ -262,7 +277,7 @@ Write a DataFrame to a file, allowing for customization of the delimiter and oth - `x`: The DataFrame to write to a file. - `file`: The path to the file where the DataFrame will be written. -delim: Character to use as the field delimiter. The default is tab ('\t'), making it a TSV (tab-separated values) file by default, but can be changed to accommodate other formats. -- `missingstring`: The string to represent missing data in the output file. +- `missing_value`: The string to represent missing data in the output file. - `append`: Whether to append to the file if it already exists. If false, the file will be overwritten. - `col_names`: Whether to write column names as the first line of the file. If appending to an existing file with append = true, column names will not be written regardless of this parameter's value. - `eol`: The end-of-line character to use in the file. Defaults to "\n". @@ -278,7 +293,7 @@ julia> write_table(df, "tabletest.txt"); const docstring_read_xlsx = """ - read_xlsx(path; sheet, range, col_names, col_types, missingstring, trim_ws, skip, n_max, guess_max) + read_xlsx(path; sheet, range, col_names, col_types, missing_value, trim_ws, skip, n_max, guess_max) Read data from an Excel file into a DataFrame. # Arguments @@ -287,7 +302,7 @@ Read data from an Excel file into a DataFrame. - `range`: Specifies a specific range of cells to be read from the sheet. If nothing, the entire sheet is read. - `col_names`: Indicates whether the first row of the specified range should be treated as column names. If false, columns will be named automatically. - `col_types`: Allows specifying column types explicitly. Can be a single type applied to all columns, a list or a dictionary mapping column names or indices to types. If nothing, types will be inferred. -- `missingstring`: The value or vector that represents missing values in the Excel file. +- `missing_value`: The value or vector that represents missing values in the Excel file. - `trim_ws`: Whether to trim leading and trailing whitespace from cells in the Excel file. - `skip`: Number of rows to skip at the beginning of the sheet or range before reading data. - `n_max`: The maximum number of rows to read from the sheet or range, after skipping. Inf means read all available rows. @@ -303,7 +318,7 @@ julia> df2 = DataFrame(AA=["aa", "bb"], AB=[10.1, 10.2]); julia> write_xlsx(("REPORT_A" => df, "REPORT_B" => df2); path="xlsxtest.xlsx", overwrite = true); -julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missingstring = [2]) +julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missing_value = [2]) 3×3 DataFrame Row │ integers strings floats │ Int64? String? Float64? diff --git a/src/xlfiles.jl b/src/xlfiles.jl index dfde3e5..cc0c0e7 100644 --- a/src/xlfiles.jl +++ b/src/xlfiles.jl @@ -46,7 +46,7 @@ function read_xlsx( sheet = nothing, range = nothing, col_names = true, - missingstring = "", + missing_value = "", trim_ws = true, skip = 0, n_max = Inf @@ -87,8 +87,8 @@ function read_xlsx( end # Replace missing strings with `missing` if applicable - if !isempty(missingstring) - for missing_value in missingstring + if !isempty(missing_value) + for missing_value in missing_value for col in names(data) data[!, col] = replace(data[!, col], missing_value => missing) end From 87c0a3f142d8bd632608bbce1096bf9fafcaac1e Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Tue, 12 Nov 2024 19:50:17 -0500 Subject: [PATCH 3/4] final doc update --- docs/examples/UserGuide/delim.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/UserGuide/delim.jl b/docs/examples/UserGuide/delim.jl index baae173..0d0ad05 100644 --- a/docs/examples/UserGuide/delim.jl +++ b/docs/examples/UserGuide/delim.jl @@ -29,7 +29,7 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin # - `escape_double`: Interpret two consecutive quote characters as a single quote. Default is `true`. # - `col_types`: Optional specification of column types using a Dict. Default is `nothing` (types are inferred). # - `num_threads`: Number of threads to use for parallel execution. Default is 1 for `read_csv` and the number of available threads for `read_tsv` and `read_delim`. - +# - `kwarg`: any CSV.jl argument can be passed to any of the above functions with correct syntax, and it will be supported. # The functions return a DataFrame containing the parsed data from the file. # ## `write_csv` and `write_tsv` From c190c366e1cd4d01e7712d5bf9ead589d2b5fb5e Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Tue, 12 Nov 2024 20:04:05 -0500 Subject: [PATCH 4/4] one last docstring change --- src/docstrings.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 814650c..c0c469f 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -101,7 +101,6 @@ Reads a delimited file or URL into a DataFrame, with options to specify delimite - `col_types`: Optional Dict to allow for column type specification - `missing_value`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. - `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. -- `col_types`: An optional specification of column types, can be a single type applied to all columns, or a collection of types with one for each column. Default is nothing (types are inferred). - `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. # Examples @@ -302,7 +301,7 @@ Read data from an Excel file into a DataFrame. - `range`: Specifies a specific range of cells to be read from the sheet. If nothing, the entire sheet is read. - `col_names`: Indicates whether the first row of the specified range should be treated as column names. If false, columns will be named automatically. - `col_types`: Allows specifying column types explicitly. Can be a single type applied to all columns, a list or a dictionary mapping column names or indices to types. If nothing, types will be inferred. -- `missing_value`: The value or vector that represents missing values in the Excel file. +- `missing_value`: The value or vector that represents missing values in the Excel file. Unlike CSV.jl based functions, everything does not need to be written as a string - `trim_ws`: Whether to trim leading and trailing whitespace from cells in the Excel file. - `skip`: Number of rows to skip at the beginning of the sheet or range before reading data. - `n_max`: The maximum number of rows to read from the sheet or range, after skipping. Inf means read all available rows.