Skip to content

Commit

Permalink
adds decimal option support
Browse files Browse the repository at this point in the history
  • Loading branch information
drizk1 committed Jul 6, 2024
1 parent 2dc574c commit e6df846
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 3 deletions.
8 changes: 6 additions & 2 deletions docs/examples/UserGuide/delim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin

#read_tsv(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())

#read_delim(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())
#read_delim(file; delim='\t', decimal = '.', groupmark = nothing col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())

#read_csv2(file; delim=';', decimal = ',', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())

#These functions read a delimited file (CSV, TSV, or custom delimiter) into a DataFrame. The arguments are:

Expand All @@ -22,13 +24,15 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin
# - `comment`: Character indicating comment lines to ignore. Default is `nothing`.
# - `missingstring`: String(s) representing missing values. Default is `""`.
# - `col_select`: Optional vector of symbols or strings to select columns to load. Default is `nothing`.
# - `groupmark`: A symbol that separates groups of digits Default is `nothing`.
# - `decimal`: An ASCII Char argument that is used when parsing float values. Default is '.'.
# - `escape_double`: Interpret two consecutive quote characters as a single quote. Default is `true`.
# - `col_types`: Optional specification of column types. Default is `nothing` (types are inferred).
# - `num_threads`: Number of threads to use for parallel execution. Default is 1 for `read_csv` and the number of available threads for `read_tsv` and `read_delim`.

# The functions return a DataFrame containing the parsed data from the file.

# ## `write_csv` and # ## `write_tsv`
# ## `write_csv` and `write_tsv`

# write_csv(x, file; missingstring="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads())

Expand Down
82 changes: 81 additions & 1 deletion src/TidierFiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using Arrow

export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx,
read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas,
write_dta, read_arrow, write_arrow, read_parquet, write_parquet
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2


include("docstrings.jl")
Expand Down Expand Up @@ -93,9 +93,11 @@ $docstring_read_delim
"""
function read_delim(file;
delim='\t',
decimal = '.',
col_names=true,
skip=0,
n_max=Inf,
groupmark=nothing,
col_select=nothing,
comment=nothing,
missingstring="",
Expand All @@ -115,9 +117,11 @@ function read_delim(file;
# Prepare arguments for CSV.read, including the effective number of tasks to use
read_options = (
delim = delim,
decimal = decimal,
header = col_names === true ? 1 : 0,
skipto = skipto + 1,
select = col_select,
groupmark=groupmark,
footerskip = 0,
limit = limit,
comment = comment,
Expand Down Expand Up @@ -207,6 +211,82 @@ function read_tsv(file;
return df
end


#"""
#$docstring_read_csv2
#"""
function read_csv2(file;
delim=';',
decimal = ',',
col_names=true,
groupmark=nothing,
skip=0,
n_max=Inf,
col_select=nothing,
comment=nothing,
missingstring="",
escape_double=true,
ntasks::Int = Threads.nthreads(), # Default ntasks value
num_threads::Union{Int, Nothing}=nothing) # Optional num_threads

# Use num_threads if provided, otherwise stick with ntasks
effective_ntasks = isnothing(num_threads) ? ntasks : num_threads

# Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument
limit = isinf(n_max) ? nothing : Int(n_max)

# Calculate skipto and header correctly
skipto = skip + (col_names === true ? 1 : 0)

# Prepare arguments for CSV.read, including the effective number of tasks to use
read_options = (
delim = delim,
decimal = decimal,
header = col_names === true ? 1 : 0,
groupmark = groupmark,
skipto = skipto + 1,
footerskip = 0,
select = col_select,
limit = limit,
comment = comment,
missingstring = missingstring,
escapechar = escape_double ? '"' : '\\',
quotechar = '"',
normalizenames = false,
ntasks = effective_ntasks > 1
)


# Filter options to remove any set to `nothing`
# clean_options = Dict{Symbol,Any}(filter(p -> !isnothing(p[2]), read_options))

# Check if the file is a URL and read accordingly
if startswith(file, "http://") || startswith(file, "https://")
# Fetch the content from the URL
response = HTTP.get(file)

# Ensure the request was successful
if response.status != 200
error("Failed to fetch the CSV file: HTTP status code ", response.status)
end

# Read the CSV data from the fetched content using cleaned options
df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame
else
# Read from a local file using cleaned options
df = CSV.File(file; read_options...) |> DataFrame
end

return df
end








"""
$docstring_read_table
"""
Expand Down

0 comments on commit e6df846

Please sign in to comment.