adds decimal option support

TidierOrg · Jul 6, 2024 · e6df846 · e6df846
1 parent 2dc574c
commit e6df846
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 3 deletions.
diff --git a/docs/examples/UserGuide/delim.jl b/docs/examples/UserGuide/delim.jl
@@ -10,7 +10,9 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin
 
 #read_tsv(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())
 
-#read_delim(file; delim='\t', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())
+#read_delim(file; delim='\t', decimal = '.', groupmark = nothing col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())
+
+#read_csv2(file; delim=';', decimal = ',', col_names=true, skip=0, n_max=Inf, comment=nothing, missingstring="", col_select=nothing, escape_double=true, col_types=nothing, num_threads=Threads.nthreads())
 
 #These functions read a delimited file (CSV, TSV, or custom delimiter) into a DataFrame. The arguments are:
 
@@ -22,13 +24,15 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin
 # - `comment`: Character indicating comment lines to ignore. Default is `nothing`.
 # - `missingstring`: String(s) representing missing values. Default is `""`.
 # - `col_select`: Optional vector of symbols or strings to select columns to load. Default is `nothing`.
+# - `groupmark`: A symbol that separates groups of digits Default is `nothing`.
+# - `decimal`: An ASCII Char argument that is used when parsing float values. Default is '.'.
 # - `escape_double`: Interpret two consecutive quote characters as a single quote. Default is `true`.
 # - `col_types`: Optional specification of column types. Default is `nothing` (types are inferred).
 # - `num_threads`: Number of threads to use for parallel execution. Default is 1 for `read_csv` and the number of available threads for `read_tsv` and `read_delim`.
 
 # The functions return a DataFrame containing the parsed data from the file.
 
-# ## `write_csv` and # ## `write_tsv`
+# ## `write_csv` and `write_tsv`
 
 # write_csv(x, file; missingstring="", append=false, col_names=true, eol="\n", num_threads=Threads.nthreads())
 

diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl
@@ -14,7 +14,7 @@ using Arrow
 
 export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, 
  read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, 
- write_dta, read_arrow, write_arrow, read_parquet, write_parquet
+ write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2
 
 
 include("docstrings.jl")
@@ -93,9 +93,11 @@ $docstring_read_delim
 """
 function read_delim(file;
                   delim='\t',
+                  decimal = '.',
                   col_names=true,
                   skip=0,
                   n_max=Inf,
+                  groupmark=nothing,
                   col_select=nothing,
                   comment=nothing,
                   missingstring="",
@@ -115,9 +117,11 @@ function read_delim(file;
     # Prepare arguments for CSV.read, including the effective number of tasks to use
     read_options = (
         delim = delim,
+        decimal = decimal,
         header = col_names === true ? 1 : 0,
         skipto = skipto + 1,
         select = col_select,
+        groupmark=groupmark,
         footerskip = 0,
         limit = limit,
         comment = comment,
@@ -207,6 +211,82 @@ function read_tsv(file;
     return df
 end
 
+
+#"""
+#$docstring_read_csv2
+#"""
+function read_csv2(file;
+                  delim=';',
+                  decimal = ',',
+                  col_names=true,
+                  groupmark=nothing,
+                  skip=0,
+                  n_max=Inf,
+                  col_select=nothing,
+                  comment=nothing,
+                  missingstring="",
+                  escape_double=true,
+                  ntasks::Int = Threads.nthreads(),  # Default ntasks value
+                  num_threads::Union{Int, Nothing}=nothing) # Optional num_threads
+
+    # Use num_threads if provided, otherwise stick with ntasks
+    effective_ntasks = isnothing(num_threads) ? ntasks : num_threads
+
+    # Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument
+    limit = isinf(n_max) ? nothing : Int(n_max)
+
+    # Calculate skipto and header correctly
+    skipto = skip + (col_names === true ? 1 : 0)
+
+    # Prepare arguments for CSV.read, including the effective number of tasks to use
+    read_options = (
+        delim = delim,
+        decimal = decimal,
+        header = col_names === true ? 1 : 0,
+        groupmark = groupmark,
+        skipto = skipto + 1,
+        footerskip = 0,
+        select = col_select,
+        limit = limit,
+        comment = comment,
+        missingstring = missingstring,
+        escapechar = escape_double ? '"' : '\\',
+        quotechar = '"',
+        normalizenames = false,
+        ntasks = effective_ntasks > 1
+    )
+
+
+    # Filter options to remove any set to `nothing`
+   # clean_options = Dict{Symbol,Any}(filter(p -> !isnothing(p[2]), read_options))
+
+    # Check if the file is a URL and read accordingly
+    if startswith(file, "http://") || startswith(file, "https://")
+        # Fetch the content from the URL
+        response = HTTP.get(file)
+
+        # Ensure the request was successful
+        if response.status != 200
+            error("Failed to fetch the CSV file: HTTP status code ", response.status)
+        end
+
+        # Read the CSV data from the fetched content using cleaned options
+        df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame
+    else
+        # Read from a local file using cleaned options
+        df = CSV.File(file; read_options...) |> DataFrame
+    end
+
+    return df
+end
+
+
+
+
+
+
+
+
 """
 $docstring_read_table
 """