diff --git a/Project.toml b/Project.toml index 8ae89eb..d5bfb95 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierFiles" uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db" authors = ["Daniel Rizk and contributors"] -version = "0.1.3" +version = "0.1.4" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" @@ -11,6 +11,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d" +RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" @@ -26,6 +27,7 @@ Parquet2 = "0.2" ReadStatTables = "0.3" Reexport = "0.2, 1" XLSX = "0.10" +RData = "1.0" julia = "1.9" [extras] diff --git a/README.md b/README.md index 3f16adb..135b24f 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,10 @@ Currently supported file types: - `read_dta` and `write_dta` (.dta) - `read_arrow` and `write_arrow` - `read_parquet` and `write_parquet` +- `read_rdata` (.rdata and .rds) + +Agnostic read and write functions that detect the type and dispatch the appropriate function. +- `read_file` and `write_file` # Examples diff --git a/docs/examples/UserGuide/r_files.jl b/docs/examples/UserGuide/r_files.jl new file mode 100644 index 0000000..ec4b112 --- /dev/null +++ b/docs/examples/UserGuide/r_files.jl @@ -0,0 +1,15 @@ +# Reading .rds and .rdata files is made possible via RData.jl. There is currently no write support, nor is there url support. + +# To read the file, simply pass the path to `read_rdata` or `read_file`. There is a small difference between .rds and .rdata files. +# .rdata files will contain a dict of the table name and the data frame. There can be multiple entries in one .rdata file. To access the data frame, you must pass the name of the dict to the object. + +# ```julia +# using TidierFiles +# file = read_rdata("path.rdata) # or read_file("path.rdata) +# df = file["entry_name"] +# ``` + +# This is in contrast to .rds files which will contain one data frame. +# ```julia +# df = read_rdata("path.rds) +# ``` \ No newline at end of file diff --git a/docs/examples/UserGuide/xl.jl b/docs/examples/UserGuide/xl.jl index 4cea74b..ef74633 100644 --- a/docs/examples/UserGuide/xl.jl +++ b/docs/examples/UserGuide/xl.jl @@ -23,4 +23,12 @@ # - `x`: The data to write. Can be a single `Pair{String, DataFrame}` for writing one sheet, or a `Tuple` of such pairs for writing multiple sheets. The `String` in each pair specifies the sheet name, and the `DataFrame` is the data to write to that sheet. # - `path`: The path to the output Excel file. -# - `overwrite`: Whether to overwrite an existing file. Default is `false`. \ No newline at end of file +# - `overwrite`: Whether to overwrite an existing file. Default is `false`. + +# ## Writing to a specific sheet +# The example below demonstrates how to write to specific sheets in a file. +# The string in the `Dict` is the sheet name, it can be new or preexisting. The second component is the dataframe to be written to that sheet. +# In this example, two sheets, "REPORT_A" and "REPORT_C" are being written to with `df` and `df2` respectively. +# ```julia +# write_xlsx(("REPORT_A" => df, "REPORT_C" => df2); path = "/Users/danielrizk/Downloads/xlsxtest2.xlsx", overwrite = true) +# ``` \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a1db7a1..7774286 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -121,4 +121,5 @@ nav: - "Stats Files": "examples/generated/UserGuide/stats.md" - "Arrow Files": "examples/generated/UserGuide/Arrow.md" - "Parquet Files": "examples/generated/UserGuide/parquet.md" + - "R Data Files": "examples/generated/UserGuide/r_files.md" - "Reference" : "reference.md" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 9ac5db8..c8bd492 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -21,6 +21,10 @@ Currently supported file types: - `read_dta` and `write_dta` (.dta) - `read_arrow` and `write_arrow` - `read_parquet` and `write_parquet` +- `read_rdata` (.rdata and .rds) + +Agnostic read and write functions that detect the type and dispatch the appropriate function. +- `read_file` and `write_file` # Examples diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl index d9255fa..44c0bce 100644 --- a/src/TidierFiles.jl +++ b/src/TidierFiles.jl @@ -9,12 +9,13 @@ using ReadStatTables using Reexport using Parquet2 using Arrow +using RData @reexport using DataFrames: DataFrame export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, - write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2 + write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata include("docstrings.jl") @@ -23,6 +24,7 @@ include("xlfiles.jl") include("statsfiles.jl") include("parquet_files.jl") include("arrow_files.jl") +include("r_data.jl") """ $docstring_read_csv @@ -444,4 +446,6 @@ function write_table( threaded = num_threads > 1) end +include("gen_fxn.jl") + end \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index 17b4cc6..b6e3426 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -6,16 +6,16 @@ const docstring_read_csv = Reads a CSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path or vector of paths to the CSV file or a URL to a CSV file. -`delim`: The character delimiting fields in the file. Default is ','. -`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. -`skip`: Number of initial lines to skip before reading data. Default is 0. -`n_max`: Maximum number of rows to read. Default is Inf (read all rows). --`col_select`: Optional vector of symbols or strings to select which columns to load. -`comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -`missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. -`escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. -`num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 +- `file`: Path or vector of paths to the CSV file or a URL to a CSV file. +- `delim`: The character delimiting fields in the file. Default is ','. +- `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. +- `skip`: Number of initial lines to skip before reading data. Default is 0. +- `n_max`: Maximum number of rows to read. Default is Inf (read all rows). +- `col_select`: Optional vector of symbols or strings to select which columns to load. +- `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). +- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. +- `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 # Examples ```jldoctest julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva"], Score = [88, 92, 77, 85, 95]); @@ -41,16 +41,16 @@ const docstring_read_tsv = Reads a TSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path or vector of paths to the TSV file or a URL to a TSV file. -`delim`: The character delimiting fields in the file. Default is ','. -`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. -`skip`: Number of initial lines to skip before reading data. Default is 0. -`n_max`: Maximum number of rows to read. Default is Inf (read all rows). --`col_select`: Optional vector of symbols or strings to select which columns to load. -`comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -`missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. -`escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. -`num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. +- `file`: Path or vector of paths to the TSV file or a URL to a TSV file. +- `delim`: The character delimiting fields in the file. Default is ','. +- `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. +- `skip`: Number of initial lines to skip before reading data. Default is 0. +- `n_max`: Maximum number of rows to read. Default is Inf (read all rows). +- `col_select`: Optional vector of symbols or strings to select which columns to load. +- `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). +- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. +- `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. # Examples ```jldoctest @@ -77,17 +77,17 @@ const docstring_read_delim = Reads a delimited file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path or vector of paths to the CSV file or a URL to a CSV file. -`delim`: The character delimiting fields in the file. Default is ','. -`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. -`skip`: Number of initial lines to skip before reading data. Default is 0. -`n_max`: Maximum number of rows to read. Default is Inf (read all rows). --`col_select`: Optional vector of symbols or strings to select which columns to load. -`comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). -`missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. -`escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. -`col_types`: An optional specification of column types, can be a single type applied to all columns, or a collection of types with one for each column. Default is nothing (types are inferred). -`num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. +- `file`: Path or vector of paths to the CSV file or a URL to a CSV file. +- `delim`: The character delimiting fields in the file. Default is ','. +- `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. +- `skip`: Number of initial lines to skip before reading data. Default is 0. +- `n_max`: Maximum number of rows to read. Default is Inf (read all rows). +- `col_select`: Optional vector of symbols or strings to select which columns to load. +- `comment`: Character that starts a comment line. Lines beginning with this character are ignored. Default is nothing (no comment lines). +- `missingstring`: String that represents missing values in the CSV. Default is "", can be set to a vector of multiple items. +- `escape_double`: Indicates whether to interpret two consecutive quote characters as a single quote in the data. Default is true. +- `col_types`: An optional specification of column types, can be a single type applied to all columns, or a collection of types with one for each column. Default is nothing (types are inferred). +- `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Default is the number of available threads. # Examples ```jldoctest @@ -227,14 +227,14 @@ const docstring_read_table = Read a table from a file where columns are separated by any amount of whitespace, processing it into a DataFrame. # Arguments --`file`: The path to the file to read. --`col_names`=true: Indicates whether the first non-skipped line should be treated as column names. If false, columns are named automatically. --`skip`: Number of lines at the beginning of the file to skip before processing starts. --`n_max`: The maximum number of lines to read from the file, after skipping. Inf means read all lines. --`col_select`: Optional vector of symbols or strings to select which columns to load. --`comment`: A character or string indicating the start of a comment. Lines starting with this character are ignored. --`missingstring`: The string that represents missing values in the table. --`kwargs`: Additional keyword arguments passed to CSV.File. +- `file`: The path to the file to read. +- `col_names`=true: Indicates whether the first non-skipped line should be treated as column names. If false, columns are named automatically. +- `skip`: Number of lines at the beginning of the file to skip before processing starts. +- `n_max`: The maximum number of lines to read from the file, after skipping. Inf means read all lines. +- `col_select`: Optional vector of symbols or strings to select which columns to load. +- `comment`: A character or string indicating the start of a comment. Lines starting with this character are ignored. +- `missingstring`: The string that represents missing values in the table. +- `kwargs`: Additional keyword arguments passed to CSV.File. # Examples ```jldoctest julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva"], Score = [88, 92, 77, 85, 95]); @@ -259,14 +259,14 @@ const docstring_write_table = Write a DataFrame to a file, allowing for customization of the delimiter and other options. # Arguments --`x`: The DataFrame to write to a file. --`file`: The path to the file where the DataFrame will be written. +- `x`: The DataFrame to write to a file. +- `file`: The path to the file where the DataFrame will be written. -delim: Character to use as the field delimiter. The default is tab ('\t'), making it a TSV (tab-separated values) file by default, but can be changed to accommodate other formats. --`missingstring`: The string to represent missing data in the output file. --`append`: Whether to append to the file if it already exists. If false, the file will be overwritten. --`col_names`: Whether to write column names as the first line of the file. If appending to an existing file with append = true, column names will not be written regardless of this parameter's value. --`eol`: The end-of-line character to use in the file. Defaults to "\n". --`num_threads`: Number of threads to use for writing the file. Uses the number of available Julia threads by default. +- `missingstring`: The string to represent missing data in the output file. +- `append`: Whether to append to the file if it already exists. If false, the file will be overwritten. +- `col_names`: Whether to write column names as the first line of the file. If appending to an existing file with append = true, column names will not be written regardless of this parameter's value. +- `eol`: The end-of-line character to use in the file. Defaults to "\n". +- `num_threads`: Number of threads to use for writing the file. Uses the number of available Julia threads by default. # Examples ```jldoctest @@ -282,16 +282,16 @@ const docstring_read_xlsx = Read data from an Excel file into a DataFrame. # Arguments --`path`: The path to the Excel file to be read. --`sheet`: Specifies the sheet to be read. Can be either the name of the sheet as a string or its index as an integer. If nothing, the first sheet is read. --`range`: Specifies a specific range of cells to be read from the sheet. If nothing, the entire sheet is read. --`col_names`: Indicates whether the first row of the specified range should be treated as column names. If false, columns will be named automatically. --`col_types`: Allows specifying column types explicitly. Can be a single type applied to all columns, a list or a dictionary mapping column names or indices to types. If nothing, types will be inferred. --`missingstring`: The value or vector that represents missing values in the Excel file. --`trim_ws`: Whether to trim leading and trailing whitespace from cells in the Excel file. --`skip`: Number of rows to skip at the beginning of the sheet or range before reading data. --`n_max`: The maximum number of rows to read from the sheet or range, after skipping. Inf means read all available rows. --`guess_max`: The maximum number of rows to scan for type guessing and column names detection. Only relevant if col_types is nothing or col_names is true. If nothing, a default heuristic is used. +- `path`: The path to the Excel file to be read. +- `sheet`: Specifies the sheet to be read. Can be either the name of the sheet as a string or its index as an integer. If nothing, the first sheet is read. +- `range`: Specifies a specific range of cells to be read from the sheet. If nothing, the entire sheet is read. +- `col_names`: Indicates whether the first row of the specified range should be treated as column names. If false, columns will be named automatically. +- `col_types`: Allows specifying column types explicitly. Can be a single type applied to all columns, a list or a dictionary mapping column names or indices to types. If nothing, types will be inferred. +- `missingstring`: The value or vector that represents missing values in the Excel file. +- `trim_ws`: Whether to trim leading and trailing whitespace from cells in the Excel file. +- `skip`: Number of rows to skip at the beginning of the sheet or range before reading data. +- `n_max`: The maximum number of rows to read from the sheet or range, after skipping. Inf means read all available rows. +- `guess_max`: The maximum number of rows to scan for type guessing and column names detection. Only relevant if col_types is nothing or col_names is true. If nothing, a default heuristic is used. # Examples ```jldoctest @@ -317,12 +317,12 @@ julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missi const docstring_write_xlsx = """ write_xlsx(x; path, overwrite) -Write a DataFrame, or multiple DataFrames, to an Excel file. +Write a DataFrame, or multiple DataFrames, to an Excel file. Specific sheets on can be specified for each dataframe. # Arguments --`x`: The data to write. Can be a single Pair{String, DataFrame} for writing one sheet, or a Tuple of such pairs for writing multiple sheets. The String in each pair specifies the sheet name, and the DataFrame is the data to write to that sheet. --`path`: The path to the Excel file where the data will be written. --`overwrite`: Defaults to false. Whether to overwrite an existing file. If false, an error is thrown when attempting to write to an existing file. +- `x`: The data to write. Can be a single Pair{String, DataFrame} for writing one sheet, or a Tuple of such pairs for writing multiple sheets. The String in each pair specifies the sheet name, and the DataFrame is the data to write to that sheet. +- `path`: The path to the Excel file where the data will be written. +- `overwrite`: Defaults to false. Whether to overwrite an existing file. If false, an error is thrown when attempting to write to an existing file. # Examples ```jldoctest @@ -343,11 +343,11 @@ const docstring_read_dta = Read data from a Stata (.dta) file into a DataFrame, supporting both local and remote sources. # Arguments --`filepath`: The path to the .dta file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. +- `filepath`: The path to the .dta file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. `encoding`: Optional; specifies the encoding of the input file. If not provided, defaults to the package's or function's default. `col_select`: Optional; allows specifying a subset of columns to read. This can be a vector of column names or indices. If nothing, all columns are read. -skip=0: Number of rows at the beginning of the file to skip before reading. -n_max=Inf: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. +- `skip=0`: Number of rows at the beginning of the file to skip before reading. +- `n_max=Inf`: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 # Examples @@ -372,11 +372,11 @@ const docstring_read_sas = Read data from a SAS (.sas7bdat and .xpt) file into a DataFrame, supporting both local and remote sources. # Arguments --`filepath`: The path to the .dta file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. +- `filepath`: The path to the .dta file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. `encoding`: Optional; specifies the encoding of the input file. If not provided, defaults to the package's or function's default. `col_select`: Optional; allows specifying a subset of columns to read. This can be a vector of column names or indices. If nothing, all columns are read. -skip=0: Number of rows at the beginning of the file to skip before reading. -n_max=Inf: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. +- `skip=0`: Number of rows at the beginning of the file to skip before reading. +- `n_max=Inf`: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 # Examples @@ -410,12 +410,12 @@ const docstring_read_sav = Read data from a SPSS (.sav and .por) file into a DataFrame, supporting both local and remote sources. # Arguments --`filepath`: The path to the .sav or .por file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. -`encoding`: Optional; specifies the encoding of the input file. If not provided, defaults to the package's or function's default. -`col_select`: Optional; allows specifying a subset of columns to read. This can be a vector of column names or indices. If nothing, all columns are read. -skip=0: Number of rows at the beginning of the file to skip before reading. -n_max=Inf: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. -`num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 +- `filepath`: The path to the .sav or .por file or a URL pointing to such a file. If a URL is provided, the file will be downloaded and then read. +- `encoding`: Optional; specifies the encoding of the input file. If not provided, defaults to the package's or function's default. +- `col_select`: Optional; allows specifying a subset of columns to read. This can be a vector of column names or indices. If nothing, all columns are read. +- `skip=0`: Number of rows at the beginning of the file to skip before reading. +- `n_max=Inf``: Maximum number of rows to read from the file, after skipping. If Inf, read all available rows. +- `num_threads`: specifies the number of concurrent tasks or threads to use for processing, allowing for parallel execution. Defaults to 1 # Examples ```jldoctest @@ -448,9 +448,9 @@ const docstring_write_sav = write_sav(df, path) Write a DataFrame to a SPSS (.sav or .por) file. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. # Examples ```jldoctest @@ -478,9 +478,9 @@ const docstring_write_sas = write_sas(df, path) Write a DataFrame to a SAS (.sas7bdat or .xpt) file. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. # Examples ```jldoctest @@ -509,9 +509,9 @@ const docstring_write_dta = write_dta(df, path) Write a DataFrame to a Stata (.dta) file. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. # Examples ```jldoctest @@ -531,9 +531,9 @@ const docstring_write_arrow = """ write_arrow(df, path) Write a DataFrame to an Arrow (.arrow) file. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. # Examples ```jldoctest julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]); @@ -546,12 +546,12 @@ const docstring_read_arrow = """ read_arrow(df, path) Read an Arrow file (.arrow) to a DataFrame. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. -`skip`: Number of initial lines to skip before reading data. Default is 0. -`n_max`: Maximum number of rows to read. Default is Inf (read all rows). --`col_select`: Optional vector of symbols or strings to select which columns to load. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +- `skip`: Number of initial lines to skip before reading data. Default is 0. +- `n_max`: Maximum number of rows to read. Default is Inf (read all rows). +- `col_select`: Optional vector of symbols or strings to select which columns to load. # Examples ```jldoctest julia> df = DataFrame(AA=["Arr", "ow"], AB=[10.1, 10.2]); @@ -572,9 +572,9 @@ const docstring_write_parquet = """ write_parquet(df, ) Write a DataFrame to an Parquet (.parquet) file. -Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +# Arguments +- `df`: The DataFrame to be written to a file. +- `path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. # Examples ```jldoctest @@ -588,12 +588,12 @@ const docstring_read_parquet = """ read_parquet(path) Read a Paquet File (.parquet) to a DataFrame. -Arguments --`path`: Path or vector of paths or URLs to parquet file to be read -`col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. -`skip`: Number of initial lines to skip before reading data. Default is 0. -`n_max`: Maximum number of rows to read. Default is Inf (read all rows). --`col_select`: Optional vector of symbols or strings to select which columns to load. +# Arguments +- `path`: Path or vector of paths or URLs to parquet file to be read +- `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. +- `skip`: Number of initial lines to skip before reading data. Default is 0. +- `n_max`: Maximum number of rows to read. Default is Inf (read all rows). +- `col_select`: Optional vector of symbols or strings to select which columns to load. # Examples ```jldoctest @@ -611,3 +611,67 @@ julia> read_parquet("test.parquet") ``` """ +const docstring_read_file = +""" + read_files(path; args) +Generic file reader that automatically detects type and dispatches the appropriate read function. + +# Arguments +- `path` : a string with the file path to read +- `args` : additional arguments supported for that specific file type are given as they normally would be +# Examples +```jldoctest +julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva"], Score = [88, 92, 77, 85, 95]); + +julia> write_parquet(df, "test.parquet"); + +julia> read_file("test.parquet") +5×3 DataFrame + Row │ ID Name Score + │ Int64 String Int64 +─────┼─────────────────────── + 1 │ 1 Alice 88 + 2 │ 2 Bob 92 + 3 │ 3 Charlie 77 + 4 │ 4 David 85 + 5 │ 5 Eva 95 +``` +""" + +const docstring_write_file = +""" + write_files(df, path; args) +Generic file writer that automatically detects type and dispatches the appropriate read function. + +# Arguments +- `df` : Data frame to be exported +- `path` : a string with the file path to for the location of resulting file +- `args` : additional arguments supported for that specific file type are given as they normally would be + +# Examples +```jldoctest +julia> df = DataFrame(ID = 1:5, Name = ["Alice", "Bob", "Charlie", "David", "Eva"], Score = [88, 92, 77, 85, 95]); + +julia> write_file(df, "test.parquet"); + +julia> read_file("test.parquet") +5×3 DataFrame + Row │ ID Name Score + │ Int64 String Int64 +─────┼─────────────────────── + 1 │ 1 Alice 88 + 2 │ 2 Bob 92 + 3 │ 3 Charlie 77 + 4 │ 4 David 85 + 5 │ 5 Eva 95 +``` +""" + +const docstring_read_rdata = +""" + read_rdata(path) +Read `.rdata` and `.rds` files as DataFrame. `.rdata` files will result in a `Dict`. Dataframes can then be selected with `result["name"]` + +# Arguments +- `path`: A string with the file location. This does not yet support reading from URLs. +""" \ No newline at end of file diff --git a/src/gen_fxn.jl b/src/gen_fxn.jl new file mode 100644 index 0000000..088b8f3 --- /dev/null +++ b/src/gen_fxn.jl @@ -0,0 +1,79 @@ +""" +$docstring_read_file +""" +function read_file(filepath::String, args...; kwargs...) + ext = lowercase(splitext(filepath)[2]) + if ext == ".csv" + return read_csv(filepath, args...; kwargs...) + elseif ext == ".tsv" + return read_tsv(filepath, args...; kwargs...) + elseif ext == ".xlsx" + return read_xlsx(filepath, args...; kwargs...) + elseif ext == ".txt" || ext == ".dat" + return read_delim(filepath, args...; kwargs...) + elseif ext == ".fwf" + return read_fwf(filepath, args...; kwargs...) + elseif ext == ".sav" || ext == ".por" + return read_sav(filepath, args...; kwargs...) + elseif ext == ".sas7bdat" || ext == ".xpt" + return read_sas(filepath, args...; kwargs...) + elseif ext == ".dta" + return read_dta(filepath, args...; kwargs...) + elseif ext == ".arrow" + return read_arrow(filepath, args...; kwargs...) + elseif ext == ".parquet" + return read_parquet(filepath, args...; kwargs...) + elseif ext == ".rds" || ext == ".RData" || ext == ".rdata" + return RData.load(filepath) + else + error("Unsupported file format: $ext") + end +end + + +""" +$docstring_write_file +""" +function write_file(data::DataFrame,path::String, args...; kwargs...) + ext = lowercase(splitext(path)[2]) + if ext == ".xlsx" + sheet_name = get(kwargs, :sheet_name, "Sheet1") + return write_xlsx((sheet_name => data); path=path, overwrite=get(kwargs, :overwrite, false)) + elseif ext == ".csv" + return write_csv(data, path, args...; kwargs...) + elseif ext == ".tsv" + return write_tsv(data, path, args...; kwargs...) + elseif ext == ".txt" || ext == ".dat" + return write_delim(data, path, args...; kwargs...) + elseif ext == ".sav" || ext == ".por" + return write_sav(data, path, args...; kwargs...) + elseif ext == ".sas7bdat" || ext == ".xpt" + return write_sas(data, path, args...; kwargs...) + elseif ext == ".dta" + return write_dta(data, path, args...; kwargs...) + elseif ext == ".arrow" + return write_arrow(data, path, args...; kwargs...) + elseif ext == ".parquet" + return write_parquet(data, path, args...; kwargs...) + else + error("Unsupported file format: $ext") + end +end + +function write_file(x::Tuple{Vararg{Pair{String, DataFrame}}}; path::String, overwrite::Bool=false) + ext = lowercase(splitext(path)[2]) + if ext == ".xlsx" + return write_xlsx(x; path=path, overwrite=overwrite) + else + error("Unsupported file format for multiple DataFrames: $ext") + end +end + +function write_file(x::Pair{String, DataFrame}; path::String, overwrite::Bool=false) + ext = lowercase(splitext(path)[2]) + if ext == ".xlsx" + return write_xlsx((x,); path=path, overwrite=overwrite) + else + error("Unsupported file format for a single DataFrame: $ext") + end +end \ No newline at end of file diff --git a/src/r_data.jl b/src/r_data.jl new file mode 100644 index 0000000..e4ad2c6 --- /dev/null +++ b/src/r_data.jl @@ -0,0 +1,6 @@ +""" +$docstring_read_rdata +""" +function read_rdata(file::String) + return RData.load(file) +end \ No newline at end of file