From de00fd9b9bbe69ada88a013f9dfffaaa94970ca1 Mon Sep 17 00:00:00 2001 From: Lawrie Date: Thu, 11 Feb 2021 10:17:57 +1100 Subject: [PATCH] v3.0.0, using Julia 1.6 and TOML.jl instead of YAML.jl --- Manifest.toml | 162 ++++++++++++++++++++--------- Project.toml | 17 ++- README.md | 61 +++++------ scripts/compare.jl | 2 +- src/Schemata.jl | 2 +- src/compare/inmemory_table.jl | 3 +- src/compare/ondisk_table.jl | 2 +- src/readwrite.jl | 62 ++++++++++- src/types.jl | 89 ++++++++++++---- test/schemata/fever.toml | 12 +++ test/schemata/fever.yaml | 35 ------- test/schemata/row_constraints.toml | 12 +++ test/schemata/row_constraints.yaml | 29 ------ test/test_inmemory_tables.jl | 15 +-- test/test_ondisk_tables.jl | 2 +- 15 files changed, 316 insertions(+), 189 deletions(-) create mode 100644 test/schemata/fever.toml delete mode 100644 test/schemata/fever.yaml create mode 100644 test/schemata/row_constraints.toml delete mode 100644 test/schemata/row_constraints.yaml diff --git a/Manifest.toml b/Manifest.toml index b675ee6..254cc1e 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,42 +1,53 @@ # This file is machine-generated - editing it directly is not advised +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[CSV]] -deps = ["CategoricalArrays", "DataFrames", "Dates", "FilePathsBase", "Mmap", "Parsers", "PooledArrays", "Tables", "Unicode", "WeakRefStrings"] -git-tree-sha1 = "52a8e60c7822f53d57e4403b7f2811e7e1bdd32b" +deps = ["Dates", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode"] +git-tree-sha1 = "1f79803452adf73e2d3fc84785adb7aaca14db36" uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -version = "0.6.2" +version = "0.8.3" [[CategoricalArrays]] -deps = ["DataAPI", "Future", "JSON", "Missings", "Printf", "Statistics", "Unicode"] -git-tree-sha1 = "a6c17353ee38ddab30e73dcfaa1107752de724ec" +deps = ["DataAPI", "Future", "JSON", "Missings", "Printf", "Statistics", "StructTypes", "Unicode"] +git-tree-sha1 = "99809999c8ee01fa89498480b147f7394ea5450f" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.8.1" +version = "0.9.2" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "054993b6611376ddb40203e973e954fd9d1d1902" +git-tree-sha1 = "919c7f3151e79ff196add81d7f4e45d91bbf420b" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.12.0" +version = "3.25.0" + +[[Crayons]] +git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.0.4" [[DataAPI]] -git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32" +git-tree-sha1 = "8ab70b4de35bb3b8cc19654f6b893cf5164f8ee8" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.3.0" +version = "1.5.1" [[DataFrames]] -deps = ["CategoricalArrays", "Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "Missings", "PooledArrays", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "02f08ae77249b7f6d4186b081a016fb7454c616f" +deps = ["CategoricalArrays", "Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "b0db5579803eabb33f1274ca7ca2f472fdfb7f2a" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "0.21.2" +version = "0.22.5" [[DataStructures]] -deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "be680f1ad03c0a03796aa3fda5a2180df7f83b46" +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.18" +version = "0.18.9" [[DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" @@ -55,11 +66,15 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" -[[FilePathsBase]] -deps = ["Dates", "LinearAlgebra", "Printf", "Test", "UUIDs"] -git-tree-sha1 = "923fd3b942a11712435682eaa95cc8518c428b2c" -uuid = "48062228-2e41-5def-b9a4-89aafe57970f" -version = "0.8.0" +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Formatting]] +deps = ["Printf"] +git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8" +uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" +version = "0.4.2" [[Future]] deps = ["Random"] @@ -82,14 +97,26 @@ version = "1.0.0" [[JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.0" +version = "0.21.1" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" [[LibGit2]] -deps = ["Printf"] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -104,42 +131,58 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + [[Missings]] deps = ["DataAPI"] -git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" +git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "0.4.3" +version = "0.4.5" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + [[OrderedCollections]] -git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" +git-tree-sha1 = "d45739abcfc03b51f6a42712894a593f74c80a23" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.2.0" +version = "1.3.3" [[Parsers]] -deps = ["Dates", "Test"] -git-tree-sha1 = "eb3e09940c0d7ae01b01d9291ebad7b081c844d3" +deps = ["Dates"] +git-tree-sha1 = "50c9a9ed8c714945e01cd53a21007ed3865ed714" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.0.5" +version = "1.0.15" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[PooledArrays]] deps = ["DataAPI"] -git-tree-sha1 = "b1333d4eced1826e15adbdf01a4ecaccca9d353c" +git-tree-sha1 = "0e8f5c428a41a81cd71f76d76f2fc3415fe5a676" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "0.5.3" +version = "1.1.0" + +[[PrettyTables]] +deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] +git-tree-sha1 = "42126c4e2677cdc664baea004c98cc60a664fe40" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "0.11.0" [[Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" [[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] @@ -147,14 +190,19 @@ deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[Reexport]] -deps = ["Pkg"] -git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "0.2.0" +version = "1.0.0" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "6ccde405cf0759eba835eb613130723cb8f10ff9" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.2.16" + [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -179,6 +227,16 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[[StructTypes]] +deps = ["Dates", "UUIDs"] +git-tree-sha1 = "65a43f5218197bc7091b76bc273a5e323a1d7b0d" +uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" +version = "1.2.3" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + [[TableTraits]] deps = ["IteratorInterfaceExtensions"] git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e" @@ -187,12 +245,16 @@ version = "1.0.0" [[Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] -git-tree-sha1 = "c45dcc27331febabc20d86cb3974ef095257dcf3" +git-tree-sha1 = "a716dde43d57fa537a19058d044b495301ba6565" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.0.4" +version = "1.3.2" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" [[Test]] -deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[UUIDs]] @@ -202,14 +264,10 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[WeakRefStrings]] -deps = ["DataAPI", "Random", "Test"] -git-tree-sha1 = "28807f85197eaad3cbd2330386fac1dcb9e7e11d" -uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" -version = "0.6.2" - -[[YAML]] -deps = ["Base64", "Dates", "Printf"] -git-tree-sha1 = "c5e2eaa5ce818c5277388377d592eb4c81f27c00" -uuid = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" -version = "0.4.0" +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" diff --git a/Project.toml b/Project.toml index e74677a..9f56db5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Schemata" uuid = "b4d66a32-c6c0-5461-b6fa-34bb9cecaf85" authors = ["Jock Lawrie "] -version = "2.0.8" +version = "3.0.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" @@ -9,17 +9,16 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] -CSV = "0.6.2" -CategoricalArrays = "0.8.1" -DataFrames = "0.21.2" -Parsers = "1.0.5" -Tables = "1.0.4" -YAML = "0.4.0" -julia = "1" +CSV = "0.8.3" +CategoricalArrays = "0.9.2" +DataFrames = "0.22.5" +Parsers = "1.0.15" +Tables = "1.3.2" +julia = "1.6" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index 1fd949b..5da3ebc 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ It exists independently of any particular data set, and therefore can be constru This package facilitates 3 use cases: -1. Read/write a schema from/to a yaml file. +1. Read/write a schema from/to a [TOML](https://toml.io/en/v1.0.0) file. 2. Compare a data set to a schema and list the non-compliance issues. @@ -19,28 +19,29 @@ Indeed the 3 use cases listed above can be carried out without writing any Julia # Usage -A `TableSchema` looks like this `yaml` file: - -```YAML -name: mytable -description: "My table" -primarykey: patientid # A column name or a vector of column names -columns: - - patientid: {description: Patient ID, datatype: UInt, iscategorical: false, isrequired: true, isunique: true, validvalues: UInt} - - age: {description: Age (years), datatype: Int, iscategorical: false, isrequired: true, isunique: false, validvalues: "0:120"} - - dose: {description: Dose size, datatype: String, iscategorical: true, isrequired: true, isunique: false, - validvalues: ["small", "medium", "large"] - - fever: {description: Had fever, datatype: Bool, iscategorical: true, isrequired: true, isunique: false, validvalues: Bool} +A `TableSchema` looks like this `TOML` file: + +```toml +name = "mytable" +description = "My table" +primarykey = "patientid" # A column name or a vector of column names +columns = [ + {name = "patientid", description = "Patient ID", datatype = "UInt", validvalues = "UInt", iscategorical = false, isrequired = true, isunique = true}, + {name = "age", description = "Age (years)", datatype = "Int", validvalues = "Int", iscategorical = false, isrequired = true, isunique = false}, + {name = "dose", description = "Dose size", datatype = "String", validvalues = ["small", "medium", "large"], iscategorical = true, isrequired = true, isunique = false}, + {name = "fever", description = "Had fever", datatype = "Bool", validvalues = "Bool", iscategorical = true, isrequired = true, isunique = false} +] ``` A `Schema` contains 1 or more `TableSchema`. For example: -```YAML -name: fever -description: "Fever schema" -tables: - table1: *table1_schema - table2: *table2_schema +```TOML +name = "fever" +description = "Fever schema" + +[tables] +table1 = "table1_schema" +table2 = "table2_schema" ``` For tables that fit into memory, usage is as follows: @@ -49,7 +50,7 @@ For tables that fit into memory, usage is as follows: # Read in a schema using Schemata -schema = readschema(joinpath(dirname(pathof(Schemata)), "..", "test/schemata/fever.yaml")) +schema = readschema(joinpath(dirname(pathof(Schemata)), "..", "test/schemata/fever.toml")) ts = schema.tables[:mytable] # TableSchema for mytable # Construct/import a table (any object that satisfies the Tables.jl interface) @@ -152,16 +153,16 @@ We often want to ensure that certain relationships hold between variables within For example, we might require that a person's marriage date is after his/her birth date. We can achieve this by specifying one or more intra-row constraints in a `TableSchema` as follows: -```yaml -name: intrarow_constraints_demo -description: "Table with intra-row constraints" -primarykey: id -intrarow_constraints: - birth date before marriage date: "r[:dob] < r[:date_of_marriage]" -columns: - - id: {description: ID, datatype: UInt, iscategorical: false, isrequired: true, isunique: true, validvalues: UInt} - - dob: {description: Date of birth, datatype: Date, iscategorical: false, isrequired: true, isunique: false, validvalues: Date} - - date_of_marriage: {description: Date of marriage, datatype: Date, iscategorical: false, isrequired: false, isunique: false, validvalues: Date} +```toml +name = "intrarow_constraints_demo" +description = "Table with intra-row constraints" +primarykey = "patientid" +intrarow_constraints = {"birth date before marriage date" = "r[:dob] < r[:date_of_marriage]"} +columns = [ + {name="patientid", description = "Patient ID", datatype = "UInt", validvalues = "UInt", iscategorical = false, isrequired = true, isunique = true}, + {name="dob", description = "Date of birth", datatype = "Date", validvalues = "Date", iscategorical = false, isrequired = true, isunique = false}, + {name="date_of_marriage", description = "Date of marriage", datatype = "Date", validvalues = "Date", iscategorical = false, isrequired = false, isunique = false} +] ``` Each constraint is specified as a key-value pair, where the key is a description of the constraint and diff --git a/scripts/compare.jl b/scripts/compare.jl index 2e029ed..e7fbc35 100644 --- a/scripts/compare.jl +++ b/scripts/compare.jl @@ -1,7 +1,7 @@ #= Run this script as follows: $ cd /path/to/Schemata.jl - $ /path/to/julia scripts/compare.jl /path/to/config.yaml /path/to/inputdata sorted_by_primarykey + $ /path/to/julia scripts/compare.jl /path/to/config.toml /path/to/inputdata sorted_by_primarykey The 3rd argument, sorted_by_primarykey is either "true" or "false". If "true" the compare function assumes that your table is sorted by its primary key, which enables a faster comparison to the schema to be made. diff --git a/src/Schemata.jl b/src/Schemata.jl index 9393207..112460f 100644 --- a/src/Schemata.jl +++ b/src/Schemata.jl @@ -2,7 +2,7 @@ module Schemata export Schema, TableSchema, ColumnSchema, # types compare, # core function - readschema # read schema from config file + readschema, writeschema # read/write schema from/to config file include("handle_validvalues.jl") include("types.jl") diff --git a/src/compare/inmemory_table.jl b/src/compare/inmemory_table.jl index 3917fb7..49bf5b1 100644 --- a/src/compare/inmemory_table.jl +++ b/src/compare/inmemory_table.jl @@ -3,6 +3,7 @@ Compare an in-memory table to a table schema. """ module inmemory_table +using CategoricalArrays using DataFrames using Tables @@ -66,7 +67,7 @@ function compare(tableschema::TableSchema, indata, sorted_by_primarykey::Bool) # Column-level checks for (colname, colschema) in colname2colschema !colschema.iscategorical && continue - categorical!(outdata, colname) + transform!(outdata, colname => categorical, renamecols=false) end datacols_match_schemacols!(issues_in, tableschema, Set(propertynames(indata))) # By construction this issue doesn't exist for outdata compare_datatypes!(issues_in, indata, colname2colschema) diff --git a/src/compare/ondisk_table.jl b/src/compare/ondisk_table.jl index f15a08e..3eafde6 100644 --- a/src/compare/ondisk_table.jl +++ b/src/compare/ondisk_table.jl @@ -37,7 +37,7 @@ function compare(tableschema::TableSchema, input_data_file::String, output_data_ colissues_in = issues_in[:columnissues] colissues_out = issues_out[:columnissues] CSV.write(output_data_file, init_outdata(tableschema, 0); delim=delim_outdata) # Write column headers to disk - csvrows = CSV.Rows(input_data_file; reusebuffer=true, use_mmap=true) + csvrows = CSV.Rows(input_data_file; reusebuffer=true) for inputrow in csvrows # Parse inputrow into outputrow according to ColumnSchema i_outdata += 1 diff --git a/src/readwrite.jl b/src/readwrite.jl index f0eaf00..0b3ea26 100644 --- a/src/readwrite.jl +++ b/src/readwrite.jl @@ -1,15 +1,71 @@ module readwrite -export readschema +export readschema, writeschema -using YAML +using TOML using ..types "Returns either a Schema or a TableSchema, depending on the contents of the config file." function readschema(filename::String) - d = YAML.load_file(filename) + d = TOML.parsefile(filename) haskey(d, "columns") && return TableSchema(d) # Config is for a TableSchema Schema(d) # Config is for a Schema end +writeschema(outfile::String, schema::Schema) = toml_to_file(outfile, schema_to_dict(schema)) +writeschema(outfile::String, tableschema::TableSchema) = toml_to_file(outfile, tableschema_to_dict(tableschema)) + +function toml_to_file(outfile::String, d) + io = open(outfile, "w") + TOML.print(io, d) + close(io) +end + +################################################################################ +# Utils + +function schema_to_dict(schema::Schema) + result = Dict{String, Any}() + result["name"] = String(schema.name) + result["description"] = schema.description + result["tables"] = Dict(String(tablename) => tableschema_to_dict(tableschema) for (tablename, tableschema) in schema.tables) + result +end + +function tableschema_to_dict(tableschema::TableSchema) + result = Dict{String, Any}() + result["name"] = String(tableschema.name) + result["description"] = tableschema.description + result["primarykey"] = String.(tableschema.primarykey) + columns = Dict{String, Any}[] # colname => colschema + for colname in tableschema.columnorder + push!(columns, colschema_to_dict(tableschema.colname2colschema[colname])) + end + result["columns"] = columns + if !isempty(tableschema.intrarow_constraints) + result["intrarow_constraints"] = Dict(msg => func_as_supplied for (func_as_supplied, f, msg) in tableschema.intrarow_constraints) + end + result +end + +function colschema_to_dict(colschema::ColumnSchema) + result = Dict{String, Any}() + result["name"] = string(colschema.name) + result["description"] = colschema.description + result["datatype"] = string(colschema.datatype) + result["iscategorical"] = colschema.iscategorical + result["isrequired"] = colschema.isrequired + result["isunique"] = colschema.isunique + result["validvalues"] = format_validvalues(colschema.validvalues, colschema.valueorder) + if !isnothing(colschema.parser_as_supplied) + result["parser"] = colschema.parser_as_supplied + end + result +end + +format_validvalues(vv::DataType, valueorder) = string(vv) +format_validvalues(vv::AbstractRange, valueorder) = string(vv) +format_validvalues(vv::Set, valueorder::Nothing) = sort!([x for x in vv]) +format_validvalues(vv::Set, valueorder::Vector) = valueorder + end diff --git a/src/types.jl b/src/types.jl index c9880ae..9ad72c2 100644 --- a/src/types.jl +++ b/src/types.jl @@ -6,6 +6,7 @@ using Dates using Parsers import Base.parse # For extending Base.parse to Base.parse(s::ColumnSchema, val) +import Base.== using ..handle_validvalues @@ -19,20 +20,21 @@ mutable struct ColumnSchema validvalues::Union{DataType, <:AbstractRange, <:Set} # Either the full range of the data type or a user-supplied restriction. valueorder::Union{DataType, <:AbstractRange, <:Vector, Nothing} # If iscategorical, valueorder specifies the ordering of categories. Else nothing. parser::Function # outputvalue = parser(inputvalue) + parser_as_supplied::Union{Dict, Nothing} # Internal use only; for writing the parser to disk in writeschema. - function ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser) + function ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser, parser_as_supplied) # Ensure eltyp and validvalues are consistent with each other tp_validvals = get_datatype(validvalues) datatype != tp_validvals && error("Column :$(name). Type of valid values ($(tp_validvals)) does not match that of eltype ($(datatype)).") - new(Symbol(name), description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser) + new(Symbol(name), description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser, parser_as_supplied) end end -function ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues) +function ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, parser_as_supplied=nothing) valueorder = iscategorical ? validvalues : nothing validvalues = validvalues isa Vector ? Set(validvalues) : validvalues parser = constructparser(nothing, nothing, nothing, datatype) - ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser) + ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser, parser_as_supplied) end function ColumnSchema(d::Dict) @@ -53,18 +55,21 @@ function ColumnSchema(d::Dict) valueorder = parse_validvalues(parser, datatype, d["validvalues"]) validvalues = valueorder isa Vector ? Set(valueorder) : valueorder valueorder = iscategorical ? valueorder : nothing - ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser) + parser_as_supplied = haskey(d, "parser") ? d["parser"] : nothing + ColumnSchema(name, description, datatype, iscategorical, isrequired, isunique, validvalues, valueorder, parser, parser_as_supplied) end function constructparser(func, args, kwargs, returntype) # Special cases - if ((func == Date) || (isnothing(func) && returntype == Date)) && !isnothing(args) && length(args) == 1 - df = DateFormat(args[1]) - return (x) -> try Date(x, df) catch e missing end + if (isnothing(func) && returntype === Date) || (!isnothing(func) && func isa DataType && func === Date) + if !isnothing(args) && length(args) == 1 + df = DateFormat(args[1]) + return (x) -> try Date(x, df) catch e missing end + end end # General cases - if func isa DataType || isnothing(func) + if isnothing(func) || func isa DataType opts = isnothing(kwargs) ? Parsers.Options() : Parsers.Options(kwargs...) function closure(val) len = val isa IO ? 0 : sizeof(val) # Use default pos=1 @@ -81,6 +86,19 @@ function constructparser(func, args, kwargs, returntype) error("Invalid specification of the parser.") end +function ==(cs1::ColumnSchema, cs2::ColumnSchema) + cs1.name !== cs2.name && return false + cs1.description != cs2.description && return false + cs1.datatype !== cs2.datatype && return false + cs1.iscategorical !== cs2.iscategorical && return false + cs1.isrequired !== cs2.isrequired && return false + cs1.isunique !== cs2.isunique && return false + cs1.validvalues != cs2.validvalues && return false + cs1.valueorder != cs2.valueorder && return false + cs1.parser_as_supplied != cs2.parser_as_supplied && return false + true +end + ################################################################################ struct TableSchema name::Symbol @@ -113,24 +131,18 @@ function TableSchema(d::Dict) name = Symbol(d["name"]) description = d["description"] pk = d["primarykey"] # String or Vector{String} - primarykey = typeof(pk) == String ? [Symbol(pk)] : [Symbol(colname) for colname in pk] + primarykey = pk isa String ? [Symbol(pk)] : [Symbol(colname) for colname in pk] columns = d["columns"] columnorder = fill(Symbol("x"), size(columns, 1)) colname2colschema = Dict{Symbol, ColumnSchema}() - i = 0 - for colname2schema in columns - for (colname, colschema) in colname2schema - i += 1 - columnorder[i] = Symbol(colname) - colschema["name"] = columnorder[i] - colname2colschema[columnorder[i]] = ColumnSchema(colschema) - end + for (i, d2) in enumerate(columns) + columnorder[i] = Symbol(d2["name"]) + colname2colschema[columnorder[i]] = ColumnSchema(d2) end intrarow_constraints = construct_intrarow_constraints(d) TableSchema(name, description, colname2colschema, columnorder, primarykey, intrarow_constraints) end - function construct_intrarow_constraints(d::Dict) !haskey(d, "intrarow_constraints") && return Tuple{String, Function}[] d = d["intrarow_constraints"] @@ -146,6 +158,33 @@ function construct_intrarow_constraints(d::Dict) result end +function ==(ts1::TableSchema, ts2::TableSchema) + ts1.name !== ts2.name && return false + ts1.description != ts2.description && return false + length(ts1.colname2colschema) != length(ts2.colname2colschema) && return false + for (colname1, colschema1) in ts1.colname2colschema + !haskey(ts2.colname2colschema, colname1) && return false + colschema2 = ts2.colname2colschema[colname1] + colschema1 != colschema2 && return false + end + length(ts1.columnorder) != length(ts2.columnorder) && return false + for (i, colname1) in enumerate(ts1.columnorder) + colname2 = ts2.columnorder[i] + colname1 !== colname2 && return false + end + length(ts1.primarykey) != length(ts2.primarykey) && return false + for (i, colname1) in enumerate(ts1.primarykey) + colname2 = ts2.primarykey[i] + colname1 !== colname2 && return false + end + length(ts1.intrarow_constraints) != length(ts2.intrarow_constraints) && return false + for (i, cname_func1) in enumerate(ts1.intrarow_constraints) + cname_func2 = ts2.intrarow_constraints[i] + cname_func1[1] != cname_func2[1] && return false + cname_func1[2] != cname_func2[2] && return false + end + true +end ################################################################################ struct Schema @@ -169,4 +208,16 @@ function Schema(d::Dict) Schema(name, description, tables) end +function ==(s1::Schema, s2::Schema) + s1.name !== s2.name && return false + s1.description != s2.description && return false + length(s1.tables) != length(s2.tables) && return false + for (tablename1, tableschema1) in s1.tables + !haskey(s2.tables, tablename1) && return false + tableschema2 = s2.tables[tablename1] + tableschema1 != tableschema2 && return false + end + true +end + end diff --git a/test/schemata/fever.toml b/test/schemata/fever.toml new file mode 100644 index 0000000..ef3cd31 --- /dev/null +++ b/test/schemata/fever.toml @@ -0,0 +1,12 @@ +name = "fever" +description = "Fever schema" + +[tables.mytable] +description = "My table" +primarykey = "patientid" +columns = [ + {name = "patientid", description = "Patient ID", datatype = "UInt", validvalues = "UInt", iscategorical = false, isrequired = true, isunique = true}, + {name = "age", description = "Age (years)", datatype = "Int", validvalues = "Int", iscategorical = false, isrequired = true, isunique = false}, + {name = "dose", description = "Dose size", datatype = "String", validvalues = ["small", "medium", "large"], iscategorical = true, isrequired = true, isunique = false}, + {name = "fever", description = "Had fever", datatype = "Bool", validvalues = "Bool", iscategorical = true, isrequired = true, isunique = false} +] \ No newline at end of file diff --git a/test/schemata/fever.yaml b/test/schemata/fever.yaml deleted file mode 100644 index 7a74efb..0000000 --- a/test/schemata/fever.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: fever -description: "Fever schema" -tables: - mytable: - description: "My table" - primarykey: patientid - columns: - - patientid: - description: Patient ID - datatype: UInt - iscategorical: false - isrequired: true - isunique: true - validvalues: UInt - - age: - description: Age (years) - datatype: Int - iscategorical: false - isrequired: true - isunique: false - validvalues: Int - - dose: - description: Dose size - datatype: String - iscategorical: true - isrequired: true - isunique: false - validvalues: ["small", "medium", "large"] - - fever: - description: Had fever - datatype: Bool - iscategorical: true - isrequired: true - isunique: false - validvalues: Bool diff --git a/test/schemata/row_constraints.toml b/test/schemata/row_constraints.toml new file mode 100644 index 0000000..e33f1e7 --- /dev/null +++ b/test/schemata/row_constraints.toml @@ -0,0 +1,12 @@ +name = "constraints" +description = "Date constraints" + +[tables.dates] +description = "Table with date constraints" +primarykey = "patientid" +intrarow_constraints = {"birth date before marriage date" = "r[:dob] < r[:date_of_marriage]"} +columns = [ + {name="patientid", description = "Patient ID", datatype = "UInt", validvalues = "UInt", iscategorical = false, isrequired = true, isunique = true}, + {name="dob", description = "Date of birth", datatype = "Date", validvalues = "Date", iscategorical = false, isrequired = true, isunique = false}, + {name="date_of_marriage", description = "Date of marriage", datatype = "Date", validvalues = "Date", iscategorical = false, isrequired = false, isunique = false} +] \ No newline at end of file diff --git a/test/schemata/row_constraints.yaml b/test/schemata/row_constraints.yaml deleted file mode 100644 index aae681c..0000000 --- a/test/schemata/row_constraints.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: constraints -description: "Date constraints" -tables: - dates: - description: "Table with date constraints" - primarykey: patientid - intrarow_constraints: {birth date before marriage date: "r[:dob] < r[:date_of_marriage]"} - columns: - - patientid: - description: Patient ID - datatype: UInt - iscategorical: false - isrequired: true - isunique: true - validvalues: UInt - - dob: - description: Date of birth - datatype: Date - iscategorical: false - isrequired: true - isunique: false - validvalues: Date - - date_of_marriage: - description: Date of marriage - datatype: Date - iscategorical: false - isrequired: false - isunique: false - validvalues: Date diff --git a/test/test_inmemory_tables.jl b/test/test_inmemory_tables.jl index 6fd497c..7dba625 100644 --- a/test/test_inmemory_tables.jl +++ b/test/test_inmemory_tables.jl @@ -37,7 +37,7 @@ outdata, issues_in, issues_out = compare(ts, tbl) @test size(issues_in, 1) == 4 # Modify data to comply with the schema -categorical!(tbl, [:dose, :fever]) # Ensure :dose and :fever contain categorical data +transform!(tbl, [:dose, :fever] .=> categorical, renamecols=false) # Ensure :dose and :fever contain categorical data outdata, issues_in, issues_out = compare(ts, tbl) @test size(issues_in, 1) == 2 @test size(issues_out, 1) == 0 @@ -69,10 +69,11 @@ push!(schema.tables[:mytable].columnorder, :zipcode) @test schema.tables[:mytable].colname2colschema[:zipcode] == zipcode # Write the updated schema to disk -#schemafile = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/fever_updated.yaml") -#writeschema(schemafile, schema) -#schema_from_disk = readschema(schemafile) -#@test schema == schema_from_disk +schemafile = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/fever_updated.toml") +writeschema(schemafile, schema) +schema_from_disk = readschema(schemafile) +rm(schemafile) +@test schema == schema_from_disk # Add a corresponding (non-compliant) column to the data tbl[!, :zipcode] = ["11111", "22222", "33333", "NULL"]; # CSV file was supplied with "NULL" values, forcing eltype to be String. @@ -103,7 +104,7 @@ end my_zdt_custom_parser(dttm::DateTime, tz::String) = ZonedDateTime(dttm, TimeZone(tz)) -# Dict for ColumnSchema constructor, obtained after reading yaml +# Dict for ColumnSchema constructor, obtained after reading toml d = Dict("name" => "zdt", "description" => "Test custom parser for TimeZones.ZonedDateTime", "datatype" => "ZonedDateTime", "iscategorical" => false, "isrequired" => true, "isunique" => true, @@ -150,7 +151,7 @@ outdata, issues_in, issues_out = compare(ts, tbl); ################################################################################ # Test intra-row constraints function test_row_constraints() - filename = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/row_constraints.yaml") + filename = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/row_constraints.toml") schema = readschema(filename) d = DataFrame( patientid = UInt.([1,2,3]), diff --git a/test/test_ondisk_tables.jl b/test/test_ondisk_tables.jl index 193dae5..c384f5a 100644 --- a/test/test_ondisk_tables.jl +++ b/test/test_ondisk_tables.jl @@ -60,7 +60,7 @@ issues_in = DataFrame(CSV.File(issues_infile; delim='\t')) ################################################################################ # Test intra-row constraints function test_row_constraints() - filename = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/row_constraints.yaml") + filename = joinpath(dirname(pathof(Schemata)), "..", "test/schemata/row_constraints.toml") schema = readschema(filename) indata = DataFrame( patientid = UInt.([1,2,3]),