Skip to content

Commit

Permalink
Merge pull request #1447 from mlaily/inline-schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermp authored Aug 8, 2022
2 parents 6757c40 + 071b030 commit d0a5f47
Show file tree
Hide file tree
Showing 204 changed files with 24,766 additions and 428 deletions.
27 changes: 18 additions & 9 deletions docs/tutorials/JsonAnonymizer.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,25 @@ type JsonAnonymizer(?propertiesToSkip, ?valuesToSkip) =
let randomize (str:string) =
String(str.ToCharArray() |> Array.map getRandomChar)

let isType testType typ =
match typ with
| Runtime.StructuralTypes.InferedType.Primitive (typ, _, _, _) -> typ = testType
| _ -> false

let rec anonymize json =
match json with
| JsonValue.String s when valuesToSkip.Contains s -> json
| JsonValue.String s ->
let typ =
Runtime.StructuralInference.inferPrimitiveType
CultureInfo.InvariantCulture s

( if typ = typeof<Guid> then Guid.NewGuid().ToString()
elif typ = typeof<Runtime.StructuralTypes.Bit0> ||
typ = typeof<Runtime.StructuralTypes.Bit1> then s
elif typ = typeof<DateTime> then s
Runtime.StructuralInference.defaultUnitsOfMeasureProvider
Runtime.StructuralInference.InferenceMode'.ValuesOnly
CultureInfo.InvariantCulture s None

( if typ |> isType typeof<Guid> then Guid.NewGuid().ToString()
elif typ |> isType typeof<Runtime.StructuralTypes.Bit0> ||
typ |> isType typeof<Runtime.StructuralTypes.Bit1> then s
elif typ |> isType typeof<DateTime> then s
else
let prefix, s =
if s.StartsWith "http://" then
Expand All @@ -92,9 +99,11 @@ type JsonAnonymizer(?propertiesToSkip, ?valuesToSkip) =
| JsonValue.Number d ->
let typ =
Runtime.StructuralInference.inferPrimitiveType
CultureInfo.InvariantCulture (d.ToString())
if typ = typeof<Runtime.StructuralTypes.Bit0> ||
typ = typeof<Runtime.StructuralTypes.Bit1> then json
Runtime.StructuralInference.defaultUnitsOfMeasureProvider
Runtime.StructuralInference.InferenceMode'.ValuesOnly
CultureInfo.InvariantCulture (d.ToString()) None
if typ |> isType typeof<Runtime.StructuralTypes.Bit0> ||
typ |> isType typeof<Runtime.StructuralTypes.Bit1> then json
else d.ToString() |> randomize |> Decimal.Parse |> JsonValue.Number
| JsonValue.Float f ->
f.ToString()
Expand Down
399 changes: 316 additions & 83 deletions src/CommonRuntime/StructuralInference.fs

Large diffs are not rendered by default.

28 changes: 16 additions & 12 deletions src/CommonRuntime/StructuralTypes.fs
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ type InferedTypeTag =
/// to generate nicer types!
[<CustomEquality; NoComparison; RequireQualifiedAccess>]
type InferedType =
| Primitive of typ: Type * unit: option<System.Type> * optional: bool
| Primitive of typ: Type * unit: option<System.Type> * optional: bool * shouldOverrideOnMerge: bool
| Record of name: string option * fields: InferedProperty list * optional: bool
| Json of typ: InferedType * optional: bool
| Collection of order: InferedTypeTag list * types: Map<InferedTypeTag, InferedMultiplicity * InferedType>
| Heterogeneous of types: Map<InferedTypeTag, InferedType>
| Heterogeneous of types: Map<InferedTypeTag, InferedType> * containsOptional: bool
| Null
| Top

Expand All @@ -86,16 +86,17 @@ type InferedType =
member x.EnsuresHandlesMissingValues allowEmptyValues =
match x with
| Null
| Heterogeneous _
| Heterogeneous(containsOptional = true)
| Primitive(optional = true)
| Record(optional = true)
| Json(optional = true) -> x
| Primitive (typ, _, false) when
| Primitive (typ, _, false, _) when
allowEmptyValues
&& InferedType.CanHaveEmptyValues typ
->
x
| Primitive (typ, unit, false) -> Primitive(typ, unit, true)
| Heterogeneous (map, false) -> Heterogeneous(map, true)
| Primitive (typ, unit, false, overrideOnMerge) -> Primitive(typ, unit, true, overrideOnMerge)
| Record (name, props, false) -> Record(name, props, true)
| Json (typ, false) -> Json(typ, true)
| Collection (order, types) ->
Expand All @@ -106,12 +107,15 @@ type InferedType =
Collection(order, typesR)
| Top -> failwith "EnsuresHandlesMissingValues: unexpected InferedType.Top"

member x.DropOptionality() =
member x.GetDropOptionality() =
match x with
| Primitive (typ, unit, true) -> Primitive(typ, unit, false)
| Record (name, props, true) -> Record(name, props, false)
| Json (typ, true) -> Json(typ, false)
| _ -> x
| Primitive (typ, unit, true, overrideOnMerge) -> Primitive(typ, unit, false, overrideOnMerge), true
| Record (name, props, true) -> Record(name, props, false), true
| Json (typ, true) -> Json(typ, false), true
| Heterogeneous (map, true) -> Heterogeneous(map, false), true
| _ -> x, false

member x.DropOptionality() = x.GetDropOptionality() |> fst

// We need to implement custom equality that returns 'true' when
// values reference the same object (to support recursive types)
Expand All @@ -121,11 +125,11 @@ type InferedType =
if y :? InferedType then
match x, y :?> InferedType with
| a, b when Object.ReferenceEquals(a, b) -> true
| Primitive (t1, ot1, b1), Primitive (t2, ot2, b2) -> t1 = t2 && ot1 = ot2 && b1 = b2
| Primitive (t1, ot1, b1, x1), Primitive (t2, ot2, b2, x2) -> t1 = t2 && ot1 = ot2 && b1 = b2 && x1 = x2
| Record (s1, pl1, b1), Record (s2, pl2, b2) -> s1 = s2 && pl1 = pl2 && b1 = b2
| Json (t1, o1), Json (t2, o2) -> t1 = t2 && o1 = o2
| Collection (o1, t1), Collection (o2, t2) -> o1 = o2 && t1 = t2
| Heterogeneous (m1), Heterogeneous (m2) -> m1 = m2
| Heterogeneous (m1, o1), Heterogeneous (m2, o2) -> m1 = m2 && o1 = o2
| Null, Null
| Top, Top -> true
| _ -> false
Expand Down
160 changes: 69 additions & 91 deletions src/Csv/CsvInference.fs
Original file line number Diff line number Diff line change
Expand Up @@ -10,46 +10,34 @@ open FSharp.Data.Runtime
open FSharp.Data.Runtime.StructuralTypes
open FSharp.Data.Runtime.StructuralInference

/// The schema may be set explicitly. This table specifies the mapping
/// from the names that users can use to the types used.
let private nameToType =
[ "int", (typeof<int>, TypeWrapper.None)
"int64", (typeof<int64>, TypeWrapper.None)
"bool", (typeof<bool>, TypeWrapper.None)
"float", (typeof<float>, TypeWrapper.None)
"decimal", (typeof<decimal>, TypeWrapper.None)
"date", (typeof<DateTime>, TypeWrapper.None)
"datetimeoffset", (typeof<DateTimeOffset>, TypeWrapper.None)
"timespan", (typeof<TimeSpan>, TypeWrapper.None)
"guid", (typeof<Guid>, TypeWrapper.None)
"string", (typeof<String>, TypeWrapper.None)
"int?", (typeof<int>, TypeWrapper.Nullable)
"int64?", (typeof<int64>, TypeWrapper.Nullable)
"bool?", (typeof<bool>, TypeWrapper.Nullable)
"float?", (typeof<float>, TypeWrapper.Nullable)
"decimal?", (typeof<decimal>, TypeWrapper.Nullable)
"date?", (typeof<DateTime>, TypeWrapper.Nullable)
"datetimeoffset?", (typeof<DateTimeOffset>, TypeWrapper.Nullable)
"timespan?", (typeof<TimeSpan>, TypeWrapper.Nullable)
"guid?", (typeof<Guid>, TypeWrapper.Nullable)
"int option", (typeof<int>, TypeWrapper.Option)
"int64 option", (typeof<int64>, TypeWrapper.Option)
"bool option", (typeof<bool>, TypeWrapper.Option)
"float option", (typeof<float>, TypeWrapper.Option)
"decimal option", (typeof<decimal>, TypeWrapper.Option)
"date option", (typeof<DateTime>, TypeWrapper.Option)
"datetimeoffset option", (typeof<DateTimeOffset>, TypeWrapper.Option)
"timespan option", (typeof<TimeSpan>, TypeWrapper.Option)
"guid option", (typeof<Guid>, TypeWrapper.Option)
"string option", (typeof<string>, TypeWrapper.Option) ]
/// This table specifies the mapping from (the names that users can use) to (the types used).
/// The table here for the CsvProvider extends the mapping used for inline schemas by adding nullable and optionals.
let private nameToTypeForCsv =
[ for KeyValue (k, v) in StructuralInference.nameToType -> k, v ]
@ [ "int?", (typeof<int>, TypeWrapper.Nullable)
"int64?", (typeof<int64>, TypeWrapper.Nullable)
"bool?", (typeof<bool>, TypeWrapper.Nullable)
"float?", (typeof<float>, TypeWrapper.Nullable)
"decimal?", (typeof<decimal>, TypeWrapper.Nullable)
"date?", (typeof<DateTime>, TypeWrapper.Nullable)
"datetimeoffset?", (typeof<DateTimeOffset>, TypeWrapper.Nullable)
"timespan?", (typeof<TimeSpan>, TypeWrapper.Nullable)
"guid?", (typeof<Guid>, TypeWrapper.Nullable)
"int option", (typeof<int>, TypeWrapper.Option)
"int64 option", (typeof<int64>, TypeWrapper.Option)
"bool option", (typeof<bool>, TypeWrapper.Option)
"float option", (typeof<float>, TypeWrapper.Option)
"decimal option", (typeof<decimal>, TypeWrapper.Option)
"date option", (typeof<DateTime>, TypeWrapper.Option)
"datetimeoffset option", (typeof<DateTimeOffset>, TypeWrapper.Option)
"timespan option", (typeof<TimeSpan>, TypeWrapper.Option)
"guid option", (typeof<Guid>, TypeWrapper.Option)
"string option", (typeof<string>, TypeWrapper.Option) ]
|> dict

let private nameAndTypeRegex =
lazy Regex(@"^(?<name>.+)\((?<type>.+)\)$", RegexOptions.Compiled ||| RegexOptions.RightToLeft)

let private typeAndUnitRegex =
lazy Regex(@"^(?<type>.+)<(?<unit>.+)>$", RegexOptions.Compiled ||| RegexOptions.RightToLeft)

let private overrideByNameRegex =
lazy
Regex(
Expand All @@ -65,56 +53,15 @@ type private SchemaParseResult =
| FullByName of property: PrimitiveInferedProperty * originalName: string
| Rename of name: string * originalName: string

let private asOption =
function
| true, x -> Some x
| false, _ -> None

/// <summary>
/// Parses type specification in the schema for a single column.
/// This can be of the form: <c>type|measure|type&lt;measure&gt;</c>
/// </summary>
let private parseTypeAndUnit unitsOfMeasureProvider str =
let m = typeAndUnitRegex.Value.Match(str)

if m.Success then
// type<unit> case, both type and unit have to be valid
let typ =
m.Groups.["type"].Value.TrimEnd().ToLowerInvariant()
|> nameToType.TryGetValue
|> asOption

match typ with
| None -> None, None
| Some typ ->
let unitName = m.Groups.["unit"].Value.Trim()
let unit = StructuralInference.parseUnitOfMeasure unitsOfMeasureProvider unitName

if unit.IsNone then
failwithf "Invalid unit of measure %s" unitName
else
Some typ, unit
else
// it is not a full type with unit, so it can be either type or a unit
let typ =
str.ToLowerInvariant()
|> nameToType.TryGetValue
|> asOption

match typ with
| Some (typ, typWrapper) ->
// Just type
Some(typ, typWrapper), None
| None ->
// Just unit (or nothing)
None, StructuralInference.parseUnitOfMeasure unitsOfMeasureProvider str

/// Parse schema specification for column. This can either be a name
/// with type or just type: name (typeInfo)|typeInfo.
/// If forSchemaOverride is set to true, only Full or Name is returned
/// (if we succeed we override the inferred schema, otherwise, we just
/// override the header name)
let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
let parseTypeAndUnit =
StructuralInference.parseTypeAndUnit unitsOfMeasureProvider nameToTypeForCsv

let name, typ, unit, isOverrideByName, originalName =
let m = overrideByNameRegex.Value.Match str

Expand All @@ -123,7 +70,7 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
let originalName = m.Groups.["name"].Value.TrimEnd()
let newName = m.Groups.["newName"].Value.Trim()
let typeAndUnit = m.Groups.["type"].Value.Trim()
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider typeAndUnit
let typ, unit = parseTypeAndUnit typeAndUnit

if typ.IsNone && typeAndUnit <> "" then
failwithf "Invalid type: %s" typeAndUnit
Expand All @@ -136,11 +83,11 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
// name (type|measure|type<measure>)
let name = m.Groups.["name"].Value.TrimEnd()
let typeAndUnit = m.Groups.["type"].Value.Trim()
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider typeAndUnit
let typ, unit = parseTypeAndUnit typeAndUnit
name, typ, unit, false, ""
elif forSchemaOverride then
// type|type<measure>
let typ, unit = parseTypeAndUnit unitsOfMeasureProvider str
let typ, unit = parseTypeAndUnit str

match typ, unit with
| None, _ -> str, None, None, false, ""
Expand All @@ -162,18 +109,26 @@ let private parseSchemaItem unitsOfMeasureProvider str forSchemaOverride =
| None, Some _ when forSchemaOverride -> SchemaParseResult.Name str
| None, Some unit -> SchemaParseResult.NameAndUnit(name, unit)

let internal inferCellType preferOptionals missingValues cultureInfo unit (value: string) =
let internal inferCellType
unitsOfMeasureProvider
preferOptionals
missingValues
inferenceMode
cultureInfo
unit
(value: string)
=
// Explicit missing values (NaN, NA, Empty string etc.) will be treated as float unless the preferOptionals is set to true
if Array.exists (value.Trim() |> (=)) missingValues then
if preferOptionals then
InferedType.Null
else
InferedType.Primitive(typeof<float>, unit, false)
InferedType.Primitive(typeof<float>, unit, false, false)
// If there's only whitespace between commas, treat it as a missing value and not as a string
elif String.IsNullOrWhiteSpace value then
InferedType.Null
else
getInferedTypeFromString cultureInfo value unit
StructuralInference.getInferedTypeFromString unitsOfMeasureProvider inferenceMode cultureInfo value unit

let internal parseHeaders headers numberOfColumns schema unitsOfMeasureProvider =

Expand Down Expand Up @@ -282,9 +237,11 @@ let internal inferType
(rows: seq<_>)
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
=

// If we have no data, generate one empty row with empty strings,
Expand Down Expand Up @@ -328,7 +285,15 @@ let internal inferType
let typ =
match schema with
| Some _ -> InferedType.Null // this will be ignored, so just return anything
| None -> inferCellType preferOptionals missingValues cultureInfo unit value
| None ->
inferCellType
unitsOfMeasureProvider
preferOptionals
missingValues
inferenceMode
cultureInfo
unit
value

{ Name = name; Type = typ } ]

Expand Down Expand Up @@ -377,7 +342,7 @@ let internal getFields preferOptionals inferedType schema =
field.Name, field.Name

match field.Type with
| InferedType.Primitive (typ, unit, optional) ->
| InferedType.Primitive (typ, unit, optional, _) ->

// Transform the types as described above
let typ, typWrapper =
Expand Down Expand Up @@ -420,11 +385,23 @@ let internal inferColumnTypes
rows
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
=
inferType headerNamesAndUnits schema rows inferRows missingValues cultureInfo assumeMissingValues preferOptionals
inferType
headerNamesAndUnits
schema
rows
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
||> getFields preferOptionals

type CsvFile with
Expand All @@ -442,14 +419,13 @@ type CsvFile with
(
inferRows,
missingValues,
inferenceMode,
cultureInfo,
schema,
assumeMissingValues,
preferOptionals,
[<Optional>] ?unitsOfMeasureProvider
unitsOfMeasureProvider
) =
let unitsOfMeasureProvider =
defaultArg unitsOfMeasureProvider defaultUnitsOfMeasureProvider

let headerNamesAndUnits, schema =
parseHeaders x.Headers x.NumberOfColumns schema unitsOfMeasureProvider
Expand All @@ -460,6 +436,8 @@ type CsvFile with
(x.Rows |> Seq.map (fun row -> row.Columns))
inferRows
missingValues
inferenceMode
cultureInfo
assumeMissingValues
preferOptionals
unitsOfMeasureProvider
Loading

0 comments on commit d0a5f47

Please sign in to comment.