From 6969dc26007f2d5031f08852e0be7066a09e9a75 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Fri, 19 Jul 2019 17:57:27 +0200 Subject: [PATCH 01/13] .avdl lexer+parser draft --- .gitignore | 2 + rebar.config | 2 +- src/avro_idl_lexer.xrl | 77 +++++++++++ src/avro_idl_parser.yrl | 180 ++++++++++++++++++++++++++ test/data/empty_protocol.avdl | 6 + test/data/full_protocol.avdl | 33 +++++ test/data/protocol_with_typedefs.avdl | 26 ++++ 7 files changed, 325 insertions(+), 1 deletion(-) create mode 100644 src/avro_idl_lexer.xrl create mode 100644 src/avro_idl_parser.yrl create mode 100644 test/data/empty_protocol.avdl create mode 100644 test/data/full_protocol.avdl create mode 100644 test/data/protocol_with_typedefs.avdl diff --git a/.gitignore b/.gitignore index 919cc29..8909176 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ out/ _build rebar.lock *.crashdump +src/avro_idl_lexer.erl +src/avro_idl_parser.erl diff --git a/rebar.config b/rebar.config index 38c47a6..9f49d6c 100644 --- a/rebar.config +++ b/rebar.config @@ -1,6 +1,6 @@ %% -*- mode:erlang -*- {erl_opts, [ debug_info - , warnings_as_errors +% , warnings_as_errors , {d,'NOTEST'} ]}. {eunit_opts, [verbose]}. diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl new file mode 100644 index 0000000..eb6c1ba --- /dev/null +++ b/src/avro_idl_lexer.xrl @@ -0,0 +1,77 @@ +%% @doc Avro IDL lexer +%% https://avro.apache.org/docs/current/idl.html + +Definitions. + +Rules. + +[\s\t\n\r]+ : skip_token. + +"[^\"]+" : {token, {string_v, TokenLine, unescape(TokenChars, $\")}}. + +`[^\`]+` : {token, {id, TokenLine, unescape(TokenChars, $`)}}. + +//[^\r\n]* : {token, {comment_v, TokenLine, unescape_line_comment(TokenChars)}}. + +/\*(.|[\r\n])*\*/ : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}. + +\{ : {token, {'{', TokenLine}}. +\} : {token, {'}', TokenLine}}. +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. +\[ : {token, {'[', TokenLine}}. +\] : {token, {']', TokenLine}}. +< : {token, {'<', TokenLine}}. +> : {token, {'>', TokenLine}}. +; : {token, {';', TokenLine}}. +\, : {token, {',', TokenLine}}. + + +%% Default values (json) += : {token, {'=', TokenLine}}. +%% TODO: better float regexp +[+-]?[0-9]+\.[0-9]+ : {token, {float_v, TokenLine, list_to_float(TokenChars)}}. +[+-]?[0-9]+ : {token, {integer_v, TokenLine, list_to_integer(TokenChars)}}. +true|false : {token, {bool_v, TokenLine, list_to_atom(TokenChars)}}. +%% TODO: null?/:(for maps)/???... + +%% === Datatype IDs === + +%% primitive; FIXME: 'null' can be used in both primitive and data! +int|long|string|boolean|float|double|bytes|null : {token, {primitive_t, TokenLine, list_to_atom(TokenChars)}}. + +%% complex +record|enum|array|map|fixed|union : {token, {list_to_atom(TokenChars ++ "_t"), TokenLine}}. + +%% Logical +decimal|date|time_ms|timestamp_ms : {token, {logical_t, TokenLine, list_to_atom(TokenChars)}}. + +%% keywords +error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(TokenChars ++ "_k"), TokenLine}}. + +%% === Constructs === + +@[a-zA-Z0-9_-]+ : {token, {annotation_v, TokenLine, unescape_annotation(TokenChars)}}. + +[A-Za-z_][A-Za-z_0-9]* : {token, {id, TokenLine, TokenChars}}. + +Erlang code. + + +unescape(Token, Char) -> + string:trim(Token, both, [Char]). + +unescape_line_comment("//" ++ Comment) -> + Comment. + +%% TODO: cleanup +unescape_multiline_comment("/**" ++ Comment0) -> + %% Drop closing "*/" + Len = length(Comment0), + lists:sublist(Comment0, Len - 2); +unescape_multiline_comment("/*" ++ Comment0) -> + Len = length(Comment0), + lists:sublist(Comment0, Len - 2). + +unescape_annotation("@" ++ Annotation) -> + Annotation. diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl new file mode 100644 index 0000000..f2d3070 --- /dev/null +++ b/src/avro_idl_parser.yrl @@ -0,0 +1,180 @@ +%% @doc Avro IDL parser +%% https://avro.apache.org/docs/current/idl.html + +Header "%% Hello". + +Terminals id string_v comment_v float_v integer_v bool_v annotation_v + primitive_t logical_t + '{' '}' '(' ')' '[' ']' '<' '>' ';' ',' '=' + record_t enum_t array_t map_t fixed_t union_t + protocol_k error_k throws_k oneway_k void_k import_k idl_k schema_k. + +Nonterminals + protocol typedefs + decorator decorator_value string array_of_strings array_of_strings_tail + typedef_tail typedef + import import_file_type + primitive + enum enum_variants + union + record + fixed + array + map + error + data + array_of_data array_of_data_tail. + +Rootsymbol protocol. + + +protocol -> + protocol_k id '{' '}' : + {protocol, value_of('$2'), []}. +protocol -> + protocol_k id '{' typedef typedef_tail : + {protocol, value_of('$2'), ['$4' | '$5']}. +protocol -> + decorator protocol : + {decorated, '$1', '$2'}. % todo: embed into protocol? + + +%% == Decorator == +decorator -> + annotation_v '(' decorator_value ')' : + {decorator, value_of('$1'), '$3'}. + +%% Maybe can just use `data` instead of `decorator_value`? +decorator_value -> + string : + '$1'. +decorator_value -> + array_of_strings : + '$1'. + +string -> + string_v : + value_of('$1'). + +array_of_strings -> + '[' ']' : + []. +array_of_strings -> + '[' string array_of_strings_tail : + ['$2' | '$3']. + +array_of_strings_tail -> + ']' : + []. +array_of_strings_tail -> + ',' string array_of_strings_tail : + ['$2' | '$3']. + + +%% == Type definitions (inside protocol or record) == + +typedef_tail -> + '}' : + []. +typedef_tail -> + typedef typedef_tail : + ['$1' | '$2']. + +%% TODO: generalize to 'type' name (= value)(;) +typedef -> import : '$1'. +typedef -> primitive : '$1'. +typedef -> enum : '$1'. +typedef -> union : '$1'. +typedef -> record : '$1'. +typedef -> fixed : '$1'. +typedef -> array : '$1'. +typedef -> map : '$1'. +typedef -> error : '$1'. +%% typedef -> function : '$1'. % TODO + +%% -- Import def + +import -> + import_k import_file_type string_v ';' : + {import, '$2', value_of('$3')}. + +import_file_type -> idl_k : idl. +import_file_type -> protocol_k : protocol. +import_file_type -> schema_k : schema. + +%% -- Primitive typedef +primitive -> + primitive_t id ';' : + {primitive, value_of('$2'), value_of('$1'), undefined}. +primitive -> + primitive_t id '=' data ';' : + {primitive, value_of('$2'), value_of('$1'), '$4'}. + +%% -- Enum typedef +enum -> + enum_t id '{' id enum_variants : + {enum, value_of('$2'), [value_of('$4') | '$5']}. + +enum_variants -> + '}' : + []. +enum_variants -> + ',' id enum_variants : [value_of('$2') | '$3']. + + +union -> union_t : '$1'. %TODO +record -> record_t : '$1'. %TODO + +%% -- Fixed typedef +fixed -> + fixed_t id '(' integer_v ')' ';': + {fixed, '$2', value_of('$4'), undefined}. +fixed -> + fixed_t id '(' integer_v ')' '=' data ';' : + {fixed, '$2', value_of('$4'), '$6'}. + +%% -- Array typedef +array -> + array_t '<' primitive_t '>' id ';' : + {array, value_of('$5'), value_of('$3'), undefined}. %FIXME: not just primitives! +array -> + array_t '<' primitive_t '>' id '=' data ';' : + {array, value_of('$5'), value_of('$3'), '$7'}. + +%% -- Map typedef +map -> + map_t '<' primitive_t '>' id ';' : + {map, '$5', value_of('$3'), undefined}. %FIXME: not just primitives!; defaults! + +%% -- Error typedef +error -> + error_k : '$1'. %TODO + +%% == Data (JSON) for default values +data -> string_v : value_of('$1'). +data -> integer_v : value_of('$1'). +data -> float_v : value_of('$1'). +data -> bool_v : value_of('$1'). +data -> array_of_data : '$1'. + +array_of_data -> + '[' ']' : + []. +array_of_data -> + '[' data array_of_data_tail : + ['$2' | '$3']. + +array_of_data_tail -> + ']' : + []. +array_of_data_tail -> + ',' data array_of_data_tail : + ['$2' | '$3']. + +Erlang code. + +value_of(Token) -> + try element(3, Token) + catch error:badarg -> + error({badarg, Token}) + end. diff --git a/test/data/empty_protocol.avdl b/test/data/empty_protocol.avdl new file mode 100644 index 0000000..20082c9 --- /dev/null +++ b/test/data/empty_protocol.avdl @@ -0,0 +1,6 @@ +@deco1("wasd") +@deco2(["abc", "def"]) +@deco3([]) +protocol MyProto { + +} diff --git a/test/data/full_protocol.avdl b/test/data/full_protocol.avdl new file mode 100644 index 0000000..9d07aa9 --- /dev/null +++ b/test/data/full_protocol.avdl @@ -0,0 +1,33 @@ +/** + * An example protocol in Avro IDL + */ +@namespace("org.apache.avro.test") +protocol Simple { + @aliases(["org.foo.KindOf"]) + enum Kind { + FOO, + BAR, // the bar enum value + BAZ + } + fixed MD5(16); + record TestRecord { + @order("ignore") + string name = "default name"; + int amount = -1; + @order("descending") + Kind kind; + MD5 hash; + union { MD5, null} @aliases(["hash"]) nullableHash; + array arrayOfLongs; + } + error TestError { + string message; + } + string hello(string greeting); + TestRecord echo(TestRecord `record`); + int add(int arg1, int arg2); + bytes echoBytes(bytes data); + void `error`() throws TestError; + void ping() oneway; +} + diff --git a/test/data/protocol_with_typedefs.avdl b/test/data/protocol_with_typedefs.avdl new file mode 100644 index 0000000..75ca25e --- /dev/null +++ b/test/data/protocol_with_typedefs.avdl @@ -0,0 +1,26 @@ + +protocol MyProto { + import idl "foo.avdl"; + import protocol "bar.avpr"; + import schema "baz.avsc"; + + enum MyEnum1 { + VAR11, + VAR12, + VAR13 + } + enum MyEnum2 { + VAR21, + VAR22, + VAR23 + } + + int my_int; + string my_string = "wasd"; + float my_float = 12.34; + boolean my_bool = false; + + array my_int_array; + array my_int_array_def = [1, 2, 3]; + array my_str_array_def = ["123", "456", "cdf"]; +} From 0bafe7eeefb3fdcb651aa6074bf24983a788c1b7 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Mon, 22 Jul 2019 15:59:00 +0200 Subject: [PATCH 02/13] More progress on parser Seems the only things left are some of annotations and docstrings --- src/avro_idl_lexer.xrl | 20 ++- src/avro_idl_parser.yrl | 209 ++++++++++++++++++-------- test/data/protocol_with_typedefs.avdl | 40 ++++- 3 files changed, 193 insertions(+), 76 deletions(-) diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index eb6c1ba..7e98033 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -7,12 +7,15 @@ Rules. [\s\t\n\r]+ : skip_token. +%% TODO: escaped double quotes inside strings "[^\"]+" : {token, {string_v, TokenLine, unescape(TokenChars, $\")}}. `[^\`]+` : {token, {id, TokenLine, unescape(TokenChars, $`)}}. //[^\r\n]* : {token, {comment_v, TokenLine, unescape_line_comment(TokenChars)}}. +%% `/**` is a docstring for the following object +/\*\*(.|[\r\n])*\*/ : {token, {doc_v, TokenLine, unescape_multiline_comment(TokenChars)}}. /\*(.|[\r\n])*\*/ : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}. \{ : {token, {'{', TokenLine}}. @@ -26,26 +29,29 @@ Rules. ; : {token, {';', TokenLine}}. \, : {token, {',', TokenLine}}. +%% Null can be in both values and primitive types +null : {token, {null, TokenLine}}. %% Default values (json) = : {token, {'=', TokenLine}}. -%% TODO: better float regexp +%% TODO: better float regexp; +%% XXX: is it safe to use list_to_float? seems float syntax is used for decimal defaults as well [+-]?[0-9]+\.[0-9]+ : {token, {float_v, TokenLine, list_to_float(TokenChars)}}. [+-]?[0-9]+ : {token, {integer_v, TokenLine, list_to_integer(TokenChars)}}. true|false : {token, {bool_v, TokenLine, list_to_atom(TokenChars)}}. -%% TODO: null?/:(for maps)/???... +\: : {token, {':', TokenLine}}. %% === Datatype IDs === -%% primitive; FIXME: 'null' can be used in both primitive and data! -int|long|string|boolean|float|double|bytes|null : {token, {primitive_t, TokenLine, list_to_atom(TokenChars)}}. +%% primitive +int|long|string|boolean|float|double|bytes : {token, {primitive_t, TokenLine, list_to_atom(TokenChars)}}. %% complex record|enum|array|map|fixed|union : {token, {list_to_atom(TokenChars ++ "_t"), TokenLine}}. %% Logical -decimal|date|time_ms|timestamp_ms : {token, {logical_t, TokenLine, list_to_atom(TokenChars)}}. - +date|time_ms|timestamp_ms : {token, {logical_t, TokenLine, list_to_atom(TokenChars)}}. +decimal : {token, {decimal_t, TokenLine}}. %% keywords error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(TokenChars ++ "_k"), TokenLine}}. @@ -54,6 +60,8 @@ error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(Toke @[a-zA-Z0-9_-]+ : {token, {annotation_v, TokenLine, unescape_annotation(TokenChars)}}. [A-Za-z_][A-Za-z_0-9]* : {token, {id, TokenLine, TokenChars}}. +%% namespaced will only be allowed in data type spec +[A-Za-z_][A-Za-z_0-9\.]+[A-Za-z_0-9] : {token, {ns_id, TokenLine, TokenChars}}. Erlang code. diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index f2d3070..b873c3a 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -1,29 +1,32 @@ %% @doc Avro IDL parser %% https://avro.apache.org/docs/current/idl.html +%% XXX: all `comment_v` tockens should be filtered-out before parsing! +%% TODO: docstrings +%% TODO: better annotations support Header "%% Hello". -Terminals id string_v comment_v float_v integer_v bool_v annotation_v - primitive_t logical_t - '{' '}' '(' ')' '[' ']' '<' '>' ';' ',' '=' +Terminals id ns_id null string_v doc_v float_v integer_v bool_v annotation_v + primitive_t logical_t decimal_t + '{' '}' '(' ')' '[' ']' '<' '>' ';' ',' '=' ':' record_t enum_t array_t map_t fixed_t union_t protocol_k error_k throws_k oneway_k void_k import_k idl_k schema_k. Nonterminals - protocol typedefs - decorator decorator_value string array_of_strings array_of_strings_tail - typedef_tail typedef + protocol + annotation annotation_value string array_of_strings array_of_strings_tail + declaration declaration_tail import import_file_type - primitive + record record_field record_tail + type error + decimal enum enum_variants - union - record + union union_tail fixed array map - error - data - array_of_data array_of_data_tail. + function fun_return fun_arguments fun_argument fun_extra + data array_of_data array_of_data_tail map_of_data map_of_data_tail. Rootsymbol protocol. @@ -32,23 +35,23 @@ protocol -> protocol_k id '{' '}' : {protocol, value_of('$2'), []}. protocol -> - protocol_k id '{' typedef typedef_tail : + protocol_k id '{' declaration declaration_tail : {protocol, value_of('$2'), ['$4' | '$5']}. protocol -> - decorator protocol : - {decorated, '$1', '$2'}. % todo: embed into protocol? + annotation protocol : + {annotated, '$1', '$2'}. -%% == Decorator == -decorator -> - annotation_v '(' decorator_value ')' : - {decorator, value_of('$1'), '$3'}. +%% == Annotation == +annotation -> + annotation_v '(' annotation_value ')' : + {annotation, value_of('$1'), '$3'}. %% Maybe can just use `data` instead of `decorator_value`? -decorator_value -> +annotation_value -> string : '$1'. -decorator_value -> +annotation_value -> array_of_strings : '$1'. @@ -71,26 +74,22 @@ array_of_strings_tail -> ['$2' | '$3']. -%% == Type definitions (inside protocol or record) == +%% == Protocol definitions == -typedef_tail -> +declaration_tail -> '}' : []. -typedef_tail -> - typedef typedef_tail : +declaration_tail -> + declaration declaration_tail : ['$1' | '$2']. -%% TODO: generalize to 'type' name (= value)(;) -typedef -> import : '$1'. -typedef -> primitive : '$1'. -typedef -> enum : '$1'. -typedef -> union : '$1'. -typedef -> record : '$1'. -typedef -> fixed : '$1'. -typedef -> array : '$1'. -typedef -> map : '$1'. -typedef -> error : '$1'. -%% typedef -> function : '$1'. % TODO +declaration -> import : '$1'. +declaration -> enum : '$1'. +declaration -> fixed : '$1'. +declaration -> error : '$1'. +declaration -> record : '$1'. +declaration -> function : '$1'. + %% -- Import def @@ -102,14 +101,6 @@ import_file_type -> idl_k : idl. import_file_type -> protocol_k : protocol. import_file_type -> schema_k : schema. -%% -- Primitive typedef -primitive -> - primitive_t id ';' : - {primitive, value_of('$2'), value_of('$1'), undefined}. -primitive -> - primitive_t id '=' data ';' : - {primitive, value_of('$2'), value_of('$1'), '$4'}. - %% -- Enum typedef enum -> enum_t id '{' id enum_variants : @@ -121,34 +112,112 @@ enum_variants -> enum_variants -> ',' id enum_variants : [value_of('$2') | '$3']. - -union -> union_t : '$1'. %TODO -record -> record_t : '$1'. %TODO - %% -- Fixed typedef fixed -> fixed_t id '(' integer_v ')' ';': - {fixed, '$2', value_of('$4'), undefined}. -fixed -> - fixed_t id '(' integer_v ')' '=' data ';' : - {fixed, '$2', value_of('$4'), '$6'}. + {fixed, value_of('$2'), value_of('$4')}. + +%% -- Error typedef +error -> + error_k id '{' record_field record_tail : + {error, value_of('$2'), ['$4' | '$5']}. + + +%% -- Record + +record -> + record_t id '{' record_field record_tail : + {record, value_of('$2'), ['$4' | '$5']}. +record -> + annotation record : + {annotated, '$1', '$2'}. + +record_tail -> + '}' : + []. +record_tail -> + record_field record_tail : + ['$1' | '$2']. + +record_field -> + type id ';' : + {field, value_of('$2'), '$1', undefined}. +record_field -> + type id '=' data ';' : + {field, value_of('$2'), '$1', '$4'}. + +type -> primitive_t : value_of('$1'). +type -> logical_t : value_of('$1'). +type -> null : null. +type -> id : {custom, value_of('$1')}. +type -> ns_id : {custom, value_of('$1')}. +type -> decimal : '$1'. +type -> union : '$1'. +type -> array : '$1'. +type -> map : '$1'. + +%% -- Decimal +decimal -> + decimal_t '(' integer_v ',' integer_v ')' : %decimal(precision, scale) + {decimal, value_of('$3'), value_of('$5')}. % + +%% -- Union +union -> + union_t '{' type union_tail : + {union, ['$3' | '$4']}. + +union_tail -> + '}' : + []. +union_tail -> + ',' type union_tail : + ['$2' | '$3']. %% -- Array typedef array -> - array_t '<' primitive_t '>' id ';' : - {array, value_of('$5'), value_of('$3'), undefined}. %FIXME: not just primitives! -array -> - array_t '<' primitive_t '>' id '=' data ';' : - {array, value_of('$5'), value_of('$3'), '$7'}. + array_t '<' primitive_t '>' : + {array, value_of('$3')}. %FIXME: not just primitives! %% -- Map typedef map -> - map_t '<' primitive_t '>' id ';' : - {map, '$5', value_of('$3'), undefined}. %FIXME: not just primitives!; defaults! + map_t '<' primitive_t '>' : + {map, value_of('$3')}. %FIXME: not just primitives! + +%% == Function (message) definitions + +function -> + fun_return id '(' fun_arguments ')' fun_extra ';' : + {function, value_of('$2'), '$4', '$6'}. + +fun_return -> type : '$1'. +fun_return -> void_k : void. + +fun_arguments -> + '$empty' : + []. +fun_arguments -> + fun_argument : + ['$1']. +fun_arguments -> + fun_argument ',' fun_arguments : + ['$1' | '$3']. + +fun_argument -> + type id : + {arg, value_of('$2'), '$1', undefined}. +fun_argument -> + type id '=' data : + {arg, value_of('$2'), '$1', '$4'}. + +fun_extra -> + '$empty' : undefined. +fun_extra -> + throws_k id : + {throws, value_of('$2')}. +fun_extra -> + oneway_k : + oneway. -%% -- Error typedef -error -> - error_k : '$1'. %TODO %% == Data (JSON) for default values data -> string_v : value_of('$1'). @@ -156,6 +225,8 @@ data -> integer_v : value_of('$1'). data -> float_v : value_of('$1'). data -> bool_v : value_of('$1'). data -> array_of_data : '$1'. +data -> null : null. +data -> map_of_data : '$1'. array_of_data -> '[' ']' : @@ -171,6 +242,20 @@ array_of_data_tail -> ',' data array_of_data_tail : ['$2' | '$3']. +map_of_data -> + '{' '}' : + #{}. +map_of_data -> + '{' string_v ':' data map_of_data_tail : + ('$5')#{value_of('$2') => '$4'}. + +map_of_data_tail -> + '}' : + #{}. +map_of_data_tail -> + ',' string_v ':' data map_of_data_tail: + ('$5')#{value_of('$2') => '$4'}. + Erlang code. value_of(Token) -> diff --git a/test/data/protocol_with_typedefs.avdl b/test/data/protocol_with_typedefs.avdl index 75ca25e..b428d2d 100644 --- a/test/data/protocol_with_typedefs.avdl +++ b/test/data/protocol_with_typedefs.avdl @@ -1,4 +1,4 @@ - +@namespace("org.erlang.www") protocol MyProto { import idl "foo.avdl"; import protocol "bar.avpr"; @@ -14,13 +14,37 @@ protocol MyProto { VAR22, VAR23 } + fixed MyFix(10); + record MyRec { + int my_int; + string my_string = "wasd"; + float my_float = 12.34; + boolean my_bool = false; + MyFix my_custom; + union {boolean, null} my_union = null; + date my_date = 123456; + decimal(5, 2) my_decimal = 1222; + + array my_int_array; + array my_int_array_def = [1, 2, 3]; + array my_str_array_def = ["123", "456", "cdf"]; + + map my_map = {"a": 1.23, "b": 45.67}; + } + @namespace("org.erlang.ftp") + record MyAnnotated { + org.erlang.www.MyError `error`; + } + error MyError { + MyEnum2 code; + string description; + } + + float mul(int arg1, float arg2 = 1.0); + + MyFix append(bytes arg1, string arg2 = "tail") throws MyError; - int my_int; - string my_string = "wasd"; - float my_float = 12.34; - boolean my_bool = false; + void gen_server_cast(map opts) oneway; - array my_int_array; - array my_int_array_def = [1, 2, 3]; - array my_str_array_def = ["123", "456", "cdf"]; + MyEnum1 ping(); } From 93ffe1d09714d879269c1ed2bf618f122c166229 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Mon, 22 Jul 2019 17:59:12 +0200 Subject: [PATCH 03/13] More strict lexical rule for namespaced ID --- src/avro_idl_lexer.xrl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index 7e98033..f288882 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -59,9 +59,9 @@ error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(Toke @[a-zA-Z0-9_-]+ : {token, {annotation_v, TokenLine, unescape_annotation(TokenChars)}}. -[A-Za-z_][A-Za-z_0-9]* : {token, {id, TokenLine, TokenChars}}. +[A-Za-z_][A-Za-z0-9_]* : {token, {id, TokenLine, TokenChars}}. %% namespaced will only be allowed in data type spec -[A-Za-z_][A-Za-z_0-9\.]+[A-Za-z_0-9] : {token, {ns_id, TokenLine, TokenChars}}. +[A-Za-z_][A-Za-z0-9_]+(\.[A-Za-z_][A-Za-z0-9_]+)+ : {token, {ns_id, TokenLine, TokenChars}}. Erlang code. From afd1aa82248ecebece38df5f33841da64b654bcc Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Tue, 23 Jul 2019 11:29:32 +0200 Subject: [PATCH 04/13] Use `string:strip` to be compatible with OTP19 --- src/avro_idl_lexer.xrl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index f288882..6c54997 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -67,7 +67,7 @@ Erlang code. unescape(Token, Char) -> - string:trim(Token, both, [Char]). + string:strip(Token, both, Char). unescape_line_comment("//" ++ Comment) -> Comment. From e23b925cdfd78b6fa4ebdec14d2f8b8dd6f59088 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Wed, 24 Jul 2019 16:08:49 +0200 Subject: [PATCH 05/13] Don't ignore function return type --- src/avro_idl_parser.yrl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index b873c3a..0ff9afa 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -187,7 +187,7 @@ map -> function -> fun_return id '(' fun_arguments ')' fun_extra ';' : - {function, value_of('$2'), '$4', '$6'}. + {function, value_of('$2'), '$4', '$1', '$6'}. fun_return -> type : '$1'. fun_return -> void_k : void. From c74d28cf69a67d19e5542ba6083cd1684c44fddf Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sun, 8 Mar 2020 03:44:10 +0100 Subject: [PATCH 06/13] Full annotations support; parser tests added --- src/avro_idl_parser.yrl | 65 ++++++++++++------ src/idl.hrl | 41 ++++++++++++ test/avro_idl_parse_tests.erl | 121 ++++++++++++++++++++++++++++++++++ test/data/annotations.avdl | 23 +++++++ test/data/empty_protocol.avdl | 7 +- 5 files changed, 232 insertions(+), 25 deletions(-) create mode 100644 src/idl.hrl create mode 100644 test/avro_idl_parse_tests.erl create mode 100644 test/data/annotations.avdl diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index 0ff9afa..75e48b1 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -14,10 +14,10 @@ Terminals id ns_id null string_v doc_v float_v integer_v bool_v annotation_v Nonterminals protocol - annotation annotation_value string array_of_strings array_of_strings_tail + annotations annotation annotation_value string array_of_strings array_of_strings_tail declaration declaration_tail import import_file_type - record record_field record_tail + record record_field record_field_name record_tail type error decimal enum enum_variants @@ -33,19 +33,26 @@ Rootsymbol protocol. protocol -> protocol_k id '{' '}' : - {protocol, value_of('$2'), []}. + #protocol{name = value_of('$2')}. protocol -> protocol_k id '{' declaration declaration_tail : - {protocol, value_of('$2'), ['$4' | '$5']}. + #protocol{name = value_of('$2'), definitions = ['$4' | '$5']}. protocol -> - annotation protocol : - {annotated, '$1', '$2'}. + annotations protocol : + ('$2')#protocol{annotations = '$1'}. %% == Annotation == +annotations -> + annotation : + ['$1']. +annotations -> + annotation annotations : + ['$1' | '$2']. + annotation -> annotation_v '(' annotation_value ')' : - {annotation, value_of('$1'), '$3'}. + #annotation{name = value_of('$1'), value = '$3'}. %% Maybe can just use `data` instead of `decorator_value`? annotation_value -> @@ -104,7 +111,10 @@ import_file_type -> schema_k : schema. %% -- Enum typedef enum -> enum_t id '{' id enum_variants : - {enum, value_of('$2'), [value_of('$4') | '$5']}. + #enum{name = value_of('$2'), variants = [value_of('$4') | '$5']}. +enum -> + annotations enum : + ('$2')#enum{annotations = '$1'}. enum_variants -> '}' : @@ -115,22 +125,27 @@ enum_variants -> %% -- Fixed typedef fixed -> fixed_t id '(' integer_v ')' ';': - {fixed, value_of('$2'), value_of('$4')}. + #fixed{name = value_of('$2'), size = value_of('$4')}. +fixed -> + annotations fixed : + ('$2')#fixed{annotations = '$1'}. %% -- Error typedef error -> error_k id '{' record_field record_tail : - {error, value_of('$2'), ['$4' | '$5']}. - + #error{name = value_of('$2'), fields = ['$4' | '$5']}. +error -> + annotations error : + ('$2')#error{annotations = '$1'}. %% -- Record record -> record_t id '{' record_field record_tail : - {record, value_of('$2'), ['$4' | '$5']}. + #record{name = value_of('$2'), fields = ['$4' | '$5']}. record -> - annotation record : - {annotated, '$1', '$2'}. + annotations record : + ('$2')#record{annotations = '$1'}. record_tail -> '}' : @@ -140,11 +155,22 @@ record_tail -> ['$1' | '$2']. record_field -> - type id ';' : - {field, value_of('$2'), '$1', undefined}. + type record_field_name ';' : + #field{name = element(1, '$2'), annotations = element(2, '$2'), type = '$1'}. record_field -> - type id '=' data ';' : - {field, value_of('$2'), '$1', '$4'}. + type record_field_name '=' data ';' : + #field{name = element(1, '$2'), annotations = element(2, '$2'), + type = '$1', default = '$4'}. +record_field -> + annotations record_field : + ('$2')#field{annotations = '$1' ++ ('$2')#field.annotations}. + +record_field_name -> + id : + {value_of('$1'), []}. +record_field_name -> + annotations id : + {value_of('$2'), '$1'}. type -> primitive_t : value_of('$1'). type -> logical_t : value_of('$1'). @@ -187,7 +213,7 @@ map -> function -> fun_return id '(' fun_arguments ')' fun_extra ';' : - {function, value_of('$2'), '$4', '$1', '$6'}. + #function{name = value_of('$2'), arguments = '$4', return = '$1', extra = '$6'}. fun_return -> type : '$1'. fun_return -> void_k : void. @@ -257,6 +283,7 @@ map_of_data_tail -> ('$5')#{value_of('$2') => '$4'}. Erlang code. +-include("idl.hrl"). value_of(Token) -> try element(3, Token) diff --git a/src/idl.hrl b/src/idl.hrl new file mode 100644 index 0000000..cd37ed0 --- /dev/null +++ b/src/idl.hrl @@ -0,0 +1,41 @@ +-record(protocol, + {name, + annotations = [], + definitions = []}). + +-record(annotation, + {name, + value}). + +-record(enum, + {name, + annotations = [], + variants = []}). + +-record(fixed, + {name, + annotations = [], + size}). + +-record(error, + {name, + annotations = [], + fields = []}). + +-record(record, + {name, + annotations = [], + fields = []}). + +-record(field, + {name, + annotations = [], + type, + default}). + +-record(function, + {name, + %% annotations = [], + arguments = [], + return, + extra}). diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl new file mode 100644 index 0000000..998751c --- /dev/null +++ b/test/avro_idl_parse_tests.erl @@ -0,0 +1,121 @@ +%% coding: latin-1 +%%%------------------------------------------------------------------- +%%% Copyright (c) 2013-2018 Klarna AB +%%% +%%% This file is provided to you under the Apache License, +%%% Version 2.0 (the "License"); you may not use this file +%%% except in compliance with the License. You may obtain +%%% a copy of the License at +%%% +%%% http://www.apache.org/licenses/LICENSE-2.0 +%%% +%%% Unless required by applicable law or agreed to in writing, +%%% software distributed under the License is distributed on an +%%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%%% KIND, either express or implied. See the License for the +%%% specific language governing permissions and limitations +%%% under the License. +%%% +%%%------------------------------------------------------------------- +-module(avro_idl_parse_tests). + +-include("../src/idl.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +parse_empty_protocol_test() -> + ?assertEqual( + #protocol{name = "MyProto"}, + parse_idl("empty_protocol")). + +parse_annotations_test() -> + ?assertEqual( + #protocol{ + name = "MyProto", + annotations = + [#annotation{name = "version", + value = "1.0"}, + #annotation{name = "aliases", + value = ["ns.Proto1", "ns.Proto2"]} + ], + definitions = + [#enum{name = "MyEnum", + annotations = + [#annotation{name = "namespace", + value = "enums"}], + variants = ["A", "B", "C"]}, + #fixed{name = "MyFixed", + annotations = + [#annotation{name = "namespace", + value = "fixeds"}], + size = 16}, + #error{name = "MyError", + annotations = + [#annotation{name = "namespace", + value = "errors"}], + fields = + [#field{name = "my_err_field", + annotations = + [#annotation{name = "order", + value = "ignore"}], + type = string}]}, + #record{name = "MyRecord", + annotations = + [#annotation{name = "namespace", + value = "records"}], + fields = + [#field{name = "my_record_field", + annotations = + [#annotation{name = "order", + value = "ignore"}, + #annotation{name = "aliases", + value = ["my_alias"]}], + type = string}]}] + }, + parse_idl("annotations")). + +full_protocol_test() -> + ?assertMatch( + #protocol{name = "Simple", + definitions = + [#enum{name = "Kind"}, + #fixed{name = "MD5"}, + #record{name = "TestRecord"}, + #error{name = "TestError"}, + #function{name = "hello"}, + #function{name = "echo"}, + #function{name = "add"}, + #function{name = "echoBytes"}, + #function{name = "error"}, + #function{name = "ping"}]}, + parse_idl("full_protocol")). + +protocol_with_typedeffs_test() -> + ?assertMatch( + #protocol{name = "MyProto", + definitions = + [{import, idl, "foo.avdl"}, + {import, protocol, "bar.avpr"}, + {import, schema, "baz.avsc"}, + #enum{name = "MyEnum1"}, + #enum{name = "MyEnum2"}, + #fixed{name = "MyFix"}, + #record{name = "MyRec"}, + #record{name = "MyAnnotated"}, + #error{name = "MyError"}, + #function{name = "mul"}, + #function{name = "append"}, + #function{name = "gen_server_cast"}, + #function{name = "ping"}]}, + parse_idl("protocol_with_typedefs")). + +parse_idl(Name) -> + File = "test/data/" ++ Name ++ ".avdl", + {ok, B} = file:read_file(File), + {ok, T, _} = avro_idl_lexer:string(binary_to_list(B)), + NoComments = lists:filter( + fun({doc_v, _, _}) -> false; + ({comment_v, _, _}) -> false; + (_) -> true + end, T), + {ok, Tree} = avro_idl_parser:parse(NoComments), + Tree. diff --git a/test/data/annotations.avdl b/test/data/annotations.avdl new file mode 100644 index 0000000..a94a705 --- /dev/null +++ b/test/data/annotations.avdl @@ -0,0 +1,23 @@ +@version("1.0") +@aliases(["ns.Proto1", "ns.Proto2"]) +protocol MyProto { + @namespace("enums") + enum MyEnum { + A, B, C + } + + @namespace("fixeds") + fixed MyFixed(16); + + @namespace("errors") + error MyError { + @order("ignore") + string my_err_field; + } + + @namespace("records") + record MyRecord { + @order("ignore") + string @aliases(["my_alias"]) my_record_field; + } +} diff --git a/test/data/empty_protocol.avdl b/test/data/empty_protocol.avdl index 20082c9..8c2e382 100644 --- a/test/data/empty_protocol.avdl +++ b/test/data/empty_protocol.avdl @@ -1,6 +1 @@ -@deco1("wasd") -@deco2(["abc", "def"]) -@deco3([]) -protocol MyProto { - -} +protocol MyProto {} From 788f725463d93738c8f6dab430a9e16582490f14 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Tue, 10 Mar 2020 03:21:37 +0100 Subject: [PATCH 07/13] Add support for docstrings; rename `annotations` field to `meta` --- rebar.config | 4 ++- src/avro_idl_lexer.xrl | 38 ++++++++++++++++++++--- src/avro_idl_parser.yrl | 57 ++++++++++++++++++++--------------- src/idl.hrl | 14 ++++----- test/avro_idl_parse_tests.erl | 52 +++++++++++++++++++------------- test/data/annotations.avdl | 15 ++++++++- 6 files changed, 121 insertions(+), 59 deletions(-) diff --git a/rebar.config b/rebar.config index 9f49d6c..c5a0be6 100644 --- a/rebar.config +++ b/rebar.config @@ -3,7 +3,7 @@ % , warnings_as_errors , {d,'NOTEST'} ]}. -{eunit_opts, [verbose]}. +%% {eunit_opts, [verbose]}. {xref_checks, [ undefined_function_calls , deprecated_function_calls ]}. @@ -15,3 +15,5 @@ {cover_opts, [verbose]}. {cover_enabled, true}. {cover_export_enabled, true}. + +%% {yrl_opts, [{verbose, true}]}. diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index 6c54997..274a1ee 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -14,10 +14,6 @@ Rules. //[^\r\n]* : {token, {comment_v, TokenLine, unescape_line_comment(TokenChars)}}. -%% `/**` is a docstring for the following object -/\*\*(.|[\r\n])*\*/ : {token, {doc_v, TokenLine, unescape_multiline_comment(TokenChars)}}. -/\*(.|[\r\n])*\*/ : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}. - \{ : {token, {'{', TokenLine}}. \} : {token, {'}', TokenLine}}. \( : {token, {'(', TokenLine}}. @@ -63,9 +59,41 @@ error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(Toke %% namespaced will only be allowed in data type spec [A-Za-z_][A-Za-z0-9_]+(\.[A-Za-z_][A-Za-z0-9_]+)+ : {token, {ns_id, TokenLine, TokenChars}}. -Erlang code. +%% https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/ +%% `/** .. */` is a docstring for the following object +(/\*\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/) : {token, {doc_v, TokenLine, unescape_multiline_comment(TokenChars)}}. +%% `/* .. */` is just a comment +(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/) : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}. +Erlang code. +-export([preprocess/2]). + +%% Api helpers + +-spec preprocess(Tokens, [drop_comments | trim_doc]) -> Tokens when + Tokens :: [tuple()]. +preprocess(Tokens, Actions) -> + lists:foldl(fun do_preprocess/2, Tokens, Actions). + +do_preprocess(drop_comments, T) -> + lists:filter( + fun({comment_v, _, _}) -> false; + (_) -> true + end, T); +do_preprocess(trim_doc, T) -> + lists:map( + fun({doc_v, Loc, Val}) -> + {doc_v, Loc, trim_doc(Val)}; + (Tok) -> Tok + end, T). + +trim_doc(Doc) -> + re:replace(Doc, "^[\\s\\*]*((?U).*)[\\s]*$", "\\1", + [global, multiline, {return, list}]). + +%% Lexer internal helpers + unescape(Token, Char) -> string:strip(Token, both, Char). diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index 75e48b1..1e9e38f 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -1,8 +1,6 @@ %% @doc Avro IDL parser -%% https://avro.apache.org/docs/current/idl.html -%% XXX: all `comment_v` tockens should be filtered-out before parsing! -%% TODO: docstrings -%% TODO: better annotations support +%% https://avro.apache.org/docs/1.9.2/idl.html +%% XXX: all `comment_v` tokens should be filtered-out before parsing! Header "%% Hello". @@ -14,7 +12,7 @@ Terminals id ns_id null string_v doc_v float_v integer_v bool_v annotation_v Nonterminals protocol - annotations annotation annotation_value string array_of_strings array_of_strings_tail + meta meta_item annotation annotation_value string array_of_strings array_of_strings_tail declaration declaration_tail import import_file_type record record_field record_field_name record_tail @@ -30,6 +28,10 @@ Nonterminals Rootsymbol protocol. +%% There are 2 shift/reduce conflicts expected due to ambiguity in +%% meta / meta_item that is automatically correctly resolved with shift. +%% See https://www.gnu.org/software/bison/manual/html_node/Shift_002fReduce.html +Expect 2. protocol -> protocol_k id '{' '}' : @@ -38,18 +40,25 @@ protocol -> protocol_k id '{' declaration declaration_tail : #protocol{name = value_of('$2'), definitions = ['$4' | '$5']}. protocol -> - annotations protocol : - ('$2')#protocol{annotations = '$1'}. + meta protocol : + ('$2')#protocol{meta = '$1'}. %% == Annotation == -annotations -> - annotation : +meta -> + meta_item : ['$1']. -annotations -> - annotation annotations : +meta -> + meta_item meta : ['$1' | '$2']. +meta_item -> + annotation : + '$1'. +meta_item -> + doc_v : + {doc, value_of('$1')}. + annotation -> annotation_v '(' annotation_value ')' : #annotation{name = value_of('$1'), value = '$3'}. @@ -113,8 +122,8 @@ enum -> enum_t id '{' id enum_variants : #enum{name = value_of('$2'), variants = [value_of('$4') | '$5']}. enum -> - annotations enum : - ('$2')#enum{annotations = '$1'}. + meta enum : + ('$2')#enum{meta = '$1'}. enum_variants -> '}' : @@ -127,16 +136,16 @@ fixed -> fixed_t id '(' integer_v ')' ';': #fixed{name = value_of('$2'), size = value_of('$4')}. fixed -> - annotations fixed : - ('$2')#fixed{annotations = '$1'}. + meta fixed : + ('$2')#fixed{meta = '$1'}. %% -- Error typedef error -> error_k id '{' record_field record_tail : #error{name = value_of('$2'), fields = ['$4' | '$5']}. error -> - annotations error : - ('$2')#error{annotations = '$1'}. + meta error : + ('$2')#error{meta = '$1'}. %% -- Record @@ -144,8 +153,8 @@ record -> record_t id '{' record_field record_tail : #record{name = value_of('$2'), fields = ['$4' | '$5']}. record -> - annotations record : - ('$2')#record{annotations = '$1'}. + meta record : + ('$2')#record{meta = '$1'}. record_tail -> '}' : @@ -156,20 +165,20 @@ record_tail -> record_field -> type record_field_name ';' : - #field{name = element(1, '$2'), annotations = element(2, '$2'), type = '$1'}. + #field{name = element(1, '$2'), meta = element(2, '$2'), type = '$1'}. record_field -> type record_field_name '=' data ';' : - #field{name = element(1, '$2'), annotations = element(2, '$2'), + #field{name = element(1, '$2'), meta = element(2, '$2'), type = '$1', default = '$4'}. record_field -> - annotations record_field : - ('$2')#field{annotations = '$1' ++ ('$2')#field.annotations}. + meta record_field : + ('$2')#field{meta = '$1' ++ ('$2')#field.meta}. record_field_name -> id : {value_of('$1'), []}. record_field_name -> - annotations id : + meta id : {value_of('$2'), '$1'}. type -> primitive_t : value_of('$1'). diff --git a/src/idl.hrl b/src/idl.hrl index cd37ed0..7b3a9e6 100644 --- a/src/idl.hrl +++ b/src/idl.hrl @@ -1,6 +1,6 @@ -record(protocol, {name, - annotations = [], + meta = [], definitions = []}). -record(annotation, @@ -9,33 +9,33 @@ -record(enum, {name, - annotations = [], + meta = [], variants = []}). -record(fixed, {name, - annotations = [], + meta = [], size}). -record(error, {name, - annotations = [], + meta = [], fields = []}). -record(record, {name, - annotations = [], + meta = [], fields = []}). -record(field, {name, - annotations = [], + meta = [], type, default}). -record(function, {name, - %% annotations = [], + %% meta = [], arguments = [], return, extra}). diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index 998751c..cbca2df 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -31,42 +31,52 @@ parse_annotations_test() -> ?assertEqual( #protocol{ name = "MyProto", - annotations = - [#annotation{name = "version", + meta = + [{doc, "My protocol"}, + {doc, "No, really\nIt's some multiline doc\n" + "bullet points will be stripped\nso no unordered lists"}, + #annotation{name = "version", value = "1.0"}, #annotation{name = "aliases", value = ["ns.Proto1", "ns.Proto2"]} ], definitions = [#enum{name = "MyEnum", - annotations = - [#annotation{name = "namespace", + meta = + [{doc, "My enum"}, + #annotation{name = "namespace", value = "enums"}], variants = ["A", "B", "C"]}, #fixed{name = "MyFixed", - annotations = - [#annotation{name = "namespace", + meta = + [{doc, "My Fixed"}, + #annotation{name = "namespace", value = "fixeds"}], size = 16}, #error{name = "MyError", - annotations = - [#annotation{name = "namespace", + meta = + [{doc, "My Error"}, + #annotation{name = "namespace", value = "errors"}], fields = [#field{name = "my_err_field", - annotations = - [#annotation{name = "order", + meta = + [{doc, "My Err Field"}, + #annotation{name = "order", value = "ignore"}], type = string}]}, #record{name = "MyRecord", - annotations = - [#annotation{name = "namespace", + meta = + [{doc, "My Record"}, + #annotation{name = "namespace", value = "records"}], fields = [#field{name = "my_record_field", - annotations = - [#annotation{name = "order", + meta = + [{doc, "My Rec Field Type"}, + #annotation{name = "order", value = "ignore"}, + {doc, "My Rec Field"}, #annotation{name = "aliases", value = ["my_alias"]}], type = string}]}] @@ -76,6 +86,9 @@ parse_annotations_test() -> full_protocol_test() -> ?assertMatch( #protocol{name = "Simple", + meta = + [{doc, "An example protocol in Avro IDL"}, + #annotation{}], definitions = [#enum{name = "Kind"}, #fixed{name = "MD5"}, @@ -111,11 +124,8 @@ protocol_with_typedeffs_test() -> parse_idl(Name) -> File = "test/data/" ++ Name ++ ".avdl", {ok, B} = file:read_file(File), - {ok, T, _} = avro_idl_lexer:string(binary_to_list(B)), - NoComments = lists:filter( - fun({doc_v, _, _}) -> false; - ({comment_v, _, _}) -> false; - (_) -> true - end, T), - {ok, Tree} = avro_idl_parser:parse(NoComments), + {ok, T0, _} = avro_idl_lexer:string(binary_to_list(B)), + %% ?debugFmt("Name: ~p~nTokens:~n~p", [Name, T0]), + T = avro_idl_lexer:preprocess(T0, [drop_comments, trim_doc]), + {ok, Tree} = avro_idl_parser:parse(T), Tree. diff --git a/test/data/annotations.avdl b/test/data/annotations.avdl index a94a705..09c6db6 100644 --- a/test/data/annotations.avdl +++ b/test/data/annotations.avdl @@ -1,23 +1,36 @@ +/** My protocol */ +/** No, really + * It's some multiline doc + * + * * bullet points will be stripped + * * so no unordered lists + */ @version("1.0") @aliases(["ns.Proto1", "ns.Proto2"]) protocol MyProto { + /** My enum */ @namespace("enums") enum MyEnum { A, B, C } + /** My Fixed */ @namespace("fixeds") fixed MyFixed(16); + /** My Error */ @namespace("errors") error MyError { + /** My Err Field */ @order("ignore") string my_err_field; } + /** My Record */ @namespace("records") record MyRecord { + /** My Rec Field Type */ @order("ignore") - string @aliases(["my_alias"]) my_record_field; + string /** My Rec Field */@aliases(["my_alias"]) my_record_field; } } From e8c8739275b94ed08632f171f2d9c49f7c446a5f Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Tue, 10 Mar 2020 19:13:37 +0100 Subject: [PATCH 08/13] IDL to avpr/avsc converter; support multiple `throw` types --- rebar.config | 2 +- src/avro_idl.erl | 173 ++++++++++++++++++++++++++ src/avro_idl_parser.yrl | 17 ++- src/idl.hrl | 6 +- test/avro_idl_parse_tests.erl | 77 +++++++----- test/avro_idl_tests.erl | 125 +++++++++++++++++++ test/data/annotations.avdl | 3 + test/data/protocol_with_typedefs.avdl | 4 +- 8 files changed, 369 insertions(+), 38 deletions(-) create mode 100644 src/avro_idl.erl create mode 100644 test/avro_idl_tests.erl diff --git a/rebar.config b/rebar.config index c5a0be6..b792ee1 100644 --- a/rebar.config +++ b/rebar.config @@ -16,4 +16,4 @@ {cover_enabled, true}. {cover_export_enabled, true}. -%% {yrl_opts, [{verbose, true}]}. +{yrl_opts, [{verbose, true}]}. diff --git a/src/avro_idl.erl b/src/avro_idl.erl new file mode 100644 index 0000000..122b3eb --- /dev/null +++ b/src/avro_idl.erl @@ -0,0 +1,173 @@ +-module(avro_idl). + +-export([new_context/1, + str_to_avpr/2, + protocol_to_avpr/2, + typedecl_to_avsc/2]). +-include("idl.hrl"). + +-record(st, {cwd}). + +new_context(Cwd) -> + #st{cwd = Cwd}. + +str_to_avpr(String, Cwd) -> + str_to_avpr(String, Cwd, [drop_comments, trim_doc]). + +str_to_avpr(String, Cwd, Opts) -> + {ok, T0, _} = avro_idl_lexer:string(String), + T = avro_idl_lexer:preprocess(T0, Opts), + {ok, Tree} = avro_idl_parser:parse(T), + protocol_to_avpr(Tree, new_context(Cwd)). + +protocol_to_avpr(#protocol{name = Name, + meta = Meta, + definitions = Defs0}, St) -> + Defs = process_imports(Defs0, St), + {Types, Messages} = + lists:partition(fun(#function{}) -> false; + (_) -> true + end, Defs), + Protocol0 = + #{protocol => Name, + types => + lists:map( + fun(Type) -> + typedecl_to_avsc(Type, St) + end, Types), + messages => + lists:map( + fun(Message) -> + message_to_avsc(Message, St) + end, Messages) + }, + meta(Protocol0, Meta). + +process_imports(Defs, _St) -> + %% TODO + lists:filter(fun({import, _, _}) -> false; + (_) -> true + end, Defs). + +typedecl_to_avsc(#enum{name = Name, meta = Meta, variants = Vars}, _St) -> + meta( + #{type => enum, + name => Name, + variants => Vars + }, + Meta); +typedecl_to_avsc(#fixed{name = Name, meta = Meta, size = Size}, _St) -> + meta( + #{type => fixed, + name => Name, + size => Size}, + Meta); +typedecl_to_avsc(#error{name = Name, meta = Meta, fields = Fields}, St) -> + meta( + #{type => error, + name => Name, + fields => [field_to_avsc(Field, St) || Field <- Fields]}, + Meta); +typedecl_to_avsc(#record{name = Name, meta = Meta, fields = Fields}, St) -> + meta( + #{type => record, + name => Name, + fields => [field_to_avsc(Field, St) || Field <- Fields]}, + Meta). + +field_to_avsc(#field{name = Name, meta = Meta, + type = Type, default = Default}, St) -> + meta( + default( + #{name => Name, + type => type_to_avsc(Type, St)}, + Default), % TODO: maybe validate default matches type + Meta). + +message_to_avsc(#function{name = Name, meta = Meta, + arguments = Args, return = Return, + extra = Extra}, St) -> + %% TODO: arguments can just reuse `#field{}` + ArgsSchema = + [default( + #{name => ArgName, + type => type_to_avsc(Type, St)}, + Default) + || {arg, ArgName, Type, Default} <- Args], + Schema0 = + #{name => Name, + request => ArgsSchema, + response => type_to_avsc(Return, St)}, + Schema1 = case Extra of + undefined -> Schema0; + oneway -> + Schema0#{'one-way' => true}; + {throws, ThrowsTypes} -> + %% Throws = [type_to_avsc(TType, St) + %% || TType <- ThrowsTypes], + Schema0#{error => ThrowsTypes} + end, + meta(Schema1, Meta). + + +type_to_avsc(void, _St) -> + null; +type_to_avsc(null, _St) -> + null; +type_to_avsc(T, _St) when T == int; + T == long; + T == string; + T == boolean; + T == float; + T == double; + T == bytes -> + T; +type_to_avsc({decimal, Precision, Scale}, _St) -> + #{type => bytes, + 'logicalType' => "decimal", + precision => Precision, + scale => Scale}; +type_to_avsc(date, _St) -> + #{type => int, + 'logicalType' => "date"}; +type_to_avsc(time_ms, _St) -> + #{type => int, + 'logicalType' => "time-millis"}; +type_to_avsc(timestamp_ms, _St) -> + #{type => long, + 'logicalType' => "timestamp-millis"}; +type_to_avsc({custom, Id}, _St) -> + Id; +type_to_avsc({union, Types}, St) -> + [type_to_avsc(Type, St) || Type <- Types]; +type_to_avsc({array, Of}, St) -> + #{type => array, + items => type_to_avsc(Of, St)}; +type_to_avsc({map, ValType}, St) -> + #{type => map, + values => type_to_avsc(ValType, St)}. + +meta(Schema, Meta) -> + {Docs, Annotations} = + lists:partition( + fun({doc, _}) -> true; + (#annotation{}) -> false + end, Meta), + Schema1 = case Docs of + [] -> Schema; + _ -> + DocStrings = [S || {doc, S} <- Docs], + Schema#{"doc" => lists:flatten(lists:join( + "\n", DocStrings))} + end, + lists:foldl( + fun(#annotation{name = Name, value = Value}, Schema2) -> + maps:is_key(Name, Schema2) andalso + error({duplicate_annotation, Name, Value, Schema2}), + Schema2#{Name => Value} + end, Schema1, Annotations). + +default(Obj, undefined) -> + Obj; +default(Obj, Default) -> + Obj#{default => Default}. diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index 1e9e38f..d2c33b0 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -23,7 +23,7 @@ Nonterminals fixed array map - function fun_return fun_arguments fun_argument fun_extra + function fun_return fun_arguments fun_argument fun_extra throws data array_of_data array_of_data_tail map_of_data map_of_data_tail. Rootsymbol protocol. @@ -111,7 +111,7 @@ declaration -> function : '$1'. import -> import_k import_file_type string_v ';' : - {import, '$2', value_of('$3')}. + #import{type = '$2', file_path = value_of('$3')}. import_file_type -> idl_k : idl. import_file_type -> protocol_k : protocol. @@ -223,6 +223,9 @@ map -> function -> fun_return id '(' fun_arguments ')' fun_extra ';' : #function{name = value_of('$2'), arguments = '$4', return = '$1', extra = '$6'}. +function -> + doc_v function : + ('$2')#function{meta = [{doc, value_of('$1')}]}. fun_return -> type : '$1'. fun_return -> void_k : void. @@ -247,12 +250,18 @@ fun_argument -> fun_extra -> '$empty' : undefined. fun_extra -> - throws_k id : - {throws, value_of('$2')}. + throws_k id throws : + {throws, [value_of('$2') | '$3']}. fun_extra -> oneway_k : oneway. +throws -> + '$empty' : + []. +throws -> + ',' id throws: + [value_of('$2') | '$3']. %% == Data (JSON) for default values data -> string_v : value_of('$1'). diff --git a/src/idl.hrl b/src/idl.hrl index 7b3a9e6..e5a9b8f 100644 --- a/src/idl.hrl +++ b/src/idl.hrl @@ -7,6 +7,10 @@ {name, value}). +-record(import, + {type, + file_path}). + -record(enum, {name, meta = [], @@ -35,7 +39,7 @@ -record(function, {name, - %% meta = [], + meta = [], arguments = [], return, extra}). diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index cbca2df..09128fb 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -1,22 +1,4 @@ -%% coding: latin-1 -%%%------------------------------------------------------------------- -%%% Copyright (c) 2013-2018 Klarna AB -%%% -%%% This file is provided to you under the Apache License, -%%% Version 2.0 (the "License"); you may not use this file -%%% except in compliance with the License. You may obtain -%%% a copy of the License at -%%% -%%% http://www.apache.org/licenses/LICENSE-2.0 -%%% -%%% Unless required by applicable law or agreed to in writing, -%%% software distributed under the License is distributed on an -%%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%%% KIND, either express or implied. See the License for the -%%% specific language governing permissions and limitations -%%% under the License. -%%% -%%%------------------------------------------------------------------- +%% @doc Tests for IDL lexer + parser -module(avro_idl_parse_tests). -include("../src/idl.hrl"). @@ -79,7 +61,12 @@ parse_annotations_test() -> {doc, "My Rec Field"}, #annotation{name = "aliases", value = ["my_alias"]}], - type = string}]}] + type = string}]}, + #function{name = "hello", + meta = [{doc, "My Fun"}], + arguments = [], + return = string, + extra = undefined}] }, parse_idl("annotations")). @@ -102,23 +89,53 @@ full_protocol_test() -> #function{name = "ping"}]}, parse_idl("full_protocol")). -protocol_with_typedeffs_test() -> +protocol_with_typedefs_test() -> ?assertMatch( #protocol{name = "MyProto", definitions = - [{import, idl, "foo.avdl"}, - {import, protocol, "bar.avpr"}, - {import, schema, "baz.avsc"}, + [#import{type = idl, file_path = "foo.avdl"}, + #import{type = protocol, file_path = "bar.avpr"}, + #import{type = schema, file_path = "baz.avsc"}, #enum{name = "MyEnum1"}, #enum{name = "MyEnum2"}, #fixed{name = "MyFix"}, - #record{name = "MyRec"}, - #record{name = "MyAnnotated"}, + #record{name = "MyRec", + fields = + [#field{name = "my_int", type = int}, + #field{name = "my_string", type = string}, + #field{name = "my_float", type = float}, + #field{name = "my_bool", type = boolean, + default = false}, + #field{name = "my_custom", + type = {custom, "MyFix"}}, + #field{name = "my_union", + type = {union, [boolean, null]}, + default = null}, + #field{name = "my_date", + type = date}, + #field{name = "my_decimal", + type = {decimal, 5, 2}}, + #field{name = "my_int_array", + type = {array, int}}, + #field{}, + #field{}, + #field{name = "my_map", + type = {map, float}} + ]}, + #record{name = "MyAnnotated", + fields = + [#field{ + name = "error", + type = {custom, + "org.erlang.www.MyError"}} + ]}, #error{name = "MyError"}, - #function{name = "mul"}, - #function{name = "append"}, - #function{name = "gen_server_cast"}, - #function{name = "ping"}]}, + #function{name = "div", + extra = {throws, ["DivisionByZero"]}}, + #function{name = "append", + extra = {throws, ["MyError", "TheirError"]}}, + #function{name = "gen_server_cast", extra = oneway}, + #function{name = "ping", extra = undefined}]}, parse_idl("protocol_with_typedefs")). parse_idl(Name) -> diff --git a/test/avro_idl_tests.erl b/test/avro_idl_tests.erl new file mode 100644 index 0000000..f90475e --- /dev/null +++ b/test/avro_idl_tests.erl @@ -0,0 +1,125 @@ +%% @doc Tests for IDL converter / loader +-module(avro_idl_tests). + +-include("../src/idl.hrl"). +-include_lib("eunit/include/eunit.hrl"). + + +empty_protocol_avpr_test() -> + ?assertEqual( + #{protocol => "MyProto", + types => [], + messages => []}, + idl_to_avpr("empty_protocol")). + + +annotations_avpr_test() -> + ?assertEqual( + #{"doc" => ("My protocol\nNo, really\nIt's some multiline doc\n" + "bullet points will be stripped\nso no unordered lists"), + "version" => "1.0", + "aliases" => ["ns.Proto1", "ns.Proto2"], + protocol => "MyProto", + types => + [#{"doc" => "My enum", + "namespace" => "enums", + type => enum, + name => "MyEnum", + variants => ["A", "B", "C"]}, + #{"doc" => "My Fixed", + "namespace" => "fixeds", + type => fixed, + name => "MyFixed", + size => 16}, + #{"doc" => "My Error", + "namespace" => "errors", + type => error, + name => "MyError", + fields => + [#{"doc" => "My Err Field", + "order" => "ignore", + type => string, + name => "my_err_field"}]}, + #{"doc" => "My Record", + "namespace" => "records", + type => record, + name => "MyRecord", + fields => + [#{"doc" => "My Rec Field Type\nMy Rec Field", + "order" => "ignore", + "aliases" => ["my_alias"], + type => string, + name => "my_record_field"}]}], + messages => + [#{"doc" => "My Fun", + name => "hello", + request => [], + response => string}] + }, + idl_to_avpr("annotations")). + + +full_protocol_avpr_test() -> + ?assertMatch( + #{}, + idl_to_avpr("full_protocol")). + + +protocol_with_typedefs_avpr_test() -> + ?assertMatch( + #{"namespace" := "org.erlang.www", + protocol := "MyProto", + types := + [#{name := "MyEnum1"}, + #{name := "MyEnum2", + type := enum, + variants := ["VAR21", "VAR22", "VAR23"]}, + #{name := "MyFix", + type := fixed, + size := 10}, + #{name := "MyRec", + fields := + [#{type := int}, + #{type := string}, + #{type := float}, + #{type := boolean}, + #{type := "MyFix"}, + #{type := [boolean, null]}, + #{type := #{type := int, 'logicalType' := "date"}}, + #{type := #{type := bytes, precision := 5, scale := 2}}, + #{type := #{type := array, items := int}}, + #{type := #{type := array, items := int}}, + #{type := #{type := array, items := string}}, + #{type := #{type := map, values := float}}] + }, + #{name := "MyAnnotated", + "namespace" := "org.erlang.ftp", + fields := + [#{name := "error", + type := "org.erlang.www.MyError"}]}, + #{name := "MyError", + fields := + [#{type := "MyEnum2"}, + #{type := string}]}], + messages := + [#{name := "div"}, + #{name := "append"}, + #{name := "gen_server_cast"}, + #{name := "ping"}]}, + idl_to_avpr("protocol_with_typedefs")). + +%% Helpers + +idl_to_avpr(Name) -> + ProtocolTree = parse_idl(Name), + avro_idl:protocol_to_avpr(ProtocolTree, + avro_idl:new_context("")). + +parse_idl(Name) -> + File = "test/data/" ++ Name ++ ".avdl", + {ok, B} = file:read_file(File), + {ok, T0, _} = avro_idl_lexer:string(binary_to_list(B)), + %% ?debugFmt("Name: ~p~nTokens:~n~p", [Name, T0]), + T = avro_idl_lexer:preprocess(T0, [drop_comments, trim_doc]), + {ok, Tree} = avro_idl_parser:parse(T), + Tree. diff --git a/test/data/annotations.avdl b/test/data/annotations.avdl index 09c6db6..e081f1f 100644 --- a/test/data/annotations.avdl +++ b/test/data/annotations.avdl @@ -33,4 +33,7 @@ protocol MyProto { @order("ignore") string /** My Rec Field */@aliases(["my_alias"]) my_record_field; } + + /** My Fun */ + string hello(); } diff --git a/test/data/protocol_with_typedefs.avdl b/test/data/protocol_with_typedefs.avdl index b428d2d..e9c035a 100644 --- a/test/data/protocol_with_typedefs.avdl +++ b/test/data/protocol_with_typedefs.avdl @@ -40,9 +40,9 @@ protocol MyProto { string description; } - float mul(int arg1, float arg2 = 1.0); + float div(int arg1, float arg2 = 1.0) throws DivisionByZero; - MyFix append(bytes arg1, string arg2 = "tail") throws MyError; + MyFix append(bytes arg1, string arg2 = "tail") throws MyError, TheirError; void gen_server_cast(map opts) oneway; From 795be3852c706329ba5e75fb8dae9d1d653e7c84 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sat, 14 Mar 2020 22:41:30 +0100 Subject: [PATCH 09/13] Better test coverage --- test/avro_idl_parse_tests.erl | 4 ++++ test/avro_idl_tests.erl | 22 ++++++++++++---------- test/data/annotations.avdl | 10 ++++++---- test/data/protocol_with_typedefs.avdl | 2 ++ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index 09128fb..24fece5 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -113,6 +113,10 @@ protocol_with_typedefs_test() -> default = null}, #field{name = "my_date", type = date}, + #field{name = "my_time", + type = time_ms}, + #field{name = "my_timestamp", + type = timestamp_ms}, #field{name = "my_decimal", type = {decimal, 5, 2}}, #field{name = "my_int_array", diff --git a/test/avro_idl_tests.erl b/test/avro_idl_tests.erl index f90475e..148392f 100644 --- a/test/avro_idl_tests.erl +++ b/test/avro_idl_tests.erl @@ -86,6 +86,9 @@ protocol_with_typedefs_avpr_test() -> #{type := "MyFix"}, #{type := [boolean, null]}, #{type := #{type := int, 'logicalType' := "date"}}, + #{type := #{type := int, 'logicalType' := "time-millis"}}, + #{type := #{type := long, + 'logicalType' := "timestamp-millis"}}, #{type := #{type := bytes, precision := 5, scale := 2}}, #{type := #{type := array, items := int}}, #{type := #{type := array, items := int}}, @@ -108,18 +111,17 @@ protocol_with_typedefs_avpr_test() -> #{name := "ping"}]}, idl_to_avpr("protocol_with_typedefs")). + +duplicate_annotation_test() -> + ?assertError( + {duplicate_annotation, "my_decorator", _, _}, + avro_idl:str_to_avpr( + "@my_decorator(\"a\") @my_decorator(\"b\") protocol MyProto{}", "") + ). + %% Helpers idl_to_avpr(Name) -> - ProtocolTree = parse_idl(Name), - avro_idl:protocol_to_avpr(ProtocolTree, - avro_idl:new_context("")). - -parse_idl(Name) -> File = "test/data/" ++ Name ++ ".avdl", {ok, B} = file:read_file(File), - {ok, T0, _} = avro_idl_lexer:string(binary_to_list(B)), - %% ?debugFmt("Name: ~p~nTokens:~n~p", [Name, T0]), - T = avro_idl_lexer:preprocess(T0, [drop_comments, trim_doc]), - {ok, Tree} = avro_idl_parser:parse(T), - Tree. + avro_idl:str_to_avpr(binary_to_list(B), ""). diff --git a/test/data/annotations.avdl b/test/data/annotations.avdl index e081f1f..8bdcdc3 100644 --- a/test/data/annotations.avdl +++ b/test/data/annotations.avdl @@ -10,10 +10,12 @@ protocol MyProto { /** My enum */ @namespace("enums") - enum MyEnum { + enum MyEnum { // my inline comment A, B, C } - + /* + * My multiline comment + */ /** My Fixed */ @namespace("fixeds") fixed MyFixed(16); @@ -23,14 +25,14 @@ protocol MyProto { error MyError { /** My Err Field */ @order("ignore") - string my_err_field; + string my_err_field; // other inline } /** My Record */ @namespace("records") record MyRecord { /** My Rec Field Type */ - @order("ignore") + @order("ignore") /* other multiline */ string /** My Rec Field */@aliases(["my_alias"]) my_record_field; } diff --git a/test/data/protocol_with_typedefs.avdl b/test/data/protocol_with_typedefs.avdl index e9c035a..73f25ec 100644 --- a/test/data/protocol_with_typedefs.avdl +++ b/test/data/protocol_with_typedefs.avdl @@ -23,6 +23,8 @@ protocol MyProto { MyFix my_custom; union {boolean, null} my_union = null; date my_date = 123456; + time_ms my_time = 23456; + timestamp_ms my_timestamp = 3456; decimal(5, 2) my_decimal = 1222; array my_int_array; From 8142a7109c5d5c435dcafee9a0149612883e9159 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sat, 14 Mar 2020 22:42:15 +0100 Subject: [PATCH 10/13] Optimize doc strip preprocessor by pre-compiling regexp --- src/avro_idl.erl | 6 ++++++ src/avro_idl_lexer.xrl | 9 ++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/avro_idl.erl b/src/avro_idl.erl index 122b3eb..7507cea 100644 --- a/src/avro_idl.erl +++ b/src/avro_idl.erl @@ -1,3 +1,6 @@ +%%% @doc APIs to work with Avro IDL format +%%% +%%% See [https://avro.apache.org/docs/1.9.2/idl.html] -module(avro_idl). -export([new_context/1, @@ -45,6 +48,9 @@ protocol_to_avpr(#protocol{name = Name, process_imports(Defs, _St) -> %% TODO + %% https://avro.apache.org/docs/1.9.2/spec.html#names + %% when importing definitions from avdl or avpr, copy namespaces from + %% protocol to definitions, if not specified lists:filter(fun({import, _, _}) -> false; (_) -> true end, Defs). diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index 274a1ee..d1cd5df 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -82,16 +82,15 @@ do_preprocess(drop_comments, T) -> (_) -> true end, T); do_preprocess(trim_doc, T) -> + {ok, Re} = re:compile("^[\\s\\*]*((?U).*)[\\s]*$", [multiline]), lists:map( fun({doc_v, Loc, Val}) -> - {doc_v, Loc, trim_doc(Val)}; + Stripped = re:replace(Val, Re, "\\1", + [global, {return, list}]), + {doc_v, Loc, Stripped}; (Tok) -> Tok end, T). -trim_doc(Doc) -> - re:replace(Doc, "^[\\s\\*]*((?U).*)[\\s]*$", "\\1", - [global, multiline, {return, list}]). - %% Lexer internal helpers unescape(Token, Char) -> From 2b2406b60728462b4e0e9725adcc8959179aa894 Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sat, 14 Mar 2020 23:15:04 +0100 Subject: [PATCH 11/13] Allow any type inside array and map; fix single-letter namespaced IDs --- src/avro_idl_lexer.xrl | 2 +- src/avro_idl_parser.yrl | 8 +++--- test/avro_idl_parse_tests.erl | 46 ++++++++++++++++++++++++++++++++++- test/avro_idl_tests.erl | 21 ++++++++++++++++ 4 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index d1cd5df..6650125 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -57,7 +57,7 @@ error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(Toke [A-Za-z_][A-Za-z0-9_]* : {token, {id, TokenLine, TokenChars}}. %% namespaced will only be allowed in data type spec -[A-Za-z_][A-Za-z0-9_]+(\.[A-Za-z_][A-Za-z0-9_]+)+ : {token, {ns_id, TokenLine, TokenChars}}. +[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)+ : {token, {ns_id, TokenLine, TokenChars}}. %% https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/ %% `/** .. */` is a docstring for the following object diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index d2c33b0..2a8d0cd 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -210,13 +210,13 @@ union_tail -> %% -- Array typedef array -> - array_t '<' primitive_t '>' : - {array, value_of('$3')}. %FIXME: not just primitives! + array_t '<' type '>' : + {array, '$3'}. %% -- Map typedef map -> - map_t '<' primitive_t '>' : - {map, value_of('$3')}. %FIXME: not just primitives! + map_t '<' type '>' : + {map, '$3'}. %% == Function (message) definitions diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index 24fece5..e65ce31 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -142,10 +142,54 @@ protocol_with_typedefs_test() -> #function{name = "ping", extra = undefined}]}, parse_idl("protocol_with_typedefs")). +array_types_test() -> + Probes = + [{int, "int"}, + {{decimal, 1, 2}, "decimal(1, 2)"}, + {null, "null"}, + {{custom, "MyType"}, "MyType"}, + {{custom, "my_ns.MyType"}, "my_ns.MyType"}, + {{union, [int, null]}, "union{int, null}"}, + {{array, int}, "array"}, + {{map, int}, "map"}], + lists:foreach( + fun({ExpectType, IdlType}) -> + test_field_type({array, ExpectType}, "array<" ++ IdlType ++ ">") + end, Probes). + +map_types_test() -> + Probes = + [{int, "int"}, + {{custom, "MyType"}, "MyType"}, + {{array, int}, "array"}, + {{map, int}, "map"}], + lists:foreach( + fun({ExpectType, IdlType}) -> + test_field_type({map, ExpectType}, "map<" ++ IdlType ++ ">") + end, Probes). + +%% Helpers + +test_field_type(ExpectType, IdlType) -> + Idl = ("protocol P {" + " record R { " ++ IdlType ++ " f; }" + "}"), + #protocol{ + definitions = + [#record{ + fields = + [#field{type = Type}]}]} = parse_str(Idl), + ?assertEqual(ExpectType, Type).%% , % ?assertEqual/3 only OTP-20+ + %% #{proto => Idl, + %% type => IdlType}). + parse_idl(Name) -> File = "test/data/" ++ Name ++ ".avdl", {ok, B} = file:read_file(File), - {ok, T0, _} = avro_idl_lexer:string(binary_to_list(B)), + parse_str(binary_to_list(B)). + +parse_str(Str) -> + {ok, T0, _} = avro_idl_lexer:string(Str), %% ?debugFmt("Name: ~p~nTokens:~n~p", [Name, T0]), T = avro_idl_lexer:preprocess(T0, [drop_comments, trim_doc]), {ok, Tree} = avro_idl_parser:parse(T), diff --git a/test/avro_idl_tests.erl b/test/avro_idl_tests.erl index 148392f..ec94b7e 100644 --- a/test/avro_idl_tests.erl +++ b/test/avro_idl_tests.erl @@ -119,6 +119,27 @@ duplicate_annotation_test() -> "@my_decorator(\"a\") @my_decorator(\"b\") protocol MyProto{}", "") ). +nested_complex_types_test() -> + ?assertEqual( + #{protocol => "P", + messages => [], + types => + [#{type => record, + name => "R", + fields => + [#{name => "f", + type => + #{type => array, + items => + #{type => map, + values => [null, "ns.T"]} + } + } + ]}]}, + avro_idl:str_to_avpr( + "protocol P { record R { array> f; }}", "") + ). + %% Helpers idl_to_avpr(Name) -> From c3bea118e83b30da1c9718d83e877a0376d345da Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sun, 15 Mar 2020 01:47:11 +0100 Subject: [PATCH 12/13] Use binaries for avpr JSON-map representation --- include/erlavro.hrl | 1 + src/avro_idl.erl | 113 +++++++++-------- src/avro_idl_lexer.xrl | 4 +- src/avro_idl_parser.yrl | 11 +- test/avro_idl_parse_tests.erl | 2 + test/avro_idl_tests.erl | 227 +++++++++++++++++++--------------- 6 files changed, 201 insertions(+), 157 deletions(-) diff --git a/include/erlavro.hrl b/include/erlavro.hrl index c3a331b..76984bf 100644 --- a/include/erlavro.hrl +++ b/include/erlavro.hrl @@ -37,6 +37,7 @@ -define(AVRO_MAP, <<"map">>). -define(AVRO_UNION, <<"union">>). -define(AVRO_FIXED, <<"fixed">>). +-define(AVRO_ERROR, <<"error">>). % idl -define(IS_AVRO_PRIMITIVE_NAME(N), (N =:= ?AVRO_NULL orelse diff --git a/src/avro_idl.erl b/src/avro_idl.erl index 7507cea..78a611c 100644 --- a/src/avro_idl.erl +++ b/src/avro_idl.erl @@ -1,6 +1,10 @@ %%% @doc APIs to work with Avro IDL format %%% -%%% See [https://avro.apache.org/docs/1.9.2/idl.html] +%%% This module allows to convert .avdl format to .avpr and .avsc as well +%%% as create Avro encoders and decoders. +%%% @end +%%% @reference See [https://avro.apache.org/docs/current/idl.html] +%%% @author Sergey Prokhhorov -module(avro_idl). -export([new_context/1, @@ -8,6 +12,7 @@ protocol_to_avpr/2, typedecl_to_avsc/2]). -include("idl.hrl"). +-include("erlavro.hrl"). -record(st, {cwd}). @@ -32,13 +37,13 @@ protocol_to_avpr(#protocol{name = Name, (_) -> true end, Defs), Protocol0 = - #{protocol => Name, - types => + #{<<"protocol">> => b(Name), + <<"types">> => lists:map( fun(Type) -> typedecl_to_avsc(Type, St) end, Types), - messages => + <<"messages">> => lists:map( fun(Message) -> message_to_avsc(Message, St) @@ -57,36 +62,36 @@ process_imports(Defs, _St) -> typedecl_to_avsc(#enum{name = Name, meta = Meta, variants = Vars}, _St) -> meta( - #{type => enum, - name => Name, - variants => Vars + #{<<"type">> => ?AVRO_ENUM, + <<"name">> => b(Name), + <<"variants">> => lists:map(fun b/1, Vars) }, Meta); typedecl_to_avsc(#fixed{name = Name, meta = Meta, size = Size}, _St) -> meta( - #{type => fixed, - name => Name, - size => Size}, + #{<<"type">> => ?AVRO_FIXED, + <<"name">> => b(Name), + <<"size">> => Size}, Meta); typedecl_to_avsc(#error{name = Name, meta = Meta, fields = Fields}, St) -> meta( - #{type => error, - name => Name, - fields => [field_to_avsc(Field, St) || Field <- Fields]}, + #{<<"type">> => ?AVRO_ERROR, + <<"name">> => b(Name), + <<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]}, Meta); typedecl_to_avsc(#record{name = Name, meta = Meta, fields = Fields}, St) -> meta( - #{type => record, - name => Name, - fields => [field_to_avsc(Field, St) || Field <- Fields]}, + #{<<"type">> => ?AVRO_RECORD, + <<"name">> => b(Name), + <<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]}, Meta). field_to_avsc(#field{name = Name, meta = Meta, type = Type, default = Default}, St) -> meta( default( - #{name => Name, - type => type_to_avsc(Type, St)}, + #{<<"name">> => b(Name), + <<"type">> => type_to_avsc(Type, St)}, Default), % TODO: maybe validate default matches type Meta). @@ -96,30 +101,28 @@ message_to_avsc(#function{name = Name, meta = Meta, %% TODO: arguments can just reuse `#field{}` ArgsSchema = [default( - #{name => ArgName, - type => type_to_avsc(Type, St)}, + #{<<"name">> => b(ArgName), + <<"type">> => type_to_avsc(Type, St)}, Default) || {arg, ArgName, Type, Default} <- Args], Schema0 = - #{name => Name, - request => ArgsSchema, - response => type_to_avsc(Return, St)}, + #{<<"name">> => b(Name), + <<"request">> => ArgsSchema, + <<"response">> => type_to_avsc(Return, St)}, Schema1 = case Extra of undefined -> Schema0; oneway -> - Schema0#{'one-way' => true}; + Schema0#{<<"one-way">> => true}; {throws, ThrowsTypes} -> - %% Throws = [type_to_avsc(TType, St) - %% || TType <- ThrowsTypes], - Schema0#{error => ThrowsTypes} + Schema0#{<<"error">> => lists:map(fun b/1, ThrowsTypes)} end, meta(Schema1, Meta). type_to_avsc(void, _St) -> - null; + ?AVRO_NULL; type_to_avsc(null, _St) -> - null; + ?AVRO_NULL; type_to_avsc(T, _St) when T == int; T == long; T == string; @@ -127,31 +130,31 @@ type_to_avsc(T, _St) when T == int; T == float; T == double; T == bytes -> - T; + atom_to_binary(T, utf8); type_to_avsc({decimal, Precision, Scale}, _St) -> - #{type => bytes, - 'logicalType' => "decimal", - precision => Precision, - scale => Scale}; + #{<<"type">> => ?AVRO_BYTES, + <<"logicalType">> => <<"decimal">>, + <<"precision">> => Precision, + <<"scale">> => Scale}; type_to_avsc(date, _St) -> - #{type => int, - 'logicalType' => "date"}; + #{<<"type">> => ?AVRO_INT, + <<"logicalType">> => <<"date">>}; type_to_avsc(time_ms, _St) -> - #{type => int, - 'logicalType' => "time-millis"}; + #{<<"type">> => ?AVRO_INT, + <<"logicalType">> => <<"time-millis">>}; type_to_avsc(timestamp_ms, _St) -> - #{type => long, - 'logicalType' => "timestamp-millis"}; + #{<<"type">> => ?AVRO_LONG, + <<"logicalType">> => <<"timestamp-millis">>}; type_to_avsc({custom, Id}, _St) -> - Id; + b(Id); type_to_avsc({union, Types}, St) -> [type_to_avsc(Type, St) || Type <- Types]; type_to_avsc({array, Of}, St) -> - #{type => array, - items => type_to_avsc(Of, St)}; + #{<<"type">> => ?AVRO_ARRAY, + <<"items">> => type_to_avsc(Of, St)}; type_to_avsc({map, ValType}, St) -> - #{type => map, - values => type_to_avsc(ValType, St)}. + #{<<"type">> => ?AVRO_MAP, + <<"values">> => type_to_avsc(ValType, St)}. meta(Schema, Meta) -> {Docs, Annotations} = @@ -163,17 +166,27 @@ meta(Schema, Meta) -> [] -> Schema; _ -> DocStrings = [S || {doc, S} <- Docs], - Schema#{"doc" => lists:flatten(lists:join( - "\n", DocStrings))} + Schema#{<<"doc">> => b(lists:join( + "\n", DocStrings))} end, lists:foldl( fun(#annotation{name = Name, value = Value}, Schema2) -> - maps:is_key(Name, Schema2) andalso + BName = b(Name), + BVal = case Value of + [] -> <<>>; + [C | _] when is_integer(C) -> b(Value); + _ -> + [b(Str) || Str <- Value] + end, + maps:is_key(BName, Schema2) andalso error({duplicate_annotation, Name, Value, Schema2}), - Schema2#{Name => Value} + Schema2#{BName => BVal} end, Schema1, Annotations). default(Obj, undefined) -> Obj; default(Obj, Default) -> - Obj#{default => Default}. + Obj#{<<"default">> => Default}. + +b(Str) when is_list(Str) -> + unicode:characters_to_binary(Str). diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index 6650125..4ec393c 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -1,5 +1,7 @@ %% @doc Avro IDL lexer -%% https://avro.apache.org/docs/current/idl.html +%% @end +%% @reference See [https://avro.apache.org/docs/current/idl.html] +%% @author Sergey Prokhhorov Definitions. diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index 2a8d0cd..147fa09 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -1,8 +1,9 @@ -%% @doc Avro IDL parser -%% https://avro.apache.org/docs/1.9.2/idl.html -%% XXX: all `comment_v` tokens should be filtered-out before parsing! - -Header "%% Hello". +Header "%%% @doc Avro IDL parser +%%% +%%% XXX: all `comment_v' tokens should be filtered-out before parsing! +%%% @end +%%% @reference See [https://avro.apache.org/docs/current/idl.html] +%%% @author Sergey Prokhhorov ". Terminals id ns_id null string_v doc_v float_v integer_v bool_v annotation_v primitive_t logical_t decimal_t diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index e65ce31..3119dd7 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -1,4 +1,6 @@ %% @doc Tests for IDL lexer + parser +%% @end +%% @author Sergey Prokhhorov -module(avro_idl_parse_tests). -include("../src/idl.hrl"). diff --git a/test/avro_idl_tests.erl b/test/avro_idl_tests.erl index ec94b7e..ceaba01 100644 --- a/test/avro_idl_tests.erl +++ b/test/avro_idl_tests.erl @@ -1,62 +1,71 @@ %% @doc Tests for IDL converter / loader +%% @end +%% @author Sergey Prokhhorov -module(avro_idl_tests). -include("../src/idl.hrl"). -include_lib("eunit/include/eunit.hrl"). - +-include("erlavro.hrl"). empty_protocol_avpr_test() -> ?assertEqual( - #{protocol => "MyProto", - types => [], - messages => []}, + #{<<"protocol">> => <<"MyProto">>, + <<"types">> => [], + <<"messages">> => []}, idl_to_avpr("empty_protocol")). annotations_avpr_test() -> + Proto = idl_to_avpr("annotations"), + ?assertEqual( + #{<<"doc">> => + <<"My protocol\nNo, really\nIt's some multiline doc\n" + "bullet points will be stripped\nso no unordered lists">>, + <<"version">> => <<"1.0">>, + <<"aliases">> => [<<"ns.Proto1">>, <<"ns.Proto2">>], + <<"protocol">> => <<"MyProto">> + }, + maps:without([<<"types">>, <<"messages">>], Proto) + ), + #{<<"types">> := Types, + <<"messages">> := Messages} = Proto, + ?assertEqual( + [#{<<"doc">> => <<"My enum">>, + <<"namespace">> => <<"enums">>, + <<"type">> => ?AVRO_ENUM, + <<"name">> => <<"MyEnum">>, + <<"variants">> => [<<"A">>, <<"B">>, <<"C">>]}, + #{<<"doc">> => <<"My Fixed">>, + <<"namespace">> => <<"fixeds">>, + <<"type">> => ?AVRO_FIXED, + <<"name">> => <<"MyFixed">>, + <<"size">> => 16}, + #{<<"doc">> => <<"My Error">>, + <<"namespace">> => <<"errors">>, + <<"type">> => ?AVRO_ERROR, + <<"name">> => <<"MyError">>, + <<"fields">> => + [#{<<"doc">> => <<"My Err Field">>, + <<"order">> => <<"ignore">>, + <<"type">> => ?AVRO_STRING, + <<"name">> => <<"my_err_field">>}]}, + #{<<"doc">> => <<"My Record">>, + <<"namespace">> => <<"records">>, + <<"type">> => ?AVRO_RECORD, + <<"name">> => <<"MyRecord">>, + <<"fields">> => + [#{<<"doc">> => <<"My Rec Field Type\nMy Rec Field">>, + <<"order">> => <<"ignore">>, + <<"aliases">> => [<<"my_alias">>], + <<"type">> => ?AVRO_STRING, + <<"name">> => <<"my_record_field">>}]}], + Types), ?assertEqual( - #{"doc" => ("My protocol\nNo, really\nIt's some multiline doc\n" - "bullet points will be stripped\nso no unordered lists"), - "version" => "1.0", - "aliases" => ["ns.Proto1", "ns.Proto2"], - protocol => "MyProto", - types => - [#{"doc" => "My enum", - "namespace" => "enums", - type => enum, - name => "MyEnum", - variants => ["A", "B", "C"]}, - #{"doc" => "My Fixed", - "namespace" => "fixeds", - type => fixed, - name => "MyFixed", - size => 16}, - #{"doc" => "My Error", - "namespace" => "errors", - type => error, - name => "MyError", - fields => - [#{"doc" => "My Err Field", - "order" => "ignore", - type => string, - name => "my_err_field"}]}, - #{"doc" => "My Record", - "namespace" => "records", - type => record, - name => "MyRecord", - fields => - [#{"doc" => "My Rec Field Type\nMy Rec Field", - "order" => "ignore", - "aliases" => ["my_alias"], - type => string, - name => "my_record_field"}]}], - messages => - [#{"doc" => "My Fun", - name => "hello", - request => [], - response => string}] - }, - idl_to_avpr("annotations")). + [#{<<"doc">> => <<"My Fun">>, + <<"name">> => <<"hello">>, + <<"request">> => [], + <<"response">> => ?AVRO_STRING}], + Messages). full_protocol_avpr_test() -> @@ -66,50 +75,66 @@ full_protocol_avpr_test() -> protocol_with_typedefs_avpr_test() -> + Proto = idl_to_avpr("protocol_with_typedefs"), + ?assertMatch( + #{<<"namespace">> := <<"org.erlang.www">>, + <<"protocol">> := <<"MyProto">>, + <<"types">> := _, + <<"messages">> := _}, + Proto), + #{<<"types">> := Types, + <<"messages">> := Messages} = Proto, + ?assertMatch( + [#{<<"name">> := <<"MyEnum1">>}, + #{<<"name">> := <<"MyEnum2">>, + <<"type">> := ?AVRO_ENUM, + <<"variants">> := [<<"VAR21">>, <<"VAR22">>, <<"VAR23">>]}, + #{<<"name">> := <<"MyFix">>, + <<"type">> := ?AVRO_FIXED, + <<"size">> := 10}, + #{<<"name">> := <<"MyRec">>, + <<"fields">> := + [#{<<"type">> := ?AVRO_INT}, + #{<<"type">> := ?AVRO_STRING}, + #{<<"type">> := ?AVRO_FLOAT}, + #{<<"type">> := ?AVRO_BOOLEAN}, + #{<<"type">> := <<"MyFix">>}, + #{<<"type">> := [?AVRO_BOOLEAN, ?AVRO_NULL]}, + #{<<"type">> := #{<<"type">> := ?AVRO_INT, + <<"logicalType">> := <<"date">>}}, + #{<<"type">> := #{<<"type">> := ?AVRO_INT, + <<"logicalType">> := <<"time-millis">>}}, + #{<<"type">> := #{<<"type">> := ?AVRO_LONG, + <<"logicalType">> := <<"timestamp-millis">>}}, + #{<<"type">> := #{<<"type">> := ?AVRO_BYTES, + <<"precision">> := 5, + <<"scale">> := 2}}, + #{<<"type">> := #{<<"type">> := ?AVRO_ARRAY, + <<"items">> := ?AVRO_INT}}, + #{<<"type">> := #{<<"type">> := ?AVRO_ARRAY, + <<"items">> := ?AVRO_INT}}, + #{<<"type">> := #{<<"type">> := ?AVRO_ARRAY, + <<"items">> := ?AVRO_STRING}}, + #{<<"type">> := #{<<"type">> := ?AVRO_MAP, + <<"values">> := ?AVRO_FLOAT}}] + }, + #{<<"name">> := <<"MyAnnotated">>, + <<"namespace">> := <<"org.erlang.ftp">>, + <<"fields">> := + [#{<<"name">> := <<"error">>, + <<"type">> := <<"org.erlang.www.MyError">>}]}, + #{<<"name">> := <<"MyError">>, + <<"fields">> := + [#{<<"type">> := <<"MyEnum2">>}, + #{<<"type">> := ?AVRO_STRING}]}], + Types), ?assertMatch( - #{"namespace" := "org.erlang.www", - protocol := "MyProto", - types := - [#{name := "MyEnum1"}, - #{name := "MyEnum2", - type := enum, - variants := ["VAR21", "VAR22", "VAR23"]}, - #{name := "MyFix", - type := fixed, - size := 10}, - #{name := "MyRec", - fields := - [#{type := int}, - #{type := string}, - #{type := float}, - #{type := boolean}, - #{type := "MyFix"}, - #{type := [boolean, null]}, - #{type := #{type := int, 'logicalType' := "date"}}, - #{type := #{type := int, 'logicalType' := "time-millis"}}, - #{type := #{type := long, - 'logicalType' := "timestamp-millis"}}, - #{type := #{type := bytes, precision := 5, scale := 2}}, - #{type := #{type := array, items := int}}, - #{type := #{type := array, items := int}}, - #{type := #{type := array, items := string}}, - #{type := #{type := map, values := float}}] - }, - #{name := "MyAnnotated", - "namespace" := "org.erlang.ftp", - fields := - [#{name := "error", - type := "org.erlang.www.MyError"}]}, - #{name := "MyError", - fields := - [#{type := "MyEnum2"}, - #{type := string}]}], - messages := - [#{name := "div"}, - #{name := "append"}, - #{name := "gen_server_cast"}, - #{name := "ping"}]}, - idl_to_avpr("protocol_with_typedefs")). + [#{<<"name">> := <<"div">>}, + #{<<"name">> := <<"append">>, + <<"error">> := [<<"MyError">>, <<"TheirError">>]}, + #{<<"name">> := <<"gen_server_cast">>, <<"one-way">> := true}, + #{<<"name">> := <<"ping">>}], + Messages). duplicate_annotation_test() -> @@ -121,18 +146,18 @@ duplicate_annotation_test() -> nested_complex_types_test() -> ?assertEqual( - #{protocol => "P", - messages => [], - types => - [#{type => record, - name => "R", - fields => - [#{name => "f", - type => - #{type => array, - items => - #{type => map, - values => [null, "ns.T"]} + #{<<"protocol">> => <<"P">>, + <<"messages">> => [], + <<"types">> => + [#{<<"type">> => ?AVRO_RECORD, + <<"name">> => <<"R">>, + <<"fields">> => + [#{<<"name">> => <<"f">>, + <<"type">> => + #{<<"type">> => ?AVRO_ARRAY, + <<"items">> => + #{<<"type">> => ?AVRO_MAP, + <<"values">> => [?AVRO_NULL, <<"ns.T">>]} } } ]}]}, From 0cbf415ba7116932f7521655e9e558c237769aaf Mon Sep 17 00:00:00 2001 From: Sergey Prokhorov Date: Sun, 15 Mar 2020 03:21:12 +0100 Subject: [PATCH 13/13] Load .avdl to erlavro internal format Now it's possible to use basic .avdl to encode/decode avro --- src/avro.erl | 9 ++++++--- src/avro_idl.erl | 22 ++++++++++++++++++++-- src/avro_idl_lexer.xrl | 2 +- src/avro_idl_parser.yrl | 4 ++-- src/avro_json_decoder.erl | 18 +++++++++++++++--- test/avro_idl_parse_tests.erl | 2 +- test/avro_idl_tests.erl | 24 +++++++++++++++++------- 7 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/avro.erl b/src/avro.erl index 8c7437e..38faecd 100644 --- a/src/avro.erl +++ b/src/avro.erl @@ -143,7 +143,8 @@ -type crc64_fingerprint() :: avro_fingerprint:crc64(). %% @doc Decode JSON format avro schema into `erlavro' internals. --spec decode_schema(binary()) -> avro_type(). +%% @param JSON: JSON binary or erlang `map()' json representation +-spec decode_schema(binary() | map() | [map()]) -> avro_type(). decode_schema(JSON) -> avro_json_decoder:decode_schema(JSON). %% @doc Make type lookup function from type definition. @@ -176,7 +177,8 @@ make_lkup_fun(AssignedName, Type) -> %% * allow_type_redefine: `boolean()' %% This option is to allow one type being defined more than once. %% @end --spec decode_schema(binary(), proplists:proplist()) -> avro_type(). +-spec decode_schema(binary() | map() | [map()], proplists:proplist()) -> + avro_type(). decode_schema(JSON, Options) -> avro_json_decoder:decode_schema(JSON, Options). @@ -269,7 +271,8 @@ make_decoder(Schema, Options) -> %% takes only one `binary()' input arg. -spec make_simple_decoder(avro_type() | binary(), codec_options()) -> simple_decoder(). -make_simple_decoder(JSON, Options) when is_binary(JSON) -> +make_simple_decoder(JSON, Options) when is_binary(JSON); + is_map(JSON) -> make_simple_decoder(decode_schema(JSON), Options); make_simple_decoder(Type, Options) when ?IS_TYPE_RECORD(Type) -> Lkup = make_lkup_fun(Type), diff --git a/src/avro_idl.erl b/src/avro_idl.erl index 78a611c..a1b9533 100644 --- a/src/avro_idl.erl +++ b/src/avro_idl.erl @@ -4,9 +4,10 @@ %%% as create Avro encoders and decoders. %%% @end %%% @reference See [https://avro.apache.org/docs/current/idl.html] -%%% @author Sergey Prokhhorov +%%% @author Sergey Prokhorov -module(avro_idl). +-export([decode_schema/2]). -export([new_context/1, str_to_avpr/2, protocol_to_avpr/2, @@ -16,6 +17,23 @@ -record(st, {cwd}). +decode_schema(SchemaStr, Cwd) -> + Protocol = str_to_avpr(SchemaStr, Cwd), + #{<<"types">> := Types0} = Protocol, + Types1 = lists:filter( + fun(#{<<"type">> := TName}) ->TName =/= <<"error">> end, Types0), + Ns = maps:get(<<"namespace">>, Protocol, ?AVRO_NS_GLOBAL), + Types = lists:map( + fun(T) -> + case maps:is_key(<<"namespace">>, T) of + false -> + T#{<<"namespace">> => Ns}; + true -> + T + end + end, Types1), + avro:decode_schema(Types, [{ignore_bad_default_values, true}]). + new_context(Cwd) -> #st{cwd = Cwd}. @@ -64,7 +82,7 @@ typedecl_to_avsc(#enum{name = Name, meta = Meta, variants = Vars}, _St) -> meta( #{<<"type">> => ?AVRO_ENUM, <<"name">> => b(Name), - <<"variants">> => lists:map(fun b/1, Vars) + <<"symbols">> => lists:map(fun b/1, Vars) }, Meta); typedecl_to_avsc(#fixed{name = Name, meta = Meta, size = Size}, _St) -> diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl index 4ec393c..4d4d545 100644 --- a/src/avro_idl_lexer.xrl +++ b/src/avro_idl_lexer.xrl @@ -1,7 +1,7 @@ %% @doc Avro IDL lexer %% @end %% @reference See [https://avro.apache.org/docs/current/idl.html] -%% @author Sergey Prokhhorov +%% @author Sergey Prokhorov Definitions. diff --git a/src/avro_idl_parser.yrl b/src/avro_idl_parser.yrl index 147fa09..a76ac6b 100644 --- a/src/avro_idl_parser.yrl +++ b/src/avro_idl_parser.yrl @@ -3,7 +3,7 @@ Header "%%% @doc Avro IDL parser %%% XXX: all `comment_v' tokens should be filtered-out before parsing! %%% @end %%% @reference See [https://avro.apache.org/docs/current/idl.html] -%%% @author Sergey Prokhhorov ". +%%% @author Sergey Prokhorov ". Terminals id ns_id null string_v doc_v float_v integer_v bool_v annotation_v primitive_t logical_t decimal_t @@ -120,7 +120,7 @@ import_file_type -> schema_k : schema. %% -- Enum typedef enum -> - enum_t id '{' id enum_variants : + enum_t id '{' id enum_variants : % TODO: add support for default #enum{name = value_of('$2'), variants = [value_of('$4') | '$5']}. enum -> meta enum : diff --git a/src/avro_json_decoder.erl b/src/avro_json_decoder.erl index 3749206..248e9e5 100644 --- a/src/avro_json_decoder.erl +++ b/src/avro_json_decoder.erl @@ -47,7 +47,7 @@ %%%_* APIs ===================================================================== %% @doc Decode JSON format avro schema into erlavro internals. --spec decode_schema(binary()) -> avro_type(). +-spec decode_schema(binary() | map() | [map()]) -> avro_type(). decode_schema(JSON) -> decode_schema(JSON, _Opts = []). @@ -66,7 +66,7 @@ decode_schema(JSON) -> %% * allow_type_redefine: `boolean()' %% This option is to allow one type being defined more than once. %% @end --spec decode_schema(binary(), sc_opts()) -> avro_type(). +-spec decode_schema(binary() | map() | [map()], sc_opts()) -> avro_type(). decode_schema(JSON, Opts) when is_list(Opts) -> %% Parse JSON first Type = parse_schema(decode_json(JSON)), @@ -511,9 +511,21 @@ do_parse_union_ex(ValueTypeName, Value, UnionType, %% 'map' is a better option, but we have to keep it backward compatible. %% 'proplist' is not an option because otherwise there is no way to tell %% apart 'object' and 'array'. --spec decode_json(binary()) -> json_value(). +-spec decode_json(binary() | map() | [map()]) -> json_value(). +decode_json(Parsed) when is_map(Parsed); + is_list(Parsed) -> + map_to_tuple(Parsed); decode_json(JSON) -> jsone:decode(JSON, [{object_format, tuple}]). +%% recursively convert map to json-tuple format +map_to_tuple(Map) when is_map(Map) -> + {[{K, map_to_tuple(V)} + || {K, V} <- maps:to_list(Map)]}; +map_to_tuple(Array) when is_list(Array) -> + lists:map(fun map_to_tuple/1, Array); +map_to_tuple(Other) -> + Other. + %% Filter out non-custom properties. -spec filter_custom_props([{binary(), json_value()}], [name()]) -> [custom_prop()]. diff --git a/test/avro_idl_parse_tests.erl b/test/avro_idl_parse_tests.erl index 3119dd7..074f510 100644 --- a/test/avro_idl_parse_tests.erl +++ b/test/avro_idl_parse_tests.erl @@ -1,6 +1,6 @@ %% @doc Tests for IDL lexer + parser %% @end -%% @author Sergey Prokhhorov +%% @author Sergey Prokhorov -module(avro_idl_parse_tests). -include("../src/idl.hrl"). diff --git a/test/avro_idl_tests.erl b/test/avro_idl_tests.erl index ceaba01..56d0e68 100644 --- a/test/avro_idl_tests.erl +++ b/test/avro_idl_tests.erl @@ -1,6 +1,6 @@ %% @doc Tests for IDL converter / loader %% @end -%% @author Sergey Prokhhorov +%% @author Sergey Prokhorov -module(avro_idl_tests). -include("../src/idl.hrl"). @@ -34,7 +34,7 @@ annotations_avpr_test() -> <<"namespace">> => <<"enums">>, <<"type">> => ?AVRO_ENUM, <<"name">> => <<"MyEnum">>, - <<"variants">> => [<<"A">>, <<"B">>, <<"C">>]}, + <<"symbols">> => [<<"A">>, <<"B">>, <<"C">>]}, #{<<"doc">> => <<"My Fixed">>, <<"namespace">> => <<"fixeds">>, <<"type">> => ?AVRO_FIXED, @@ -88,7 +88,7 @@ protocol_with_typedefs_avpr_test() -> [#{<<"name">> := <<"MyEnum1">>}, #{<<"name">> := <<"MyEnum2">>, <<"type">> := ?AVRO_ENUM, - <<"variants">> := [<<"VAR21">>, <<"VAR22">>, <<"VAR23">>]}, + <<"symbols">> := [<<"VAR21">>, <<"VAR22">>, <<"VAR23">>]}, #{<<"name">> := <<"MyFix">>, <<"type">> := ?AVRO_FIXED, <<"size">> := 10}, @@ -137,14 +137,14 @@ protocol_with_typedefs_avpr_test() -> Messages). -duplicate_annotation_test() -> +duplicate_annotation_avpr_test() -> ?assertError( {duplicate_annotation, "my_decorator", _, _}, avro_idl:str_to_avpr( "@my_decorator(\"a\") @my_decorator(\"b\") protocol MyProto{}", "") ). -nested_complex_types_test() -> +nested_complex_types_avr_test() -> ?assertEqual( #{<<"protocol">> => <<"P">>, <<"messages">> => [], @@ -165,9 +165,19 @@ nested_complex_types_test() -> "protocol P { record R { array> f; }}", "") ). +full_protocol_load_test() -> + Schema = read_schema("full_protocol"), + DecSchema = avro_idl:decode_schema(Schema, ""), + _EncSchema = avro:encode_schema(DecSchema). + %% ?debugFmt("~n~p~n~s", [DecSchema, EncSchema]). + %% Helpers -idl_to_avpr(Name) -> +read_schema(Name) -> File = "test/data/" ++ Name ++ ".avdl", {ok, B} = file:read_file(File), - avro_idl:str_to_avpr(binary_to_list(B), ""). + binary_to_list(B). + +idl_to_avpr(Name) -> + Schema = read_schema(Name), + avro_idl:str_to_avpr(Schema, "").