klarna · seriyps · Jul 19, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 23, 2019
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,5 @@ out/
 _build
 rebar.lock
 *.crashdump
+src/avro_idl_lexer.erl
+src/avro_idl_parser.erl
diff --git a/include/erlavro.hrl b/include/erlavro.hrl
@@ -37,6 +37,7 @@
 -define(AVRO_MAP,     <<"map">>).
 -define(AVRO_UNION,   <<"union">>).
 -define(AVRO_FIXED,   <<"fixed">>).
+-define(AVRO_ERROR,   <<"error">>).             % idl
 
 -define(IS_AVRO_PRIMITIVE_NAME(N),
         (N =:= ?AVRO_NULL    orelse

diff --git a/rebar.config b/rebar.config
@@ -1,9 +1,9 @@
 %% -*- mode:erlang -*-
 {erl_opts,             [ debug_info
-                       , warnings_as_errors
+%                       , warnings_as_errors
                        , {d,'NOTEST'}
                        ]}.
-{eunit_opts,           [verbose]}.
+%% {eunit_opts,           [verbose]}.
 {xref_checks,          [ undefined_function_calls
                        , deprecated_function_calls
                        ]}.
@@ -15,3 +15,5 @@
 {cover_opts, [verbose]}.
 {cover_enabled, true}.
 {cover_export_enabled, true}.
+
+{yrl_opts, [{verbose, true}]}.
diff --git a/src/avro.erl b/src/avro.erl
@@ -143,7 +143,8 @@
 -type crc64_fingerprint() :: avro_fingerprint:crc64().
 
 %% @doc Decode JSON format avro schema into `erlavro' internals.
--spec decode_schema(binary()) -> avro_type().
+%% @param JSON: JSON binary or erlang `map()' json representation
+-spec decode_schema(binary() | map() | [map()]) -> avro_type().
 decode_schema(JSON) -> avro_json_decoder:decode_schema(JSON).
 
 %% @doc Make type lookup function from type definition.
@@ -176,7 +177,8 @@ make_lkup_fun(AssignedName, Type) ->
 %%  * allow_type_redefine: `boolean()'
 %%     This option is to allow one type being defined more than once.
 %% @end
--spec decode_schema(binary(), proplists:proplist()) -> avro_type().
+-spec decode_schema(binary() | map() | [map()], proplists:proplist()) ->
+                       avro_type().
 decode_schema(JSON, Options) ->
   avro_json_decoder:decode_schema(JSON, Options).
 
@@ -269,7 +271,8 @@ make_decoder(Schema, Options) ->
 %% takes only one `binary()' input arg.
 -spec make_simple_decoder(avro_type() | binary(), codec_options()) ->
         simple_decoder().
-make_simple_decoder(JSON, Options) when is_binary(JSON) ->
+make_simple_decoder(JSON, Options) when is_binary(JSON);
+                                        is_map(JSON) ->
   make_simple_decoder(decode_schema(JSON), Options);
 make_simple_decoder(Type, Options) when ?IS_TYPE_RECORD(Type) ->
   Lkup = make_lkup_fun(Type),

diff --git a/src/avro_idl.erl b/src/avro_idl.erl
@@ -0,0 +1,210 @@
+%%% @doc APIs to work with Avro IDL format
+%%%
+%%% This module allows to convert .avdl format to .avpr and .avsc as well
+%%% as create Avro encoders and decoders.
+%%% @end
+%%% @reference See [https://avro.apache.org/docs/current/idl.html]
+%%% @author Sergey Prokhorov <[email protected]>
+-module(avro_idl).
+
+-export([decode_schema/2]).
+-export([new_context/1,
+         str_to_avpr/2,
+         protocol_to_avpr/2,
+         typedecl_to_avsc/2]).
+-include("idl.hrl").
+-include("erlavro.hrl").
+
+-record(st, {cwd}).
+
+decode_schema(SchemaStr, Cwd) ->
+    Protocol = str_to_avpr(SchemaStr, Cwd),
+    #{<<"types">> := Types0} = Protocol,
+    Types1 = lists:filter(
+              fun(#{<<"type">> := TName}) ->TName =/= <<"error">> end, Types0),
+    Ns = maps:get(<<"namespace">>, Protocol, ?AVRO_NS_GLOBAL),
+    Types = lists:map(
+        fun(T) ->
+                case maps:is_key(<<"namespace">>, T) of
+                    false ->
+                        T#{<<"namespace">> => Ns};
+                    true ->
+                        T
+                end
+        end, Types1),
+    avro:decode_schema(Types, [{ignore_bad_default_values, true}]).
+
+new_context(Cwd) ->
+    #st{cwd = Cwd}.
+
+str_to_avpr(String, Cwd) ->
+    str_to_avpr(String, Cwd, [drop_comments, trim_doc]).
+
+str_to_avpr(String, Cwd, Opts) ->
+    {ok, T0, _} =  avro_idl_lexer:string(String),
+    T = avro_idl_lexer:preprocess(T0, Opts),
+    {ok, Tree} = avro_idl_parser:parse(T),
+    protocol_to_avpr(Tree, new_context(Cwd)).
+
+protocol_to_avpr(#protocol{name = Name,
+                           meta = Meta,
+                           definitions = Defs0}, St) ->
+    Defs = process_imports(Defs0, St),
+    {Types, Messages} =
+        lists:partition(fun(#function{}) -> false;
+                           (_) -> true
+                        end, Defs),
+    Protocol0 =
+        #{<<"protocol">> => b(Name),
+          <<"types">> =>
+              lists:map(
+                fun(Type) ->
+                        typedecl_to_avsc(Type, St)
+                end, Types),
+          <<"messages">> =>
+              lists:map(
+                fun(Message) ->
+                        message_to_avsc(Message, St)
+                end, Messages)
+         },
+    meta(Protocol0, Meta).
+
+process_imports(Defs, _St) ->
+    %% TODO
+    %% https://avro.apache.org/docs/1.9.2/spec.html#names
+    %% when importing definitions from avdl or avpr, copy namespaces from
+    %% protocol to definitions, if not specified
+    lists:filter(fun({import, _, _}) -> false;
+                    (_) -> true
+                 end, Defs).
+
+typedecl_to_avsc(#enum{name = Name, meta = Meta, variants = Vars}, _St) ->
+    meta(
+      #{<<"type">> => ?AVRO_ENUM,
+        <<"name">> => b(Name),
+        <<"symbols">> => lists:map(fun b/1, Vars)
+       },
+      Meta);
+typedecl_to_avsc(#fixed{name = Name, meta = Meta, size = Size}, _St) ->
+    meta(
+      #{<<"type">> => ?AVRO_FIXED,
+        <<"name">> => b(Name),
+        <<"size">> => Size},
+      Meta);
+typedecl_to_avsc(#error{name = Name, meta = Meta, fields = Fields}, St) ->
+    meta(
+      #{<<"type">> => ?AVRO_ERROR,
+        <<"name">> => b(Name),
+        <<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]},
+      Meta);
+typedecl_to_avsc(#record{name = Name, meta = Meta, fields = Fields}, St) ->
+    meta(
+      #{<<"type">> => ?AVRO_RECORD,
+        <<"name">> => b(Name),
+        <<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]},
+      Meta).
+
+field_to_avsc(#field{name = Name, meta = Meta,
+                     type = Type, default = Default}, St) ->
+    meta(
+      default(
+        #{<<"name">> => b(Name),
+          <<"type">> => type_to_avsc(Type, St)},
+        Default),         % TODO: maybe validate default matches type
+      Meta).
+
+message_to_avsc(#function{name = Name, meta = Meta,
+                          arguments = Args, return = Return,
+                          extra = Extra}, St) ->
+    %% TODO: arguments can just reuse `#field{}`
+    ArgsSchema =
+        [default(
+           #{<<"name">> => b(ArgName),
+             <<"type">> => type_to_avsc(Type, St)},
+           Default)
+         || {arg, ArgName, Type, Default} <- Args],
+    Schema0 =
+        #{<<"name">> => b(Name),
+          <<"request">> => ArgsSchema,
+          <<"response">> => type_to_avsc(Return, St)},
+    Schema1 = case Extra of
+                  undefined -> Schema0;
+                  oneway ->
+                      Schema0#{<<"one-way">> => true};
+                  {throws, ThrowsTypes} ->
+                      Schema0#{<<"error">> => lists:map(fun b/1, ThrowsTypes)}
+              end,
+    meta(Schema1, Meta).
+
+
+type_to_avsc(void, _St) ->
+    ?AVRO_NULL;
+type_to_avsc(null, _St) ->
+    ?AVRO_NULL;
+type_to_avsc(T, _St) when T == int;
+                          T == long;
+                          T == string;
+                          T == boolean;
+                          T == float;
+                          T == double;
+                          T == bytes ->
+    atom_to_binary(T, utf8);
+type_to_avsc({decimal, Precision, Scale}, _St) ->
+    #{<<"type">> => ?AVRO_BYTES,
+      <<"logicalType">> => <<"decimal">>,
+      <<"precision">> => Precision,
+      <<"scale">> => Scale};
+type_to_avsc(date, _St) ->
+    #{<<"type">> => ?AVRO_INT,
+      <<"logicalType">> => <<"date">>};
+type_to_avsc(time_ms, _St) ->
+    #{<<"type">> => ?AVRO_INT,
+      <<"logicalType">> => <<"time-millis">>};
+type_to_avsc(timestamp_ms, _St) ->
+    #{<<"type">> => ?AVRO_LONG,
+      <<"logicalType">> => <<"timestamp-millis">>};
+type_to_avsc({custom, Id}, _St) ->
+    b(Id);
+type_to_avsc({union, Types}, St) ->
+    [type_to_avsc(Type, St) || Type <- Types];
+type_to_avsc({array, Of}, St) ->
+    #{<<"type">> => ?AVRO_ARRAY,
+      <<"items">> => type_to_avsc(Of, St)};
+type_to_avsc({map, ValType}, St) ->
+    #{<<"type">> => ?AVRO_MAP,
+      <<"values">> => type_to_avsc(ValType, St)}.
+
+meta(Schema, Meta) ->
+    {Docs, Annotations} =
+        lists:partition(
+          fun({doc, _}) -> true;
+             (#annotation{}) -> false
+          end, Meta),
+    Schema1 = case Docs of
+                  [] -> Schema;
+                  _ ->
+                      DocStrings = [S || {doc, S} <- Docs],
+                      Schema#{<<"doc">> => b(lists:join(
+                                               "\n", DocStrings))}
+              end,
+    lists:foldl(
+     fun(#annotation{name = Name, value = Value}, Schema2) ->
+             BName = b(Name),
+             BVal = case Value of
+                        [] -> <<>>;
+                        [C | _] when is_integer(C) -> b(Value);
+                        _ ->
+                            [b(Str) || Str <- Value]
+                    end,
+             maps:is_key(BName, Schema2) andalso
+                 error({duplicate_annotation, Name, Value, Schema2}),
+             Schema2#{BName => BVal}
+     end, Schema1, Annotations).
+
+default(Obj, undefined) ->
+    Obj;
+default(Obj, Default) ->
+    Obj#{<<"default">> => Default}.
+
+b(Str) when is_list(Str) ->
+    unicode:characters_to_binary(Str).
diff --git a/src/avro_idl_lexer.xrl b/src/avro_idl_lexer.xrl
@@ -0,0 +1,114 @@
+%% @doc Avro IDL lexer
+%% @end
+%% @reference See [https://avro.apache.org/docs/current/idl.html]
+%% @author Sergey Prokhorov <[email protected]>
+
+Definitions.
+
+Rules.
+
+[\s\t\n\r]+ : skip_token.
+
+%% TODO: escaped double quotes inside strings
+"[^\"]+" : {token, {string_v, TokenLine, unescape(TokenChars, $\")}}.
+
+`[^\`]+` : {token, {id, TokenLine, unescape(TokenChars, $`)}}.
+
+//[^\r\n]* : {token, {comment_v, TokenLine, unescape_line_comment(TokenChars)}}.
+
+\{ : {token, {'{', TokenLine}}.
+\} : {token, {'}', TokenLine}}.
+\( : {token, {'(', TokenLine}}.
+\) : {token, {')', TokenLine}}.
+\[ : {token, {'[', TokenLine}}.
+\] : {token, {']', TokenLine}}.
+<  : {token, {'<', TokenLine}}.
+>  : {token, {'>', TokenLine}}.
+;  : {token, {';', TokenLine}}.
+\, : {token, {',', TokenLine}}.
+
+%% Null can be in both values and primitive types
+null : {token, {null, TokenLine}}.
+
+%% Default values (json)
+= : {token, {'=', TokenLine}}.
+%% TODO: better float regexp;
+%% XXX: is it safe to use list_to_float? seems float syntax is used for decimal defaults as well
+[+-]?[0-9]+\.[0-9]+ : {token, {float_v, TokenLine, list_to_float(TokenChars)}}.
+[+-]?[0-9]+         : {token, {integer_v, TokenLine, list_to_integer(TokenChars)}}.
+true|false          : {token, {bool_v, TokenLine, list_to_atom(TokenChars)}}.
+\:                  : {token, {':', TokenLine}}.
+
+%% === Datatype IDs ===
+
+%% primitive
+int|long|string|boolean|float|double|bytes : {token, {primitive_t, TokenLine, list_to_atom(TokenChars)}}.
+
+%% complex
+record|enum|array|map|fixed|union : {token, {list_to_atom(TokenChars ++ "_t"), TokenLine}}.
+
+%% Logical
+date|time_ms|timestamp_ms : {token, {logical_t, TokenLine, list_to_atom(TokenChars)}}.
+decimal : {token, {decimal_t, TokenLine}}.
+%% keywords
+error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(TokenChars ++ "_k"), TokenLine}}.
+
+%% === Constructs ===
+
+@[a-zA-Z0-9_-]+ : {token, {annotation_v, TokenLine, unescape_annotation(TokenChars)}}.
+
+[A-Za-z_][A-Za-z0-9_]* : {token, {id, TokenLine, TokenChars}}.
+%% namespaced will only be allowed in data type spec
+[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)+ : {token, {ns_id, TokenLine, TokenChars}}.
+
+%% https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/
+%% `/** .. */` is a docstring for the following object
+(/\*\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)  : {token, {doc_v, TokenLine, unescape_multiline_comment(TokenChars)}}.
+%% `/* .. */` is just a comment
+(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/) : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}.
+
+
+Erlang code.
+-export([preprocess/2]).
+
+%% Api helpers
+
+-spec preprocess(Tokens, [drop_comments | trim_doc]) -> Tokens when
+      Tokens :: [tuple()].
+preprocess(Tokens, Actions) ->
+    lists:foldl(fun do_preprocess/2, Tokens, Actions).
+
+do_preprocess(drop_comments, T) ->
+    lists:filter(
+      fun({comment_v, _, _}) -> false;
+         (_) -> true
+      end, T);
+do_preprocess(trim_doc, T) ->
+    {ok, Re} = re:compile("^[\\s\\*]*((?U).*)[\\s]*$", [multiline]),
+    lists:map(
+      fun({doc_v, Loc, Val}) ->
+              Stripped = re:replace(Val, Re, "\\1",
+                                    [global, {return, list}]),
+              {doc_v, Loc, Stripped};
+         (Tok) -> Tok
+      end, T).
+
+%% Lexer internal helpers
+
+unescape(Token, Char) ->
+    string:strip(Token, both, Char).
+
+unescape_line_comment("//" ++ Comment) ->
+    Comment.
+
+%% TODO: cleanup
+unescape_multiline_comment("/**" ++ Comment0) ->
+    %% Drop closing "*/"
+    Len = length(Comment0),
+    lists:sublist(Comment0, Len - 2);
+unescape_multiline_comment("/*" ++ Comment0) ->
+    Len = length(Comment0),
+    lists:sublist(Comment0, Len - 2).
+
+unescape_annotation("@" ++ Annotation) ->
+    Annotation.