Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Idl parser #90

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ out/
_build
rebar.lock
*.crashdump
src/avro_idl_lexer.erl
src/avro_idl_parser.erl
1 change: 1 addition & 0 deletions include/erlavro.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
-define(AVRO_MAP, <<"map">>).
-define(AVRO_UNION, <<"union">>).
-define(AVRO_FIXED, <<"fixed">>).
-define(AVRO_ERROR, <<"error">>). % idl

-define(IS_AVRO_PRIMITIVE_NAME(N),
(N =:= ?AVRO_NULL orelse
Expand Down
6 changes: 4 additions & 2 deletions rebar.config
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
%% -*- mode:erlang -*-
{erl_opts, [ debug_info
, warnings_as_errors
% , warnings_as_errors
, {d,'NOTEST'}
]}.
{eunit_opts, [verbose]}.
%% {eunit_opts, [verbose]}.
{xref_checks, [ undefined_function_calls
, deprecated_function_calls
]}.
Expand All @@ -15,3 +15,5 @@
{cover_opts, [verbose]}.
{cover_enabled, true}.
{cover_export_enabled, true}.

{yrl_opts, [{verbose, true}]}.
9 changes: 6 additions & 3 deletions src/avro.erl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@
-type crc64_fingerprint() :: avro_fingerprint:crc64().

%% @doc Decode JSON format avro schema into `erlavro' internals.
-spec decode_schema(binary()) -> avro_type().
%% @param JSON: JSON binary or erlang `map()' json representation
-spec decode_schema(binary() | map() | [map()]) -> avro_type().
decode_schema(JSON) -> avro_json_decoder:decode_schema(JSON).

%% @doc Make type lookup function from type definition.
Expand Down Expand Up @@ -176,7 +177,8 @@ make_lkup_fun(AssignedName, Type) ->
%% * allow_type_redefine: `boolean()'
%% This option is to allow one type being defined more than once.
%% @end
-spec decode_schema(binary(), proplists:proplist()) -> avro_type().
-spec decode_schema(binary() | map() | [map()], proplists:proplist()) ->
avro_type().
decode_schema(JSON, Options) ->
avro_json_decoder:decode_schema(JSON, Options).

Expand Down Expand Up @@ -269,7 +271,8 @@ make_decoder(Schema, Options) ->
%% takes only one `binary()' input arg.
-spec make_simple_decoder(avro_type() | binary(), codec_options()) ->
simple_decoder().
make_simple_decoder(JSON, Options) when is_binary(JSON) ->
make_simple_decoder(JSON, Options) when is_binary(JSON);
is_map(JSON) ->
make_simple_decoder(decode_schema(JSON), Options);
make_simple_decoder(Type, Options) when ?IS_TYPE_RECORD(Type) ->
Lkup = make_lkup_fun(Type),
Expand Down
210 changes: 210 additions & 0 deletions src/avro_idl.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
%%% @doc APIs to work with Avro IDL format
%%%
%%% This module allows to convert .avdl format to .avpr and .avsc as well
%%% as create Avro encoders and decoders.
%%% @end
%%% @reference See [https://avro.apache.org/docs/current/idl.html]
%%% @author Sergey Prokhorov <[email protected]>
-module(avro_idl).

-export([decode_schema/2]).
-export([new_context/1,
str_to_avpr/2,
protocol_to_avpr/2,
typedecl_to_avsc/2]).
-include("idl.hrl").
-include("erlavro.hrl").

-record(st, {cwd}).

decode_schema(SchemaStr, Cwd) ->
Protocol = str_to_avpr(SchemaStr, Cwd),
#{<<"types">> := Types0} = Protocol,
Types1 = lists:filter(
fun(#{<<"type">> := TName}) ->TName =/= <<"error">> end, Types0),
Ns = maps:get(<<"namespace">>, Protocol, ?AVRO_NS_GLOBAL),
Types = lists:map(
fun(T) ->
case maps:is_key(<<"namespace">>, T) of
false ->
T#{<<"namespace">> => Ns};
true ->
T
end
end, Types1),
avro:decode_schema(Types, [{ignore_bad_default_values, true}]).

new_context(Cwd) ->
#st{cwd = Cwd}.

str_to_avpr(String, Cwd) ->
str_to_avpr(String, Cwd, [drop_comments, trim_doc]).

str_to_avpr(String, Cwd, Opts) ->
{ok, T0, _} = avro_idl_lexer:string(String),
T = avro_idl_lexer:preprocess(T0, Opts),
{ok, Tree} = avro_idl_parser:parse(T),
protocol_to_avpr(Tree, new_context(Cwd)).

protocol_to_avpr(#protocol{name = Name,
meta = Meta,
definitions = Defs0}, St) ->
Defs = process_imports(Defs0, St),
{Types, Messages} =
lists:partition(fun(#function{}) -> false;
(_) -> true
end, Defs),
Protocol0 =
#{<<"protocol">> => b(Name),
<<"types">> =>
lists:map(
fun(Type) ->
typedecl_to_avsc(Type, St)
end, Types),
<<"messages">> =>
lists:map(
fun(Message) ->
message_to_avsc(Message, St)
end, Messages)
},
meta(Protocol0, Meta).

process_imports(Defs, _St) ->
%% TODO
%% https://avro.apache.org/docs/1.9.2/spec.html#names
%% when importing definitions from avdl or avpr, copy namespaces from
%% protocol to definitions, if not specified
lists:filter(fun({import, _, _}) -> false;
(_) -> true
end, Defs).

typedecl_to_avsc(#enum{name = Name, meta = Meta, variants = Vars}, _St) ->
meta(
#{<<"type">> => ?AVRO_ENUM,
<<"name">> => b(Name),
<<"symbols">> => lists:map(fun b/1, Vars)
},
Meta);
typedecl_to_avsc(#fixed{name = Name, meta = Meta, size = Size}, _St) ->
meta(
#{<<"type">> => ?AVRO_FIXED,
<<"name">> => b(Name),
<<"size">> => Size},
Meta);
typedecl_to_avsc(#error{name = Name, meta = Meta, fields = Fields}, St) ->
meta(
#{<<"type">> => ?AVRO_ERROR,
<<"name">> => b(Name),
<<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]},
Meta);
typedecl_to_avsc(#record{name = Name, meta = Meta, fields = Fields}, St) ->
meta(
#{<<"type">> => ?AVRO_RECORD,
<<"name">> => b(Name),
<<"fields">> => [field_to_avsc(Field, St) || Field <- Fields]},
Meta).

field_to_avsc(#field{name = Name, meta = Meta,
type = Type, default = Default}, St) ->
meta(
default(
#{<<"name">> => b(Name),
<<"type">> => type_to_avsc(Type, St)},
Default), % TODO: maybe validate default matches type
Meta).

message_to_avsc(#function{name = Name, meta = Meta,
arguments = Args, return = Return,
extra = Extra}, St) ->
%% TODO: arguments can just reuse `#field{}`
ArgsSchema =
[default(
#{<<"name">> => b(ArgName),
<<"type">> => type_to_avsc(Type, St)},
Default)
|| {arg, ArgName, Type, Default} <- Args],
Schema0 =
#{<<"name">> => b(Name),
<<"request">> => ArgsSchema,
<<"response">> => type_to_avsc(Return, St)},
Schema1 = case Extra of
undefined -> Schema0;
oneway ->
Schema0#{<<"one-way">> => true};
{throws, ThrowsTypes} ->
Schema0#{<<"error">> => lists:map(fun b/1, ThrowsTypes)}
end,
meta(Schema1, Meta).


type_to_avsc(void, _St) ->
?AVRO_NULL;
type_to_avsc(null, _St) ->
?AVRO_NULL;
type_to_avsc(T, _St) when T == int;
T == long;
T == string;
T == boolean;
T == float;
T == double;
T == bytes ->
atom_to_binary(T, utf8);
type_to_avsc({decimal, Precision, Scale}, _St) ->
#{<<"type">> => ?AVRO_BYTES,
<<"logicalType">> => <<"decimal">>,
<<"precision">> => Precision,
<<"scale">> => Scale};
type_to_avsc(date, _St) ->
#{<<"type">> => ?AVRO_INT,
<<"logicalType">> => <<"date">>};
type_to_avsc(time_ms, _St) ->
#{<<"type">> => ?AVRO_INT,
<<"logicalType">> => <<"time-millis">>};
type_to_avsc(timestamp_ms, _St) ->
#{<<"type">> => ?AVRO_LONG,
<<"logicalType">> => <<"timestamp-millis">>};
type_to_avsc({custom, Id}, _St) ->
b(Id);
type_to_avsc({union, Types}, St) ->
[type_to_avsc(Type, St) || Type <- Types];
type_to_avsc({array, Of}, St) ->
#{<<"type">> => ?AVRO_ARRAY,
<<"items">> => type_to_avsc(Of, St)};
type_to_avsc({map, ValType}, St) ->
#{<<"type">> => ?AVRO_MAP,
<<"values">> => type_to_avsc(ValType, St)}.

meta(Schema, Meta) ->
{Docs, Annotations} =
lists:partition(
fun({doc, _}) -> true;
(#annotation{}) -> false
end, Meta),
Schema1 = case Docs of
[] -> Schema;
_ ->
DocStrings = [S || {doc, S} <- Docs],
Schema#{<<"doc">> => b(lists:join(
"\n", DocStrings))}
end,
lists:foldl(
fun(#annotation{name = Name, value = Value}, Schema2) ->
BName = b(Name),
BVal = case Value of
[] -> <<>>;
[C | _] when is_integer(C) -> b(Value);
_ ->
[b(Str) || Str <- Value]
end,
maps:is_key(BName, Schema2) andalso
error({duplicate_annotation, Name, Value, Schema2}),
Schema2#{BName => BVal}
end, Schema1, Annotations).

default(Obj, undefined) ->
Obj;
default(Obj, Default) ->
Obj#{<<"default">> => Default}.

b(Str) when is_list(Str) ->
unicode:characters_to_binary(Str).
114 changes: 114 additions & 0 deletions src/avro_idl_lexer.xrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
%% @doc Avro IDL lexer
%% @end
%% @reference See [https://avro.apache.org/docs/current/idl.html]
%% @author Sergey Prokhorov <[email protected]>

Definitions.

Rules.

[\s\t\n\r]+ : skip_token.

%% TODO: escaped double quotes inside strings
"[^\"]+" : {token, {string_v, TokenLine, unescape(TokenChars, $\")}}.

`[^\`]+` : {token, {id, TokenLine, unescape(TokenChars, $`)}}.

//[^\r\n]* : {token, {comment_v, TokenLine, unescape_line_comment(TokenChars)}}.

\{ : {token, {'{', TokenLine}}.
\} : {token, {'}', TokenLine}}.
\( : {token, {'(', TokenLine}}.
\) : {token, {')', TokenLine}}.
\[ : {token, {'[', TokenLine}}.
\] : {token, {']', TokenLine}}.
< : {token, {'<', TokenLine}}.
> : {token, {'>', TokenLine}}.
; : {token, {';', TokenLine}}.
\, : {token, {',', TokenLine}}.

%% Null can be in both values and primitive types
null : {token, {null, TokenLine}}.

%% Default values (json)
= : {token, {'=', TokenLine}}.
%% TODO: better float regexp;
%% XXX: is it safe to use list_to_float? seems float syntax is used for decimal defaults as well
[+-]?[0-9]+\.[0-9]+ : {token, {float_v, TokenLine, list_to_float(TokenChars)}}.
[+-]?[0-9]+ : {token, {integer_v, TokenLine, list_to_integer(TokenChars)}}.
true|false : {token, {bool_v, TokenLine, list_to_atom(TokenChars)}}.
\: : {token, {':', TokenLine}}.

%% === Datatype IDs ===

%% primitive
int|long|string|boolean|float|double|bytes : {token, {primitive_t, TokenLine, list_to_atom(TokenChars)}}.

%% complex
record|enum|array|map|fixed|union : {token, {list_to_atom(TokenChars ++ "_t"), TokenLine}}.

%% Logical
date|time_ms|timestamp_ms : {token, {logical_t, TokenLine, list_to_atom(TokenChars)}}.
decimal : {token, {decimal_t, TokenLine}}.
%% keywords
error|throws|oneway|void|import|idl|protocol|schema : {token, {list_to_atom(TokenChars ++ "_k"), TokenLine}}.

%% === Constructs ===

@[a-zA-Z0-9_-]+ : {token, {annotation_v, TokenLine, unescape_annotation(TokenChars)}}.

[A-Za-z_][A-Za-z0-9_]* : {token, {id, TokenLine, TokenChars}}.
%% namespaced will only be allowed in data type spec
[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)+ : {token, {ns_id, TokenLine, TokenChars}}.

%% https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/
%% `/** .. */` is a docstring for the following object
(/\*\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/) : {token, {doc_v, TokenLine, unescape_multiline_comment(TokenChars)}}.
%% `/* .. */` is just a comment
(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/) : {token, {comment_v, TokenLine, unescape_multiline_comment(TokenChars)}}.


Erlang code.
-export([preprocess/2]).

%% Api helpers

-spec preprocess(Tokens, [drop_comments | trim_doc]) -> Tokens when
Tokens :: [tuple()].
preprocess(Tokens, Actions) ->
lists:foldl(fun do_preprocess/2, Tokens, Actions).

do_preprocess(drop_comments, T) ->
lists:filter(
fun({comment_v, _, _}) -> false;
(_) -> true
end, T);
do_preprocess(trim_doc, T) ->
{ok, Re} = re:compile("^[\\s\\*]*((?U).*)[\\s]*$", [multiline]),
lists:map(
fun({doc_v, Loc, Val}) ->
Stripped = re:replace(Val, Re, "\\1",
[global, {return, list}]),
{doc_v, Loc, Stripped};
(Tok) -> Tok
end, T).

%% Lexer internal helpers

unescape(Token, Char) ->
string:strip(Token, both, Char).

unescape_line_comment("//" ++ Comment) ->
Comment.

%% TODO: cleanup
unescape_multiline_comment("/**" ++ Comment0) ->
%% Drop closing "*/"
Len = length(Comment0),
lists:sublist(Comment0, Len - 2);
unescape_multiline_comment("/*" ++ Comment0) ->
Len = length(Comment0),
lists:sublist(Comment0, Len - 2).

unescape_annotation("@" ++ Annotation) ->
Annotation.
Loading