Skip to content

Commit

Permalink
hotfix: get rid of the path for fully qualified names.
Browse files Browse the repository at this point in the history
This is not complete normalization feature but rather more a fix for the fully qualified paths
vs simple name references. This fix not manage the various way a type can be
expressed with a partial path.

Long explanation below:

If we accept the specifications from buf as correct, we can look at how a type
reference is defined here: https://protobuf.com/docs/language-spec#type-references.

The main problem with the previous implementation is that the current parser can't tell
if a fully-qualified type reference in dot notation is the same as one identified by name
alone aka simple name notation (https://protobuf.com/docs/language-spec#fully-qualified-references).

The fix do not consider all the different ways users can define a relative reference, schemas
 with different way of expressing a relative reference even if normalized,
 for now will keep being considered different (https://protobuf.com/docs/language-spec#relative-references).

Right now, our logic removes the `.package_name` (+ the Message scope) part
 before comparing field modifications (in fact it re-writes the schema using the simple name notation).

Even though the TypeTree (trie data structure) could help resolve
relative references (https://protobuf.com/docs/language-spec#relative-references),
we don't want to add this feature in the python implementation now because it might cause new bugs due to the
non-trivial behaviour of the protoc compiler.

We plan to properly normalize the protobuf schemas later.

We'll use protobuf descriptors (after the compilation and linking step)
to gather type references already resolved, and we will threaten all the protobuf
using always the fully qualified names.
Properly handling all path variations means reimplementing the protoc compiler behavior,
we prefer relying on the already processed proto descriptor.

So, for now, we'll only implement a normalization for the fully-qualified references
in dot notation and by simple name alone
  • Loading branch information
eliax1996 committed Jul 3, 2024
1 parent c6a0948 commit 0916518
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 16 deletions.
32 changes: 20 additions & 12 deletions karapace/protobuf/proto_normalizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,15 @@ class NormalizedOneOfElement(OneOfElement):
groups: Sequence[NormalizedGroupElement]


def type_field_element_with_sorted_options(type_field: FieldElement) -> NormalizedFieldElement:
def type_field_element_with_sorted_options(type_field: FieldElement, scope: str) -> NormalizedFieldElement:
sorted_options = None if type_field.options is None else list(sorted(type_field.options, key=sort_by_name))
field_type_normalized = (type_field.element_type
.removeprefix(".") # getting rid of the optional `.` before the package name
.removeprefix(f"{scope}."))
return NormalizedFieldElement(
location=type_field.location,
label=type_field.label,
element_type=type_field.element_type,
element_type=field_type_normalized,
name=type_field.name,
default_value=type_field.default_value,
json_name=type_field.json_name,
Expand All @@ -119,13 +122,14 @@ def enum_constant_element_with_sorted_options(enum_constant: EnumConstantElement
)


def enum_element_with_sorted_options(enum_element: EnumElement) -> NormalizedEnumElement:
def enum_element_with_sorted_options(enum_element: EnumElement, scope: str) -> NormalizedEnumElement:
sorted_options = None if enum_element.options is None else list(sorted(enum_element.options, key=sort_by_name))
constants_with_sorted_options = (
None
if enum_element.constants is None
else [enum_constant_element_with_sorted_options(constant) for constant in enum_element.constants]
)
# todo: strip scope.
return NormalizedEnumElement(
location=enum_element.location,
name=enum_element.name,
Expand Down Expand Up @@ -163,10 +167,11 @@ def one_ofs_with_sorted_options(one_ofs: OneOfElement) -> NormalizedOneOfElement
)


def message_element_with_sorted_options(message_element: MessageElement) -> NormalizedMessageElement:
def message_element_with_sorted_options(message_element: MessageElement, scope: str) -> NormalizedMessageElement:
message_scope = f"{scope}.{message_element.name}"
sorted_options = None if message_element.options is None else list(sorted(message_element.options, key=sort_by_name))
sorted_nested_types = [type_element_with_sorted_options(nested_type) for nested_type in message_element.nested_types]
sorted_fields = [type_field_element_with_sorted_options(field) for field in message_element.fields]
sorted_nested_types = [type_element_with_sorted_options(nested_type, scope) for nested_type in message_element.nested_types]
sorted_fields = [type_field_element_with_sorted_options(field, scope) for field in message_element.fields]
sorted_one_ofs = [one_ofs_with_sorted_options(one_of) for one_of in message_element.one_ofs]

return NormalizedMessageElement(
Expand All @@ -183,25 +188,28 @@ def message_element_with_sorted_options(message_element: MessageElement) -> Norm
)


def type_element_with_sorted_options(type_element: TypeElement) -> NormalizedTypeElement:
def type_element_with_sorted_options(type_element: TypeElement, scope: str) -> NormalizedTypeElement:
sorted_nested_types: list[TypeElement] = []

parent_name = type_element.name
message_scope = f"{scope}.{parent_name}"

for nested_type in type_element.nested_types:
if isinstance(nested_type, EnumElement):
sorted_nested_types.append(enum_element_with_sorted_options(nested_type))
sorted_nested_types.append(enum_element_with_sorted_options(nested_type, message_scope))
elif isinstance(nested_type, MessageElement):
sorted_nested_types.append(message_element_with_sorted_options(nested_type))
sorted_nested_types.append(message_element_with_sorted_options(nested_type, message_scope))
else:
raise ValueError("Unknown type element") # tried with assert_never but it did not work

# doing it here since the subtypes do not declare the nested_types property
type_element.nested_types = sorted_nested_types

if isinstance(type_element, EnumElement):
return enum_element_with_sorted_options(type_element)
return enum_element_with_sorted_options(type_element, scope)

if isinstance(type_element, MessageElement):
return message_element_with_sorted_options(type_element)
return message_element_with_sorted_options(type_element, scope)

raise ValueError("Unknown type element") # tried with assert_never but it did not work

Expand Down Expand Up @@ -251,7 +259,7 @@ def service_element_with_sorted_options(service_element: ServiceElement) -> Norm

def normalize(proto_file_element: ProtoFileElement) -> NormalizedProtoFileElement:
sorted_types: Sequence[NormalizedTypeElement] = [
type_element_with_sorted_options(type_element) for type_element in proto_file_element.types
type_element_with_sorted_options(type_element, proto_file_element.package_name) for type_element in proto_file_element.types
]
sorted_options = list(sorted(proto_file_element.options, key=sort_by_name))
sorted_services: Sequence[NormalizedServiceElement] = [
Expand Down
85 changes: 81 additions & 4 deletions tests/unit/protobuf/test_protobuf_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
Copyright (c) 2024 Aiven Ltd
See LICENSE for details
"""
from typing import Final

from karapace.protobuf.compare_result import CompareResult
from karapace.protobuf.location import Location
from karapace.protobuf.location import Location, DEFAULT_LOCATION
from karapace.protobuf.proto_normalizations import normalize
from karapace.protobuf.proto_parser import ProtoParser

import pytest

location: Location = Location("some/folder", "file.proto")
LOCATION: Final[Location] = Location("somefolder", "file.proto")


# this would be a good case for using a property based test with a well-formed message generator
Expand Down Expand Up @@ -511,10 +513,85 @@
),
)
def test_differently_ordered_options_normalizes_equally(ordered_schema: str, unordered_schema: str) -> None:
ordered_proto = ProtoParser.parse(location, ordered_schema)
unordered_proto = ProtoParser.parse(location, unordered_schema)
ordered_proto = ProtoParser.parse(LOCATION, ordered_schema)
unordered_proto = ProtoParser.parse(LOCATION, unordered_schema)

result = CompareResult()
normalize(ordered_proto).compare(normalize(unordered_proto), result)
assert result.is_compatible()
assert normalize(ordered_proto).to_schema() == normalize(unordered_proto).to_schema()



PROTO_WITH_FULLY_QUALIFIED_PATHS = """\
syntax = "proto3";
package my.awesome.customer.v1;
import "my/awesome/customer/v1/nested_value.proto";
import "google/protobuf/timestamp.proto";
option csharp_namespace = "my.awesome.customer.V1";
option go_package = "github.com/customer/api/my/awesome/customer/v1;dspv1";
option java_multiple_files = true;
option java_outer_classname = "EventValueProto";
option java_package = "com.my.awesome.customer.v1";
option objc_class_prefix = "TDD";
option php_metadata_namespace = "My\\Awesome\\Customer\\V1";
option php_namespace = "My\\Awesome\\Customer\\V1";
option ruby_package = "My::Awesome::Customer::V1";
message EventValue {
.my.awesome.customer.v1.NestedValue nested_value = 1;
.google.protobuf.Timestamp created_at = 2;
}
"""



PROTO_WITH_SIMPLE_NAMES = """\
syntax = "proto3";
package my.awesome.customer.v1;
import "my/awesome/customer/v1/nested_value.proto";
import "google/protobuf/timestamp.proto";
option csharp_namespace = "my.awesome.customer.V1";
option go_package = "github.com/customer/api/my/awesome/customer/v1;dspv1";
option java_multiple_files = true;
option java_outer_classname = "EventValueProto";
option java_package = "com.my.awesome.customer.v1";
option objc_class_prefix = "TDD";
option php_metadata_namespace = "My\\Awesome\\Customer\\V1";
option php_namespace = "My\\Awesome\\Customer\\V1";
option ruby_package = "My::Awesome::Customer::V1";
message EventValue {
NestedValue nested_value = 1;
google.protobuf.Timestamp created_at = 2;
}"""


def test_full_path_and_simple_names_are_equal() -> None:
"""
If we consider the specifications provided by buf as valid, we can read the definition of a type reference there: https://protobuf.com/docs/language-spec#type-references.
The current issue in the previous definition its the inability of the current parser to determine the equivalente between a fully-qualified type reference (in a dot notation) and a fully-qualified reference identified
by the name alone. (https://protobuf.com/docs/language-spec#fully-qualified-references).
The current test do not assert anything about the various equal definition that a user can use to define a relative reference (https://protobuf.com/docs/language-spec#relative-references).
The current resolution logic its to strip away the `.package_name` reference from the definition before starting the comparison for the field modifications.
Even if the `TypeTree` (trie datastructure) can enable the implementation of the relative reference algorithm resolution (https://protobuf.com/docs/language-spec#relative-references).
We wanna avoid implementing such a feature due to the risk of introducing new bugs in the current implementation.
The plan its to introduce a step of normalization for the protobuf schemas later, we will use the profile descriptors to gather for each type the reference
and once obtained the fully qualified type reference directly from the proto descriptor the "lingua franca" of the protobuf format.
Implementing properly all the variations of the path resolution means implementing the same behaviour of the protoc compiler, the mechanism should rely
on an already processed entity as the proto descriptor.
Due to this the current solution its to implement "just" the fully-qualified in dot notation and the fully-qualified reference identified by the name alone.
"""

fully_qualitifed_simple_name_notation = ProtoParser.parse(LOCATION, PROTO_WITH_SIMPLE_NAMES)
fully_qualitifed_dot_notation = ProtoParser.parse(LOCATION, PROTO_WITH_FULLY_QUALIFIED_PATHS)
result = CompareResult()
normalize(fully_qualitifed_dot_notation).compare(normalize(fully_qualitifed_simple_name_notation), result)
assert result.is_compatible()
assert normalize(fully_qualitifed_dot_notation).to_schema() == normalize(fully_qualitifed_simple_name_notation).to_schema()

0 comments on commit 0916518

Please sign in to comment.