Added support to other naming strategy, refactored different unrelate…

…d stuff and added a couple of tests
Aiven-Open · Nov 3, 2023 · af7f346 · af7f346
1 parent 101d69e
commit af7f346
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 98 deletions.
diff --git a/README.rst b/README.rst
@@ -460,7 +460,7 @@ Keys to take special care are the ones needed to configure Kafka and advertised_
      - ``runtime``
      - Runtime directory for the ``protoc`` protobuf schema parser and code generator
    * - ``name_strategy``
-     - ``topic_name``
+     - ``topic_name``, ``record_name``, ``topic_record_name``
      - Name strategy to use when storing schemas from the kafka rest proxy service
    * - ``name_strategy_validation``
      - ``true``

diff --git a/karapace/config.py b/karapace/config.py
@@ -158,19 +158,31 @@ class InvalidConfiguration(Exception):
     pass
 
 
+class StrEnum(str, Enum):
+    def __str__(self) -> str:
+        return str(self.value)
+
+
 @unique
 class ElectionStrategy(Enum):
     highest = "highest"
     lowest = "lowest"
 
 
 @unique
-class NameStrategy(Enum):
+class NameStrategy(StrEnum):
     topic_name = "topic_name"
     record_name = "record_name"
     topic_record_name = "topic_record_name"
 
 
+@unique
+class SubjectType(StrEnum):
+    key = "key"
+    value = "value"
+    partition = "partition"
+
+
 def parse_env_value(value: str) -> str | int | bool:
     # we only have ints, strings and bools in the config
     try:
@@ -273,8 +285,10 @@ def validate_config(config: Config) -> None:
     try:
         NameStrategy(name_strategy)
     except ValueError:
-        valid_strategies = [strategy.value for strategy in NameStrategy]
-        raise InvalidConfiguration(f"Invalid name strategy: {name_strategy}, valid values are {valid_strategies}") from None
+        valid_strategies = list(NameStrategy)
+        raise InvalidConfiguration(
+            f"Invalid default name strategy: {name_strategy}, valid values are {valid_strategies}"
+        ) from None
 
     if config["rest_authorization"] and config["sasl_bootstrap_uri"] is None:
         raise InvalidConfiguration(

diff --git a/karapace/kafka_rest_apis/__init__.py b/karapace/kafka_rest_apis/__init__.py
@@ -13,7 +13,7 @@
     TopicAuthorizationFailedError,
     UnknownTopicOrPartitionError,
 )
-from karapace.config import Config, create_client_ssl_context
+from karapace.config import Config, create_client_ssl_context, NameStrategy, SubjectType
 from karapace.errors import InvalidSchema
 from karapace.kafka_rest_apis.admin import KafkaRestAdminClient
 from karapace.kafka_rest_apis.authentication import (
@@ -28,7 +28,13 @@
 from karapace.rapu import HTTPRequest, JSON_CONTENT_TYPE
 from karapace.schema_models import TypedSchema, ValidatedTypedSchema
 from karapace.schema_type import SchemaType
-from karapace.serialization import InvalidMessageSchema, InvalidPayload, SchemaRegistrySerializer, SchemaRetrievalError
+from karapace.serialization import (
+    get_subject_name,
+    InvalidMessageSchema,
+    InvalidPayload,
+    SchemaRegistrySerializer,
+    SchemaRetrievalError,
+)
 from karapace.typing import SchemaId, Subject
 from karapace.utils import convert_to_int, json_encode, KarapaceKafkaClient
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -39,7 +45,7 @@
 import logging
 import time
 
-RECORD_KEYS = ["key", "value", "partition"]
+SUBJECT_VALID_POSTFIX = [SubjectType.key, SubjectType.value]
 PUBLISH_KEYS = {"records", "value_schema", "value_schema_id", "key_schema", "key_schema_id"}
 RECORD_CODES = [42201, 42202]
 KNOWN_FORMATS = {"json", "avro", "protobuf", "binary"}
@@ -439,6 +445,7 @@ def __init__(
 
         self._async_producer_lock = asyncio.Lock()
         self._async_producer: Optional[AIOKafkaProducer] = None
+        self.naming_strategy = NameStrategy(self.config["name_strategy"])
 
     def __str__(self) -> str:
         return f"UserRestProxy(username={self.config['sasl_plain_username']})"
@@ -759,7 +766,7 @@ async def get_schema_id(
         self,
         data: dict,
         topic: str,
-        prefix: str,
+        subject_type: SubjectType,
         schema_type: SchemaType,
     ) -> SchemaId:
         """
@@ -770,21 +777,27 @@ async def get_schema_id(
         """
         log.debug("[resolve schema id] Retrieving schema id for %r", data)
         schema_id: Union[SchemaId, None] = (
-            SchemaId(int(data[f"{prefix}_schema_id"])) if f"{prefix}_schema_id" in data else None
+            SchemaId(int(data[f"{subject_type}_schema_id"])) if f"{subject_type}_schema_id" in data else None
         )
-        schema_str = data.get(f"{prefix}_schema")
+        schema_str = data.get(f"{subject_type}_schema")
 
         if schema_id is None and schema_str is None:
             raise InvalidSchema()
 
         if schema_id is None:
             parsed_schema = ValidatedTypedSchema.parse(schema_type, schema_str)
-            subject_name = self.serializer.get_subject_name(topic, parsed_schema, prefix, schema_type)
+
+            subject_name = get_subject_name(
+                topic,
+                parsed_schema,
+                subject_type,
+                self.naming_strategy,
+            )
             schema_id = await self._query_schema_id_from_cache_or_registry(parsed_schema, schema_str, subject_name)
         else:
 
             def subject_not_included(schema: TypedSchema, subjects: List[Subject]) -> bool:
-                subject = self.serializer.get_subject_name(topic, schema, prefix, schema_type)
+                subject = get_subject_name(topic, schema, subject_type, self.naming_strategy)
                 return subject not in subjects
 
             parsed_schema, valid_subjects = await self._query_schema_and_subjects(
@@ -833,7 +846,9 @@ async def _query_schema_id_from_cache_or_registry(
             )
         return schema_id
 
-    async def validate_schema_info(self, data: dict, prefix: str, content_type: str, topic: str, schema_type: str):
+    async def validate_schema_info(
+        self, data: dict, subject_type: SubjectType, content_type: str, topic: str, schema_type: str
+    ):
         try:
             schema_type = SCHEMA_MAPPINGS[schema_type]
         except KeyError:
@@ -848,7 +863,7 @@ async def validate_schema_info(self, data: dict, prefix: str, content_type: str,
 
         # will do in place updates of id keys, since calling these twice would be expensive
         try:
-            data[f"{prefix}_schema_id"] = await self.get_schema_id(data, topic, prefix, schema_type)
+            data[f"{subject_type}_schema_id"] = await self.get_schema_id(data, topic, subject_type, schema_type)
         except InvalidPayload:
             log.exception("Unable to retrieve schema id")
             KafkaRest.r(
@@ -863,16 +878,17 @@ async def validate_schema_info(self, data: dict, prefix: str, content_type: str,
             KafkaRest.r(
                 body={
                     "error_code": RESTErrorCodes.SCHEMA_RETRIEVAL_ERROR.value,
-                    "message": f"Error when registering schema. format = {schema_type.value}, subject = {topic}-{prefix}",
+                    "message": f"Error when registering schema."
+                    f"format = {schema_type.value}, subject = {topic}-{subject_type}",
                 },
                 content_type=content_type,
                 status=HTTPStatus.REQUEST_TIMEOUT,
             )
         except InvalidSchema:
-            if f"{prefix}_schema" in data:
-                err = f'schema = {data[f"{prefix}_schema"]}'
+            if f"{subject_type}_schema" in data:
+                err = f'schema = {data[f"{subject_type}_schema"]}'
             else:
-                err = f'schema_id = {data[f"{prefix}_schema_id"]}'
+                err = f'schema_id = {data[f"{subject_type}_schema_id"]}'
             KafkaRest.r(
                 body={
                     "error_code": RESTErrorCodes.INVALID_DATA.value,
@@ -1002,26 +1018,26 @@ async def validate_publish_request_format(self, data: dict, formats: dict, conte
                     status=HTTPStatus.BAD_REQUEST,
                 )
             convert_to_int(r, "partition", content_type)
-            if set(r.keys()).difference(RECORD_KEYS):
+            if set(r.keys()).difference({subject_type.value for subject_type in SubjectType}):
                 KafkaRest.unprocessable_entity(
                     message="Invalid request format",
                     content_type=content_type,
                     sub_code=RESTErrorCodes.HTTP_UNPROCESSABLE_ENTITY.value,
                 )
         # disallow missing id and schema for any key/value list that has at least one populated element
         if formats["embedded_format"] in {"avro", "jsonschema", "protobuf"}:
-            for prefix, code in zip(RECORD_KEYS, RECORD_CODES):
-                if self.all_empty(data, prefix):
+            for subject_type, code in zip(SUBJECT_VALID_POSTFIX, RECORD_CODES):
+                if self.all_empty(data, subject_type):
                     continue
-                if not self.is_valid_schema_request(data, prefix):
+                if not self.is_valid_schema_request(data, subject_type):
                     KafkaRest.unprocessable_entity(
-                        message=f"Request includes {prefix}s and uses a format that requires schemas "
-                        f"but does not include the {prefix}_schema or {prefix}_schema_id fields",
+                        message=f"Request includes {subject_type}s and uses a format that requires schemas "
+                        f"but does not include the {subject_type}_schema or {subject_type.value}_schema_id fields",
                         content_type=content_type,
                         sub_code=code,
                     )
                 try:
-                    await self.validate_schema_info(data, prefix, content_type, topic, formats["embedded_format"])
+                    await self.validate_schema_info(data, subject_type, content_type, topic, formats["embedded_format"])
                 except InvalidMessageSchema as e:
                     KafkaRest.unprocessable_entity(
                         message=str(e),

diff --git a/karapace/schema_reader.py b/karapace/schema_reader.py
@@ -8,6 +8,7 @@
 
 from avro.schema import Schema as AvroSchema
 from contextlib import closing, ExitStack
+from enum import Enum
 from jsonschema.validators import Draft7Validator
 from kafka import KafkaConsumer, TopicPartition
 from kafka.admin import KafkaAdminClient, NewTopic
@@ -58,6 +59,13 @@
 METRIC_SUBJECT_DATA_SCHEMA_VERSIONS_GAUGE: Final = "karapace_schema_reader_subject_data_schema_versions"
 
 
+class MessageType(Enum):
+    config = "CONFIG"
+    schema = "SCHEMA"
+    delete_subject = "DELETE_SUBJECT"
+    no_operation = "NOOP"
+
+
 def _create_consumer_from_config(config: Config) -> KafkaConsumer:
     # Group not set on purpose, all consumers read the same data
     session_timeout_ms = config["session_timeout_ms"]
@@ -522,14 +530,25 @@ def _handle_msg_schema(self, key: dict, value: dict | None) -> None:
                 self.database.insert_referenced_by(subject=ref.subject, version=ref.version, schema_id=schema_id)
 
     def handle_msg(self, key: dict, value: dict | None) -> None:
-        if key["keytype"] == "CONFIG":
-            self._handle_msg_config(key, value)
-        elif key["keytype"] == "SCHEMA":
-            self._handle_msg_schema(key, value)
-        elif key["keytype"] == "DELETE_SUBJECT":
-            self._handle_msg_delete_subject(key, value)
-        elif key["keytype"] == "NOOP":  # for spec completeness
-            pass
+        if "keytype" in key:
+            try:
+                message_type = MessageType(key["keytype"])
+
+                if message_type == MessageType.config:
+                    self._handle_msg_config(key, value)
+                elif message_type == MessageType.schema:
+                    self._handle_msg_schema(key, value)
+                elif message_type == MessageType.delete_subject:
+                    self._handle_msg_delete_subject(key, value)
+                elif message_type == MessageType.no_operation:
+                    pass
+            except ValueError:
+                LOG.error("The message %s-%s has been discarded because the %s is not managed", key, value, key["keytype"])
+
+        else:
+            LOG.error(
+                "The message %s-%s has been discarded because doesn't contain the `keytype` key in the key", key, value
+            )
 
     def remove_referenced_by(
         self,

diff --git a/karapace/serialization.py b/karapace/serialization.py
@@ -9,13 +9,15 @@
 from google.protobuf.message import DecodeError
 from jsonschema import ValidationError
 from karapace.client import Client
+from karapace.config import NameStrategy
 from karapace.dependency import Dependency
 from karapace.errors import InvalidReferences
+from karapace.kafka_rest_apis import SubjectType
 from karapace.protobuf.exception import ProtobufTypeException
 from karapace.protobuf.io import ProtobufDatumReader, ProtobufDatumWriter
 from karapace.schema_models import InvalidSchema, ParsedTypedSchema, SchemaType, TypedSchema, ValidatedTypedSchema
 from karapace.schema_references import LatestVersionReference, Reference, reference_from_mapping
-from karapace.typing import ResolvedVersion, SchemaId, Subject
+from karapace.typing import ResolvedVersion, SchemaId, Subject, TopicName
 from karapace.utils import json_decode, json_encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Set, Tuple
 from urllib.parse import quote
@@ -72,9 +74,9 @@ def topic_record_name_strategy(topic_name: str, record_name: str) -> str:
 
 
 NAME_STRATEGIES = {
-    "topic_name": topic_name_strategy,
-    "record_name": record_name_strategy,
-    "topic_record_name": topic_record_name_strategy,
+    NameStrategy.topic_name: topic_name_strategy,
+    NameStrategy.record_name: record_name_strategy,
+    NameStrategy.topic_record_name: topic_record_name_strategy,
 }
 
 
@@ -103,7 +105,7 @@ async def post_new_schema(
             raise SchemaRetrievalError(result.json())
         return SchemaId(result.json()["id"])
 
-    async def _get_schema_r(
+    async def _get_schema_recursive(
         self,
         subject: Subject,
         explored_schemas: Set[Tuple[Subject, Optional[ResolvedVersion]]],
@@ -131,7 +133,7 @@ async def _get_schema_r(
             references = [Reference.from_dict(data) for data in json_result["references"]]
             dependencies = {}
             for reference in references:
-                _, schema, version = await self._get_schema_r(reference.subject, explored_schemas, reference.version)
+                _, schema, version = await self._get_schema_recursive(reference.subject, explored_schemas, reference.version)
                 dependencies[reference.name] = Dependency(
                     name=reference.name, subject=reference.subject, version=version, target_schema=schema
                 )
@@ -174,7 +176,7 @@ async def get_schema(
                 - ValidatedTypedSchema: The retrieved schema, validated and typed.
                 - ResolvedVersion: The version of the schema that was retrieved.
         """
-        return await self._get_schema_r(subject, set(), version)
+        return await self._get_schema_recursive(subject, set(), version)
 
     async def get_schema_for_id(self, schema_id: SchemaId) -> Tuple[TypedSchema, List[Subject]]:
         result = await self.client.get(f"schemas/ids/{schema_id}", params={"includeSubjects": "True"})
@@ -225,6 +227,25 @@ async def close(self):
         await self.client.close()
 
 
+def get_subject_name(
+    topic_name: str,
+    schema: TypedSchema,
+    subject_type: SubjectType,
+    naming_strategy: NameStrategy,
+) -> Subject:
+    namespace = "dummy"
+    if schema.schema_type is SchemaType.AVRO:
+        if isinstance(schema.schema, avro.schema.NamedSchema):
+            namespace = schema.schema.namespace or ""
+    if schema.schema_type is SchemaType.JSONSCHEMA:
+        namespace = schema.to_dict().get("namespace", "dummy")
+    #  Protobuf does not use namespaces in terms of AVRO
+    if schema.schema_type is SchemaType.PROTOBUF:
+        namespace = ""
+    naming_strategy = NAME_STRATEGIES[naming_strategy]
+    return Subject(f"{naming_strategy(topic_name, namespace)}-{subject_type}")
+
+
 class SchemaRegistrySerializer:
     def __init__(
         self,
@@ -243,8 +264,6 @@ def __init__(
         else:
             registry_url = f"http://{self.config['registry_host']}:{self.config['registry_port']}"
             registry_client = SchemaRegistryClient(registry_url, session_auth=session_auth)
-        name_strategy = config.get("name_strategy", "topic_name")
-        self.subject_name_strategy = NAME_STRATEGIES.get(name_strategy, topic_name_strategy)
         self.registry_client: Optional[SchemaRegistryClient] = registry_client
         self.ids_to_schemas: Dict[int, TypedSchema] = {}
         self.ids_to_subjects: MutableMapping[int, List[Subject]] = TTLCache(maxsize=10000, ttl=600)
@@ -255,25 +274,6 @@ async def close(self) -> None:
             await self.registry_client.close()
             self.registry_client = None
 
-    def get_subject_name(
-        self,
-        topic_name: str,
-        schema: TypedSchema,
-        subject_type: str,
-        schema_type: SchemaType,
-    ) -> Subject:
-        namespace = "dummy"
-        if schema_type is SchemaType.AVRO:
-            if isinstance(schema.schema, avro.schema.NamedSchema):
-                namespace = schema.schema.namespace
-        if schema_type is SchemaType.JSONSCHEMA:
-            namespace = schema.to_dict().get("namespace", "dummy")
-        #  Protobuf does not use namespaces in terms of AVRO
-        if schema_type is SchemaType.PROTOBUF:
-            namespace = ""
-
-        return Subject(f"{self.subject_name_strategy(topic_name, namespace)}-{subject_type}")
-
     async def get_schema_for_subject(self, subject: Subject) -> TypedSchema:
         assert self.registry_client, "must not call this method after the object is closed."
         schema_id, schema, _ = await self.registry_client.get_schema(subject)