From 3dd80cb26e0a8387fe86c90236b11bfd2d91ff25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Z=C4=85bek?= Date: Thu, 21 Nov 2024 13:22:10 +0100 Subject: [PATCH] refactor, remove ApacheIcebergSchema class --- .../internal/SnowflakeConnectionService.java | 9 +- .../SnowflakeConnectionServiceV1.java | 63 ++------------ .../iceberg/ApacheIcebergColumnSchema.java | 6 +- .../iceberg/IcebergColumnJsonValuePair.java | 8 +- .../iceberg/IcebergColumnTree.java | 8 +- .../iceberg/IcebergColumnTypeMapper.java | 4 +- .../iceberg/IcebergDataTypeParser.java | 10 +-- .../iceberg/IcebergFieldNode.java | 33 ++------ .../IcebergSchemaEvolutionService.java | 78 +++++++++++++---- .../iceberg/IcebergTableSchema.java | 23 ----- .../iceberg/IcebergTableSchemaResolver.java | 84 ++++++++----------- .../iceberg/ParseIcebergColumnTreeTest.java | 29 ++++--- .../IcebergIngestionSchemaEvolutionIT.java | 47 +++++++---- 13 files changed, 179 insertions(+), 223 deletions(-) delete mode 100644 src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchema.java diff --git a/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionService.java b/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionService.java index 8e7f864a4..c50830e5b 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionService.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionService.java @@ -2,7 +2,6 @@ import com.snowflake.kafka.connector.internal.streaming.ChannelMigrateOffsetTokenResponseDTO; import com.snowflake.kafka.connector.internal.streaming.schemaevolution.ColumnInfos; -import com.snowflake.kafka.connector.internal.streaming.schemaevolution.iceberg.IcebergColumnTree; import com.snowflake.kafka.connector.internal.telemetry.SnowflakeTelemetryService; import java.sql.Connection; import java.util.List; @@ -117,13 +116,11 @@ public interface SnowflakeConnectionService { * Alter iceberg table to add columns according to a map from columnNames to their types * * @param tableName the name of the table - * @param addedColumns todo - * @param modifiedColumns todo + * @param addColumnsQuery + * @param alterSetDataTypeQuery */ void appendColumnsToIcebergTable( - String tableName, - List addedColumns, - List modifiedColumns); + String tableName, String addColumnsQuery, String alterSetDataTypeQuery); /** * Alter table to drop non-nullability of a list of columns diff --git a/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionServiceV1.java b/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionServiceV1.java index 896cee995..dc367ea92 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionServiceV1.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/SnowflakeConnectionServiceV1.java @@ -12,7 +12,6 @@ import com.snowflake.kafka.connector.internal.streaming.ChannelMigrationResponseCode; import com.snowflake.kafka.connector.internal.streaming.IngestionMethodConfig; import com.snowflake.kafka.connector.internal.streaming.schemaevolution.ColumnInfos; -import com.snowflake.kafka.connector.internal.streaming.schemaevolution.iceberg.IcebergColumnTree; import com.snowflake.kafka.connector.internal.telemetry.SnowflakeTelemetryService; import com.snowflake.kafka.connector.internal.telemetry.SnowflakeTelemetryServiceFactory; import java.io.ByteArrayInputStream; @@ -515,68 +514,24 @@ public void appendColumnsToTable(String tableName, Map colu * Alter iceberg table to add columns according to a map from columnNames to their types * * @param tableName the name of the table - * @param addedColumns the mapping from the columnNames to their infos - * @param modifiedColumns + * @param addColumnsQuery + * @param alterSetDataTypeQuery */ @Override public void appendColumnsToIcebergTable( - String tableName, - List addedColumns, - List modifiedColumns) { + String tableName, String addColumnsQuery, String alterSetDataTypeQuery) { LOGGER.debug("Appending columns to iceberg table"); InternalUtils.assertNotEmpty("tableName", tableName); checkConnection(); - appendIcebergColumnsQuery(tableName, addedColumns); - modifyIcebergColumnsQuery(tableName, modifiedColumns); - } - - // alter table kafka_connector_test_table_785581352092121753 add RECORD_METADATA column to align - // with iceberg format - private void appendIcebergColumnsQuery(String tableName, List columnsToAdd) { - if (columnsToAdd.isEmpty()) { - return; - } - StringBuilder addColumnQuery = new StringBuilder("alter iceberg "); - addColumnQuery.append("table identifier(?) add column "); - - for (IcebergColumnTree column : columnsToAdd) { - addColumnQuery.append("if not exists "); - - String columnName = column.getColumnName(); - String dataType = column.buildType(); - - addColumnQuery.append(" ").append(columnName).append(" ").append(dataType).append(", "); - } - // remove last comma and whitespace - addColumnQuery.deleteCharAt(addColumnQuery.length() - 1); - addColumnQuery.deleteCharAt(addColumnQuery.length() - 1); - - executeStatement(tableName, addColumnQuery.toString()); - - LOGGER.info("Query SUCCEEDED: " + addColumnQuery); - } - - private void modifyIcebergColumnsQuery( - String tableName, List columnsToModify) { - if (columnsToModify.isEmpty()) { - return; + if (!addColumnsQuery.isEmpty()) { + executeStatement(tableName, addColumnsQuery); + LOGGER.info("Query SUCCEEDED: " + addColumnsQuery); } - StringBuilder setDataTypeQuery = new StringBuilder("alter iceberg "); - setDataTypeQuery.append("table identifier(?) alter column "); - for (IcebergColumnTree column : columnsToModify) { - String columnName = column.getColumnName(); - String dataType = column.buildType(); - - setDataTypeQuery.append(columnName).append(" set data type ").append(dataType).append(", "); + if (!alterSetDataTypeQuery.isEmpty()) { + executeStatement(tableName, alterSetDataTypeQuery); + LOGGER.info("Query SUCCEEDED: " + addColumnsQuery); } - // remove last comma and whitespace - setDataTypeQuery.deleteCharAt(setDataTypeQuery.length() - 1); - setDataTypeQuery.deleteCharAt(setDataTypeQuery.length() - 1); - - executeStatement(tableName, setDataTypeQuery.toString()); - - LOGGER.info("Query SUCCEEDED: " + setDataTypeQuery); } private void executeStatement(String tableName, String query) { diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ApacheIcebergColumnSchema.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ApacheIcebergColumnSchema.java index 28bdd4494..ad51e20b8 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ApacheIcebergColumnSchema.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ApacheIcebergColumnSchema.java @@ -9,16 +9,16 @@ class ApacheIcebergColumnSchema { private final String columnName; - public ApacheIcebergColumnSchema(Type schema, String columnName) { + ApacheIcebergColumnSchema(Type schema, String columnName) { this.schema = schema; this.columnName = columnName; } - public Type getSchema() { + Type getSchema() { return schema; } - public String getColumnName() { + String getColumnName() { return columnName; } } diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnJsonValuePair.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnJsonValuePair.java index 7dbfd26b3..a369fede6 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnJsonValuePair.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnJsonValuePair.java @@ -9,7 +9,7 @@ class IcebergColumnJsonValuePair { private final String quotedColumnName; private final JsonNode jsonNode; - public static IcebergColumnJsonValuePair from(Map.Entry field) { + static IcebergColumnJsonValuePair from(Map.Entry field) { return new IcebergColumnJsonValuePair(field.getKey(), field.getValue()); } @@ -19,15 +19,15 @@ public static IcebergColumnJsonValuePair from(Map.Entry field) this.jsonNode = jsonNode; } - public String getColumnName() { + String getColumnName() { return columnName; } - public String getQuotedColumnName() { + String getQuotedColumnName() { return quotedColumnName; } - public JsonNode getJsonNode() { + JsonNode getJsonNode() { return jsonNode; } } diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTree.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTree.java index 7e061e7d5..d6518c263 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTree.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTree.java @@ -3,22 +3,22 @@ import com.google.common.base.Preconditions; /** Class with object types compatible with Snowflake Iceberg table */ -public class IcebergColumnTree { +class IcebergColumnTree { private final IcebergFieldNode rootNode; - public String getColumnName() { + String getColumnName() { return rootNode.name; } IcebergColumnTree(ApacheIcebergColumnSchema columnSchema) { - // rootNodes column name serve as a name of the column, hence it is uppercase + // rootNodes name serve as a name of the column, hence it is uppercase String columnName = columnSchema.getColumnName().toUpperCase(); this.rootNode = new IcebergFieldNode(columnName, columnSchema.getSchema()); } IcebergColumnTree(IcebergColumnJsonValuePair pair) { - // rootNodes column name serve as a name of the column, hence it is uppercase + // rootNodes name serve as a name of the column, hence it is uppercase String columnName = pair.getColumnName().toUpperCase(); this.rootNode = new IcebergFieldNode(columnName, pair.getJsonNode()); } diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTypeMapper.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTypeMapper.java index 272ce2ad3..93daf72f9 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTypeMapper.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergColumnTypeMapper.java @@ -40,10 +40,10 @@ public String mapToSnowflakeDataType(Type apacheIcebergType) { case INTEGER: return "NUMBER(10,0)"; case LONG: - return "NUMBER(19,0)"; + return "LONG"; case FLOAT: case DOUBLE: - return "FLOAT"; + return "DOUBLE"; case DATE: return "DATE"; case TIME: diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergDataTypeParser.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergDataTypeParser.java index b9fd696a8..be090a8b3 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergDataTypeParser.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergDataTypeParser.java @@ -46,7 +46,7 @@ class IcebergDataTypeParser { * @param icebergDataType string representation of Iceberg data type * @return Iceberg data type */ - public static Type deserializeIcebergType(String icebergDataType) { + static Type deserializeIcebergType(String icebergDataType) { try { JsonNode json = MAPPER.readTree(icebergDataType); return getTypeFromJson(json); @@ -62,7 +62,7 @@ public static Type deserializeIcebergType(String icebergDataType) { * @param jsonNode JsonNode parsed from Iceberg type string. * @return Iceberg data type */ - public static Type getTypeFromJson(@Nonnull JsonNode jsonNode) { + static Type getTypeFromJson(@Nonnull JsonNode jsonNode) { if (jsonNode.isTextual()) { return Types.fromPrimitiveString(jsonNode.asText()); } else if (jsonNode.isObject()) { @@ -91,7 +91,7 @@ public static Type getTypeFromJson(@Nonnull JsonNode jsonNode) { * @param json JsonNode parsed from Iceberg type string. * @return struct type */ - public static @Nonnull Types.StructType structFromJson(@Nonnull JsonNode json) { + static @Nonnull Types.StructType structFromJson(@Nonnull JsonNode json) { if (!json.has(FIELDS)) { throw new IllegalArgumentException( String.format("Missing key '%s' in schema: %s", FIELDS, json)); @@ -137,7 +137,7 @@ public static Type getTypeFromJson(@Nonnull JsonNode jsonNode) { * @param json JsonNode parsed from Iceberg type string. * @return list type */ - public static Types.ListType listFromJson(JsonNode json) { + static Types.ListType listFromJson(JsonNode json) { int elementId = JsonUtil.getInt(ELEMENT_ID, json); Type elementType = getTypeFromJson(json.get(ELEMENT)); boolean isRequired = JsonUtil.getBool(ELEMENT_REQUIRED, json); @@ -155,7 +155,7 @@ public static Types.ListType listFromJson(JsonNode json) { * @param json JsonNode parsed from Iceberg type string. * @return map type */ - public static Types.MapType mapFromJson(JsonNode json) { + static Types.MapType mapFromJson(JsonNode json) { int keyId = JsonUtil.getInt(KEY_ID, json); Type keyType = getTypeFromJson(json.get(KEY)); diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergFieldNode.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergFieldNode.java index c2660a01e..b28e2a1fe 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergFieldNode.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergFieldNode.java @@ -3,6 +3,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; import java.util.LinkedHashMap; +import java.util.Map; import java.util.stream.Collectors; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; @@ -12,11 +13,11 @@ class IcebergFieldNode { // todo consider refactoring into some more classes private final IcebergColumnTypeMapper mapper = IcebergColumnTypeMapper.INSTANCE; - public final String name; + final String name; - public final String snowflakeIcebergType; + final String snowflakeIcebergType; - public final LinkedHashMap children; + final LinkedHashMap children; IcebergFieldNode(String name, Type apacheIcebergSchema) { this.name = name; @@ -24,7 +25,6 @@ class IcebergFieldNode { this.children = produceChildren(apacheIcebergSchema); } - // todo refactor IcebergFieldNode(String name, JsonNode jsonNode) { this.name = name; this.snowflakeIcebergType = mapper.mapToColumnDataTypeFromJson(jsonNode); @@ -35,7 +35,7 @@ class IcebergFieldNode { * Method does not modify, delete any existing nodes and its types, names. It is meant only to add * new children. */ - IcebergFieldNode merge(IcebergFieldNode modifiedNode) { + void merge(IcebergFieldNode modifiedNode) { modifiedNode.children.forEach( (key, value) -> { IcebergFieldNode thisChild = this.children.get(key); @@ -46,24 +46,12 @@ IcebergFieldNode merge(IcebergFieldNode modifiedNode) { } }); addNewChildren(modifiedNode); - return this; } private void addNewChildren(IcebergFieldNode modifiedNode) { modifiedNode.children.forEach(this.children::putIfAbsent); } - // nodeFromModifiedSchema.children.entrySet().forEach( - // newChildsEntry -> { - // IcebergFieldNode thisChild = this.children.get(newChildsEntry.getKey()); - // if (thisChild == null) { - // this.children.put(newChildsEntry.getKey(), newChildsEntry.getValue()); - // } else { - // thisChild.merge(newChildsEntry.getValue()); - // } - // } - // ); - private LinkedHashMap produceChildren(JsonNode recordNode) { // primitives must not have children if (recordNode.isEmpty() || recordNode.isNull()) { @@ -74,7 +62,7 @@ private LinkedHashMap produceChildren(JsonNode recordN return objectNode.properties().stream() .collect( Collectors.toMap( - stringJsonNodeEntry -> stringJsonNodeEntry.getKey(), + Map.Entry::getKey, stringJsonNodeEntry -> new IcebergFieldNode( stringJsonNodeEntry.getKey(), stringJsonNodeEntry.getValue()), @@ -132,26 +120,23 @@ StringBuilder buildQuery(StringBuilder sb, String parentType) { return sb; } - private StringBuilder appendNameAndType(StringBuilder sb) { + private void appendNameAndType(StringBuilder sb) { sb.append(name); sb.append(" "); sb.append(snowflakeIcebergType); - return sb; } - private StringBuilder appendChildren(StringBuilder sb, String parentType) { + private void appendChildren(StringBuilder sb, String parentType) { children.forEach( (name, node) -> { node.buildQuery(sb, parentType); sb.append(", "); }); removeLastSeparator(sb); - return sb; } - private StringBuilder removeLastSeparator(StringBuilder sb) { + private void removeLastSeparator(StringBuilder sb) { sb.deleteCharAt(sb.length() - 1); sb.deleteCharAt(sb.length() - 1); - return sb; } } diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergSchemaEvolutionService.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergSchemaEvolutionService.java index 95af4eb3c..fc4a5e74c 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergSchemaEvolutionService.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergSchemaEvolutionService.java @@ -52,46 +52,52 @@ public void evolveIcebergSchemaIfNeeded( if (!columnsToEvolve.isEmpty()) { LOGGER.debug("Adding columns to iceberg table: {} columns: {}", tableName, columnsToEvolve); // some of the column might already exist, and we will modify them, not create - IcebergTableSchema alreadyExistingColumns = + List alreadyExistingColumns = icebergTableSchemaResolver.resolveIcebergSchemaFromChannel( schemaAlreadyInUse, columnsToEvolve); // new columns resolved from incoming record - IcebergTableSchema modifiedOrAddedColumns = - icebergTableSchemaResolver.resolveIcebergSchema(record, columnsToEvolve); + List modifiedOrAddedColumns = + icebergTableSchemaResolver.resolveIcebergSchemaFromRecord(record, columnsToEvolve); - // columns that we simply add because they do not exist. They are NOT present in an already - // existing - // schema. + // columns that we simply add because they are NOT present in an already existing schema. List addedColumns = - modifiedOrAddedColumns.getColumns().stream() + modifiedOrAddedColumns.stream() .filter( modifiedOrAddedColumn -> - alreadyExistingColumns.getColumns().stream() + alreadyExistingColumns.stream() .noneMatch( tree -> tree.getColumnName() .equalsIgnoreCase(modifiedOrAddedColumn.getColumnName()))) .collect(Collectors.toList()); - // consider just getting rest of the columns + // column that are present in a schema and needs to have its type modified List modifiedColumns = - modifiedOrAddedColumns.getColumns().stream() + modifiedOrAddedColumns.stream() .filter( modifiedOrAddedColumn -> - alreadyExistingColumns.getColumns().stream() + alreadyExistingColumns.stream() .anyMatch( tree -> tree.getColumnName() .equalsIgnoreCase(modifiedOrAddedColumn.getColumnName()))) .collect(Collectors.toList()); - // merge withAlreadyExistingColumns - - System.out.println("stop debugger"); + String addColumnsQuery = generateAddColumnQuery(addedColumns); + // merge changes into already existing column + alreadyExistingColumns.forEach( + existingColumn -> { + IcebergColumnTree mewVersion = + modifiedColumns.stream() + .filter(c -> c.getColumnName().equals(existingColumn.getColumnName())) + .collect(Collectors.toList()) + .get(0); + existingColumn.merge(mewVersion); + }); + String alterSetDataTypeQuery = alterSetDataTypeQuery(alreadyExistingColumns); try { - // todo columns to add and column to modify - conn.appendColumnsToIcebergTable(tableName, addedColumns, modifiedColumns); + conn.appendColumnsToIcebergTable(tableName, addColumnsQuery, alterSetDataTypeQuery); } catch (SnowflakeKafkaConnectorException e) { LOGGER.warn( String.format( @@ -103,4 +109,44 @@ public void evolveIcebergSchemaIfNeeded( } } } + + private String generateAddColumnQuery(List columnsToAdd) { + if (columnsToAdd.isEmpty()) { + return ""; + } + StringBuilder addColumnQuery = new StringBuilder("alter iceberg "); + addColumnQuery.append("table identifier(?) add column "); + + for (IcebergColumnTree column : columnsToAdd) { + addColumnQuery.append("if not exists "); + + String columnName = column.getColumnName(); + String dataType = column.buildType(); + + addColumnQuery.append(" ").append(columnName).append(" ").append(dataType).append(", "); + } + // remove last comma and whitespace + addColumnQuery.deleteCharAt(addColumnQuery.length() - 1); + addColumnQuery.deleteCharAt(addColumnQuery.length() - 1); + return addColumnQuery.toString(); + } + + private String alterSetDataTypeQuery(List columnsToModify) { + if (columnsToModify.isEmpty()) { + return ""; + } + StringBuilder setDataTypeQuery = new StringBuilder("alter iceberg "); + setDataTypeQuery.append("table identifier(?) alter column "); + for (IcebergColumnTree column : columnsToModify) { + String columnName = column.getColumnName(); + String dataType = column.buildType(); + + setDataTypeQuery.append(columnName).append(" set data type ").append(dataType).append(", "); + } + // remove last comma and whitespace + setDataTypeQuery.deleteCharAt(setDataTypeQuery.length() - 1); + setDataTypeQuery.deleteCharAt(setDataTypeQuery.length() - 1); + + return setDataTypeQuery.toString(); + } } diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchema.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchema.java deleted file mode 100644 index 31ac24d57..000000000 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchema.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.snowflake.kafka.connector.internal.streaming.schemaevolution.iceberg; - -import com.google.common.collect.ImmutableList; -import java.util.Collections; -import java.util.List; - -/** Wrapper for multiple columns, not necessary all columns that are in the table */ -class IcebergTableSchema { - - private final List columns; - - public IcebergTableSchema(List columns) { - this.columns = Collections.unmodifiableList(columns); - } - - public List getColumns() { - return columns; - } - - public static IcebergTableSchema Empty() { - return new IcebergTableSchema(ImmutableList.of()); - } -} diff --git a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchemaResolver.java b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchemaResolver.java index a97695c3e..e9e7f97f7 100644 --- a/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchemaResolver.java +++ b/src/main/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/IcebergTableSchemaResolver.java @@ -1,7 +1,7 @@ package com.snowflake.kafka.connector.internal.streaming.schemaevolution.iceberg; import com.fasterxml.jackson.databind.JsonNode; -import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Streams; import com.snowflake.kafka.connector.records.RecordService; import java.util.HashSet; @@ -15,52 +15,54 @@ class IcebergTableSchemaResolver { - private final IcebergColumnTypeMapper columnTypeMapper; + public List resolveIcebergSchemaFromRecord( + SinkRecord record, Set columnsToInclude) { + if (columnsToInclude == null || columnsToInclude.isEmpty()) { + return ImmutableList.of(); + } + Set columnNamesSet = new HashSet<>(columnsToInclude); - @VisibleForTesting - IcebergTableSchemaResolver(IcebergColumnTypeMapper columnTypeMapper) { - this.columnTypeMapper = columnTypeMapper; + if (hasSchema(record)) { + return getTableSchemaFromRecordSchemaIceberg(record, columnNamesSet); + } else { + return getTableSchemaFromJsonIceberg(record, columnNamesSet); + } } - public IcebergTableSchemaResolver() { - this.columnTypeMapper = IcebergColumnTypeMapper.INSTANCE; - } + /** + * Retrieve IcebergSchema stored in a channel, then parse it into a tree. Filter out columns that + * do not need to be modified. + */ + public List resolveIcebergSchemaFromChannel( + Map tableSchemaFromChannel, Set columnsToEvolve) { - public IcebergTableSchema resolveIcebergSchemaFromChannel( - Map tableSchemaFromChannel, Set columnsToInclude) { - // tableSchemaFromChannel nie maja ciapek - // columnsToInclude maja ciapki - // todo remember about the case with dots - // todo potential error when cases are different - think easy to overcome List apacheIcebergColumnSchemas = tableSchemaFromChannel.entrySet().stream() .filter( (schemaFromChannelEntry) -> { - String quoteChannelColumnName = schemaFromChannelEntry.getKey(); - // on a second run columns to include is not quoted - return columnsToInclude.contains(quoteChannelColumnName); + String columnNameFromChannel = schemaFromChannelEntry.getKey(); + return columnsToEvolve.contains(columnNameFromChannel); }) .map(this::mapApacheSchemaFromChannel) .collect(Collectors.toList()); - List icebergColumnTrees = - apacheIcebergColumnSchemas.stream() - .map(IcebergColumnTree::new) - .collect(Collectors.toList()); - - return new IcebergTableSchema(icebergColumnTrees); + return apacheIcebergColumnSchemas.stream() + .map(IcebergColumnTree::new) + .collect(Collectors.toList()); } private ApacheIcebergColumnSchema mapApacheSchemaFromChannel( Map.Entry schemaFromChannelEntry) { - String columnName = schemaFromChannelEntry.getKey(); + ColumnProperties columnProperty = schemaFromChannelEntry.getValue(); String plainIcebergSchema = getIcebergSchema(columnProperty); + Type schema = IcebergDataTypeParser.deserializeIcebergType(plainIcebergSchema); + String columnName = schemaFromChannelEntry.getKey(); return new ApacheIcebergColumnSchema(schema, columnName); } - // todo remove it just when we can + // todo remove in 1820155 when getIcebergSchema() method is made public private static String getIcebergSchema(ColumnProperties columnProperties) { try { java.lang.reflect.Field field = @@ -74,43 +76,27 @@ private static String getIcebergSchema(ColumnProperties columnProperties) { } } - public IcebergTableSchema resolveIcebergSchema(SinkRecord record, Set columnsToInclude) { - if (columnsToInclude == null || columnsToInclude.isEmpty()) { - return IcebergTableSchema.Empty(); - } - Set columnNamesSet = new HashSet<>(columnsToInclude); - - if (hasSchema(record)) { - return getTableSchemaFromRecordSCHEMAIceberg(record, columnNamesSet); - } else { - return getTableSchemaFromJsonIceberg(record, columnNamesSet); - } - } - private boolean hasSchema(SinkRecord record) { return record.valueSchema() != null && record.valueSchema().fields() != null && !record.valueSchema().fields().isEmpty(); } - private IcebergTableSchema getTableSchemaFromJsonIceberg( + private List getTableSchemaFromJsonIceberg( SinkRecord record, Set columnsToEvolve) { JsonNode recordNode = RecordService.convertToJson(record.valueSchema(), record.value(), true); - List icebergColumnTrees = - Streams.stream(recordNode.fields()) - .map(IcebergColumnJsonValuePair::from) - .filter(pair -> columnsToEvolve.contains(pair.getColumnName().toUpperCase())) - .map(IcebergColumnTree::new) - .collect(Collectors.toList()); - return new IcebergTableSchema(icebergColumnTrees); + return Streams.stream(recordNode.fields()) + .map(IcebergColumnJsonValuePair::from) + .filter(pair -> columnsToEvolve.contains(pair.getColumnName().toUpperCase())) + .map(IcebergColumnTree::new) + .collect(Collectors.toList()); } - private IcebergTableSchema getTableSchemaFromRecordSCHEMAIceberg( + private List getTableSchemaFromRecordSchemaIceberg( SinkRecord record, Set columnNamesSet) { // todo its second part JsonNode recordNode = RecordService.convertToJson(record.valueSchema(), record.value(), true); - throw new IllegalArgumentException("not yet implemented SCHEMA path"); - // return IcebergTableSchema.Empty(); + throw new RuntimeException("NOT YET IMPLEMENTED! - SCHEMA path"); } } diff --git a/src/test/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ParseIcebergColumnTreeTest.java b/src/test/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ParseIcebergColumnTreeTest.java index 8c19da967..c953bd960 100644 --- a/src/test/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ParseIcebergColumnTreeTest.java +++ b/src/test/java/com/snowflake/kafka/connector/internal/streaming/schemaevolution/iceberg/ParseIcebergColumnTreeTest.java @@ -41,9 +41,9 @@ static Stream icebergSchemas() { // primitives arguments("\"boolean\"", "BOOLEAN"), arguments("\"int\"", "NUMBER(10,0)"), - arguments("\"long\"", "NUMBER(19,0)"), - arguments("\"float\"", "FLOAT"), - arguments("\"double\"", "FLOAT"), + arguments("\"long\"", "LONG"), + arguments("\"float\"", "DOUBLE"), + arguments("\"double\"", "DOUBLE"), arguments("\"date\"", "DATE"), arguments("\"time\"", "TIME(6)"), arguments("\"timestamptz\"", "TIMESTAMP_LTZ"), @@ -59,7 +59,7 @@ static Stream icebergSchemas() { // list arguments( "{\"type\":\"list\",\"element-id\":23,\"element\":\"long\",\"element-required\":false}", - "ARRAY(NUMBER(19,0))"), + "ARRAY(LONG)"), // map arguments( "{\"type\":\"map\",\"key-id\":4,\"key\":\"int\",\"value-id\":5,\"value\":\"string\",\"value-required\":false}", @@ -78,8 +78,8 @@ static Stream icebergSchemas() { "{\"type\":\"struct\",\"fields\":[{\"id\":2,\"name\":\"offset\",\"required\":false,\"type\":\"int\"},{\"id\":3,\"name\":\"topic\",\"required\":false,\"type\":\"string\"},{\"id\":4,\"name\":\"partition\",\"required\":false,\"type\":\"int\"},{\"id\":5,\"name\":\"key\",\"required\":false,\"type\":\"string\"},{\"id\":6,\"name\":\"schema_id\",\"required\":false,\"type\":\"int\"},{\"id\":7,\"name\":\"key_schema_id\",\"required\":false,\"type\":\"int\"},{\"id\":8,\"name\":\"CreateTime\",\"required\":false,\"type\":\"long\"},{\"id\":9,\"name\":\"LogAppendTime\",\"required\":false,\"type\":\"long\"},{\"id\":10,\"name\":\"SnowflakeConnectorPushTime\",\"required\":false,\"type\":\"long\"},{\"id\":11,\"name\":\"headers\",\"required\":false,\"type\":{\"type\":\"map\",\"key-id\":12,\"key\":\"string\",\"value-id\":13,\"value\":\"string\",\"value-required\":false}}]}\n", "OBJECT(offset NUMBER(10,0), topic VARCHAR(16777216), partition" + " NUMBER(10,0), key VARCHAR(16777216), schema_id NUMBER(10,0), key_schema_id" - + " NUMBER(10,0), CreateTime NUMBER(19,0), LogAppendTime NUMBER(19,0)," - + " SnowflakeConnectorPushTime NUMBER(19,0), headers MAP(VARCHAR(16777216)," + + " NUMBER(10,0), CreateTime LONG, LogAppendTime LONG," + + " SnowflakeConnectorPushTime LONG, headers MAP(VARCHAR(16777216)," + " VARCHAR(16777216)))")); } @@ -115,8 +115,10 @@ static Stream parseFromJsonArguments() { + "\"vehicle1\" : { \"car\" : { \"brand\" : \"vw\" } }," + "\"vehicle2\" : { \"car\" : { \"brand\" : \"toyota\" } }" + "}}", - "OBJECT(vehicle1 OBJECT(car OBJECT(brand VARCHAR)), " - + "vehicle2 OBJECT(car OBJECT(brand VARCHAR)))"), + "OBJECT(vehicle2 OBJECT(car OBJECT(brand VARCHAR)), " + + "vehicle1 OBJECT(car OBJECT(brand VARCHAR)))"), + // <- todo lol with k1, k2 the order is natural, however it changes an order when I used + // vehicles - inspect it arguments( "{ \"testColumnName\": {" + "\"k1\" : { \"car\" : { \"brand\" : \"vw\" } }," @@ -126,28 +128,25 @@ static Stream parseFromJsonArguments() { + " OBJECT(car" + " OBJECT(brand" + " VARCHAR)))")); - // <- todo lol with k1, k2 the order is natural, however it changes an order when I used - // vehicle1, vehicle2 // arguments("{\"test_array\": [1,2,3] }", "Array not yet implemented")); } @ParameterizedTest @MethodSource("mergeTestArguments") void mergeTwoTreesTest(String plainIcebergSchema, String recordJson, String expectedResult) { - // given - // tree parsed from already existing schema store in a channel + // given tree parsed from channel Type type = IcebergDataTypeParser.deserializeIcebergType(plainIcebergSchema); ApacheIcebergColumnSchema apacheSchema = new ApacheIcebergColumnSchema(type, "TESTSTRUCT"); + IcebergColumnTree alreadyExistingTree = new IcebergColumnTree(apacheSchema); + // tree parsed from a record SinkRecord record = createKafkaRecord(recordJson, false); JsonNode recordNode = RecordService.convertToJson(record.valueSchema(), record.value(), true); IcebergColumnJsonValuePair columnValuePair = IcebergColumnJsonValuePair.from(recordNode.fields().next()); - // parse trees - IcebergColumnTree alreadyExistingTree = new IcebergColumnTree(apacheSchema); + IcebergColumnTree modifiedTree = new IcebergColumnTree(columnValuePair); // then - // merge modified tree alreadyExistingTree.merge(modifiedTree); String expected = expectedResult.replaceAll("/ +/g", " "); diff --git a/src/test/java/com/snowflake/kafka/connector/streaming/iceberg/IcebergIngestionSchemaEvolutionIT.java b/src/test/java/com/snowflake/kafka/connector/streaming/iceberg/IcebergIngestionSchemaEvolutionIT.java index b3a07cd8f..b4f32b798 100644 --- a/src/test/java/com/snowflake/kafka/connector/streaming/iceberg/IcebergIngestionSchemaEvolutionIT.java +++ b/src/test/java/com/snowflake/kafka/connector/streaming/iceberg/IcebergIngestionSchemaEvolutionIT.java @@ -86,13 +86,11 @@ void shouldEvolveSchemaAndInsertRecords( // @Disabled public void shouldResolveNewlyInsertedStructuredObjects() throws Exception { String testStruct1 = "{ \"testStruct\": { \"k1\" : 1, \"k2\" : 2 } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct1, 0, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct1, 0, false))); + insertWithRetry(testStruct1, 0, false); waitForOffset(1); String testStruct2 = "{ \"testStruct2\": {\"k1\" : 1, \"k3\" : 2 } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct2, 1, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct2, 1, false))); + insertWithRetry(testStruct2, 1, false); waitForOffset(2); String testStruct3 = @@ -100,8 +98,7 @@ public void shouldResolveNewlyInsertedStructuredObjects() throws Exception { + "\"k1\" : { \"car\" : { \"brand\" : \"vw\" } }," + "\"k2\" : { \"car\" : { \"brand\" : \"toyota\" } }" + "}}"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct3, 2, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct3, 2, false))); + insertWithRetry(testStruct3, 2, false); waitForOffset(3); List rows = describeTable(tableName); assertEquals(rows.size(), 4); @@ -111,23 +108,20 @@ public void shouldResolveNewlyInsertedStructuredObjects() throws Exception { @Test // @Disabled public void alterAlreadyExistingStructure() throws Exception { - // insert a structure k1, k2 + // k1, k2 String testStruct1 = "{ \"testStruct\": { \"k1\" : 1, \"k2\" : 2 } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct1, 0, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct1, 0, false))); + insertWithRetry(testStruct1, 0, false); waitForOffset(1); - // insert the structure but with additional field k3 + // k1, k2 + k3 String testStruct2 = "{ \"testStruct\": { \"k1\" : 1, \"k2\" : 2, \"k3\" : \"foo\" } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct2, 1, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct2, 1, false))); + insertWithRetry(testStruct2, 1, false); waitForOffset(2); - // k1, k2, k3, k4 + // k1, k2, k3 + k4 String testStruct3 = "{ \"testStruct\": { \"k1\" : 1, \"k2\" : 2, \"k3\" : \"bar\", \"k4\" : 4.5 } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct3, 2, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct3, 2, false))); + insertWithRetry(testStruct3, 2, false); waitForOffset(3); List columns = describeTable(tableName); @@ -135,19 +129,35 @@ public void alterAlreadyExistingStructure() throws Exception { columns.get(1).getType(), "OBJECT(k1 NUMBER(19,0), k2 NUMBER(19,0), k3 VARCHAR(16777216), k4 FLOAT)"); - // struck without k1 - verify that schema was not evolved back + // k2, k3, k4 String testStruct4 = "{ \"testStruct\": { \"k2\" : 2, \"k3\" : 3, \"k4\" : 4.34 } }"; - service.insert(Collections.singletonList(createKafkaRecord(testStruct4, 3, false))); - service.insert(Collections.singletonList(createKafkaRecord(testStruct4, 3, false))); + insertWithRetry(testStruct4, 3, false); + waitForOffset(4); columns = describeTable(tableName); assertEquals( columns.get(1).getType(), "OBJECT(k1 NUMBER(19,0), k2 NUMBER(19,0), k3 VARCHAR(16777216), k4 FLOAT)"); + + // k5, k6 + String testStruct5 = "{ \"testStruct\": { \"k5\" : 2, \"k6\" : 3 } }"; + insertWithRetry(testStruct5, 4, false); + waitForOffset(5); + + columns = describeTable(tableName); + assertEquals( + columns.get(1).getType(), + "OBJECT(k1 NUMBER(19,0), k2 NUMBER(19,0), k3 VARCHAR(16777216), k4 FLOAT, k5 NUMBER(19,0)," + + " k6 NUMBER(19,0))"); assertEquals(columns.size(), 2); } + private void insertWithRetry(String record, int offset, boolean withSchema) { + service.insert(Collections.singletonList(createKafkaRecord(record, offset, false))); + service.insert(Collections.singletonList(createKafkaRecord(record, offset, false))); + } + private void assertRecordsInTable() { List> recordsWithMetadata = selectAllSchematizedRecords(); @@ -176,6 +186,7 @@ private void assertRecordsInTable() { private static Stream prepareData() { return Stream.of( + // Reading schema from a record is not yet supported. // Arguments.of( // "Primitive JSON with schema", // primitiveJsonWithSchemaExample,