[cdc] Refactor DebeziumRecordParser (apache#2790)

tongcheng-elong · Jan 25, 2024 · b88938d · b88938d
1 parent 43ddab5
commit b88938d
Show file tree

Hide file tree

Showing 8 changed files with 611 additions and 109 deletions.
diff --git a/docs/content/cdc-ingestion/kafka-cdc.md b/docs/content/cdc-ingestion/kafka-cdc.md
@@ -66,9 +66,11 @@ If a message in a Kafka topic is a change event captured from another database u
 The JSON sources possibly missing some information. For example, Ogg and Maxwell format standards don't contain field 
 types; When you write JSON sources into Flink Kafka sink, it will only reserve data and row type and drop other information. 
 The synchronization job will try best to handle the problem as follows:
-1. If missing field types, Paimon will use 'STRING' type as default. 
-2. If missing database name or table name, you cannot do database synchronization, but you can still do table synchronization.
-3. If missing primary keys, the job might create non primary key table. You can set primary keys when submit job in table 
+1. Usually, debezium-json contains 'schema' field, from which Paimon will retrieve data types. Make sure your debezium 
+json has this field, or Paimon will use 'STRING' type.
+2. If missing field types, Paimon will use 'STRING' type as default. 
+3. If missing database name or table name, you cannot do database synchronization, but you can still do table synchronization.
+4. If missing primary keys, the job might create non primary key table. You can set primary keys when submit job in table 
 synchronization.
 {{< /hint >}}
 

diff --git a/.../paimon-flink-cdc/src/main/java/org/apache/paimon/flink/action/cdc/format/DataFormat.java b/.../paimon-flink-cdc/src/main/java/org/apache/paimon/flink/action/cdc/format/DataFormat.java
@@ -22,7 +22,6 @@
 import org.apache.paimon.flink.action.cdc.TypeMapping;
 import org.apache.paimon.flink.action.cdc.format.canal.CanalRecordParser;
 import org.apache.paimon.flink.action.cdc.format.debezium.DebeziumRecordParser;
-import org.apache.paimon.flink.action.cdc.format.debezium.DebeziumSchemaIncludeRecordParser;
 import org.apache.paimon.flink.action.cdc.format.maxwell.MaxwellRecordParser;
 import org.apache.paimon.flink.action.cdc.format.ogg.OggRecordParser;
 
@@ -39,8 +38,7 @@ public enum DataFormat {
     CANAL_JSON(CanalRecordParser::new),
     OGG_JSON(OggRecordParser::new),
     MAXWELL_JSON(MaxwellRecordParser::new),
-    DEBEZIUM_JSON(DebeziumRecordParser::new),
-    DEBEZIUM_JSON_SCHEMA_INCLUDE(DebeziumSchemaIncludeRecordParser::new);
+    DEBEZIUM_JSON(DebeziumRecordParser::new);
     // Add more data formats here if needed
 
     private final RecordParserFactory parser;

diff --git a/...rc/main/java/org/apache/paimon/flink/action/cdc/format/debezium/DebeziumRecordParser.java b/...rc/main/java/org/apache/paimon/flink/action/cdc/format/debezium/DebeziumRecordParser.java
@@ -22,16 +22,26 @@
 import org.apache.paimon.flink.action.cdc.TypeMapping;
 import org.apache.paimon.flink.action.cdc.format.RecordParser;
 import org.apache.paimon.flink.sink.cdc.RichCdcMultiplexRecord;
+import org.apache.paimon.types.DataType;
 import org.apache.paimon.types.RowKind;
 import org.apache.paimon.utils.JsonSerdeUtil;
+import org.apache.paimon.utils.Preconditions;
 
+import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.type.TypeReference;
 import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.databind.JsonNode;
+import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.databind.node.ArrayNode;
 
 import javax.annotation.Nullable;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 
+import static org.apache.paimon.utils.JsonSerdeUtil.getNodeAs;
 import static org.apache.paimon.utils.JsonSerdeUtil.isNull;
 
 /**
@@ -66,6 +76,11 @@ public class DebeziumRecordParser extends RecordParser {
     private static final String OP_DELETE = "d";
     private static final String OP_READE = "r";
 
+    private boolean hasSchema;
+    private final Map<String, String> debeziumTypes = new HashMap<>();
+    private final Map<String, String> classNames = new HashMap<>();
+    private final Map<String, Map<String, String>> parameters = new HashMap<>();
+
     public DebeziumRecordParser(
             boolean caseSensitive, TypeMapping typeMapping, List<ComputedColumn> computedColumns) {
         super(caseSensitive, typeMapping, computedColumns);
@@ -105,13 +120,98 @@ private JsonNode getBefore(String op) {
     @Override
     protected void setRoot(String record) {
         JsonNode node = JsonSerdeUtil.fromJson(record, JsonNode.class);
+
+        hasSchema = false;
         if (node.has(FIELD_SCHEMA)) {
             root = node.get(FIELD_PAYLOAD);
+            JsonNode schema = node.get(FIELD_SCHEMA);
+            if (!isNull(schema)) {
+                parseSchema(schema);
+                hasSchema = true;
+            }
         } else {
             root = node;
         }
     }
 
+    private void parseSchema(JsonNode schema) {
+        debeziumTypes.clear();
+        classNames.clear();
+        parameters.clear();
+
+        ArrayNode schemaFields = getNodeAs(schema, "fields", ArrayNode.class);
+        Preconditions.checkNotNull(schemaFields);
+
+        ArrayNode fields = null;
+        for (int i = 0; i < schemaFields.size(); i++) {
+            JsonNode node = schemaFields.get(i);
+            if (getString(node, "field").equals("after")) {
+                fields = getNodeAs(node, "fields", ArrayNode.class);
+                break;
+            } else if (getString(node, "field").equals("before")) {
+                if (fields == null) {
+                    fields = getNodeAs(node, "fields", ArrayNode.class);
+                }
+            }
+        }
+        Preconditions.checkNotNull(fields);
+
+        for (JsonNode node : fields) {
+            String field = getString(node, "field");
+
+            debeziumTypes.put(field, getString(node, "type"));
+            classNames.put(field, getString(node, "name"));
+
+            JsonNode parametersNode = node.get("parameters");
+            Map<String, String> parametersMap =
+                    isNull(parametersNode)
+                            ? Collections.emptyMap()
+                            : JsonSerdeUtil.convertValue(
+                                    parametersNode,
+                                    new TypeReference<HashMap<String, String>>() {});
+
+            parameters.put(field, parametersMap);
+        }
+    }
+
+    @Nullable
+    private String getString(JsonNode node, String fieldName) {
+        JsonNode fieldValue = node.get(fieldName);
+        return isNull(fieldValue) ? null : fieldValue.asText();
+    }
+
+    @Override
+    protected Map<String, String> extractRowData(
+            JsonNode record, LinkedHashMap<String, DataType> paimonFieldTypes) {
+        if (!hasSchema) {
+            return super.extractRowData(record, paimonFieldTypes);
+        }
+
+        Map<String, Object> recordMap =
+                JsonSerdeUtil.convertValue(record, new TypeReference<Map<String, Object>>() {});
+        LinkedHashMap<String, String> resultMap = new LinkedHashMap<>();
+        for (Map.Entry<String, Object> entry : recordMap.entrySet()) {
+            String fieldName = entry.getKey();
+            String rawValue = Objects.toString(entry.getValue(), null);
+            String debeziumType = debeziumTypes.get(fieldName);
+            String className = classNames.get(fieldName);
+
+            String transformed =
+                    DebeziumSchemaUtils.transformRawValue(
+                            rawValue, debeziumType, className, typeMapping, record.get(fieldName));
+            resultMap.put(fieldName, transformed);
+
+            paimonFieldTypes.put(
+                    fieldName,
+                    DebeziumSchemaUtils.toDataType(
+                            debeziumType, className, parameters.get(fieldName)));
+        }
+
+        evalComputedColumns(resultMap, paimonFieldTypes);
+
+        return resultMap;
+    }
+
     @Override
     protected String primaryField() {
         return FIELD_PRIMARY;

diff --git a/...org/apache/paimon/flink/action/cdc/format/debezium/DebeziumSchemaIncludeRecordParser.java b/...org/apache/paimon/flink/action/cdc/format/debezium/DebeziumSchemaIncludeRecordParser.java