diff --git a/core/trino-server/src/main/provisio/presto.xml b/core/trino-server/src/main/provisio/presto.xml index 48190e7b21a7..02575a105ae4 100644 --- a/core/trino-server/src/main/provisio/presto.xml +++ b/core/trino-server/src/main/provisio/presto.xml @@ -74,6 +74,12 @@ + + + + + + diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java index 5546808b2bdb..37f33bc6cfaa 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java @@ -28,7 +28,6 @@ import io.trino.plugin.hive.HiveSplit.BucketConversion; import io.trino.plugin.hive.HiveSplit.BucketValidation; import io.trino.plugin.hive.acid.AcidTransaction; -import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.metastore.Partition; import io.trino.plugin.hive.metastore.Table; import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; @@ -100,8 +99,6 @@ import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE; import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR; import static io.trino.plugin.hive.HivePartitionManager.partitionMatches; import static io.trino.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize; @@ -114,11 +111,11 @@ import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.FAIL; import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.IGNORED; import static io.trino.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.RECURSE; -import static io.trino.plugin.hive.util.HiveUtil.checkCondition; import static io.trino.plugin.hive.util.HiveUtil.getFooterCount; import static io.trino.plugin.hive.util.HiveUtil.getHeaderCount; import static io.trino.plugin.hive.util.HiveUtil.getInputFormat; import static io.trino.plugin.hive.util.HiveUtil.getPartitionKeyColumnHandles; +import static io.trino.plugin.hive.util.HiveUtil.getPartitionKeys; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; import static java.lang.Integer.parseInt; import static java.lang.Math.max; @@ -942,28 +939,6 @@ private static List getTargetPathsFromSymlink(FileSystem fileSystem, Path } } - private static List getPartitionKeys(Table table, Optional partition) - { - if (partition.isEmpty()) { - return ImmutableList.of(); - } - ImmutableList.Builder partitionKeys = ImmutableList.builder(); - List keys = table.getPartitionColumns(); - List values = partition.get().getValues(); - checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, "Expected %s partition key values, but got %s", keys.size(), values.size()); - for (int i = 0; i < keys.size(); i++) { - String name = keys.get(i).getName(); - HiveType hiveType = keys.get(i).getType(); - if (!hiveType.isSupportedType(table.getStorage().getStorageFormat())) { - throw new TrinoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName())); - } - String value = values.get(i); - checkCondition(value != null, HIVE_INVALID_PARTITION_VALUE, "partition key value cannot be null for field: %s", name); - partitionKeys.add(new HivePartitionKey(name, value)); - } - return partitionKeys.build(); - } - private static Properties getPartitionSchema(Table table, Optional partition) { if (partition.isEmpty()) { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java index 1aeee10e3e85..8ee548f8a89d 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java @@ -365,7 +365,7 @@ public static Optional getColumnType(HiveColumnH return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type))); } - private static Optional getColumnIndexStore( + public static Optional getColumnIndexStore( ParquetDataSource dataSource, BlockMetaData blockMetadata, Map, RichColumnDescriptor> descriptorsByPath, @@ -440,7 +440,7 @@ public static TupleDomain getParquetTupleDomain( return TupleDomain.withColumnDomains(predicate.buildOrThrow()); } - private static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) + public static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) { if (useParquetColumnNames) { return getParquetTypeByName(column.getBaseColumnName(), messageType); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index 9486f51f3d8d..0a8e611088c9 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -32,6 +32,7 @@ import io.trino.plugin.hive.HiveType; import io.trino.plugin.hive.avro.TrinoAvroSerDe; import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.Partition; import io.trino.plugin.hive.metastore.SortingColumn; import io.trino.plugin.hive.metastore.Table; import io.trino.spi.ErrorCodeSupplier; @@ -1128,4 +1129,26 @@ public static boolean isIcebergTable(Table table) { return ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(table.getParameters().get(ICEBERG_TABLE_TYPE_NAME)); } + + public static List getPartitionKeys(Table table, Optional partition) + { + if (partition.isEmpty()) { + return ImmutableList.of(); + } + ImmutableList.Builder partitionKeys = ImmutableList.builder(); + List keys = table.getPartitionColumns(); + List values = partition.get().getValues(); + checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, "Expected %s partition key values, but got %s", keys.size(), values.size()); + for (int i = 0; i < keys.size(); i++) { + String name = keys.get(i).getName(); + HiveType hiveType = keys.get(i).getType(); + if (!hiveType.isSupportedType(table.getStorage().getStorageFormat())) { + throw new TrinoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName())); + } + String value = values.get(i); + checkCondition(value != null, HIVE_INVALID_PARTITION_VALUE, "partition key value cannot be null for field: %s", name); + partitionKeys.add(new HivePartitionKey(name, value)); + } + return partitionKeys.build(); + } } diff --git a/plugin/trino-hudi/pom.xml b/plugin/trino-hudi/pom.xml new file mode 100644 index 000000000000..a98763cd29c2 --- /dev/null +++ b/plugin/trino-hudi/pom.xml @@ -0,0 +1,371 @@ + + + 4.0.0 + + + trino-root + io.trino + 370-SNAPSHOT + ../../pom.xml + + + trino-hudi + Trino - Hudi Connector + trino-plugin + + + ${project.parent.basedir} + 0.10.0 + + + + + io.trino + trino-hive + + + io.trino + trino-memory-context + + + io.trino + trino-parquet + + + io.trino + trino-plugin-toolkit + + + io.trino.hadoop + hadoop-apache + + + io.trino.hive + hive-apache + + + io.airlift + bootstrap + + + io.airlift + configuration + + + io.airlift + event + + + io.airlift + json + + + io.airlift + log + + + io.airlift + units + + + com.google.guava + guava + + + com.google.inject + guice + + + javax.inject + javax.inject + + + javax.validation + validation-api + + + joda-time + joda-time + + + org.apache.hudi + hudi-common + ${dep.hudi.version} + + + org.apache.hbase + hbase-server + + + org.apache.orc + orc-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + fluent-hc + + + org.rocksdb + rocksdbjni + + + com.esotericsoftware + kryo-shaded + + + org.apache.hadoop + hadoop-client + + + org.apache.httpcomponents + httpcore + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-jdbc + + + + + org.apache.hudi + hudi-hadoop-mr + ${dep.hudi.version} + + + org.apache.hbase + hbase-server + + + org.apache.orc + orc-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + fluent-hc + + + org.rocksdb + rocksdbjni + + + com.esotericsoftware + kryo-shaded + + + org.apache.hadoop + hadoop-client + + + org.apache.httpcomponents + httpcore + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-jdbc + + + + + org.apache.hudi + hudi-hive-sync + ${dep.hudi.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-hdfs + + + org.apache.hadoop + hadoop-auth + + + org.apache.hive + hive-common + + + org.apache.hive + hive-jdbc + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-service + + + org.apache.hudi + hudi-common + + + org.apache.hudi + hudi-hadoop-mr + + + org.apache.hudi + hudi-sync-common + + + org.apache.parquet + parquet-avro + + + log4j + log4j + + + com.beust + jcommander + + + servletapi + servletapi + + + + + org.weakref + jmxutils + + + + io.trino + trino-spi + provided + + + io.airlift + slice + provided + + + com.fasterxml.jackson.core + jackson-annotations + provided + + + org.openjdk.jol + jol-core + provided + + + + + io.trino + trino-hive + test-jar + test + + + io.trino + trino-hive-hadoop2 + test + + + io.trino + trino-main + test + + + io.trino + trino-main + test-jar + test + + + io.trino + trino-parser + test + + + io.trino + trino-spi + test-jar + test + + + io.trino + trino-testing + test + + + io.trino + trino-testing-services + test + + + io.trino + trino-tpch + test + + + io.trino.tpch + tpch + test + + + io.airlift + testing + test + + + org.assertj + assertj-core + test + + + org.jetbrains + annotations + test + + + org.testng + testng + test + + + + diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java new file mode 100644 index 000000000000..1b84fa5bc635 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java @@ -0,0 +1,198 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.units.DataSize; +import org.apache.hudi.common.model.HoodieFileFormat; + +import javax.validation.constraints.DecimalMax; +import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Min; +import javax.validation.constraints.NotNull; + +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +public class HudiConfig +{ + private HoodieFileFormat baseFileFormat = PARQUET; + private boolean metadataEnabled; + private boolean shouldSkipMetaStoreForPartition; + private boolean shouldUseParquetColumnNames = true; + private int partitionScannerParallelism = 16; + private int splitGeneratorParallelism = 16; + private int minPartitionBatchSize = 10; + private int maxPartitionBatchSize = 100; + private boolean sizeBasedSplitWeightsEnabled = true; + private DataSize standardSplitWeightSize = DataSize.of(128, MEGABYTE); + private double minimumAssignedSplitWeight = 0.05; + + @NotNull + public HoodieFileFormat getBaseFileFormat() + { + return HoodieFileFormat.valueOf(baseFileFormat.name()); + } + + @Config("hudi.base-file-format") + public HudiConfig setBaseFileFormat(HoodieFileFormat baseFileFormat) + { + this.baseFileFormat = baseFileFormat; + return this; + } + + @Config("hudi.metadata-enabled") + @ConfigDescription("Fetch the list of file names and sizes from metadata rather than storage") + public HudiConfig setMetadataEnabled(boolean metadataEnabled) + { + this.metadataEnabled = metadataEnabled; + return this; + } + + @NotNull + public boolean isMetadataEnabled() + { + return this.metadataEnabled; + } + + @Config("hudi.skip-metastore-for-partition") + @ConfigDescription("By default, partition info is fetched from the metastore. " + + "When this config is enabled, then the partition info is fetched using Hudi's partition extractor and relative partition path.") + public HudiConfig setSkipMetaStoreForPartition(boolean shouldSkipMetaStoreForPartition) + { + this.shouldSkipMetaStoreForPartition = shouldSkipMetaStoreForPartition; + return this; + } + + @NotNull + public boolean getSkipMetaStoreForPartition() + { + return this.shouldSkipMetaStoreForPartition; + } + + @Config("hudi.use-parquet-column-names") + @ConfigDescription("Access parquet columns using names from the file. If disabled, then columns are accessed using index." + + "Only applicable to parquet file format.") + public HudiConfig setUseParquetColumnNames(boolean shouldUseParquetColumnNames) + { + this.shouldUseParquetColumnNames = shouldUseParquetColumnNames; + return this; + } + + @NotNull + public boolean getUseParquetColumnNames() + { + return this.shouldUseParquetColumnNames; + } + + @Config("hudi.partition-scanner-parallelism") + @ConfigDescription("Number of threads to use for partition scanners") + public HudiConfig setPartitionScannerParallelism(int partitionScannerParallelism) + { + this.partitionScannerParallelism = partitionScannerParallelism; + return this; + } + + @NotNull + public int getPartitionScannerParallelism() + { + return this.partitionScannerParallelism; + } + + @Config("hudi.split-generator-parallelism") + @ConfigDescription("Number of threads to use for split generators") + public HudiConfig setSplitGeneratorParallelism(int splitGeneratorParallelism) + { + this.splitGeneratorParallelism = splitGeneratorParallelism; + return this; + } + + @NotNull + public int getSplitGeneratorParallelism() + { + return this.splitGeneratorParallelism; + } + + @Config("hudi.min-partition-batch-size") + public HudiConfig setMinPartitionBatchSize(int minPartitionBatchSize) + { + this.minPartitionBatchSize = minPartitionBatchSize; + return this; + } + + @Min(1) + public int getMinPartitionBatchSize() + { + return minPartitionBatchSize; + } + + @Config("hudi.max-partition-batch-size") + public HudiConfig setMaxPartitionBatchSize(int maxPartitionBatchSize) + { + this.maxPartitionBatchSize = maxPartitionBatchSize; + return this; + } + + @Min(1) + public int getMaxPartitionBatchSize() + { + return maxPartitionBatchSize; + } + + @Config("hudi.size-based-split-weights-enabled") + @ConfigDescription("Unlike uniform splitting, size-based splitting ensures that each batch of splits has enough data to process. " + + "By default, it is enabled to improve performance") + public HudiConfig setSizeBasedSplitWeightsEnabled(boolean sizeBasedSplitWeightsEnabled) + { + this.sizeBasedSplitWeightsEnabled = sizeBasedSplitWeightsEnabled; + return this; + } + + public boolean isSizeBasedSplitWeightsEnabled() + { + return sizeBasedSplitWeightsEnabled; + } + + @Config("hudi.standard-split-weight-size") + @ConfigDescription("The split size corresponding to the standard weight (1.0) " + + "when size based split weights are enabled") + public HudiConfig setStandardSplitWeightSize(DataSize standardSplitWeightSize) + { + this.standardSplitWeightSize = standardSplitWeightSize; + return this; + } + + @NotNull + public DataSize getStandardSplitWeightSize() + { + return standardSplitWeightSize; + } + + @Config("hudi.minimum-assigned-split-weight") + @ConfigDescription("Minimum weight that a split can be assigned when size based split weights are enabled") + public HudiConfig setMinimumAssignedSplitWeight(double minimumAssignedSplitWeight) + { + this.minimumAssignedSplitWeight = minimumAssignedSplitWeight; + return this; + } + + @DecimalMax("1") + @DecimalMin(value = "0", inclusive = false) + public double getMinimumAssignedSplitWeight() + { + return minimumAssignedSplitWeight; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java new file mode 100644 index 000000000000..ec7e0e39fa96 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java @@ -0,0 +1,179 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.airlift.bootstrap.LifeCycleManager; +import io.airlift.log.Logger; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorMetadata; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.HiveTransactionHandle; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorHandleResolver; +import io.trino.spi.connector.ConnectorMetadata; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.transaction.IsolationLevel; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.spi.transaction.IsolationLevel.SERIALIZABLE; +import static io.trino.spi.transaction.IsolationLevel.checkConnectorSupports; +import static java.util.Objects.requireNonNull; + +public class HudiConnector + implements Connector +{ + private static final Logger log = Logger.get(HudiConnector.class); + + private final LifeCycleManager lifeCycleManager; + private final HudiTransactionManager transactionManager; + private final HudiMetadataFactory metadataFactory; + private final ConnectorSplitManager splitManager; + private final ConnectorPageSourceProvider pageSourceProvider; + private final ConnectorNodePartitioningProvider nodePartitioningProvider; + private final Set systemTables; + private final List> sessionProperties; + private final List> tableProperties; + private final Optional accessControl; + + public HudiConnector( + LifeCycleManager lifeCycleManager, + HudiTransactionManager transactionManager, + HudiMetadataFactory metadataFactory, + ConnectorSplitManager splitManager, + ConnectorPageSourceProvider pageSourceProvider, + ConnectorNodePartitioningProvider nodePartitioningProvider, + Set systemTables, + Set sessionPropertiesProviders, + List> tableProperties, + Optional accessControl) + { + this.lifeCycleManager = requireNonNull(lifeCycleManager, "lifeCycleManager is null"); + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.metadataFactory = requireNonNull(metadataFactory, "metadataFactory is null"); + this.splitManager = requireNonNull(splitManager, "splitManager is null"); + this.pageSourceProvider = requireNonNull(pageSourceProvider, "pageSourceProvider is null"); + this.nodePartitioningProvider = requireNonNull(nodePartitioningProvider, "nodePartitioningProvider is null"); + this.systemTables = ImmutableSet.copyOf(requireNonNull(systemTables, "systemTables is null")); + this.sessionProperties = requireNonNull(sessionPropertiesProviders, "sessionPropertiesProviders is null").stream() + .flatMap(sessionPropertiesProvider -> sessionPropertiesProvider.getSessionProperties().stream()) + .collect(toImmutableList()); + this.tableProperties = ImmutableList.copyOf(requireNonNull(tableProperties, "tableProperties is null")); + this.accessControl = requireNonNull(accessControl, "accessControl is null"); + } + + @Override + public Optional getHandleResolver() + { + return Optional.of(new HudiHandleResolver()); + } + + @Override + public ConnectorMetadata getMetadata(ConnectorTransactionHandle transactionHandle) + { + ConnectorMetadata metadata = transactionManager.get(transactionHandle); + return new ClassLoaderSafeConnectorMetadata(metadata, getClass().getClassLoader()); + } + + @Override + public ConnectorSplitManager getSplitManager() + { + return splitManager; + } + + @Override + public ConnectorPageSourceProvider getPageSourceProvider() + { + return pageSourceProvider; + } + + @Override + public ConnectorNodePartitioningProvider getNodePartitioningProvider() + { + return nodePartitioningProvider; + } + + @Override + public Set getSystemTables() + { + return systemTables; + } + + @Override + public List> getSessionProperties() + { + return sessionProperties; + } + + @Override + public List> getTableProperties() + { + return tableProperties; + } + + @Override + public ConnectorAccessControl getAccessControl() + { + return accessControl.orElseThrow(UnsupportedOperationException::new); + } + + @Override + public ConnectorTransactionHandle beginTransaction(IsolationLevel isolationLevel, boolean readOnly, boolean autoCommit) + { + checkConnectorSupports(SERIALIZABLE, isolationLevel); + ConnectorTransactionHandle transaction = new HiveTransactionHandle(); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + transactionManager.put(transaction, metadataFactory.create()); + } + return transaction; + } + + @Override + public void commit(ConnectorTransactionHandle transaction) + { + transactionManager.remove(transaction); + } + + @Override + public void rollback(ConnectorTransactionHandle transaction) + { + HudiMetadata metadata = transactionManager.remove(transaction); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + metadata.rollback(); + } + } + + @Override + public final void shutdown() + { + try { + lifeCycleManager.stop(); + } + catch (Exception e) { + log.error(e, "Error shutting down connector"); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java new file mode 100644 index 000000000000..34d077f756a5 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.inject.Binder; +import com.google.inject.Module; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorContext; +import io.trino.spi.connector.ConnectorFactory; + +import java.lang.reflect.InvocationTargetException; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static java.util.Objects.requireNonNull; + +public class HudiConnectorFactory + implements ConnectorFactory +{ + private final String name; + private final Class module; + + public HudiConnectorFactory(String name) + { + this(name, EmptyModule.class); + } + + public HudiConnectorFactory(String name, Class module) + { + checkArgument(!isNullOrEmpty(name), "name is null or empty"); + this.name = name; + this.module = requireNonNull(module, "module is null"); + } + + @Override + public String getName() + { + return name; + } + + @Override + public Connector create(String catalogName, Map config, ConnectorContext context) + { + ClassLoader classLoader = context.duplicatePluginClassLoader(); + try { + Object moduleInstance = classLoader.loadClass(module.getName()).getConstructor().newInstance(); + Class moduleClass = classLoader.loadClass(Module.class.getName()); + return (Connector) classLoader.loadClass(InternalHudiConnectorFactory.class.getName()) + .getMethod("createConnector", String.class, Map.class, ConnectorContext.class, moduleClass) + .invoke(null, catalogName, config, context, moduleInstance); + } + catch (InvocationTargetException e) { + Throwable targetException = e.getTargetException(); + throwIfUnchecked(targetException); + throw new RuntimeException(targetException); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + + public static class EmptyModule + implements Module + { + @Override + public void configure(Binder binder) {} + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java new file mode 100644 index 000000000000..d38724451c22 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.spi.ErrorCode; +import io.trino.spi.ErrorCodeSupplier; +import io.trino.spi.ErrorType; + +import static io.trino.spi.ErrorType.EXTERNAL; +import static io.trino.spi.ErrorType.INTERNAL_ERROR; +import static io.trino.spi.ErrorType.USER_ERROR; + +public enum HudiErrorCode + implements ErrorCodeSupplier +{ + HUDI_UNKNOWN_TABLE_TYPE(0, EXTERNAL), + HUDI_INVALID_METADATA(1, EXTERNAL), + HUDI_TOO_MANY_OPEN_PARTITIONS(2, USER_ERROR), + HUDI_INVALID_PARTITION_VALUE(3, EXTERNAL), + HUDI_BAD_DATA(4, EXTERNAL), + HUDI_MISSING_DATA(5, EXTERNAL), + HUDI_CANNOT_OPEN_SPLIT(6, EXTERNAL), + HUDI_WRITER_OPEN_ERROR(7, EXTERNAL), + HUDI_FILESYSTEM_ERROR(8, EXTERNAL), + HUDI_CURSOR_ERROR(9, EXTERNAL), + HUDI_WRITE_VALIDATION_FAILED(10, INTERNAL_ERROR), + HUDI_INVALID_SNAPSHOT_ID(11, USER_ERROR); + + private final ErrorCode errorCode; + + HudiErrorCode(int code, ErrorType type) + { + errorCode = new ErrorCode(code + 0x0100_0000, name(), type); + } + + @Override + public ErrorCode toErrorCode() + { + return errorCode; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiHandleResolver.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiHandleResolver.java new file mode 100644 index 000000000000..fd72934f825a --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiHandleResolver.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HiveInsertTableHandle; +import io.trino.plugin.hive.HiveOutputTableHandle; +import io.trino.plugin.hive.HiveTransactionHandle; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ConnectorHandleResolver; +import io.trino.spi.connector.ConnectorInsertTableHandle; +import io.trino.spi.connector.ConnectorOutputTableHandle; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTransactionHandle; + +public class HudiHandleResolver + implements ConnectorHandleResolver +{ + @Override + public Class getTableHandleClass() + { + return HudiTableHandle.class; + } + + @Override + public Class getColumnHandleClass() + { + return HiveColumnHandle.class; + } + + @Override + public Class getSplitClass() + { + return HudiSplit.class; + } + + @Override + public Class getOutputTableHandleClass() + { + return HiveOutputTableHandle.class; + } + + @Override + public Class getInsertTableHandleClass() + { + return HiveInsertTableHandle.class; + } + + @Override + public Class getTransactionHandleClass() + { + return HiveTransactionHandle.class; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java new file mode 100644 index 000000000000..36843d75540f --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.List; + +public class HudiInputInfo +{ + private final List partitionIds; + // Code that serialize HudiInputInfo into log would often need the ability to limit the length of log entries. + // This boolean field allows such code to mark the log entry as length limited. + private final boolean truncated; + + @JsonCreator + public HudiInputInfo( + @JsonProperty("partitionIds") List partitionIds, + @JsonProperty("truncated") boolean truncated) + { + this.partitionIds = partitionIds; + this.truncated = truncated; + } + + @JsonProperty + public List getPartitionIds() + { + return partitionIds; + } + + @JsonProperty + public boolean isTruncated() + { + return truncated; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java new file mode 100644 index 000000000000..e03aa4a01ac5 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java @@ -0,0 +1,341 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Sets; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartition; +import io.trino.plugin.hive.acid.AcidSchema; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorMetadata; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTableProperties; +import io.trino.spi.connector.Constraint; +import io.trino.spi.connector.ConstraintApplicationResult; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SchemaTablePrefix; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Iterables.concat; +import static io.trino.plugin.hive.HiveColumnHandle.BUCKET_COLUMN_NAME; +import static io.trino.plugin.hive.HiveColumnHandle.FILE_MODIFIED_TIME_COLUMN_NAME; +import static io.trino.plugin.hive.HiveColumnHandle.FILE_SIZE_COLUMN_NAME; +import static io.trino.plugin.hive.HiveColumnHandle.PARTITION_COLUMN_NAME; +import static io.trino.plugin.hive.HiveColumnHandle.PATH_COLUMN_NAME; +import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; +import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; +import static io.trino.plugin.hive.HiveTableProperties.EXTERNAL_LOCATION_PROPERTY; +import static io.trino.plugin.hive.HiveTableProperties.PARTITIONED_BY_PROPERTY; +import static io.trino.plugin.hive.HiveTimestampPrecision.NANOSECONDS; +import static io.trino.plugin.hive.util.HiveUtil.columnExtraInfo; +import static io.trino.plugin.hive.util.HiveUtil.hiveColumnHandles; +import static io.trino.plugin.hive.util.HiveUtil.isHiveSystemSchema; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_UNKNOWN_TABLE_TYPE; +import static io.trino.plugin.hudi.HudiUtil.splitPredicate; +import static java.lang.String.format; +import static java.util.Collections.singletonList; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; +import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE; +import static org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable; +import static org.apache.hudi.common.fs.FSUtils.getFs; +import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; +import static org.apache.hudi.exception.TableNotFoundException.checkTableValidity; + +public class HudiMetadata + implements ConnectorMetadata +{ + private static final Logger log = Logger.get(HudiMetadata.class); + private final HiveMetastore metastore; + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + private Table hiveTable; + + public HudiMetadata(HiveMetastore metastore, HdfsEnvironment hdfsEnvironment, TypeManager typeManager) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public List listSchemaNames(ConnectorSession session) + { + return metastore.getAllDatabases(); + } + + @Override + public HudiTableHandle getTableHandle(ConnectorSession session, SchemaTableName tableName) + { + requireNonNull(tableName, "tableName is null"); + if (isHiveSystemSchema(tableName.getSchemaName())) { + return null; + } + Optional table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()); + if (table.isEmpty()) { + return null; + } + hiveTable = table.get(); + if (!isHudiTable(session, hiveTable)) { + throw new TrinoException(HUDI_UNKNOWN_TABLE_TYPE, format("Not a Hudi table: %s", tableName)); + } + return new HudiTableHandle( + tableName.getSchemaName(), + tableName.getTableName(), + hiveTable.getStorage().getLocation(), + HoodieTableType.COPY_ON_WRITE, + TupleDomain.all(), + TupleDomain.all(), + Optional.of(getTableMetaClient(session, table.get()))); + } + + @Override + public ConnectorTableMetadata getTableMetadata(ConnectorSession session, ConnectorTableHandle table) + { + HudiTableHandle hudiTableHandle = (HudiTableHandle) table; + return getTableMetadata(hudiTableHandle.getSchemaTableName()); + } + + @Override + public Optional> applyFilter(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint) + { + HudiTableHandle handle = (HudiTableHandle) tableHandle; + HudiPredicates predicates = splitPredicate(constraint.getSummary()); + HudiTableHandle newHudiTableHandle = handle.withPredicates(predicates); + + if (handle.getPartitionPredicates().equals(newHudiTableHandle.getPartitionPredicates()) + && handle.getRegularPredicates().equals(newHudiTableHandle.getRegularPredicates())) { + log.info("No new predicates to apply"); + return Optional.empty(); + } + + return Optional.of(new ConstraintApplicationResult<>( + newHudiTableHandle, + newHudiTableHandle.getRegularPredicates().transformKeys(ColumnHandle.class::cast), + false)); + } + + @Override + public ConnectorTableProperties getTableProperties(ConnectorSession session, ConnectorTableHandle tableHandle) + { + return new ConnectorTableProperties(); + } + + @Override + public Map getColumnHandles(ConnectorSession session, ConnectorTableHandle tableHandle) + { + requireNonNull(hiveTable, "hiveTable is null"); + return hiveColumnHandles(hiveTable, typeManager, NANOSECONDS).stream() + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + } + + @Override + public ColumnMetadata getColumnMetadata(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle columnHandle) + { + return ((HiveColumnHandle) columnHandle).getColumnMetadata(); + } + + @Override + public Optional getInfo(ConnectorTableHandle table) + { + return ((HudiTableHandle) table).getPartitions() + .map(partitions -> new HudiInputInfo( + partitions.stream() + .map(HivePartition::getPartitionId) + .collect(toImmutableList()), + false)); + } + + @Override + public List listTables(ConnectorSession session, Optional optionalSchemaName) + { + ImmutableList.Builder tableNames = ImmutableList.builder(); + for (String schemaName : listSchemas(session, optionalSchemaName)) { + for (String tableName : metastore.getAllTables(schemaName)) { + tableNames.add(new SchemaTableName(schemaName, tableName)); + } + } + + tableNames.addAll(listMaterializedViews(session, optionalSchemaName)); + return tableNames.build(); + } + + @Override + public Map> listTableColumns(ConnectorSession session, SchemaTablePrefix prefix) + { + List tables = prefix.getTable() + .map(ignored -> singletonList(prefix.toSchemaTableName())) + .orElseGet(() -> listTables(session, prefix.getSchema())); + + ImmutableMap.Builder> columns = ImmutableMap.builder(); + for (SchemaTableName table : tables) { + try { + columns.put(table, getTableMetadata(table).getColumns()); + } + catch (TableNotFoundException e) { + // table disappeared during listing operation + } + } + return columns.build(); + } + + HiveMetastore getMetastore() + { + return metastore; + } + + Table getTable() + { + return hiveTable; + } + + void rollback() + { + // TODO: cleanup open transaction when write will be supported + } + + private static Function columnMetadataGetter(Table table) + { + ImmutableList.Builder columnNames = ImmutableList.builder(); + table.getPartitionColumns().stream().map(Column::getName).forEach(columnNames::add); + table.getDataColumns().stream().map(Column::getName).forEach(columnNames::add); + List allColumnNames = columnNames.build(); + if (allColumnNames.size() > Sets.newHashSet(allColumnNames).size()) { + throw new TrinoException(HIVE_INVALID_METADATA, + format("Hive metadata for table %s is invalid: Table descriptor contains duplicate columns", table.getTableName())); + } + + List tableColumns = table.getDataColumns(); + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Column field : concat(tableColumns, table.getPartitionColumns())) { + if (field.getComment().isPresent() && !field.getComment().get().equals("from deserializer")) { + builder.put(field.getName(), field.getComment()); + } + else { + builder.put(field.getName(), Optional.empty()); + } + } + + // add hidden columns + builder.put(PATH_COLUMN_NAME, Optional.empty()); + if (table.getStorage().getBucketProperty().isPresent()) { + builder.put(BUCKET_COLUMN_NAME, Optional.empty()); + } + builder.put(FILE_SIZE_COLUMN_NAME, Optional.empty()); + builder.put(FILE_MODIFIED_TIME_COLUMN_NAME, Optional.empty()); + if (!table.getPartitionColumns().isEmpty()) { + builder.put(PARTITION_COLUMN_NAME, Optional.empty()); + } + + if (isFullAcidTable(table.getParameters())) { + for (String name : AcidSchema.ACID_COLUMN_NAMES) { + builder.put(name, Optional.empty()); + } + } + + Map> columnComment = builder.build(); + + return handle -> ColumnMetadata.builder() + .setName(handle.getName()) + .setType(handle.getType()) + .setComment(columnComment.get(handle.getName())) + .setExtraInfo(Optional.ofNullable(columnExtraInfo(handle.isPartitionKey()))) + .setHidden(handle.isHidden()) + .build(); + } + + private boolean isHudiTable(ConnectorSession session, Table table) + { + String basePath = table.getStorage().getLocation(); + Configuration conf = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session), new Path(basePath)); + try { + checkTableValidity(getFs(basePath, conf), new Path(basePath), new Path(basePath, METAFOLDER_NAME)); + } + catch (Exception e) { + return false; + } + return true; + } + + private HoodieTableMetaClient getTableMetaClient(ConnectorSession session, Table table) + { + String basePath = table.getStorage().getLocation(); + Configuration conf = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session), new Path(basePath)); + return HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); + } + + private ConnectorTableMetadata getTableMetadata(SchemaTableName tableName) + { + requireNonNull(hiveTable, "hiveTable is null"); + Function metadataGetter = columnMetadataGetter(hiveTable); + ImmutableList.Builder columns = ImmutableList.builder(); + for (HiveColumnHandle columnHandle : hiveColumnHandles(hiveTable, typeManager, NANOSECONDS)) { + columns.add(metadataGetter.apply(columnHandle)); + } + + // External location property + ImmutableMap.Builder properties = ImmutableMap.builder(); + if (hiveTable.getTableType().equals(EXTERNAL_TABLE.name())) { + properties.put(EXTERNAL_LOCATION_PROPERTY, hiveTable.getStorage().getLocation()); + } + + // Partitioning property + List partitionedBy = hiveTable.getPartitionColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + if (!partitionedBy.isEmpty()) { + properties.put(PARTITIONED_BY_PROPERTY, partitionedBy); + } + + Optional comment = Optional.ofNullable(hiveTable.getParameters().get(TABLE_COMMENT)); + return new ConnectorTableMetadata(tableName, columns.build(), properties.build(), comment); + } + + private List listSchemas(ConnectorSession session, Optional schemaName) + { + if (schemaName.isPresent()) { + if (isHiveSystemSchema(schemaName.get())) { + return ImmutableList.of(); + } + return ImmutableList.of(schemaName.get()); + } + return listSchemaNames(session); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java new file mode 100644 index 000000000000..c698ca7f26f6 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.spi.type.TypeManager; + +import javax.inject.Inject; + +import static java.util.Objects.requireNonNull; + +public class HudiMetadataFactory +{ + private final HiveMetastore metastore; + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + + @Inject + public HudiMetadataFactory(HiveMetastore metastore, HdfsEnvironment hdfsEnvironment, TypeManager typeManager) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + public HudiMetadata create() + { + return new HudiMetadata(metastore, hdfsEnvironment, typeManager); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java new file mode 100644 index 000000000000..3110b73038aa --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Scopes; +import io.trino.plugin.base.security.AllowAllAccessControl; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.CachingDirectoryLister; +import io.trino.plugin.hive.DirectoryLister; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HdfsConfiguration; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HiveConfig; +import io.trino.plugin.hive.HiveHdfsConfiguration; +import io.trino.plugin.hive.HiveNodePartitioningProvider; +import io.trino.plugin.hive.HiveTransactionManager; +import io.trino.plugin.hive.metastore.MetastoreConfig; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.ParquetWriterConfig; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSplitManager; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static com.google.inject.multibindings.OptionalBinder.newOptionalBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class HudiModule + implements Module +{ + @Override + public void configure(Binder binder) + { + binder.bind(HdfsConfiguration.class).to(HiveHdfsConfiguration.class).in(Scopes.SINGLETON); + binder.bind(HdfsEnvironment.class).in(Scopes.SINGLETON); + binder.bind(DirectoryLister.class).to(CachingDirectoryLister.class).in(Scopes.SINGLETON); + binder.bind(HudiTransactionManager.class).in(Scopes.SINGLETON); + + configBinder(binder).bindConfig(HiveConfig.class); + configBinder(binder).bindConfig(HudiConfig.class); + configBinder(binder).bindConfig(MetastoreConfig.class); + + newSetBinder(binder, SessionPropertiesProvider.class).addBinding().to(HudiSessionProperties.class).in(Scopes.SINGLETON); + binder.bind(HudiTableProperties.class).in(Scopes.SINGLETON); + + binder.bind(ConnectorSplitManager.class).to(HudiSplitManager.class).in(Scopes.SINGLETON); + binder.bind(ConnectorPageSourceProvider.class).to(HudiPageSourceProvider.class).in(Scopes.SINGLETON); + binder.bind(ConnectorNodePartitioningProvider.class).to(HiveNodePartitioningProvider.class).in(Scopes.SINGLETON); + + configBinder(binder).bindConfig(ParquetReaderConfig.class); + configBinder(binder).bindConfig(ParquetWriterConfig.class); + + binder.bind(HudiMetadataFactory.class).in(Scopes.SINGLETON); + binder.bind(HiveTransactionManager.class).in(Scopes.SINGLETON); + + binder.bind(FileFormatDataSourceStats.class).in(Scopes.SINGLETON); + newExporter(binder).export(FileFormatDataSourceStats.class).withGeneratedName(); + + newOptionalBinder(binder, ConnectorAccessControl.class).setDefault().to(AllowAllAccessControl.class).in(Scopes.SINGLETON); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java new file mode 100644 index 000000000000..a583c6226c96 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java @@ -0,0 +1,127 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.spi.Page; +import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.connector.ConnectorPageSource; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_BAD_DATA; +import static java.util.Objects.requireNonNull; + +public class HudiPageSource + implements ConnectorPageSource +{ + private final List columnHandles; + private final ConnectorPageSource pageSource; + private final Map partitionBlocks; + + public HudiPageSource( + List columnHandles, + Map partitionBlocks, + ConnectorPageSource pageSource) + { + this.columnHandles = requireNonNull(columnHandles, "columnHandles is null"); + this.pageSource = requireNonNull(pageSource, "pageSource is null"); + this.partitionBlocks = requireNonNull(partitionBlocks, "partitionBlocks is null"); + } + + @Override + public long getCompletedBytes() + { + return pageSource.getCompletedBytes(); + } + + @Override + public long getReadTimeNanos() + { + return pageSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return pageSource.isFinished(); + } + + @Override + public Page getNextPage() + { + try { + Page page = pageSource.getNextPage(); + if (page == null) { + return null; + } + int positionCount = page.getPositionCount(); + + int dataColumnIndex = 0; + int columnIndex = 0; + Block[] blocksWithPartitionColumns = new Block[columnHandles.size()]; + for (HiveColumnHandle columnHandle : columnHandles) { + if (columnHandle.isPartitionKey()) { + Block partitionValue = partitionBlocks.get(columnHandle.getName()); + blocksWithPartitionColumns[columnIndex++] = new RunLengthEncodedBlock(partitionValue, positionCount); + } + else { + blocksWithPartitionColumns[columnIndex++] = (page.getBlock(dataColumnIndex)); + dataColumnIndex++; + } + } + return new Page(positionCount, blocksWithPartitionColumns); + } + catch (TrinoException e) { + closeWithSuppression(e); + throw e; + } + catch (RuntimeException e) { + closeWithSuppression(e); + throw new TrinoException(HUDI_BAD_DATA, e); + } + } + + @Override + public long getMemoryUsage() + { + return pageSource.getMemoryUsage(); + } + + @Override + public void close() + throws IOException + { + pageSource.close(); + } + + private void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (Exception e) { + // Self-suppression not permitted + if (e != throwable) { + throwable.addSuppressed(e); + } + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java new file mode 100644 index 000000000000..c610fb2c9bb2 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java @@ -0,0 +1,141 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HdfsEnvironment.HdfsContext; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hudi.page.HudiPageSourceCreator; +import io.trino.plugin.hudi.page.HudiPageSourceFactory; +import io.trino.plugin.hudi.page.HudiParquetPageSourceCreator; +import io.trino.spi.block.Block; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.DynamicFilter; +import io.trino.spi.predicate.Utils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; +import java.util.stream.Collectors; + +import static io.trino.plugin.hudi.HudiUtil.convertPartitionValue; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; + +public class HudiPageSourceProvider + implements ConnectorPageSourceProvider +{ + private final HudiConfig hudiConfig; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats fileFormatDataSourceStats; + private final DateTimeZone timeZone; + private final Map pageSourceBuilderMap; + private final Map context; + + @Inject + public HudiPageSourceProvider( + HdfsEnvironment hdfsEnvironment, + FileFormatDataSourceStats fileFormatDataSourceStats, + ParquetReaderConfig parquetReaderConfig, + HudiConfig hudiConfig) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.fileFormatDataSourceStats = requireNonNull(fileFormatDataSourceStats, "fileFormatDataSourceStats is null"); + this.hudiConfig = requireNonNull(hudiConfig, "hudiConfig is null"); + this.timeZone = DateTimeZone.forID(TimeZone.getDefault().getID()); + this.pageSourceBuilderMap = new HashMap<>(); + this.context = new HashMap<>(); + this.context.put( + HudiParquetPageSourceCreator.CONTEXT_KEY_PARQUET_READER_OPTIONS, + requireNonNull(parquetReaderConfig, "parquetReaderConfig is null").toParquetReaderOptions()); + } + + @Override + public ConnectorPageSource createPageSource( + ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorSplit connectorSplit, + ConnectorTableHandle connectorTable, + List columns, + DynamicFilter dynamicFilter) + { + HudiSplit split = (HudiSplit) connectorSplit; + Path path = new Path(split.getPath()); + HoodieFileFormat hudiFileFormat = HudiUtil.getHudiFileFormat(path.toString()); + List hiveColumns = columns.stream() + .map(HiveColumnHandle.class::cast) + .collect(toList()); + // just send regular columns to create parquet page source + // for partition columns, separate blocks will be created + List regularColumns = hiveColumns.stream() + .filter(columnHandle -> !columnHandle.isPartitionKey()) + .collect(Collectors.toList()); + Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path); + ConnectorPageSource dataPageSource = getHudiPageSourceCreator(hudiFileFormat).createPageSource( + configuration, session.getIdentity(), regularColumns, split); + + return new HudiPageSource( + hiveColumns, + convertPartitionValues(hiveColumns, split.getPartitionKeys()), // create blocks for partition values + dataPageSource); + } + + private Map convertPartitionValues( + List allColumns, + List partitionKeys) + { + return allColumns.stream() + .filter(HiveColumnHandle::isPartitionKey) + .collect(toMap( + HiveColumnHandle::getName, + columnHandle -> Utils.nativeValueToBlock( + columnHandle.getType(), + convertPartitionValue( + columnHandle.getName(), + partitionKeys.get(0).getValue(), + columnHandle.getType().getTypeSignature()).orElse(null)))); + } + + private HudiPageSourceCreator getHudiPageSourceCreator(HoodieFileFormat hudiFileFormat) + { + if (!pageSourceBuilderMap.containsKey(hudiFileFormat)) { + // HudiPageSourceProvider::createPageSource may be called concurrently + // So the below guarantees the construction of HudiPageSourceCreator once + synchronized (pageSourceBuilderMap) { + pageSourceBuilderMap.computeIfAbsent(hudiFileFormat, + format -> HudiPageSourceFactory.get( + format, hudiConfig, hdfsEnvironment, fileFormatDataSourceStats, timeZone, context)); + } + } + return pageSourceBuilderMap.get(hudiFileFormat); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java new file mode 100644 index 000000000000..565976f63891 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.Plugin; +import io.trino.spi.connector.ConnectorFactory; + +public class HudiPlugin + implements Plugin +{ + @Override + public Iterable getConnectorFactories() + { + return ImmutableList.of(new HudiConnectorFactory("hudi")); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java new file mode 100644 index 000000000000..a897f18f881d --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.predicate.TupleDomain; + +public class HudiPredicates +{ + private final TupleDomain partitionColumnPredicates; + private final TupleDomain regularColumnPredicates; + + public HudiPredicates(TupleDomain partitionColumnPredicates, + TupleDomain regularColumnPredicates) + { + this.partitionColumnPredicates = partitionColumnPredicates; + this.regularColumnPredicates = regularColumnPredicates; + } + + public TupleDomain getPartitionColumnPredicates() + { + return partitionColumnPredicates; + } + + public TupleDomain getRegularColumnPredicates() + { + return regularColumnPredicates; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java new file mode 100644 index 000000000000..a42b66eeb8cf --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java @@ -0,0 +1,183 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.session.PropertyMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; + +import javax.inject.Inject; + +import java.util.List; + +import static io.trino.plugin.base.session.PropertyMetadataUtil.dataSizeProperty; +import static io.trino.spi.StandardErrorCode.INVALID_SESSION_PROPERTY; +import static io.trino.spi.session.PropertyMetadata.booleanProperty; +import static io.trino.spi.session.PropertyMetadata.doubleProperty; +import static io.trino.spi.session.PropertyMetadata.enumProperty; +import static io.trino.spi.session.PropertyMetadata.integerProperty; +import static java.lang.String.format; + +public class HudiSessionProperties + implements SessionPropertiesProvider +{ + private static final String BASE_FILE_FORMAT = "file_format"; + private static final String METADATA_ENABLED = "metadata_enabled"; + private static final String SKIP_METASTORE_FOR_PARTITION = "skip_metastore_for_partition"; + private static final String USE_PARQUET_COLUMN_NAMES = "use_parquet_column_names"; + private static final String PARTITION_SCANNER_PARALLELISM = "partition_scanner_parallelism"; + private static final String SPLIT_GENERATOR_PARALLELISM = "split_generator_parallelism"; + private static final String MIN_PARTITION_BATCH_SIZE = "min_partition_batch_size"; + private static final String MAX_PARTITION_BATCH_SIZE = "max_partition_batch_size"; + private static final String SIZE_BASED_SPLIT_WEIGHTS_ENABLED = "size_based_split_weights_enabled"; + private static final String STANDARD_SPLIT_WEIGHT_SIZE = "standard_split_weight_size"; + private static final String MINIMUM_ASSIGNED_SPLIT_WEIGHT = "minimum_assigned_split_weight"; + + private final List> sessionProperties; + + @Inject + public HudiSessionProperties(HudiConfig hudiConfig) + { + sessionProperties = ImmutableList.of( + enumProperty( + BASE_FILE_FORMAT, + "Currently, only Parquet is supported", + HoodieFileFormat.class, + hudiConfig.getBaseFileFormat(), + false), + booleanProperty( + METADATA_ENABLED, + "For Hudi tables prefer to fetch the list of files from its metadata", + hudiConfig.isMetadataEnabled(), + false), + booleanProperty( + SKIP_METASTORE_FOR_PARTITION, + "Whether to skip metastore for partition info. " + + "If enabled, then the partition info is fetched using Hudi's partition extractor and relative partition path.", + hudiConfig.getSkipMetaStoreForPartition(), + false), + booleanProperty( + USE_PARQUET_COLUMN_NAMES, + "Access parquet columns using names from the file. If disabled, then columns are accessed using index.", + hudiConfig.getUseParquetColumnNames(), + false), + integerProperty( + PARTITION_SCANNER_PARALLELISM, + "Number of threads to use for partition scanners", + hudiConfig.getPartitionScannerParallelism(), + false), + integerProperty( + SPLIT_GENERATOR_PARALLELISM, + "Number of threads to use for split generators", + hudiConfig.getSplitGeneratorParallelism(), + false), + integerProperty( + MIN_PARTITION_BATCH_SIZE, + "Minimum partition batch size", + hudiConfig.getMinPartitionBatchSize(), + false), + integerProperty( + MAX_PARTITION_BATCH_SIZE, + "Maximum partition batch size", + hudiConfig.getMaxPartitionBatchSize(), + false), + booleanProperty( + SIZE_BASED_SPLIT_WEIGHTS_ENABLED, + "Size-based splitting ensures that each batch of splits has enough data to process. Enabled by default.", + hudiConfig.isSizeBasedSplitWeightsEnabled(), + false), + dataSizeProperty( + STANDARD_SPLIT_WEIGHT_SIZE, + "The split size corresponding to the standard weight (1.0) " + + "when size based split weights are enabled", + hudiConfig.getStandardSplitWeightSize(), + false), + doubleProperty( + MINIMUM_ASSIGNED_SPLIT_WEIGHT, + "Minimum assigned split weight when size based split weights are enabled", + hudiConfig.getMinimumAssignedSplitWeight(), + value -> { + if (!Double.isFinite(value) || value <= 0 || value > 1) { + throw new TrinoException(INVALID_SESSION_PROPERTY, format("%s must be > 0 and <= 1.0: %s", MINIMUM_ASSIGNED_SPLIT_WEIGHT, value)); + } + }, + false)); + } + + @Override + public List> getSessionProperties() + { + return sessionProperties; + } + + public static HoodieFileFormat getBaseFileFormat(ConnectorSession session) + { + return session.getProperty(BASE_FILE_FORMAT, HoodieFileFormat.class); + } + + public static boolean isHudiMetadataEnabled(ConnectorSession session) + { + return session.getProperty(METADATA_ENABLED, Boolean.class); + } + + public static boolean shouldSkipMetaStoreForPartition(ConnectorSession session) + { + return session.getProperty(SKIP_METASTORE_FOR_PARTITION, Boolean.class); + } + + public static boolean shouldUseParquetColumnNames(ConnectorSession session) + { + return session.getProperty(USE_PARQUET_COLUMN_NAMES, Boolean.class); + } + + public static int getPartitionScannerParallelism(ConnectorSession session) + { + return session.getProperty(PARTITION_SCANNER_PARALLELISM, Integer.class); + } + + public static int getSplitGeneratorParallelism(ConnectorSession session) + { + return session.getProperty(SPLIT_GENERATOR_PARALLELISM, Integer.class); + } + + public static int getMinPartitionBatchSize(ConnectorSession session) + { + return session.getProperty(MIN_PARTITION_BATCH_SIZE, Integer.class); + } + + public static int getMaxPartitionBatchSize(ConnectorSession session) + { + return session.getProperty(MAX_PARTITION_BATCH_SIZE, Integer.class); + } + + public static boolean isSizeBasedSplitWeightsEnabled(ConnectorSession session) + { + return session.getProperty(SIZE_BASED_SPLIT_WEIGHTS_ENABLED, Boolean.class); + } + + public static DataSize getStandardSplitWeightSize(ConnectorSession session) + { + return session.getProperty(STANDARD_SPLIT_WEIGHT_SIZE, DataSize.class); + } + + public static double getMinimumAssignedSplitWeight(ConnectorSession session) + { + return session.getProperty(MINIMUM_ASSIGNED_SPLIT_WEIGHT, Double.class); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java new file mode 100644 index 000000000000..3bff57b780fe --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java @@ -0,0 +1,148 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.spi.HostAddress; +import io.trino.spi.SplitWeight; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.predicate.TupleDomain; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class HudiSplit + implements ConnectorSplit +{ + private final String path; + private final long start; + private final long length; + private final long fileSize; + private final List addresses; + private final TupleDomain predicate; + private final List partitionKeys; + private final SplitWeight splitWeight; + + @JsonCreator + public HudiSplit( + @JsonProperty("path") String path, + @JsonProperty("start") long start, + @JsonProperty("length") long length, + @JsonProperty("fileSize") long fileSize, + @JsonProperty("addresses") List addresses, + @JsonProperty("predicate") TupleDomain predicate, + @JsonProperty("partitionKeys") List partitionKeys, + @JsonProperty("splitWeight") SplitWeight splitWeight) + { + checkArgument(start >= 0, "start must be positive"); + checkArgument(length >= 0, "length must be positive"); + checkArgument(fileSize >= 0, "fileSize must be positive"); + + this.path = requireNonNull(path, "path is null"); + this.start = start; + this.length = length; + this.fileSize = fileSize; + this.addresses = ImmutableList.copyOf(requireNonNull(addresses, "addresses is null")); + this.predicate = requireNonNull(predicate, "predicate is null"); + this.partitionKeys = ImmutableList.copyOf(requireNonNull(partitionKeys, "partitionKeys is null")); + this.splitWeight = requireNonNull(splitWeight, "splitWeight is null"); + } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @JsonProperty + @Override + public List getAddresses() + { + return addresses; + } + + @Override + public Object getInfo() + { + return ImmutableMap.builder() + .put("path", path) + .put("start", start) + .put("length", length) + .put("fileSize", fileSize) + .build(); + } + + @JsonProperty + @Override + public SplitWeight getSplitWeight() + { + return splitWeight; + } + + @JsonProperty + public String getPath() + { + return path; + } + + @JsonProperty + public long getStart() + { + return start; + } + + @JsonProperty + public long getLength() + { + return length; + } + + @JsonProperty + public long getFileSize() + { + return fileSize; + } + + @JsonProperty + public TupleDomain getPredicate() + { + return predicate; + } + + @JsonProperty + public List getPartitionKeys() + { + return partitionKeys; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(path) + .addValue(start) + .addValue(length) + .addValue(fileSize) + .toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java new file mode 100644 index 000000000000..0e5375865802 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitSource; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.connector.ConnectorSplitSource; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.Constraint; +import io.trino.spi.connector.DynamicFilter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import javax.inject.Inject; + +import java.util.Map; +import java.util.stream.Collectors; + +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; + +public class HudiSplitManager + implements ConnectorSplitManager +{ + private final HudiTransactionManager transactionManager; + private final HdfsEnvironment hdfsEnvironment; + + @Inject + public HudiSplitManager(HudiTransactionManager transactionManager, HdfsEnvironment hdfsEnvironment) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + } + + @Override + public ConnectorSplitSource getSplits( + ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorTableHandle tableHandle, + SplitSchedulingStrategy splitSchedulingStrategy, + DynamicFilter dynamicFilter, + Constraint constraint) + { + HudiTableHandle hudiTable = (HudiTableHandle) tableHandle; + HudiMetadata hudiMetadata = transactionManager.get(transaction); + HiveMetastore metastore = hudiMetadata.getMetastore(); + Map partitionColumnHandles = hudiMetadata.getColumnHandles(session, tableHandle) + .values().stream().map(HiveColumnHandle.class::cast) + .filter(HiveColumnHandle::isPartitionKey) + .collect(Collectors.toMap(HiveColumnHandle::getName, identity())); + Table hiveTable = hudiMetadata.getTable(); + HdfsEnvironment.HdfsContext context = new HdfsEnvironment.HdfsContext(session); + Configuration conf = hdfsEnvironment.getConfiguration( + context, new Path(hiveTable.getStorage().getLocation())); + HudiSplitSource splitSource = new HudiSplitSource( + session, metastore, hiveTable, hudiTable, conf, partitionColumnHandles); + return new ClassLoaderSafeConnectorSplitSource(splitSource, Thread.currentThread().getContextClassLoader()); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java new file mode 100644 index 000000000000..defb8a5d9f3b --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java @@ -0,0 +1,138 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.airlift.log.Logger; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.query.HudiFileListing; +import io.trino.plugin.hudi.query.HudiFileListingFactory; +import io.trino.plugin.hudi.query.HudiQueryMode; +import io.trino.plugin.hudi.split.HudiSplitBackgroundLoader; +import io.trino.spi.connector.ConnectorPartitionHandle; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorSplitSource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.HoodieTimer; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +import static io.trino.plugin.hudi.HudiSessionProperties.isHudiMetadataEnabled; +import static io.trino.plugin.hudi.HudiSessionProperties.shouldSkipMetaStoreForPartition; +import static io.trino.plugin.hudi.HudiUtil.getMetaClient; +import static java.util.concurrent.CompletableFuture.completedFuture; +import static java.util.stream.Collectors.toList; + +public class HudiSplitSource + implements ConnectorSplitSource +{ + private static final Logger log = Logger.get(HudiSplitSource.class); + private static final long IDLE_WAIT_TIME_MS = 10; + private final HiveIdentity identity; + private final HoodieTableMetaClient metaClient; + private final boolean metadataEnabled; + private final boolean shouldSkipMetastoreForPartition; + private final HudiFileListing hudiFileListing; + private final ArrayDeque connectorSplitQueue; + private final HudiSplitBackgroundLoader splitLoader; + private final ScheduledExecutorService splitLoaderExecutorService; + private final ScheduledFuture splitLoaderFuture; + + public HudiSplitSource( + ConnectorSession session, + HiveMetastore metastore, + Table table, + HudiTableHandle tableHandle, + Configuration conf, + Map partitionColumnHandleMap) + { + this.identity = new HiveIdentity(session); + this.metadataEnabled = isHudiMetadataEnabled(session); + this.shouldSkipMetastoreForPartition = shouldSkipMetaStoreForPartition(session); + this.metaClient = tableHandle.getMetaClient().orElseGet(() -> getMetaClient(conf, tableHandle.getBasePath())); + HoodieEngineContext engineContext = new HoodieLocalEngineContext(conf); + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .enable(metadataEnabled) + .build(); + List partitionColumnHandles = table.getPartitionColumns().stream() + .map(column -> partitionColumnHandleMap.get(column.getName())).collect(toList()); + // TODO: fetch the query mode from config / query context + this.hudiFileListing = HudiFileListingFactory.get(HudiQueryMode.READ_OPTIMIZED, + metadataConfig, engineContext, tableHandle, metaClient, metastore, table, + identity, partitionColumnHandles, shouldSkipMetastoreForPartition); + this.connectorSplitQueue = new ArrayDeque<>(); + this.splitLoader = new HudiSplitBackgroundLoader( + session, tableHandle, metaClient, hudiFileListing, connectorSplitQueue); + this.splitLoaderExecutorService = Executors.newSingleThreadScheduledExecutor(); + this.splitLoaderFuture = this.splitLoaderExecutorService.schedule( + this.splitLoader, 0, TimeUnit.MILLISECONDS); + } + + @Override + public CompletableFuture getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) + { + if (isFinished()) { + return completedFuture(new ConnectorSplitBatch(new ArrayList<>(), true)); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + List connectorSplits = new ArrayList<>(); + + while (!splitLoaderFuture.isDone() && connectorSplitQueue.isEmpty()) { + try { + Thread.sleep(IDLE_WAIT_TIME_MS); + } + catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + synchronized (connectorSplitQueue) { + while (connectorSplits.size() < maxSize && !connectorSplitQueue.isEmpty()) { + connectorSplits.add(connectorSplitQueue.pollFirst()); + } + } + + log.debug(String.format("Get the next batch of %d splits in %d ms", connectorSplits.size(), timer.endTimer())); + return completedFuture(new ConnectorSplitBatch(connectorSplits, isFinished())); + } + + @Override + public void close() + { + hudiFileListing.close(); + } + + @Override + public boolean isFinished() + { + return splitLoaderFuture.isDone() && connectorSplitQueue.isEmpty(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java new file mode 100644 index 000000000000..d33e4dbeb7fd --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java @@ -0,0 +1,177 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartition; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.TimelineUtils; + +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import static io.trino.plugin.hudi.HudiUtil.mergePredicates; +import static java.util.Objects.requireNonNull; + +public class HudiTableHandle + implements ConnectorTableHandle +{ + private final String schemaName; + private final String tableName; + private final String basePath; + private final HoodieTableType tableType; + private final TupleDomain partitionPredicates; + private final TupleDomain regularPredicates; + private final Optional> partitions; + private final Optional metaClient; + + @JsonCreator + public HudiTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("basePath") String basePath, + @JsonProperty("tableType") HoodieTableType tableType, + @JsonProperty("partitionPredicates") TupleDomain partitionPredicates, + @JsonProperty("regularPredicates") TupleDomain regularPredicates) + { + this(schemaName, tableName, basePath, tableType, partitionPredicates, + regularPredicates, Optional.empty(), Optional.empty()); + } + + public HudiTableHandle( + String schemaName, + String tableName, + String basePath, + HoodieTableType tableType, + TupleDomain partitionPredicates, + TupleDomain regularPredicates, + Optional metaClient) + { + this(schemaName, tableName, basePath, tableType, partitionPredicates, + regularPredicates, Optional.empty(), metaClient); + } + + public HudiTableHandle( + String schemaName, + String tableName, + String basePath, + HoodieTableType tableType, + TupleDomain partitionPredicates, + TupleDomain regularPredicates, + Optional> partitions, + Optional metaClient) + { + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.basePath = requireNonNull(basePath, "basePath is null"); + this.tableType = requireNonNull(tableType, "tableType is null"); + this.partitionPredicates = requireNonNull(partitionPredicates, "partitionPredicates is null"); + this.regularPredicates = requireNonNull(regularPredicates, "regularPredicates is null"); + this.partitions = requireNonNull(partitions, "partitions is null").map(ImmutableList::copyOf); + this.metaClient = requireNonNull(metaClient, "metaClient is null"); + } + + @JsonProperty + public String getSchemaName() + { + return schemaName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @JsonProperty + public String getBasePath() + { + return basePath; + } + + @JsonProperty + public HoodieTableType getTableType() + { + return tableType; + } + + @JsonProperty + public TupleDomain getPartitionPredicates() + { + return partitionPredicates; + } + + @JsonProperty + public TupleDomain getRegularPredicates() + { + return regularPredicates; + } + + @JsonIgnore + public Optional> getPartitions() + { + if (partitions.isEmpty()) { + List partitionIds = TimelineUtils.getPartitionsWritten(metaClient.get().getActiveTimeline()); + List hivePartitions = partitionIds.stream() + .map(p -> new HivePartition(getSchemaTableName(), p, ImmutableMap.of())) + .collect(Collectors.toList()); + return Optional.of(hivePartitions); + } + + return partitions; + } + + @JsonIgnore + public Optional getMetaClient() + { + return metaClient; + } + + public SchemaTableName getSchemaTableName() + { + return new SchemaTableName(schemaName, tableName); + } + + HudiTableHandle withPredicates(HudiPredicates predicates) + { + return new HudiTableHandle( + schemaName, + tableName, + basePath, + tableType, + mergePredicates(partitionPredicates, + predicates.getPartitionColumnPredicates().transformKeys(HiveColumnHandle.class::cast)), + mergePredicates(regularPredicates, + predicates.getRegularColumnPredicates().transformKeys(HiveColumnHandle.class::cast)), + partitions, + metaClient); + } + + @Override + public String toString() + { + return getSchemaTableName().toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java new file mode 100644 index 000000000000..6f25a19cf1a0 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.session.PropertyMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; + +import javax.inject.Inject; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.spi.session.PropertyMetadata.enumProperty; +import static io.trino.spi.session.PropertyMetadata.stringProperty; + +public class HudiTableProperties +{ + public static final String BASE_FILE_FORMAT_PROPERTY = "format"; + public static final String LOCATION_PROPERTY = "location"; + + private final List> tableProperties; + + @Inject + public HudiTableProperties(HudiConfig hudiConfig) + { + tableProperties = ImmutableList.>builder() + .add(enumProperty( + BASE_FILE_FORMAT_PROPERTY, + "File format for the table", + HoodieFileFormat.class, + hudiConfig.getBaseFileFormat(), + false)) + .add(stringProperty( + LOCATION_PROPERTY, + "File system location URI for the table", + null, + false)) + .build(); + } + + public List> getTableProperties() + { + return tableProperties; + } + + public static HoodieFileFormat getBaseFileFormat(Map tableProperties) + { + return (HoodieFileFormat) tableProperties.get(BASE_FILE_FORMAT_PROPERTY); + } + + public static Optional getTableLocation(Map tableProperties) + { + return Optional.ofNullable((String) tableProperties.get(LOCATION_PROPERTY)); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java new file mode 100644 index 000000000000..7fad31361b2a --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.trino.spi.connector.ConnectorMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +public class HudiTransactionManager +{ + private final Map transactions = new ConcurrentHashMap<>(); + + public HudiMetadata get(ConnectorTransactionHandle transaction) + { + HudiMetadata metadata = transactions.get(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + return metadata; + } + + public HudiMetadata remove(ConnectorTransactionHandle transaction) + { + HudiMetadata metadata = transactions.remove(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + return metadata; + } + + public void put(ConnectorTransactionHandle transaction, HudiMetadata metadata) + { + ConnectorMetadata existing = transactions.putIfAbsent(transaction, metadata); + checkState(existing == null, "transaction already exists: %s", existing); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java new file mode 100644 index 000000000000..505547a2c217 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java @@ -0,0 +1,442 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartition; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.HivePartitionManager; +import io.trino.plugin.hive.metastore.Column; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.NullableValue; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.Type; +import io.trino.spi.type.TypeSignature; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; +import org.apache.hudi.hive.HiveStylePartitionValueExtractor; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.PartitionValueExtractor; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.hive.SlashEncodedHourPartitionValueExtractor; + +import java.io.IOException; +import java.sql.Timestamp; +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; +import static io.trino.plugin.hive.util.HiveUtil.checkCondition; +import static io.trino.plugin.hive.util.HiveUtil.parsePartitionValue; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_INVALID_PARTITION_VALUE; +import static io.trino.spi.type.StandardTypes.BIGINT; +import static io.trino.spi.type.StandardTypes.BOOLEAN; +import static io.trino.spi.type.StandardTypes.DATE; +import static io.trino.spi.type.StandardTypes.DECIMAL; +import static io.trino.spi.type.StandardTypes.DOUBLE; +import static io.trino.spi.type.StandardTypes.INTEGER; +import static io.trino.spi.type.StandardTypes.REAL; +import static io.trino.spi.type.StandardTypes.SMALLINT; +import static io.trino.spi.type.StandardTypes.TIMESTAMP; +import static io.trino.spi.type.StandardTypes.TINYINT; +import static io.trino.spi.type.StandardTypes.VARBINARY; +import static io.trino.spi.type.StandardTypes.VARCHAR; +import static java.lang.Double.parseDouble; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.parseFloat; +import static java.lang.Long.parseLong; +import static java.lang.String.format; +import static java.util.Objects.isNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.common.FileUtils.unescapePathName; + +public class HudiUtil +{ + private static final Logger log = Logger.get(HudiUtil.class); + private static final double SPLIT_SLOP = 1.1; // 10% slop + + private HudiUtil() {} + + public static HoodieTableMetaClient getMetaClient(Configuration conf, String basePath) + { + return HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); + } + + public static boolean isHudiParquetInputFormat(InputFormat inputFormat) + { + return inputFormat instanceof HoodieParquetInputFormat; + } + + public static HoodieFileFormat getHudiFileFormat(String path) + { + final String extension = FSUtils.getFileExtension(path); + if (extension.equals(HoodieFileFormat.PARQUET.getFileExtension())) { + return HoodieFileFormat.PARQUET; + } + if (extension.equals(HoodieFileFormat.HOODIE_LOG.getFileExtension())) { + return HoodieFileFormat.HOODIE_LOG; + } + if (extension.equals(HoodieFileFormat.ORC.getFileExtension())) { + return HoodieFileFormat.ORC; + } + if (extension.equals(HoodieFileFormat.HFILE.getFileExtension())) { + return HoodieFileFormat.HFILE; + } + throw new HoodieIOException("Hoodie InputFormat not implemented for base file of type " + extension); + } + + public static HudiPredicates splitPredicate( + TupleDomain predicate) + { + Map partitionColumnPredicates = new HashMap<>(); + Map regularColumnPredicates = new HashMap<>(); + + Optional> domains = predicate.getDomains(); + domains.ifPresent(columnHandleDomainMap -> columnHandleDomainMap.forEach((key, value) -> { + HiveColumnHandle columnHandle = (HiveColumnHandle) key; + if (columnHandle.isPartitionKey()) { + partitionColumnPredicates.put(key, value); + } + else { + regularColumnPredicates.put(key, value); + } + })); + + return new HudiPredicates( + TupleDomain.withColumnDomains(partitionColumnPredicates), + TupleDomain.withColumnDomains(regularColumnPredicates)); + } + + public static TupleDomain mergePredicates( + TupleDomain predicates1, TupleDomain predicates2) + { + Map newColumnDomains = new HashMap<>(); + predicates1.getDomains().ifPresent(newColumnDomains::putAll); + predicates2.getDomains().ifPresent(domains -> { + for (HiveColumnHandle columnHandle : domains.keySet()) { + if (newColumnDomains.containsKey(columnHandle) + && !newColumnDomains.get(columnHandle).equals(domains.get(columnHandle))) { + throw new HoodieIOException(String.format("Conflicting predicates for %s: [%s] and [%s]", + columnHandle, newColumnDomains.get(columnHandle), domains.get(columnHandle))); + } + else { + newColumnDomains.put(columnHandle, domains.get(columnHandle)); + } + } + }); + return TupleDomain.withColumnDomains(newColumnDomains); + } + + public static boolean doesPartitionMatchPredicates( + SchemaTableName tableName, + String hivePartitionName, + List partitionColumnHandles, + TupleDomain constraintSummary) + { + List partitionColumnTypes = partitionColumnHandles.stream() + .map(HiveColumnHandle::getType) + .collect(toList()); + HivePartition partition = HivePartitionManager.parsePartition( + tableName, hivePartitionName, partitionColumnHandles, partitionColumnTypes); + + return partitionMatches(partitionColumnHandles, constraintSummary, partition); + } + + public static boolean doesPartitionMatchPredicates( + SchemaTableName tableName, + String relativePartitionPath, + List partitionValues, + List partitionColumnHandles, + TupleDomain constraintSummary) + { + List partitionColumnTypes = partitionColumnHandles.stream() + .map(HiveColumnHandle::getType) + .collect(toList()); + HivePartition partition = parsePartition( + tableName, relativePartitionPath, partitionValues, partitionColumnHandles, partitionColumnTypes); + + return partitionMatches(partitionColumnHandles, constraintSummary, partition); + } + + public static HivePartition parsePartition( + SchemaTableName tableName, + String dummyPartitionName, + List partitionValues, + List partitionColumns, + List partitionColumnTypes) + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i = 0; i < partitionColumns.size(); i++) { + HiveColumnHandle column = partitionColumns.get(i); + NullableValue parsedValue = parsePartitionValue( + dummyPartitionName, partitionValues.get(i), partitionColumnTypes.get(i)); + builder.put(column, parsedValue); + } + Map values = builder.build(); + return new HivePartition(tableName, dummyPartitionName, values); + } + + public static boolean partitionMatches(List partitionColumns, TupleDomain constraintSummary, HivePartition partition) + { + if (constraintSummary.isNone()) { + log.warn("constraintSummary is none"); + return false; + } + Map domains = constraintSummary.getDomains().orElseGet(ImmutableMap::of); + for (HiveColumnHandle column : partitionColumns) { + NullableValue value = partition.getKeys().get(column); + Domain allowedDomain = domains.get(column); + if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) { + return false; + } + } + return true; + } + + public static Optional convertPartitionValue( + String partitionColumnName, + String partitionValue, + TypeSignature partitionDataType) + { + if (isNull(partitionValue)) { + return Optional.empty(); + } + + String baseType = partitionDataType.getBase(); + try { + switch (baseType) { + case TINYINT: + case SMALLINT: + case INTEGER: + case BIGINT: + return Optional.of(parseLong(partitionValue)); + case REAL: + return Optional.of((long) floatToRawIntBits(parseFloat(partitionValue))); + case DOUBLE: + return Optional.of(parseDouble(partitionValue)); + case VARCHAR: + case VARBINARY: + return Optional.of(utf8Slice(partitionValue)); + case DATE: + return Optional.of(LocalDate.parse(partitionValue, DateTimeFormatter.ISO_LOCAL_DATE).toEpochDay()); + case TIMESTAMP: + return Optional.of(Timestamp.valueOf(partitionValue).toLocalDateTime().toEpochSecond(ZoneOffset.UTC) * 1_000); + case BOOLEAN: + checkArgument(partitionValue.equalsIgnoreCase("true") || partitionValue.equalsIgnoreCase("false")); + return Optional.of(Boolean.valueOf(partitionValue)); + case DECIMAL: + return Optional.of(Decimals.parse(partitionValue).getObject()); + default: + throw new TrinoException(HUDI_INVALID_PARTITION_VALUE, + format("Unsupported data type '%s' for partition column %s", partitionDataType, partitionColumnName)); + } + } + catch (IllegalArgumentException | DateTimeParseException e) { + throw new TrinoException(HUDI_INVALID_PARTITION_VALUE, + format("Can not parse partition value '%s' of type '%s' for partition column '%s'", + partitionValue, partitionDataType, partitionColumnName)); + } + } + + public static List getSplits(FileSystem fs, FileStatus fileStatus) + throws IOException + { + if (fileStatus.isDirectory()) { + throw new IOException("Not a file: " + fileStatus.getPath()); + } + + Path path = fileStatus.getPath(); + long length = fileStatus.getLen(); + + // generate splits + List splits = new ArrayList<>(); + if (length != 0) { + BlockLocation[] blkLocations; + if (fileStatus instanceof LocatedFileStatus) { + blkLocations = ((LocatedFileStatus) fileStatus).getBlockLocations(); + } + else { + blkLocations = fs.getFileBlockLocations(fileStatus, 0, length); + } + if (isSplitable(path)) { + long splitSize = fileStatus.getBlockSize(); + + long bytesRemaining = length; + while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { + String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining); + splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1])); + bytesRemaining -= splitSize; + } + + if (bytesRemaining != 0) { + String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining); + splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1])); + } + } + else { + String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0); + splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1])); + } + } + else { + //Create empty hosts array for zero length files + splits.add(makeSplit(path, 0, length, new String[0])); + } + return splits; + } + + private static boolean isSplitable(Path filename) + { + return !(filename instanceof PathWithBootstrapFileStatus); + } + + private static FileSplit makeSplit(Path file, long start, long length, String[] hosts) + { + return new FileSplit(file, start, length, hosts); + } + + private static FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) + { + return new FileSplit(file, start, length, hosts, inMemoryHosts); + } + + private static String[][] getSplitHostsAndCachedHosts(BlockLocation[] blkLocations, long offset) + throws IOException + { + int startIndex = getBlockIndex(blkLocations, offset); + + return new String[][] {blkLocations[startIndex].getHosts(), + blkLocations[startIndex].getCachedHosts()}; + } + + private static int getBlockIndex(BlockLocation[] blkLocations, long offset) + { + for (int i = 0; i < blkLocations.length; i++) { + // is the offset inside this block? + if ((blkLocations[i].getOffset() <= offset) && + (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) { + return i; + } + } + BlockLocation last = blkLocations[blkLocations.length - 1]; + long fileLength = last.getOffset() + last.getLength() - 1; + throw new IllegalArgumentException("Offset " + offset + + " is outside of file (0.." + + fileLength + ")"); + } + + public static List buildPartitionKeys(List keys, List values) + { + checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, + "Expected %s partition key values, but got %s. Keys: %s, Values: %s.", + keys.size(), values.size(), keys, values); + ImmutableList.Builder partitionKeys = ImmutableList.builder(); + for (int i = 0; i < keys.size(); i++) { + String name = keys.get(i).getName(); + String value = values.get(i); + partitionKeys.add(new HivePartitionKey(name, value)); + } + return partitionKeys.build(); + } + + public static List buildPartitionValues(String partitionNames) + { + ImmutableList.Builder values = ImmutableList.builder(); + String[] parts = partitionNames.split("="); + if (parts.length == 1) { + values.add(unescapePathName(partitionNames)); + return values.build(); + } + if (parts.length == 2) { + values.add(unescapePathName(parts[1])); + return values.build(); + } + for (int i = 1; i < parts.length; i++) { + String val = parts[i]; + int j = val.lastIndexOf('/'); + if (j == -1) { + values.add(unescapePathName(val)); + } + else { + values.add(unescapePathName(val.substring(0, j))); + } + } + return values.build(); + } + + public static PartitionValueExtractor inferPartitionValueExtractor( + String relativePartitionPath, List expectedPartitionValues) + throws HoodieIOException + { + // The order of extractors to try should not be changed + List partitionValueExtractorList = new ArrayList<>(); + partitionValueExtractorList.add(new HiveStylePartitionValueExtractor()); + partitionValueExtractorList.add(new MultiPartKeysValueExtractor()); + partitionValueExtractorList.add(new SlashEncodedDayPartitionValueExtractor()); + partitionValueExtractorList.add(new SlashEncodedHourPartitionValueExtractor()); + + for (PartitionValueExtractor partitionValueExtractor : partitionValueExtractorList) { + try { + List extractedPartitionValues = + partitionValueExtractor.extractPartitionValuesInPath(relativePartitionPath); + if (extractedPartitionValues.equals(expectedPartitionValues)) { + log.debug(String.format("Inferred %s to be the partition value extractor", + partitionValueExtractor.getClass().getName())); + return partitionValueExtractor; + } + else { + log.debug(String.format("Cannot use partition value extractor %s due to value mismatch " + + "(expected: %s, actual: %s), trying the next option ...", + partitionValueExtractor.getClass().getName(), expectedPartitionValues, + extractedPartitionValues)); + } + } + catch (IllegalArgumentException e) { + log.debug(String.format("Cannot use partition value extractor %s, trying the next option ...", + partitionValueExtractor.getClass().getName())); + } + } + + throw new HoodieIOException("Cannot infer the partition value extractor"); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java new file mode 100644 index 000000000000..051586b27357 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java @@ -0,0 +1,122 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableSet; +import com.google.inject.Injector; +import com.google.inject.Key; +import com.google.inject.Module; +import com.google.inject.TypeLiteral; +import io.airlift.bootstrap.Bootstrap; +import io.airlift.bootstrap.LifeCycleManager; +import io.airlift.event.client.EventModule; +import io.airlift.json.JsonModule; +import io.trino.plugin.base.CatalogName; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorPageSourceProvider; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitManager; +import io.trino.plugin.base.classloader.ClassLoaderSafeNodePartitioningProvider; +import io.trino.plugin.base.jmx.MBeanServerModule; +import io.trino.plugin.base.security.AllowAllAccessControl; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.HiveHdfsModule; +import io.trino.plugin.hive.NodeVersion; +import io.trino.plugin.hive.authentication.HdfsAuthenticationModule; +import io.trino.plugin.hive.azure.HiveAzureModule; +import io.trino.plugin.hive.gcs.HiveGcsModule; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreModule; +import io.trino.plugin.hive.s3.HiveS3Module; +import io.trino.spi.NodeManager; +import io.trino.spi.PageIndexerFactory; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorContext; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.type.TypeManager; +import org.weakref.jmx.guice.MBeanModule; + +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +public final class InternalHudiConnectorFactory +{ + private InternalHudiConnectorFactory() {} + + public static Connector createConnector(String catalogName, Map config, ConnectorContext context, Module module) + { + return createConnector(catalogName, config, context, module, Optional.empty()); + } + + public static Connector createConnector(String catalogName, Map config, ConnectorContext context, Module module, Optional metastore) + { + requireNonNull(config, "config is null"); + ClassLoader classLoader = InternalHudiConnectorFactory.class.getClassLoader(); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + Bootstrap app = new Bootstrap( + new EventModule(), + new MBeanModule(), + new JsonModule(), + new HudiModule(), + new HiveMetastoreModule(metastore), + new HiveHdfsModule(), + new HiveS3Module(), + new HiveGcsModule(), + new HiveAzureModule(), + new HdfsAuthenticationModule(), + new MBeanServerModule(), + binder -> { + binder.bind(NodeVersion.class).toInstance(new NodeVersion(context.getNodeManager().getCurrentNode().getVersion())); + binder.bind(NodeManager.class).toInstance(context.getNodeManager()); + binder.bind(TypeManager.class).toInstance(context.getTypeManager()); + binder.bind(PageIndexerFactory.class).toInstance(context.getPageIndexerFactory()); + binder.bind(CatalogName.class).toInstance(new CatalogName(catalogName)); + }, + module); + + Injector injector = app + .doNotInitializeLogging() + .setRequiredConfigurationProperties(config) + .initialize(); + + LifeCycleManager lifeCycleManager = injector.getInstance(LifeCycleManager.class); + HudiTransactionManager transactionManager = injector.getInstance(HudiTransactionManager.class); + HudiMetadataFactory metadataFactory = injector.getInstance(HudiMetadataFactory.class); + ConnectorSplitManager splitManager = injector.getInstance(ConnectorSplitManager.class); + ConnectorPageSourceProvider connectorPageSource = injector.getInstance(ConnectorPageSourceProvider.class); + ConnectorNodePartitioningProvider connectorDistributionProvider = injector.getInstance(ConnectorNodePartitioningProvider.class); + Set sessionPropertiesProviders = injector.getInstance(Key.get(new TypeLiteral>() {})); + HudiTableProperties hudiTableProperties = injector.getInstance(HudiTableProperties.class); + Optional accessControl = Optional.of(new AllowAllAccessControl()); + + return new HudiConnector( + lifeCycleManager, + transactionManager, + metadataFactory, + new ClassLoaderSafeConnectorSplitManager(splitManager, classLoader), + new ClassLoaderSafeConnectorPageSourceProvider(connectorPageSource, classLoader), + new ClassLoaderSafeNodePartitioningProvider(connectorDistributionProvider, classLoader), + ImmutableSet.of(), + sessionPropertiesProviders, + hudiTableProperties.getTableProperties(), + accessControl); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceCreator.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceCreator.java new file mode 100644 index 000000000000..93a0e87ca357 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceCreator.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.page; + +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hudi.HudiConfig; +import io.trino.plugin.hudi.HudiSplit; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.security.ConnectorIdentity; +import org.apache.hadoop.conf.Configuration; +import org.joda.time.DateTimeZone; + +import java.util.List; + +public abstract class HudiPageSourceCreator +{ + protected final HudiConfig hudiConfig; + protected final HdfsEnvironment hdfsEnvironment; + protected final FileFormatDataSourceStats stats; + protected final DateTimeZone timeZone; + + public HudiPageSourceCreator( + HudiConfig hudiConfig, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, + DateTimeZone timeZone) + { + this.hudiConfig = hudiConfig; + this.hdfsEnvironment = hdfsEnvironment; + this.stats = stats; + this.timeZone = timeZone; + } + + public abstract ConnectorPageSource createPageSource( + Configuration configuration, + ConnectorIdentity identity, + List regularColumns, + HudiSplit hudiSplit); +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceFactory.java new file mode 100644 index 000000000000..e2eaf45de8e4 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiPageSourceFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.page; + +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hudi.HudiConfig; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.exception.HoodieIOException; +import org.joda.time.DateTimeZone; + +import java.util.Map; + +public final class HudiPageSourceFactory +{ + private HudiPageSourceFactory() {} + + public static HudiPageSourceCreator get( + HoodieFileFormat baseFileFormat, HudiConfig hudiConfig, HdfsEnvironment hdfsEnvironment, + FileFormatDataSourceStats stats, DateTimeZone timeZone, Map context) + { + switch (baseFileFormat) { + case PARQUET: + return new HudiParquetPageSourceCreator( + hudiConfig, hdfsEnvironment, stats, timeZone, context); + default: + throw new HoodieIOException( + String.format("Base file format %s is not supported yet", baseFileFormat)); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiParquetPageSourceCreator.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiParquetPageSourceCreator.java new file mode 100644 index 000000000000..0597632f20c4 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/page/HudiParquetPageSourceCreator.java @@ -0,0 +1,236 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.page; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.Field; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.RichColumnDescriptor; +import io.trino.parquet.predicate.Predicate; +import io.trino.parquet.reader.ParquetReader; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HdfsEnvironment; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.ReaderColumns; +import io.trino.plugin.hive.parquet.HdfsParquetDataSource; +import io.trino.plugin.hive.parquet.ParquetPageSource; +import io.trino.plugin.hudi.HudiConfig; +import io.trino.plugin.hudi.HudiSplit; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.security.ConnectorIdentity; +import io.trino.spi.type.Type; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.trino.parquet.ParquetTypeUtils.getColumnIO; +import static io.trino.parquet.ParquetTypeUtils.getDescriptors; +import static io.trino.parquet.ParquetTypeUtils.lookupColumnByName; +import static io.trino.parquet.predicate.PredicateUtils.buildPredicate; +import static io.trino.parquet.predicate.PredicateUtils.predicateMatches; +import static io.trino.parquet.reader.MetadataReader.readFooter; +import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; +import static io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns; +import static io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.PARQUET_ROW_INDEX_COLUMN; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getColumnIndexStore; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getColumnType; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetTupleDomain; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetType; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_BAD_DATA; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_CANNOT_OPEN_SPLIT; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_MISSING_DATA; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toUnmodifiableList; + +public class HudiParquetPageSourceCreator + extends HudiPageSourceCreator +{ + public static final String CONTEXT_KEY_PARQUET_READER_OPTIONS = "parquet_reader_options"; + private final ParquetReaderOptions options; + + public HudiParquetPageSourceCreator( + HudiConfig hudiConfig, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, + DateTimeZone timeZone, Map context) + { + super(hudiConfig, hdfsEnvironment, stats, timeZone); + this.options = (ParquetReaderOptions) requireNonNull( + context.get(CONTEXT_KEY_PARQUET_READER_OPTIONS), "Parquet reader options are not present"); + } + + @Override + public ConnectorPageSource createPageSource( + Configuration configuration, + ConnectorIdentity identity, + List regularColumns, + HudiSplit hudiSplit) + { + ParquetDataSource dataSource = null; + boolean useParquetColumnNames = hudiConfig.getUseParquetColumnNames(); + Path path = new Path(hudiSplit.getPath()); + long start = hudiSplit.getStart(); + long length = hudiSplit.getLength(); + long estimatedFileSize = hudiSplit.getFileSize(); + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration); + FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path)); + dataSource = new HdfsParquetDataSource( + new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options); + ParquetDataSource parquetDataSource = dataSource; + ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(identity, () -> readFooter(parquetDataSource)); + FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + MessageType fileSchema = fileMetaData.getSchema(); + + Optional message = projectSufficientColumns(regularColumns) + .map(projection -> projection.get().stream() + .map(HiveColumnHandle.class::cast) + .collect(toUnmodifiableList())) + .orElse(regularColumns).stream() + .filter(column -> column.getColumnType() == REGULAR) + .map(column -> getColumnType(column, fileSchema, useParquetColumnNames)) + .filter(Optional::isPresent) + .map(Optional::get) + .map(type -> new MessageType(fileSchema.getName(), type)) + .reduce(MessageType::union); + + MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of())); + MessageColumnIO messageColumn = getColumnIO(fileSchema, requestedSchema); + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema); + TupleDomain parquetTupleDomain = options.isIgnoreStatistics() + ? TupleDomain.all() + : getParquetTupleDomain( + descriptorsByPath, hudiSplit.getPredicate(), fileSchema, useParquetColumnNames); + + Predicate parquetPredicate = buildPredicate( + requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone); + + long nextStart = 0; + ImmutableList.Builder blocks = ImmutableList.builder(); + ImmutableList.Builder blockStarts = ImmutableList.builder(); + ImmutableList.Builder> columnIndexes = ImmutableList.builder(); + for (BlockMetaData block : parquetMetadata.getBlocks()) { + long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + Optional columnIndex = getColumnIndexStore( + dataSource, block, descriptorsByPath, parquetTupleDomain, options); + if (start <= firstDataPage && firstDataPage < start + length + && predicateMatches(parquetPredicate, block, dataSource, + descriptorsByPath, parquetTupleDomain, columnIndex)) { + blocks.add(block); + blockStarts.add(nextStart); + columnIndexes.add(columnIndex); + } + nextStart += block.getRowCount(); + } + + ParquetReader parquetReader = new ParquetReader( + Optional.ofNullable(fileMetaData.getCreatedBy()), + messageColumn, + blocks.build(), + Optional.of(blockStarts.build()), + dataSource, + timeZone, + newSimpleAggregatedMemoryContext(), + options, + parquetPredicate, + columnIndexes.build()); + Optional readerProjections = projectBaseColumns(regularColumns); + List baseColumns = readerProjections.map(projection -> + projection.get().stream() + .map(HiveColumnHandle.class::cast) + .collect(toUnmodifiableList())) + .orElse(regularColumns); + + for (HiveColumnHandle column : baseColumns) { + checkArgument(column == PARQUET_ROW_INDEX_COLUMN + || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column); + } + + ImmutableList.Builder trinoTypes = ImmutableList.builder(); + ImmutableList.Builder> internalFields = ImmutableList.builder(); + ImmutableList.Builder rowIndexColumns = ImmutableList.builder(); + for (HiveColumnHandle column : baseColumns) { + trinoTypes.add(column.getBaseType()); + rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN); + if (column == PARQUET_ROW_INDEX_COLUMN) { + internalFields.add(Optional.empty()); + } + else { + internalFields.add(Optional.ofNullable( + getParquetType(column, fileSchema, useParquetColumnNames)) + .flatMap(field -> { + String columnName = useParquetColumnNames + ? column.getBaseColumnName() + : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName(); + return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName)); + })); + } + } + + return new ParquetPageSource( + parquetReader, + trinoTypes.build(), + rowIndexColumns.build(), + internalFields.build()); + } + catch (IOException | RuntimeException e) { + try { + if (dataSource != null) { + dataSource.close(); + } + } + catch (IOException ignored) { + } + if (e instanceof TrinoException) { + throw (TrinoException) e; + } + String message = format("Error opening Hudi split %s (offset=%s, length=%s): %s", + path, start, length, e.getMessage()); + + if (e instanceof ParquetCorruptionException) { + throw new TrinoException(HUDI_BAD_DATA, message, e); + } + + if (e instanceof BlockMissingException) { + throw new TrinoException(HUDI_MISSING_DATA, message, e); + } + throw new TrinoException(HUDI_CANNOT_OPEN_SPLIT, message, e); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionHiveInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionHiveInfo.java new file mode 100644 index 000000000000..9ad1fd7565fa --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionHiveInfo.java @@ -0,0 +1,134 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hive.util.HiveUtil; +import io.trino.plugin.hudi.HudiUtil; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.exception.HoodieIOException; + +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +import static java.util.Objects.isNull; + +public class HudiPartitionHiveInfo + extends HudiPartitionInfo +{ + private final List partitionColumns; + private final HiveMetastore hiveMetastore; + private final HiveIdentity hiveIdentity; + + public HudiPartitionHiveInfo( + String hivePartitionName, List partitionColumns, + List partitionColumnHandles, + TupleDomain constraintSummary, + Table table, HiveMetastore hiveMetastore, HiveIdentity hiveIdentity) + { + super(table, partitionColumnHandles, constraintSummary); + this.hivePartitionName = hivePartitionName; + this.partitionColumns = partitionColumns; + if (partitionColumns.isEmpty()) { + this.relativePartitionPath = ""; + this.hivePartitionKeys = Collections.emptyList(); + } + this.hiveMetastore = hiveMetastore; + this.hiveIdentity = hiveIdentity; + } + + @Override + public String getRelativePartitionPath() + { + if (isNull(relativePartitionPath)) { + loadPartitionInfoFromHiveMetastore(); + } + return relativePartitionPath; + } + + @Override + public String getHivePartitionName() + { + return hivePartitionName; + } + + @Override + public List getHivePartitionKeys() + { + if (isNull(hivePartitionKeys)) { + loadPartitionInfoFromHiveMetastore(); + } + return hivePartitionKeys; + } + + @Override + public boolean doesMatchPredicates() + { + return HudiUtil.doesPartitionMatchPredicates( + table.getSchemaTableName(), hivePartitionName, + partitionColumnHandles, constraintSummary); + } + + @Override + public String getComparingKey() + { + return hivePartitionName; + } + + @Override + public void loadPartitionInfo(Optional partition) + { + if (partition.isEmpty()) { + throw new HoodieIOException( + String.format("Cannot find partition in Hive Metastore: %s", hivePartitionName)); + } + this.relativePartitionPath = FSUtils.getRelativePartitionPath( + new Path(table.getStorage().getLocation()), + new Path(partition.get().getStorage().getLocation())); + this.hivePartitionKeys = + HudiUtil.buildPartitionKeys(partitionColumns, partition.get().getValues()); + } + + @Override + public String toString() + { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("HudiPartitionHiveInfo{"); + stringBuilder.append("hivePartitionName="); + stringBuilder.append(hivePartitionName); + if (!isNull(hivePartitionKeys)) { + stringBuilder.append(",hivePartitionKeys="); + stringBuilder.append(hivePartitionKeys); + } + stringBuilder.append("}"); + return stringBuilder.toString(); + } + + private void loadPartitionInfoFromHiveMetastore() + { + Optional partition = hiveMetastore.getPartition( + hiveIdentity, table, HiveUtil.toPartitionValues(hivePartitionName)); + loadPartitionInfo(partition); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java new file mode 100644 index 000000000000..7284eb7afb1a --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.predicate.TupleDomain; + +import java.util.List; +import java.util.Optional; + +public abstract class HudiPartitionInfo +{ + protected final Table table; + protected final List partitionColumnHandles; + protected final TupleDomain constraintSummary; + + // Relative partition path + protected String relativePartitionPath; + // Hive partition name containing partition column key-value pairs + protected String hivePartitionName; + protected List hivePartitionKeys; + + public HudiPartitionInfo( + Table table, List partitionColumnHandles, + TupleDomain constraintSummary) + { + this.table = table; + this.partitionColumnHandles = partitionColumnHandles; + this.constraintSummary = constraintSummary; + } + + public Table getTable() + { + return table; + } + + public abstract String getRelativePartitionPath(); + + public abstract String getHivePartitionName(); + + public abstract List getHivePartitionKeys(); + + public abstract boolean doesMatchPredicates(); + + public abstract String getComparingKey(); + + public abstract void loadPartitionInfo(Optional partition); +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoFactory.java new file mode 100644 index 000000000000..a303ba0609ee --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoFactory.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hive.PartitionValueExtractor; + +import java.util.List; + +public final class HudiPartitionInfoFactory +{ + private HudiPartitionInfoFactory() {} + + public static HudiPartitionInfo get( + boolean shouldSkipMetastoreForPartition, + Option relativePartitionPath, Option hivePartitionName, + Option partitionValueExtractor, + List partitionColumns, List partitionColumnHandles, + TupleDomain constraintSummary, + Table table, HiveMetastore hiveMetastore, HiveIdentity hiveIdentity) + { + if (shouldSkipMetastoreForPartition) { + return new HudiPartitionInternalInfo( + relativePartitionPath.get(), partitionColumns, partitionColumnHandles, + constraintSummary, partitionValueExtractor.get(), table); + } + return new HudiPartitionHiveInfo( + hivePartitionName.get(), partitionColumns, partitionColumnHandles, + constraintSummary, table, hiveMetastore, hiveIdentity); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java new file mode 100644 index 000000000000..48d7fea3fa0c --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.airlift.log.Logger; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hudi.query.HudiFileListing; +import io.trino.spi.connector.ConnectorSession; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.exception.HoodieIOException; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static io.trino.plugin.hudi.HudiSessionProperties.getMaxPartitionBatchSize; +import static io.trino.plugin.hudi.HudiSessionProperties.getMinPartitionBatchSize; + +public class HudiPartitionInfoLoader + implements Runnable +{ + private static final Logger log = Logger.get(HudiPartitionInfoLoader.class); + private final HudiFileListing hudiFileListing; + private final int minPartitionBatchSize; + private final int maxPartitionBatchSize; + private final ArrayDeque partitionQueue; + private int currBatchSize; + + public HudiPartitionInfoLoader( + ConnectorSession session, + HudiFileListing hudiFileListing, + ArrayDeque partitionQueue) + { + this.hudiFileListing = hudiFileListing; + this.partitionQueue = partitionQueue; + this.minPartitionBatchSize = getMinPartitionBatchSize(session); + this.maxPartitionBatchSize = getMaxPartitionBatchSize(session); + this.currBatchSize = -1; + } + + @Override + public void run() + { + HoodieTimer timer = new HoodieTimer().startTimer(); + List hudiPartitionInfoList = hudiFileListing.getPartitionsToScan().stream() + .sorted(Comparator.comparing(HudiPartitionInfo::getComparingKey)).collect(Collectors.toList()); + boolean shouldUseHiveMetastore = + !hudiPartitionInfoList.isEmpty() && hudiPartitionInfoList.get(0) instanceof HudiPartitionHiveInfo; + Iterator iterator = hudiPartitionInfoList.iterator(); + while (iterator.hasNext()) { + int batchSize = updateBatchSize(); + List partitionInfoBatch = new ArrayList<>(); + while (iterator.hasNext() && batchSize > 0) { + partitionInfoBatch.add(iterator.next()); + batchSize--; + } + + if (!partitionInfoBatch.isEmpty()) { + if (shouldUseHiveMetastore) { + Map> partitions = + hudiFileListing.getPartitions(partitionInfoBatch.stream() + .map(HudiPartitionInfo::getHivePartitionName) + .collect(Collectors.toList())); + partitionInfoBatch + .forEach(partitionInfo -> { + String hivePartitionName = partitionInfo.getHivePartitionName(); + if (!partitions.containsKey(hivePartitionName)) { + throw new HoodieIOException("Partition does not exist: " + hivePartitionName); + } + partitionInfo.loadPartitionInfo(partitions.get(hivePartitionName)); + synchronized (partitionQueue) { + partitionQueue.add(partitionInfo); + } + }); + } + else { + partitionInfoBatch.forEach(partitionInfo -> { + partitionInfo.getHivePartitionKeys(); + synchronized (partitionQueue) { + partitionQueue.add(partitionInfo); + } + }); + } + } + } + log.debug(String.format("HudiPartitionInfoLoader finishes in %d ms", timer.endTimer())); + } + + private int updateBatchSize() + { + if (currBatchSize <= 0) { + currBatchSize = minPartitionBatchSize; + } + else { + currBatchSize *= 2; + currBatchSize = Math.min(currBatchSize, maxPartitionBatchSize); + } + return currBatchSize; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInternalInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInternalInfo.java new file mode 100644 index 000000000000..78ad741c2f43 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInternalInfo.java @@ -0,0 +1,119 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiUtil; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.PartitionValueExtractor; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static java.util.Objects.isNull; + +public class HudiPartitionInternalInfo + extends HudiPartitionInfo +{ + private final List partitionColumns; + private final PartitionValueExtractor partitionValueExtractor; + + public HudiPartitionInternalInfo( + String relativePartitionPath, List partitionColumns, + List partitionColumnHandles, + TupleDomain constraintSummary, + PartitionValueExtractor partitionValueExtractor, + Table table) + { + super(table, partitionColumnHandles, constraintSummary); + this.relativePartitionPath = relativePartitionPath; + this.partitionColumns = partitionColumns; + this.partitionValueExtractor = partitionValueExtractor; + } + + @Override + public String getRelativePartitionPath() + { + return relativePartitionPath; + } + + @Override + public String getHivePartitionName() + { + throw new HoodieException( + "HudiPartitionInternalInfo::getHivePartitionName() should not be called"); + } + + @Override + public List getHivePartitionKeys() + { + if (isNull(hivePartitionKeys)) { + List partitionValues = + partitionValueExtractor.extractPartitionValuesInPath(relativePartitionPath); + hivePartitionKeys = HudiUtil.buildPartitionKeys(partitionColumns, partitionValues); + } + + return hivePartitionKeys; + } + + @Override + public boolean doesMatchPredicates() + { + Map partitionKeyValueMap = + getHivePartitionKeys().stream().collect(Collectors.toMap( + HivePartitionKey::getName, HivePartitionKey::getValue)); + List partitionValues = partitionColumns.stream() + .map(column -> partitionKeyValueMap.get(column.getName())) + .collect(Collectors.toList()); + return HudiUtil.doesPartitionMatchPredicates( + table.getSchemaTableName(), relativePartitionPath, partitionValues, + partitionColumnHandles, constraintSummary); + } + + @Override + public String getComparingKey() + { + return relativePartitionPath; + } + + @Override + public void loadPartitionInfo(Optional partition) + { + throw new HoodieException( + "HudiPartitionInternalInfo::loadPartitionInfo() should not be called"); + } + + @Override + public String toString() + { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("HudiPartitionInternalInfo{"); + stringBuilder.append("relativePartitionPath="); + stringBuilder.append(relativePartitionPath); + if (!isNull(hivePartitionKeys)) { + stringBuilder.append(",hivePartitionKeys="); + stringBuilder.append(hivePartitionKeys); + } + stringBuilder.append("}"); + return stringBuilder.toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionScanner.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionScanner.java new file mode 100644 index 000000000000..20db184e2f24 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionScanner.java @@ -0,0 +1,93 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import io.airlift.log.Logger; +import io.trino.plugin.hudi.query.HudiFileListing; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.ArrayDeque; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class HudiPartitionScanner + implements Runnable +{ + private static final Logger log = Logger.get(HudiPartitionScanner.class); + private final HudiFileListing hudiFileListing; + private final ArrayDeque partitionQueue; + private final Map partitionInfoMap; + private final ArrayDeque> hoodieFileStatusQueue; + private boolean isRunning; + + public HudiPartitionScanner( + HudiFileListing hudiFileListing, + ArrayDeque partitionQueue, + Map partitionInfoMap, + ArrayDeque> hoodieFileStatusQueue) + { + this.hudiFileListing = hudiFileListing; + this.partitionQueue = partitionQueue; + this.partitionInfoMap = partitionInfoMap; + this.hoodieFileStatusQueue = hoodieFileStatusQueue; + this.isRunning = true; + } + + @Override + public void run() + { + HoodieTimer timer = new HoodieTimer().startTimer(); + + while (isRunning || !partitionQueue.isEmpty()) { + HudiPartitionInfo partitionInfo = null; + synchronized (partitionQueue) { + if (!partitionQueue.isEmpty()) { + partitionInfo = partitionQueue.pollFirst(); + } + } + + if (partitionInfo != null) { + scanPartition(partitionInfo); + } + } + log.debug(String.format("HudiPartitionScanner %s finishes in %d ms", this, timer.endTimer())); + } + + public void stopRunning() + { + this.isRunning = false; + } + + private void scanPartition(HudiPartitionInfo partitionInfo) + { + // Load Hive partition keys + synchronized (partitionInfoMap) { + partitionInfoMap.put(partitionInfo.getRelativePartitionPath(), partitionInfo); + } + final String relativePartitionPath = partitionInfo.getRelativePartitionPath(); + List> fileStatusList = hudiFileListing.listStatus(partitionInfo).stream() + .map(fileStatus -> new ImmutablePair<>(fileStatus, relativePartitionPath)) + .collect(Collectors.toList()); + synchronized (hoodieFileStatusQueue) { + hoodieFileStatusQueue.addAll(fileStatusList); + } + log.debug(String.format("Add %d base files for %s", + fileStatusList.size(), partitionInfo.getRelativePartitionPath())); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionSplitGenerator.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionSplitGenerator.java new file mode 100644 index 000000000000..a0d6c0ee93b7 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionSplitGenerator.java @@ -0,0 +1,129 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.partition; + +import com.google.common.collect.ImmutableList; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hudi.HudiSplit; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.HudiUtil; +import io.trino.plugin.hudi.split.HudiSplitWeightProvider; +import io.trino.spi.connector.ConnectorSplit; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class HudiPartitionSplitGenerator + implements Runnable +{ + private static final Logger log = Logger.get(HudiPartitionSplitGenerator.class); + private final FileSystem fileSystem; + private final HoodieTableMetaClient metaClient; + private final HudiTableHandle tableHandle; + private final HudiSplitWeightProvider hudiSplitWeightProvider; + private final Map partitionInfoMap; + private final ArrayDeque> hoodieFileStatusQueue; + private final ArrayDeque connectorSplitQueue; + private boolean isRunning; + + public HudiPartitionSplitGenerator( + FileSystem fileSystem, + HoodieTableMetaClient metaClient, + HudiTableHandle tableHandle, + HudiSplitWeightProvider hudiSplitWeightProvider, + Map partitionInfoMap, + ArrayDeque> hoodieFileStatusQueue, + ArrayDeque connectorSplitQueue) + { + this.fileSystem = fileSystem; + this.metaClient = metaClient; + this.tableHandle = tableHandle; + this.hudiSplitWeightProvider = hudiSplitWeightProvider; + this.partitionInfoMap = partitionInfoMap; + this.hoodieFileStatusQueue = hoodieFileStatusQueue; + this.connectorSplitQueue = connectorSplitQueue; + this.isRunning = true; + } + + @Override + public void run() + { + HoodieTimer timer = new HoodieTimer().startTimer(); + while (isRunning || !hoodieFileStatusQueue.isEmpty()) { + Pair fileStatusPartitionPair = null; + synchronized (hoodieFileStatusQueue) { + if (!hoodieFileStatusQueue.isEmpty()) { + fileStatusPartitionPair = hoodieFileStatusQueue.pollFirst(); + } + } + if (fileStatusPartitionPair != null) { + try { + String relativePartitionPath = fileStatusPartitionPair.getValue(); + final List hivePartitionKeys; + synchronized (partitionInfoMap) { + hivePartitionKeys = partitionInfoMap.get(relativePartitionPath).getHivePartitionKeys(); + } + List hudiSplits = HudiUtil.getSplits(fileSystem, fileStatusPartitionPair.getKey()) + .stream() + .flatMap(fileSplit -> { + List result = new ArrayList<>(); + try { + result.add(new HudiSplit( + fileSplit.getPath().toString(), + fileSplit.getStart(), + fileSplit.getLength(), + metaClient.getFs().getLength(fileSplit.getPath()), + ImmutableList.of(), + tableHandle.getRegularPredicates(), + hivePartitionKeys, + hudiSplitWeightProvider.weightForSplitSizeInBytes( + fileSplit.getLength()))); + } + catch (IOException e) { + throw new HoodieIOException(String.format( + "Unable to get Hudi split for %s, start=%d len=%d", + fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength()), e); + } + return result.stream(); + }) + .collect(Collectors.toList()); + synchronized (connectorSplitQueue) { + connectorSplitQueue.addAll(hudiSplits); + } + } + catch (IOException e) { + throw new HoodieIOException("Unable to get splits for " + fileStatusPartitionPair.getKey().getPath(), e); + } + } + } + log.debug(String.format("HudiPartitionSplitGenerator finishes in %d ms", timer.endTimer())); + } + + public void stopRunning() + { + this.isRunning = false; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListing.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListing.java new file mode 100644 index 000000000000..0ee138c81576 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListing.java @@ -0,0 +1,102 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.query; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.MetastoreUtil; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.isNull; + +public abstract class HudiFileListing +{ + protected final HoodieMetadataConfig metadataConfig; + protected final HoodieEngineContext engineContext; + protected final HoodieTableMetaClient metaClient; + protected final HudiTableHandle tableHandle; + protected final HiveMetastore hiveMetastore; + protected final Table hiveTable; + protected final HiveIdentity hiveIdentity; + protected final SchemaTableName tableName; + protected final List partitionColumnHandles; + protected final boolean shouldSkipMetastoreForPartition; + protected HoodieTableFileSystemView fileSystemView; + protected TupleDomain partitionKeysFilter; + protected List partitionColumns; + + public HudiFileListing( + HoodieMetadataConfig metadataConfig, HoodieEngineContext engineContext, + HudiTableHandle tableHandle, HoodieTableMetaClient metaClient, + HiveMetastore hiveMetastore, Table hiveTable, HiveIdentity hiveIdentity, + List partitionColumnHandles, boolean shouldSkipMetastoreForPartition) + { + this.metadataConfig = metadataConfig; + this.engineContext = engineContext; + this.metaClient = metaClient; + this.tableHandle = tableHandle; + this.tableName = tableHandle.getSchemaTableName(); + this.hiveMetastore = hiveMetastore; + this.hiveTable = hiveTable; + this.hiveIdentity = hiveIdentity; + this.partitionColumnHandles = partitionColumnHandles; + this.shouldSkipMetastoreForPartition = shouldSkipMetastoreForPartition; + } + + public abstract List getPartitionsToScan(); + + public abstract List listStatus(HudiPartitionInfo partitionInfo); + + public void close() + { + if (!fileSystemView.isClosed()) { + fileSystemView.close(); + } + } + + public Map> getPartitions(List partitionNames) + { + return hiveMetastore.getPartitionsByNames(hiveIdentity, hiveTable, partitionNames); + } + + protected void initFileSystemViewAndPredicates() + { + if (isNull(fileSystemView)) { + // These are time-consuming operations + // Triggering them when getting the partitions + this.fileSystemView = FileSystemViewManager.createInMemoryFileSystemView( + engineContext, metaClient, metadataConfig); + this.partitionKeysFilter = MetastoreUtil.computePartitionKeyFilter( + partitionColumnHandles, tableHandle.getPartitionPredicates()); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListingFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListingFactory.java new file mode 100644 index 000000000000..aec92a198d61 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiFileListingFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.query; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiTableHandle; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.exception.HoodieException; + +import java.util.List; + +public final class HudiFileListingFactory +{ + private HudiFileListingFactory() {} + + public static HudiFileListing get( + HudiQueryMode queryMode, HoodieMetadataConfig metadataConfig, + HoodieEngineContext engineContext, HudiTableHandle tableHandle, + HoodieTableMetaClient metaClient, HiveMetastore hiveMetastore, Table hiveTable, + HiveIdentity hiveIdentity, List partitionColumnHandles, + boolean shouldSkipMetastoreForPartition) + { + switch (queryMode) { + case SNAPSHOT: + return new HudiSnapshotFileListing(metadataConfig, engineContext, tableHandle, + metaClient, hiveMetastore, hiveTable, hiveIdentity, partitionColumnHandles, + shouldSkipMetastoreForPartition); + case READ_OPTIMIZED: + return new HudiReadOptimizedFileListing(metadataConfig, engineContext, tableHandle, + metaClient, hiveMetastore, hiveTable, hiveIdentity, partitionColumnHandles, + shouldSkipMetastoreForPartition); + default: + throw new HoodieException( + String.format("Hudi query mode %s is not supported yet", queryMode)); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiQueryMode.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiQueryMode.java new file mode 100644 index 000000000000..3032584000c9 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiQueryMode.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.query; + +public enum HudiQueryMode { + SNAPSHOT, + INCREMENTAL, + READ_OPTIMIZED +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedFileListing.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedFileListing.java new file mode 100644 index 000000000000..0b907e316bd8 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedFileListing.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.query; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.HudiUtil; +import io.trino.plugin.hudi.partition.HudiPartitionHiveInfo; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import io.trino.plugin.hudi.partition.HudiPartitionInfoFactory; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.TimelineUtils; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.hive.NonPartitionedExtractor; +import org.apache.hudi.hive.PartitionValueExtractor; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import static java.util.Objects.isNull; + +public class HudiReadOptimizedFileListing + extends HudiFileListing +{ + private static final Logger log = Logger.get(HudiReadOptimizedFileListing.class); + + private List hivePartitionNames; + + public HudiReadOptimizedFileListing( + HoodieMetadataConfig metadataConfig, HoodieEngineContext engineContext, + HudiTableHandle tableHandle, HoodieTableMetaClient metaClient, + HiveMetastore hiveMetastore, Table hiveTable, HiveIdentity hiveIdentity, + List partitionColumnHandles, boolean shouldSkipMetastoreForPartition) + { + super(metadataConfig, engineContext, tableHandle, metaClient, hiveMetastore, hiveTable, + hiveIdentity, partitionColumnHandles, shouldSkipMetastoreForPartition); + } + + @Override + public List getPartitionsToScan() + { + HoodieTimer timer = new HoodieTimer().startTimer(); + + initFileSystemViewAndPredicates(); + + partitionColumns = hiveTable.getPartitionColumns(); + List allPartitionInfoList = null; + + if (shouldSkipMetastoreForPartition) { + try { + // Use relative partition path and other context to construct + // HudiPartitionInternalInfo instances + PartitionValueExtractor partitionValueExtractor = partitionColumns.isEmpty() + ? new NonPartitionedExtractor() + : inferPartitionValueExtractorWithHiveMetastore(); + List relativePartitionPathList = partitionColumns.isEmpty() + ? Collections.singletonList("") + : TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline()); + allPartitionInfoList = relativePartitionPathList.stream() + .map(relativePartitionPath -> + HudiPartitionInfoFactory.get(shouldSkipMetastoreForPartition, + Option.of(relativePartitionPath), Option.empty(), + Option.of(partitionValueExtractor), partitionColumns, + partitionColumnHandles, tableHandle.getPartitionPredicates(), + hiveTable, hiveMetastore, hiveIdentity)) + .collect(Collectors.toList()); + } + catch (HoodieIOException e) { + log.warn("Cannot skip Hive Metastore for scanning partitions. Falling back to using Hive Metastore."); + } + } + + if (isNull(allPartitionInfoList)) { + // Use Hive partition names and other context to construct + // HudiPartitionHiveInfo instances + if (isNull(hivePartitionNames)) { + hivePartitionNames = partitionColumns.isEmpty() + ? Collections.singletonList("") + : getPartitionNamesFromHiveMetastore(partitionKeysFilter); + } + + allPartitionInfoList = hivePartitionNames.stream() + .map(hivePartitionName -> + HudiPartitionInfoFactory.get(shouldSkipMetastoreForPartition, + Option.empty(), Option.of(hivePartitionName), + Option.empty(), partitionColumns, + partitionColumnHandles, tableHandle.getPartitionPredicates(), + hiveTable, hiveMetastore, hiveIdentity)) + .collect(Collectors.toList()); + } + + List filteredPartitionInfoList = allPartitionInfoList.stream() + .filter(HudiPartitionInfo::doesMatchPredicates) + .collect(Collectors.toList()); + + log.debug(String.format( + "Get partitions to scan in %d ms (shouldSkipMetastoreForPartition: %s): %s", + timer.endTimer(), shouldSkipMetastoreForPartition, filteredPartitionInfoList)); + + return filteredPartitionInfoList; + } + + @Override + public List listStatus(HudiPartitionInfo partitionInfo) + { + initFileSystemViewAndPredicates(); + return fileSystemView.getLatestBaseFiles(partitionInfo.getRelativePartitionPath()) + .map(baseFile -> { + try { + return HoodieInputFormatUtils.getFileStatus(baseFile); + } + catch (IOException e) { + throw new HoodieIOException("Error getting file status of " + baseFile.getPath(), e); + } + }) + .collect(Collectors.toList()); + } + + @VisibleForTesting + PartitionValueExtractor inferPartitionValueExtractorWithHiveMetastore() + throws HoodieIOException + { + hivePartitionNames = getPartitionNamesFromHiveMetastore(TupleDomain.all()); + if (hivePartitionNames.isEmpty()) { + throw new HoodieIOException("Cannot infer partition value extractor with Hive Metastore: partition list is empty!"); + } + HudiPartitionHiveInfo partitionHiveInfo = new HudiPartitionHiveInfo( + hivePartitionNames.get(0), partitionColumns, partitionColumnHandles, + tableHandle.getPartitionPredicates(), hiveTable, hiveMetastore, hiveIdentity); + String relativePartitionPath = partitionHiveInfo.getRelativePartitionPath(); + List partitionValues = partitionHiveInfo.getHivePartitionKeys().stream() + .map(HivePartitionKey::getValue).collect(Collectors.toList()); + return HudiUtil.inferPartitionValueExtractor(relativePartitionPath, partitionValues); + } + + private List getPartitionNamesFromHiveMetastore(TupleDomain partitionKeysFilter) + { + return hiveMetastore.getPartitionNamesByFilter( + hiveIdentity, tableName.getSchemaName(), tableName.getTableName(), + partitionColumns.stream().map(Column::getName).collect(Collectors.toList()), + partitionKeysFilter) + .orElseThrow(() -> new TableNotFoundException(tableHandle.getSchemaTableName())); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiSnapshotFileListing.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiSnapshotFileListing.java new file mode 100644 index 000000000000..e0c1ef256221 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiSnapshotFileListing.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.query; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.authentication.HiveIdentity; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.util.List; + +public class HudiSnapshotFileListing + extends HudiFileListing +{ + public HudiSnapshotFileListing( + HoodieMetadataConfig metadataConfig, HoodieEngineContext engineContext, + HudiTableHandle tableHandle, HoodieTableMetaClient metaClient, + HiveMetastore hiveMetastore, Table hiveTable, HiveIdentity hiveIdentity, + List partitionColumnHandles, boolean shouldSkipMetastoreForPartition) + { + super(metadataConfig, engineContext, tableHandle, metaClient, hiveMetastore, hiveTable, + hiveIdentity, partitionColumnHandles, shouldSkipMetastoreForPartition); + } + + @Override + public List getPartitionsToScan() + { + return null; + } + + @Override + public List listStatus(HudiPartitionInfo partitionInfo) + { + return null; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitBackgroundLoader.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitBackgroundLoader.java new file mode 100644 index 000000000000..aa61408d5b18 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitBackgroundLoader.java @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.split; + +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import io.trino.plugin.hudi.partition.HudiPartitionInfoLoader; +import io.trino.plugin.hudi.partition.HudiPartitionScanner; +import io.trino.plugin.hudi.partition.HudiPartitionSplitGenerator; +import io.trino.plugin.hudi.query.HudiFileListing; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import static io.trino.plugin.hudi.HudiSessionProperties.getMinimumAssignedSplitWeight; +import static io.trino.plugin.hudi.HudiSessionProperties.getPartitionScannerParallelism; +import static io.trino.plugin.hudi.HudiSessionProperties.getSplitGeneratorParallelism; +import static io.trino.plugin.hudi.HudiSessionProperties.getStandardSplitWeightSize; +import static io.trino.plugin.hudi.HudiSessionProperties.isSizeBasedSplitWeightsEnabled; + +public class HudiSplitBackgroundLoader + implements Runnable +{ + private static final Logger log = Logger.get(HudiSplitBackgroundLoader.class); + private final ConnectorSession session; + private final HudiTableHandle tableHandle; + private final HoodieTableMetaClient metaClient; + private final HudiFileListing hudiFileListing; + private final ArrayDeque connectorSplitQueue; + private final ArrayDeque partitionQueue; + private final Map partitionInfoMap; + private final ArrayDeque> hoodieFileStatusQueue; + private final ExecutorService partitionInfoLoaderExecutorService; + private final ExecutorService partitionScannerExecutorService; + private final ExecutorService splitGeneratorExecutorService; + private final int partitionScannerNumThreads; + private final int splitGeneratorNumThreads; + private final boolean sizeBasedSplitWeightsEnabled; + private final DataSize standardSplitWeightSize; + private final double minimumAssignedSplitWeight; + + public HudiSplitBackgroundLoader( + ConnectorSession session, + HudiTableHandle tableHandle, + HoodieTableMetaClient metaClient, + HudiFileListing hudiFileListing, + ArrayDeque connectorSplitQueue) + { + this.session = session; + this.tableHandle = tableHandle; + this.metaClient = metaClient; + this.hudiFileListing = hudiFileListing; + this.connectorSplitQueue = connectorSplitQueue; + this.partitionQueue = new ArrayDeque<>(); + this.partitionInfoMap = new HashMap<>(); + this.hoodieFileStatusQueue = new ArrayDeque<>(); + this.partitionScannerNumThreads = getPartitionScannerParallelism(session); + this.splitGeneratorNumThreads = getSplitGeneratorParallelism(session); + this.partitionInfoLoaderExecutorService = Executors.newSingleThreadExecutor(); + this.partitionScannerExecutorService = Executors.newCachedThreadPool(); + this.splitGeneratorExecutorService = Executors.newCachedThreadPool(); + this.sizeBasedSplitWeightsEnabled = isSizeBasedSplitWeightsEnabled(session); + this.standardSplitWeightSize = getStandardSplitWeightSize(session); + this.minimumAssignedSplitWeight = getMinimumAssignedSplitWeight(session); + } + + @Override + public void run() + { + HoodieTimer timer = new HoodieTimer().startTimer(); + FileSystem fileSystem = metaClient.getFs(); + // Step 1: fetch partitions info that need to be read for file listing. + HudiPartitionInfoLoader partitionInfoLoader = + new HudiPartitionInfoLoader(session, hudiFileListing, partitionQueue); + Future partitionInfoLoaderFuture = partitionInfoLoaderExecutorService.submit(partitionInfoLoader); + // Step 2: scan partitions to list files concurrently. + List partitionScannerList = new ArrayList<>(); + List partitionScannerFutures = new ArrayList<>(); + + for (int i = 0; i < partitionScannerNumThreads; i++) { + HudiPartitionScanner scanner = new HudiPartitionScanner(hudiFileListing, + partitionQueue, partitionInfoMap, hoodieFileStatusQueue); + partitionScannerList.add(scanner); + partitionScannerFutures.add(partitionScannerExecutorService.submit(scanner)); + } + // Step 3: Generate splits from the files listed in the second step. + List splitGeneratorList = new ArrayList<>(); + List splitGeneratorFutures = new ArrayList<>(); + + for (int i = 0; i < splitGeneratorNumThreads; i++) { + HudiSplitWeightProvider splitWeightProvider = sizeBasedSplitWeightsEnabled + ? new SizeBasedSplitWeightProvider(minimumAssignedSplitWeight, standardSplitWeightSize) + : HudiSplitWeightProvider.uniformStandardWeightProvider(); + HudiPartitionSplitGenerator generator = new HudiPartitionSplitGenerator( + fileSystem, metaClient, tableHandle, splitWeightProvider, + partitionInfoMap, hoodieFileStatusQueue, connectorSplitQueue); + splitGeneratorList.add(generator); + splitGeneratorFutures.add(splitGeneratorExecutorService.submit(generator)); + } + + // Wait for partition info loader to finish + try { + partitionInfoLoaderFuture.get(); + } + catch (InterruptedException | ExecutionException e) { + throw new RuntimeException("Partition loader interrupted", e); + } + + for (HudiPartitionScanner scanner : partitionScannerList) { + scanner.stopRunning(); + } + + // Wait for all partition scanners to finish + for (Future future : partitionScannerFutures) { + try { + future.get(); + } + catch (InterruptedException | ExecutionException e) { + throw new RuntimeException("Partition scanner interrupted", e); + } + } + + for (HudiPartitionSplitGenerator generator : splitGeneratorList) { + generator.stopRunning(); + } + + // Wait for all split generators to finish + for (Future future : splitGeneratorFutures) { + try { + future.get(); + } + catch (InterruptedException | ExecutionException e) { + throw new RuntimeException("Split generator interrupted", e); + } + } + log.debug(String.format("Finish getting all splits in %d ms", timer.endTimer())); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java new file mode 100644 index 000000000000..3554481ebec7 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java @@ -0,0 +1,27 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.split; + +import io.trino.spi.SplitWeight; + +public interface HudiSplitWeightProvider +{ + SplitWeight weightForSplitSizeInBytes(long splitSizeInBytes); + + static HudiSplitWeightProvider uniformStandardWeightProvider() + { + return (splitSizeInBytes) -> SplitWeight.standard(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java new file mode 100644 index 000000000000..e59f360e5aaf --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java @@ -0,0 +1,48 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi.split; + +import io.airlift.units.DataSize; +import io.trino.spi.SplitWeight; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class SizeBasedSplitWeightProvider + implements HudiSplitWeightProvider +{ + private final double minimumWeight; + private final double standardSplitSizeInBytes; + + public SizeBasedSplitWeightProvider(double minimumWeight, DataSize standardSplitSize) + { + checkArgument(Double.isFinite(minimumWeight) && minimumWeight > 0 && minimumWeight <= 1, + "minimumWeight must be > 0 and <= 1, found: %s", minimumWeight); + this.minimumWeight = minimumWeight; + long standardSplitSizeInBytesLong = requireNonNull( + standardSplitSize, "standardSplitSize is null").toBytes(); + checkArgument(standardSplitSizeInBytesLong > 0, + "standardSplitSize must be > 0, found: %s", standardSplitSize); + this.standardSplitSizeInBytes = (double) standardSplitSizeInBytesLong; + } + + @Override + public SplitWeight weightForSplitSizeInBytes(long splitSizeInBytes) + { + double computedWeight = splitSizeInBytes / standardSplitSizeInBytes; + // Clamp the value between the minimum weight and 1.0 (standard weight) + return SplitWeight.fromProportion(Math.min(Math.max(computedWeight, minimumWeight), 1.0)); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/AbstractHudiTestQueryFramework.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/AbstractHudiTestQueryFramework.java new file mode 100644 index 000000000000..04a0899e2f50 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/AbstractHudiTestQueryFramework.java @@ -0,0 +1,180 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.Session; +import io.trino.plugin.hive.HivePlugin; +import io.trino.plugin.tpch.TpchPlugin; +import io.trino.spi.type.TimeZoneKey; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; + +import java.nio.file.Path; +import java.util.Locale; +import java.util.Map; + +import static io.trino.testing.TestingSession.testSessionBuilder; +import static java.lang.String.format; + +public class AbstractHudiTestQueryFramework + extends AbstractTestQueryFramework +{ + public static final String HUDI_CATALOG = "hudi"; + public static final String HIVE_CATALOG = "hive"; + public static final String HUDI_SCHEMA = "default"; + + static final String NON_PARTITIONED_TABLE_NAME = "hudi_non_part_cow"; + static final String PARTITIONED_COW_TABLE_NAME = "stock_ticks_cow"; + static final String PARTITIONED_MOR_TABLE_NAME = "stock_ticks_mor"; + + private static final String CREATE_NON_PARTITIONED_TABLE_STATEMENT = "CREATE TABLE %s.\"%s\".\"%s\" (\n" + + " _hoodie_commit_time varchar,\n" + + " _hoodie_commit_seqno varchar,\n" + + " _hoodie_record_key varchar,\n" + + " _hoodie_partition_path varchar,\n" + + " _hoodie_file_name varchar,\n" + + " rowid varchar,\n" + + " partitionid varchar,\n" + + " precomb bigint,\n" + + " name varchar,\n" + + " versionid varchar,\n" + + " tobedeletedstr varchar,\n" + + " inttolong integer,\n" + + " longtoint bigint\n" + + " )\n" + + " WITH (\n" + + " external_location = '%s',\n" + + " format = 'PARQUET'\n" + + " )"; + + private static final String CREATE_PARTITIONED_TABLE_STATEMENT = "CREATE TABLE %s.\"%s\".\"%s\" (\n" + + " _hoodie_commit_time varchar,\n" + + " _hoodie_commit_seqno varchar,\n" + + " _hoodie_record_key varchar,\n" + + " _hoodie_partition_path varchar,\n" + + " _hoodie_file_name varchar,\n" + + " volume bigint,\n" + + " ts varchar,\n" + + " symbol varchar,\n" + + " year integer,\n" + + " month varchar,\n" + + " high double,\n" + + " low double,\n" + + " key varchar,\n" + + " date varchar,\n" + + " close double,\n" + + " open double,\n" + + " day varchar,\n" + + " dt varchar\n" + + " )\n" + + " WITH (\n" + + " external_location = '%s',\n" + + " format = 'PARQUET',\n" + + " partitioned_by = ARRAY['dt']\n" + + " )"; + + private static final Map TABLE_NAME_TO_CREATE_STATEMENT = new ImmutableMap.Builder() + .put(NON_PARTITIONED_TABLE_NAME, CREATE_NON_PARTITIONED_TABLE_STATEMENT) + .put(PARTITIONED_COW_TABLE_NAME, CREATE_PARTITIONED_TABLE_STATEMENT) + .put(PARTITIONED_MOR_TABLE_NAME, CREATE_PARTITIONED_TABLE_STATEMENT) + .build(); + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createHudiQueryRunner(ImmutableMap.of()); + } + + protected void assertHudiQuery(String table, String testQuery, String expResults, boolean fail) + { + try { + syncHudiTableInMetastore(table); + if (!fail) { + assertQuery(testQuery, expResults); + } + else { + assertQueryFails(testQuery, expResults); + } + } + finally { + dropHudiTableFromMetastore(table); + } + } + + protected static String getTableBasePath(String tableName) + { + return AbstractHudiTestQueryFramework.class.getClassLoader().getResource(tableName).toString(); + } + + private static DistributedQueryRunner createHudiQueryRunner(Map extraProperties) + throws Exception + { + Session session = testSessionBuilder() + .setCatalog(HUDI_CATALOG) + .setSchema(HUDI_SCHEMA.toLowerCase(Locale.ROOT)) + .setTimeZoneKey(TimeZoneKey.UTC_KEY) + .build(); + + DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(session) + .setExtraProperties(extraProperties) + .build(); + + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch"); + + Path dataDir = queryRunner.getCoordinator().getBaseDataDir().resolve("hudi_metadata"); + Path catalogDir = dataDir.getParent().resolve("catalog"); + + // Install Hudi connector + queryRunner.installPlugin(new HudiPlugin()); + Map hudiProperties = ImmutableMap.builder() + .put("hive.metastore", "file") + .put("hive.metastore.catalog.dir", catalogDir.toFile().toURI().toString()) + .build(); + queryRunner.createCatalog(HUDI_CATALOG, "hudi", hudiProperties); + + // Install Hive connector + queryRunner.installPlugin(new HivePlugin()); + Map hiveProperties = ImmutableMap.builder() + .put("hive.metastore", "file") + .put("hive.metastore.catalog.dir", catalogDir.toFile().toURI().toString()) + .put("hive.allow-drop-table", "true") + .put("hive.security", "legacy") + .build(); + queryRunner.createCatalog(HIVE_CATALOG, "hive", hiveProperties); + queryRunner.execute(format("CREATE SCHEMA %s.%s", HIVE_CATALOG, HUDI_SCHEMA)); + + return queryRunner; + } + + protected void syncHudiTableInMetastore(String tableName) + { + getQueryRunner().execute(format( + TABLE_NAME_TO_CREATE_STATEMENT.get(tableName), + HIVE_CATALOG, + HUDI_SCHEMA, + tableName, + getTableBasePath(tableName))); + } + + protected void dropHudiTableFromMetastore(String tableName) + { + getQueryRunner().execute( + format("DROP TABLE IF EXISTS %s.\"%s\".\"%s\"", HIVE_CATALOG, HUDI_SCHEMA, tableName)); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java new file mode 100644 index 000000000000..6a8dded56f32 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java @@ -0,0 +1,81 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +public class TestHudiConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(HudiConfig.class) + .setBaseFileFormat(PARQUET) + .setMetadataEnabled(false) + .setSkipMetaStoreForPartition(false) + .setUseParquetColumnNames(true) + .setPartitionScannerParallelism(16) + .setSplitGeneratorParallelism(16) + .setMinPartitionBatchSize(10) + .setMaxPartitionBatchSize(100) + .setSizeBasedSplitWeightsEnabled(true) + .setStandardSplitWeightSize(DataSize.of(128, MEGABYTE)) + .setMinimumAssignedSplitWeight(0.05)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hudi.base-file-format", "ORC") + .put("hudi.metadata-enabled", "true") + .put("hudi.skip-metastore-for-partition", "true") + .put("hudi.use-parquet-column-names", "false") + .put("hudi.partition-scanner-parallelism", "8") + .put("hudi.split-generator-parallelism", "8") + .put("hudi.min-partition-batch-size", "5") + .put("hudi.max-partition-batch-size", "50") + .put("hudi.size-based-split-weights-enabled", "false") + .put("hudi.standard-split-weight-size", "64MB") + .put("hudi.minimum-assigned-split-weight", "0.1") + .build(); + + HudiConfig expected = new HudiConfig() + .setBaseFileFormat(ORC) + .setMetadataEnabled(true) + .setSkipMetaStoreForPartition(true) + .setUseParquetColumnNames(false) + .setPartitionScannerParallelism(8) + .setSplitGeneratorParallelism(8) + .setMinPartitionBatchSize(5) + .setMaxPartitionBatchSize(50) + .setSizeBasedSplitWeightsEnabled(false) + .setStandardSplitWeightSize(DataSize.of(64, MEGABYTE)) + .setMinimumAssignedSplitWeight(0.1); + + assertFullMapping(properties, expected); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java new file mode 100644 index 000000000000..1fd113dbd7e4 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorMetadata; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitManager; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.testing.TestingConnectorContext; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.trino.plugin.hudi.AbstractHudiTestQueryFramework.HUDI_CATALOG; +import static io.trino.spi.transaction.IsolationLevel.READ_UNCOMMITTED; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class TestHudiConnectorFactory +{ + @Test + public void testCreateConnector() + { + assertCreateConnector("thrift://localhost:1234"); + assertCreateConnector("thrift://localhost:1234,thrift://192.0.2.3:5678"); + + assertCreateConnectorFails("abc", "metastoreUri scheme is missing: abc"); + assertCreateConnectorFails("thrift://:8090", "metastoreUri host is missing: thrift://:8090"); + assertCreateConnectorFails("thrift://localhost", "metastoreUri port is missing: thrift://localhost"); + assertCreateConnectorFails("abc::", "metastoreUri scheme must be thrift: abc::"); + assertCreateConnectorFails("", "metastoreUris must specify at least one URI"); + assertCreateConnectorFails("thrift://localhost:1234,thrift://test-1", "metastoreUri port is missing: thrift://test-1"); + } + + private static void assertCreateConnector(String metastoreUri) + { + Map config = ImmutableMap.builder() + .put("hive.metastore.uri", metastoreUri) + .build(); + + Connector connector = new HudiConnectorFactory(HUDI_CATALOG).create("test", config, new TestingConnectorContext()); + ConnectorTransactionHandle transaction = connector.beginTransaction(READ_UNCOMMITTED, true, true); + assertInstanceOf(connector.getMetadata(transaction), ClassLoaderSafeConnectorMetadata.class); + assertInstanceOf(connector.getSplitManager(), ClassLoaderSafeConnectorSplitManager.class); + assertInstanceOf(connector.getPageSourceProvider(), ConnectorPageSourceProvider.class); + connector.commit(transaction); + } + + private static void assertCreateConnectorFails(String metastoreUri, String exceptionString) + { + assertThatThrownBy(() -> assertCreateConnector(metastoreUri)) + .isInstanceOf(RuntimeException.class) + .hasMessageContaining(exceptionString); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSanity.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSanity.java new file mode 100644 index 000000000000..bb548e85703c --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSanity.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import org.testng.annotations.Test; + +import static java.lang.String.format; + +public class TestHudiSanity + extends AbstractHudiTestQueryFramework +{ + @Test + public void readNonPartitionedTable() + { + String testQuery = format("SELECT rowid, name FROM \"%s\"", NON_PARTITIONED_TABLE_NAME); + String expResults = "SELECT * FROM VALUES('row_1', 'bob'),('row_2', 'john'),('row_3', 'tom')"; + assertHudiQuery(NON_PARTITIONED_TABLE_NAME, testQuery, expResults, false); + } + + @Test + public void readPartitionedCowTable() + { + String testQuery = format("SELECT symbol, max(ts) FROM \"%s\" group by symbol HAVING symbol = 'GOOG'", PARTITIONED_COW_TABLE_NAME); + String expResults = "SELECT * FROM VALUES('GOOG', '2018-08-31 10:59:00')"; + assertHudiQuery(PARTITIONED_COW_TABLE_NAME, testQuery, expResults, false); + } + + @Test + public void readPartitionedMorTable() + { + String testQuery = format("SELECT symbol, max(ts) FROM \"%s\" group by symbol HAVING symbol = 'GOOG'", PARTITIONED_MOR_TABLE_NAME); + String expResults = "SELECT * FROM VALUES('GOOG', '2018-08-31 10:59:00')"; + assertHudiQuery(PARTITIONED_MOR_TABLE_NAME, testQuery, expResults, false); + } + + @Test + public void readPartitionedColumn() + { + String testQuery = format("SELECT dt, count(1) FROM \"%s\" group by dt", PARTITIONED_COW_TABLE_NAME); + String expResults = "SELECT * FROM VALUES('2018/08/31', '99')"; + assertHudiQuery(PARTITIONED_COW_TABLE_NAME, testQuery, expResults, false); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSplit.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSplit.java new file mode 100644 index 000000000000..835c1bb18d88 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSplit.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.airlift.json.JsonCodec; +import io.trino.spi.SplitWeight; +import io.trino.spi.predicate.TupleDomain; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + +public class TestHudiSplit +{ + private final JsonCodec codec = JsonCodec.jsonCodec(HudiSplit.class); + + @Test + public void testJsonRoundTrip() + { + HudiSplit expectedSplit = new HudiSplit( + "/user/hive/warehouse/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet", + 0L, + 440747L, + 440747L, + ImmutableList.of(), + TupleDomain.all(), + ImmutableList.of(), + SplitWeight.fromProportion(0.1)); + + String json = codec.toJson(expectedSplit); + HudiSplit actualSplit = codec.fromJson(json); + + assertEquals(actualSplit.getPath(), expectedSplit.getPath()); + assertEquals(actualSplit.getAddresses(), expectedSplit.getAddresses()); + assertEquals(actualSplit.getPartitionKeys(), expectedSplit.getPartitionKeys()); + assertEquals(actualSplit.getStart(), expectedSplit.getStart()); + assertEquals(actualSplit.getLength(), expectedSplit.getLength()); + assertEquals(actualSplit.getFileSize(), expectedSplit.getFileSize()); + assertEquals(actualSplit.getSplitWeight(), expectedSplit.getSplitWeight()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiTableHandle.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiTableHandle.java new file mode 100644 index 000000000000..2d82f1181df2 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiTableHandle.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import io.airlift.json.JsonCodec; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hudi.common.model.HoodieTableType; +import org.testng.annotations.Test; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.testng.Assert.assertEquals; + +public class TestHudiTableHandle +{ + private final JsonCodec codec = JsonCodec.jsonCodec(HudiTableHandle.class); + + @Test + public void testJsonRoundTrip() + { + HudiTableHandle expectedHandle = new HudiTableHandle( + "schema", + "table", + "/tmp/hudi_trips", + HoodieTableType.valueOf(COPY_ON_WRITE.name()), + TupleDomain.none(), + TupleDomain.all()); + + String json = codec.toJson(expectedHandle); + HudiTableHandle actualHandle = codec.fromJson(json); + + assertEquals(actualHandle.getSchemaName(), expectedHandle.getSchemaName()); + assertEquals(actualHandle.getTableName(), expectedHandle.getTableName()); + assertEquals(actualHandle.getPartitionPredicates(), expectedHandle.getPartitionPredicates()); + assertEquals(actualHandle.getRegularPredicates(), expectedHandle.getRegularPredicates()); + assertEquals(actualHandle.getTableType(), expectedHandle.getTableType()); + assertEquals(actualHandle.getBasePath(), expectedHandle.getBasePath()); + assertEquals(actualHandle.getTableType(), expectedHandle.getTableType()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java new file mode 100644 index 000000000000..d39cfd09ce74 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java @@ -0,0 +1,91 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hive.HiveStylePartitionValueExtractor; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.hive.SlashEncodedHourPartitionValueExtractor; +import org.testng.annotations.Test; + +import java.util.Collections; +import java.util.List; +import java.util.Properties; + +import static io.trino.plugin.hive.HiveStorageFormat.PARQUET; +import static io.trino.plugin.hive.util.HiveUtil.getInputFormat; +import static io.trino.plugin.hudi.HudiUtil.buildPartitionValues; +import static io.trino.plugin.hudi.HudiUtil.isHudiParquetInputFormat; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestHudiUtil +{ + @Test + public void testIsHudiParquetInputFormat() + { + Properties schema = new Properties(); + schema.setProperty(FILE_INPUT_FORMAT, HoodieParquetInputFormat.class.getName()); + schema.setProperty(SERIALIZATION_LIB, PARQUET.getSerde()); + + assertTrue(isHudiParquetInputFormat(getInputFormat(new Configuration(false), schema, false))); + } + + @Test + public void testBuildPartitionValues() + { + assertToPartitionValues("partitionColumn1=01/01/2020", ImmutableList.of("01/01/2020")); + assertToPartitionValues("partitionColumn1=01/01/2020/partitioncolumn2=abc", ImmutableList.of("01/01/2020", "abc")); + assertToPartitionValues("ds=2015-12-30/event_type=QueryCompletion", ImmutableList.of("2015-12-30", "QueryCompletion")); + assertToPartitionValues("ds=2015-12-30", ImmutableList.of("2015-12-30")); + assertToPartitionValues("a=1", ImmutableList.of("1")); + assertToPartitionValues("a=1/b=2/c=3", ImmutableList.of("1", "2", "3")); + assertToPartitionValues("pk=!@%23$%25%5E&%2A()%2F%3D", ImmutableList.of("!@#$%^&*()/=")); + assertToPartitionValues("pk=__HIVE_DEFAULT_PARTITION__", ImmutableList.of("__HIVE_DEFAULT_PARTITION__")); + } + + @Test + public void testInferPartitionValueExtractor() + { + assertEquals(HudiUtil.inferPartitionValueExtractor( + "2022/01/05", Collections.singletonList("2022-01-05")).getClass().getName(), + SlashEncodedDayPartitionValueExtractor.class.getName()); + assertEquals(HudiUtil.inferPartitionValueExtractor( + "2022/01/05/19", Collections.singletonList("2022-01-05-19")).getClass().getName(), + SlashEncodedHourPartitionValueExtractor.class.getName()); + assertEquals(HudiUtil.inferPartitionValueExtractor( + "country=united_states", + Collections.singletonList("united_states")).getClass().getName(), + HiveStylePartitionValueExtractor.class.getName()); + assertEquals(HudiUtil.inferPartitionValueExtractor( + "country=united_states/city=san_francisco", + ImmutableList.of("united_states", "san_francisco")).getClass().getName(), + MultiPartKeysValueExtractor.class.getName()); + assertThatThrownBy(() -> HudiUtil.inferPartitionValueExtractor( + "randompartitionpath", Collections.singletonList(""))); + } + + private static void assertToPartitionValues(String partitionName, List expected) + { + List actual = buildPartitionValues(partitionName); + assertEquals(actual, expected); + } +} diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.commit b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.commit new file mode 100644 index 000000000000..f77eeb137f02 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.commit @@ -0,0 +1,50 @@ +{ + "partitionToWriteStats" : { + "" : [ { + "fileId" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0", + "path" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet", + "prevCommit" : "null", + "numWrites" : 3, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 3, + "totalWriteBytes" : 436273, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 436273, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"hudi_non_part_cow_record\",\"namespace\":\"hoodie.hudi_non_part_cow\",\"fields\":[{\"name\":\"rowId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partitionId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"preComb\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"name\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"versionId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"toBeDeletedStr\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"intToLong\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"longToInt\",\"type\":[\"null\",\"long\"],\"default\":null}]}" + }, + "operationType" : "INSERT", + "writePartitionPaths" : [ "" ], + "fileIdAndRelativePaths" : { + "d0875d00-483d-4e8b-bbbe-c520366c47a0-0" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 1743, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.commit.requested b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.inflight b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.inflight new file mode 100644 index 000000000000..6605bcaf9b36 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/20211217110514527.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 3, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "INSERT", + "writePartitionPaths" : [ "" ], + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/hoodie.properties new file mode 100644 index 000000000000..3d03fa7915c3 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie/hoodie.properties @@ -0,0 +1,14 @@ +#Properties saved on Fri Dec 17 11:05:14 UTC 2021 +#Fri Dec 17 11:05:14 UTC 2021 +hoodie.table.precombine.field=preComb +hoodie.table.partition.fields= +hoodie.table.type=COPY_ON_WRITE +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=rowId +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.table.name=hudi_non_part_cow +hoodie.datasource.write.hive_style_partitioning=false diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie_partition_metadata new file mode 100644 index 000000000000..f2149eb6cd5a --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Fri Dec 17 11:05:23 UTC 2021 +commitTime=20211217110514527 +partitionDepth=0 diff --git a/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet new file mode 100644 index 000000000000..52de8719bf62 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.commit b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.commit new file mode 100644 index 000000000000..18cf55cc1bfd --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.commit @@ -0,0 +1,51 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "871677fb-e0e3-46f8-9cc1-fe497e317216-0", + "path" : "2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet", + "prevCommit" : "null", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 440747, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 440747, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:1668" + }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "871677fb-e0e3-46f8-9cc1-fe497e317216-0" : "2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 750, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.commit.requested b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.inflight b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.inflight new file mode 100644 index 000000000000..6dc689a285d9 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/20211216071453747.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/hoodie.properties new file mode 100644 index 000000000000..4754c1c23eb2 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/.hoodie/hoodie.properties @@ -0,0 +1,13 @@ +#Properties saved on Thu Dec 16 07:14:51 UTC 2021 +#Thu Dec 16 07:14:51 UTC 2021 +hoodie.table.precombine.field=ts +hoodie.table.partition.fields=date +hoodie.table.type=COPY_ON_WRITE +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=key +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator +hoodie.table.name=stock_ticks_cow diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata new file mode 100644 index 000000000000..1aaf9e64d933 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Thu Dec 16 07:14:56 UTC 2021 +commitTime=20211216071453747 +partitionDepth=3 diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet new file mode 100644 index 000000000000..b97391697e62 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit new file mode 100644 index 000000000000..f9e28873d524 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit @@ -0,0 +1,51 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : "2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet", + "prevCommit" : "null", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 440746, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 440746, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:1668" + }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : "2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 1402, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight new file mode 100644 index 000000000000..6dc689a285d9 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.requested b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit new file mode 100644 index 000000000000..f1cc26fecc7b --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit @@ -0,0 +1,55 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : "2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29", + "prevCommit" : "20211221030120532", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 99, + "numInserts" : 0, + "totalWriteBytes" : 22220, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 22220, + "minEventTime" : null, + "maxEventTime" : null, + "logVersion" : 1, + "logOffset" : 0, + "baseFile" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet", + "logFiles" : [ ".167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29" ] + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:3336" + }, + "operationType" : "UPSERT", + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 187, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ], + "fileIdAndRelativePaths" : { + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : "2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29" + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight new file mode 100644 index 000000000000..724ce56ff0d6 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight @@ -0,0 +1,71 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 0, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + }, { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : null, + "prevCommit" : "20211221030120532", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 99, + "numInserts" : 0, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ], + "fileIdAndRelativePaths" : { + "" : null, + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : null + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.requested b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/hoodie.properties new file mode 100644 index 000000000000..33392aa182f2 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/.hoodie/hoodie.properties @@ -0,0 +1,14 @@ +#Properties saved on Tue Dec 21 03:01:13 UTC 2021 +#Tue Dec 21 03:01:13 UTC 2021 +hoodie.table.precombine.field=ts +hoodie.table.partition.fields=date +hoodie.table.type=MERGE_ON_READ +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.compaction.payload.class=org.apache.hudi.common.model.OverwriteWithLatestAvroPayload +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=key +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator +hoodie.table.name=stock_ticks_mor diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 new file mode 100644 index 000000000000..da3c7bc07ee1 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 differ diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata new file mode 100644 index 000000000000..340533d6e680 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Tue Dec 21 03:01:25 UTC 2021 +commitTime=20211221030120532 +partitionDepth=3 diff --git a/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet new file mode 100644 index 000000000000..9fe2112d09bb Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet differ diff --git a/pom.xml b/pom.xml index 513a6be764e9..ee96d1f44a10 100644 --- a/pom.xml +++ b/pom.xml @@ -120,6 +120,7 @@ plugin/trino-hive plugin/trino-hive-hadoop2 plugin/trino-http-event-listener + plugin/trino-hudi plugin/trino-iceberg plugin/trino-jmx plugin/trino-kafka @@ -271,6 +272,12 @@ ${project.version} + + io.trino + trino-hudi + ${project.version} + + io.trino trino-iceberg diff --git a/testing/trino-server-dev/etc/catalog/hudi.properties b/testing/trino-server-dev/etc/catalog/hudi.properties new file mode 100644 index 000000000000..23bc4e0573fc --- /dev/null +++ b/testing/trino-server-dev/etc/catalog/hudi.properties @@ -0,0 +1,17 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +connector.name=hudi +hive.metastore.uri=thrift://localhost:9083 +#hive.config.resources=/path/to/core-site.xml,/path/to/hdfs-site.xml diff --git a/testing/trino-server-dev/etc/config.properties b/testing/trino-server-dev/etc/config.properties index e1e4530f7421..fd2beb31dbaa 100644 --- a/testing/trino-server-dev/etc/config.properties +++ b/testing/trino-server-dev/etc/config.properties @@ -36,6 +36,7 @@ plugin.bundles=\ ../../plugin/trino-jmx/pom.xml,\ ../../plugin/trino-raptor-legacy/pom.xml,\ ../../plugin/trino-hive-hadoop2/pom.xml,\ + ../../plugin/trino-hudi/pom.xml,\ ../../plugin/trino-example-http/pom.xml,\ ../../plugin/trino-kafka/pom.xml, \ ../../plugin/trino-tpch/pom.xml, \