Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[spark] Support merge into for append table #3917

Merged
merged 5 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,22 @@

package org.apache.paimon.spark.sql

import org.apache.paimon.spark.{PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}
import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}

class MergeIntoPrimaryKeyBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with PaimonPrimaryKeyBucketedTableTest {}

class MergeIntoPrimaryKeyNonBucketTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with PaimonPrimaryKeyNonBucketTableTest {}

class MergeIntoAppendBucketedTableTest
extends MergeIntoTableTestBase
with PaimonAppendBucketedTableTest {}

class MergeIntoAppendNonBucketedTableTest
extends MergeIntoTableTestBase
with PaimonAppendNonBucketTableTest {}
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,22 @@

package org.apache.paimon.spark.sql

import org.apache.paimon.spark.{PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}
import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}

class MergeIntoPrimaryKeyBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with PaimonPrimaryKeyBucketedTableTest {}

class MergeIntoPrimaryKeyNonBucketTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with PaimonPrimaryKeyNonBucketTableTest {}

class MergeIntoAppendBucketedTableTest
extends MergeIntoTableTestBase
with PaimonAppendBucketedTableTest {}

class MergeIntoAppendNonBucketedTableTest
extends MergeIntoTableTestBase
with PaimonAppendNonBucketTableTest {}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,26 @@

package org.apache.paimon.spark.sql

import org.apache.paimon.spark.{PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}
import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}

class MergeIntoPrimaryKeyBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with MergeIntoNotMatchedBySourceTest
with PaimonPrimaryKeyBucketedTableTest {}

class MergeIntoPrimaryKeyNonBucketTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with MergeIntoNotMatchedBySourceTest
with PaimonPrimaryKeyNonBucketTableTest {}

class MergeIntoAppendBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoNotMatchedBySourceTest
with PaimonAppendBucketedTableTest {}

class MergeIntoAppendNonBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoNotMatchedBySourceTest
with PaimonAppendNonBucketTableTest {}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,26 @@

package org.apache.paimon.spark.sql

import org.apache.paimon.spark.{PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}
import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}

class MergeIntoPrimaryKeyBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with MergeIntoNotMatchedBySourceTest
with PaimonPrimaryKeyBucketedTableTest {}

class MergeIntoPrimaryKeyNonBucketTableTest
extends MergeIntoTableTestBase
with MergeIntoPrimaryKeyTableTest
with MergeIntoNotMatchedBySourceTest
with PaimonPrimaryKeyNonBucketTableTest {}

class MergeIntoAppendBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoNotMatchedBySourceTest
with PaimonAppendBucketedTableTest {}

class MergeIntoAppendNonBucketedTableTest
extends MergeIntoTableTestBase
with MergeIntoNotMatchedBySourceTest
with PaimonAppendNonBucketTableTest {}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

package org.apache.paimon.spark.catalyst.analysis

import org.apache.paimon.CoreOptions
import org.apache.paimon.spark.SparkTable
import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper
import org.apache.paimon.spark.commands.MergeIntoPaimonTable
Expand All @@ -28,6 +27,8 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeS
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.Rule

import scala.collection.JavaConverters._

trait PaimonMergeIntoBase
extends Rule[LogicalPlan]
with RowLevelHelper
Expand All @@ -52,13 +53,14 @@ trait PaimonMergeIntoBase
merge.notMatchedActions.flatMap(_.condition).foreach(checkCondition)

val updateActions = merge.matchedActions.collect { case a: UpdateAction => a }
val primaryKeys = v2Table.properties().get(CoreOptions.PRIMARY_KEY.key).split(",")
checkUpdateActionValidity(
AttributeSet(targetOutput),
merge.mergeCondition,
updateActions,
primaryKeys)

val primaryKeys = v2Table.getTable.primaryKeys().asScala
if (primaryKeys.nonEmpty) {
checkUpdateActionValidity(
AttributeSet(targetOutput),
merge.mergeCondition,
updateActions,
primaryKeys)
}
val alignedMatchedActions =
merge.matchedActions.map(checkAndAlignActionAssignment(_, targetOutput))
val alignedNotMatchedActions =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,6 @@ case object MergeInto extends RowLevelOp {
override val supportedMergeEngine: Seq[MergeEngine] =
Seq(MergeEngine.DEDUPLICATE, MergeEngine.PARTIAL_UPDATE)

override val supportAppendOnlyTable: Boolean = false
override val supportAppendOnlyTable: Boolean = true

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,12 @@

package org.apache.paimon.spark.commands

import org.apache.paimon.CoreOptions
import org.apache.paimon.CoreOptions.MergeEngine
import org.apache.paimon.spark.PaimonSplitScan
import org.apache.paimon.spark.catalyst.Compatibility
import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionHelper
import org.apache.paimon.spark.leafnode.PaimonLeafRunnableCommand
import org.apache.paimon.spark.schema.SparkSystemColumns.ROW_KIND_COL
import org.apache.paimon.spark.util.SQLHelper
import org.apache.paimon.table.{BucketMode, FileStoreTable}
import org.apache.paimon.table.FileStoreTable
import org.apache.paimon.table.sink.{BatchWriteBuilder, CommitMessage}
import org.apache.paimon.types.RowKind
import org.apache.paimon.utils.InternalRowPartitionComputer
Expand Down Expand Up @@ -144,20 +141,11 @@ case class DeleteFromPaimonTableCommand(
findTouchedFiles(candidateDataSplits, condition, relation, sparkSession)

// Step3: the smallest range of data files that need to be rewritten.
val touchedFiles = touchedFilePaths.map {
file =>
dataFilePathToMeta.getOrElse(file, throw new RuntimeException(s"Missing file: $file"))
}
val (touchedFiles, newRelation) =
createNewRelation(touchedFilePaths, dataFilePathToMeta, relation)

// Step4: build a dataframe that contains the unchanged data, and write out them.
val touchedDataSplits =
SparkDataFileMeta.convertToDataSplits(touchedFiles, rawConvertible = true, pathFactory)
val toRewriteScanRelation = Filter(
Not(condition),
Compatibility.createDataSourceV2ScanRelation(
relation,
PaimonSplitScan(table, touchedDataSplits),
relation.output))
val toRewriteScanRelation = Filter(Not(condition), newRelation)
val data = createDataset(sparkSession, toRewriteScanRelation)

// only write new files, should have no compaction
Expand Down
Loading
Loading