Skip to content

Commit

Permalink
sort and limit
Browse files Browse the repository at this point in the history
  • Loading branch information
askwang committed Sep 26, 2024
1 parent 22e6bd3 commit c4bc403
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/** Expire partitions. */
public class PartitionExpire {
Expand All @@ -51,6 +52,7 @@ public class PartitionExpire {
private LocalDateTime lastCheck;
private final PartitionExpireStrategy strategy;
private final boolean endInputCheckPartitionExpire;
private int maxExpires;

public PartitionExpire(
Duration expirationTime,
Expand All @@ -68,6 +70,7 @@ public PartitionExpire(
this.metastoreClient = metastoreClient;
this.lastCheck = LocalDateTime.now();
this.endInputCheckPartitionExpire = endInputCheckPartitionExpire;
this.maxExpires = Integer.MAX_VALUE;
}

public PartitionExpire(
Expand All @@ -85,6 +88,11 @@ public PartitionExpire withLock(Lock lock) {
return this;
}

public PartitionExpire withMaxExpires(int maxExpires) {
this.maxExpires = maxExpires;
return this;
}

public List<Map<String, String>> expire(long commitIdentifier) {
return expire(LocalDateTime.now(), commitIdentifier);
}
Expand Down Expand Up @@ -125,14 +133,16 @@ List<Map<String, String>> expire(LocalDateTime now, long commitIdentifier) {

private List<Map<String, String>> doExpire(
LocalDateTime expireDateTime, long commitIdentifier) {
List<Map<String, String>> expired = new ArrayList<>();
List<List<String>> partValues = new ArrayList<>();
for (PartitionEntry partition : strategy.selectExpiredPartitions(scan, expireDateTime)) {
Object[] array = strategy.convertPartition(partition.partition());
Map<String, String> partString = strategy.toPartitionString(array);
expired.add(partString);
LOG.info("Expire Partition: {}", partString);
partValues.add(strategy.toPartitionValue(array));
}
if (!expired.isEmpty()) {

List<Map<String, String>> expired = new ArrayList<>();
if (!partValues.isEmpty()) {
expired = convertToPartitionString(partValues);
LOG.info("Expire Partition: {}", expired);
if (metastoreClient != null) {
deleteMetastorePartitions(expired);
}
Expand All @@ -153,4 +163,14 @@ private void deleteMetastorePartitions(List<Map<String, String>> partitions) {
});
}
}

private List<Map<String, String>> convertToPartitionString(List<List<String>> partValues) {
return partValues.stream()
.map(values -> String.join(",", values))
.sorted()
.map(s -> s.split(","))
.map(strategy::toPartitionString)
.limit(Math.min(partValues.size(), maxExpires))
.collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.paimon.utils.RowDataToObjectArrayConverter;

import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
Expand All @@ -49,6 +50,14 @@ public Map<String, String> toPartitionString(Object[] array) {
return map;
}

public List<String> toPartitionValue(Object[] array) {
List<String> list = new ArrayList<>();
for (int i = 0; i < partitionKeys.size(); i++) {
list.add(array[i].toString());
}
return list;
}

public Object[] convertPartition(BinaryRow partition) {
return toObjectArrayConverter.convert(partition);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,19 @@ public String identifier() {
name = "expire_strategy",
type = @DataTypeHint("STRING"),
isOptional = true),
@ArgumentHint(
name = "max_expires",
type = @DataTypeHint("INTEGER"),
isOptional = true)
})
public @DataTypeHint("ROW< expired_partitions STRING>") Row[] call(
ProcedureContext procedureContext,
String tableId,
String expirationTime,
String timestampFormatter,
String timestampPattern,
String expireStrategy)
String expireStrategy,
Integer maxExpires)
throws Catalog.TableNotExistException {
FileStoreTable fileStoreTable = (FileStoreTable) table(tableId);
FileStore fileStore = fileStoreTable.store();
Expand All @@ -93,6 +98,9 @@ public String identifier() {
.metastoreClientFactory())
.map(MetastoreClient.Factory::create)
.orElse(null));
if (maxExpires != null) {
partitionExpire.withMaxExpires(maxExpires);
}
List<Map<String, String>> expired = partitionExpire.expire(Long.MAX_VALUE);
return expired == null || expired.isEmpty()
? new Row[] {Row.of("No expired partitions.")}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,61 @@ public void testPartitionExpireWithTimePartition() throws Exception {
.containsExactlyInAnyOrder("Never-expire:9999-09-09:99:99");
}

@Test
public void testSortAndLimitExpirePartition() throws Exception {
sql(
"CREATE TABLE T ("
+ " k STRING,"
+ " dt STRING,"
+ " hm STRING,"
+ " PRIMARY KEY (k, dt, hm) NOT ENFORCED"
+ ") PARTITIONED BY (dt, hm) WITH ("
+ " 'bucket' = '1'"
+ ")");
FileStoreTable table = paimonTable("T");
// Test there are no expired partitions.
assertThat(
callExpirePartitions(
"CALL sys.expire_partitions("
+ "`table` => 'default.T'"
+ ", expiration_time => '1 d'"
+ ", timestamp_formatter => 'yyyy-MM-dd')"))
.containsExactlyInAnyOrder("No expired partitions.");

sql("INSERT INTO T VALUES ('3', '2024-06-02', '02:00')");
sql("INSERT INTO T VALUES ('2', '2024-06-02', '01:00')");
sql("INSERT INTO T VALUES ('4', '2024-06-03', '01:00')");
sql("INSERT INTO T VALUES ('1', '2024-06-01', '01:00')");
// This partition never expires.
sql("INSERT INTO T VALUES ('Never-expire', '9999-09-09', '99:99')");

Function<InternalRow, String> consumerReadResult =
(InternalRow row) ->
row.getString(0) + ":" + row.getString(1) + ":" + row.getString(2);
assertThat(read(table, consumerReadResult))
.containsExactlyInAnyOrder(
"1:2024-06-01:01:00",
"2:2024-06-02:01:00",
"3:2024-06-02:02:00",
"4:2024-06-03:01:00",
"Never-expire:9999-09-09:99:99");

// Show a list of expired partitions.
assertThat(
callExpirePartitions(
"CALL sys.expire_partitions("
+ "`table` => 'default.T'"
+ ", expiration_time => '1 d'"
+ ", timestamp_formatter => 'yyyy-MM-dd', max_expires => 3)"))
.containsExactly(
"dt=2024-06-01, hm=01:00",
"dt=2024-06-02, hm=01:00",
"dt=2024-06-02, hm=02:00");

assertThat(read(table, consumerReadResult))
.containsExactly("4:2024-06-03:01:00", "Never-expire:9999-09-09:99:99");
}

/** Return a list of expired partitions. */
public List<String> callExpirePartitions(String callSql) {
return sql(callSql).stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import java.util.Optional;

import static org.apache.paimon.partition.PartitionExpireStrategy.createPartitionExpireStrategy;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.apache.spark.sql.types.DataTypes.StringType;

/** A procedure to expire partitions. */
Expand All @@ -51,7 +52,8 @@ public class ExpirePartitionsProcedure extends BaseProcedure {
ProcedureParameter.required("expiration_time", StringType),
ProcedureParameter.optional("timestamp_formatter", StringType),
ProcedureParameter.optional("timestamp_pattern", StringType),
ProcedureParameter.optional("expire_strategy", StringType)
ProcedureParameter.optional("expire_strategy", StringType),
ProcedureParameter.optional("max_expires", IntegerType)
};

private static final StructType OUTPUT_TYPE =
Expand Down Expand Up @@ -81,6 +83,7 @@ public InternalRow[] call(InternalRow args) {
String timestampFormatter = args.isNullAt(2) ? null : args.getString(2);
String timestampPattern = args.isNullAt(3) ? null : args.getString(3);
String expireStrategy = args.isNullAt(4) ? null : args.getString(4);
Integer maxExpires = args.isNullAt(5) ? null : args.getInt(5);
return modifyPaimonTable(
tableIdent,
table -> {
Expand All @@ -105,6 +108,9 @@ public InternalRow[] call(InternalRow args) {
.metastoreClientFactory())
.map(MetastoreClient.Factory::create)
.orElse(null));
if (maxExpires != null) {
partitionExpire.withMaxExpires(maxExpires);
}
List<Map<String, String>> expired = partitionExpire.expire(Long.MAX_VALUE);
return expired == null || expired.isEmpty()
? new InternalRow[] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -474,4 +474,81 @@ class ExpirePartitionsProcedureTest extends PaimonSparkTestBase with StreamTest
}
}
}

test("Paimon procedure : sorted the expired partitions with max_expires.") {
failAfter(streamingTimeout) {
withTempDir {
checkpointDir =>
spark.sql(s"""
|CREATE TABLE T (k STRING, pt STRING, hm STRING)
|TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1')
| PARTITIONED BY (pt,hm)
|""".stripMargin)
val location = loadTable("T").location().toString

val inputData = MemoryStream[(String, String, String)]
val stream = inputData
.toDS()
.toDF("k", "pt", "hm")
.writeStream
.option("checkpointLocation", checkpointDir.getCanonicalPath)
.foreachBatch {
(batch: Dataset[Row], _: Long) =>
batch.write.format("paimon").mode("append").save(location)
}
.start()

val query = () => spark.sql("SELECT * FROM T")

try {
// Show results : There are no expired partitions.
checkAnswer(
spark.sql(
"CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
", timestamp_formatter => 'yyyy-MM-dd')"),
Row("No expired partitions.") :: Nil
)

inputData.addData(("a", "2024-06-02", "02:00"))
stream.processAllAvailable()
inputData.addData(("b", "2024-06-02", "01:00"))
stream.processAllAvailable()
inputData.addData(("d", "2024-06-03", "01:00"))
stream.processAllAvailable()
inputData.addData(("c", "2024-06-01", "01:00"))
stream.processAllAvailable()
// this snapshot never expires.
inputData.addData(("Never-expire", "9999-09-09", "99:99"))
stream.processAllAvailable()

checkAnswer(
query(),
Row("a", "2024-06-02", "02:00") :: Row("b", "2024-06-02", "01:00") :: Row(
"d",
"2024-06-03",
"01:00") :: Row("c", "2024-06-01", "01:00") :: Row(
"Never-expire",
"9999-09-09",
"99:99") :: Nil
)

// sorted result of limited expired partitions.
checkAnswer(
spark.sql(
"CALL paimon.sys.expire_partitions(table => 'test.T'" +
", expiration_time => '1 d'" +
", timestamp_formatter => 'yyyy-MM-dd', max_expires => 3)"),
Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=01:00") :: Row(
"pt=2024-06-02, hm=02:00") :: Nil
)

checkAnswer(
query(),
Row("d", "2024-06-03", "01:00") :: Row("Never-expire", "9999-09-09", "99:99") :: Nil)
} finally {
stream.stop()
}
}
}
}
}

0 comments on commit c4bc403

Please sign in to comment.