From a2cbc7984687f1ae6cfef5609b24a5a93564019a Mon Sep 17 00:00:00 2001 From: tsreaper Date: Mon, 28 Oct 2024 16:51:27 +0800 Subject: [PATCH] [docs] Add example on querying Paimon table with Trino Iceberg connector (#4368) --- .../migration/iceberg-compatibility.md | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/docs/content/migration/iceberg-compatibility.md b/docs/content/migration/iceberg-compatibility.md index 6a7103473acd..17e1c6523398 100644 --- a/docs/content/migration/iceberg-compatibility.md +++ b/docs/content/migration/iceberg-compatibility.md @@ -50,22 +50,23 @@ Set the following table options, so that Paimon tables can generate Iceberg comp When set, produce Iceberg metadata after a snapshot is committed, so that Iceberg readers can read Paimon's raw data files. -For most SQL users, we recommend setting `'metadata.iceberg.storage' = 'hadoop-catalog'`, +For most SQL users, we recommend setting `'metadata.iceberg.storage' = 'hadoop-catalog'` +or `'metadata.iceberg.storage' = 'hive-catalog'`, so that all tables can be visited as an Iceberg warehouse. For Iceberg Java API users, you might consider setting `'metadata.iceberg.storage' = 'table-location'`, so you can visit each table with its table path. -## Example: Query Paimon Append Only Tables with Iceberg Connector +## Example: Query Paimon Append Only Tables on Flink/Spark with Iceberg Connector Let's walk through a simple example, where we query Paimon tables with Iceberg connectors in Flink and Spark. Before trying out this example, make sure that your compute engine already supports Iceberg. @@ -101,7 +102,7 @@ Start `spark-sql` with the following command line. ```bash spark-sql --jars \ --conf spark.sql.catalog.paimon_catalog=org.apache.paimon.spark.SparkCatalog \ - --conf spark.sql.catalog.paimon_catalog.warehouse=/tmp/sparkware \ + --conf spark.sql.catalog.paimon_catalog.warehouse= \ --packages org.apache.iceberg:iceberg-spark-runtime- \ --conf spark.sql.catalog.iceberg_catalog=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.iceberg_catalog.type=hadoop \ @@ -199,7 +200,7 @@ germany hamburg {{< /tabs >}} -## Example: Query Paimon Primary Key Tables with Iceberg Connector +## Example: Query Paimon Primary Key Tables on Flink/Spark with Iceberg Connector {{< tabs "paimon-primary-key-table" >}} @@ -258,8 +259,8 @@ Start `spark-sql` with the following command line. ```bash spark-sql --jars \ --conf spark.sql.catalog.paimon_catalog=org.apache.paimon.spark.SparkCatalog \ - --conf spark.sql.catalog.paimon_catalog.warehouse=/tmp/sparkware \ - --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1 \ + --conf spark.sql.catalog.paimon_catalog.warehouse= \ + --packages org.apache.iceberg:iceberg-spark-runtime- \ --conf spark.sql.catalog.iceberg_catalog=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.iceberg_catalog.type=hadoop \ --conf spark.sql.catalog.iceberg_catalog.warehouse=/iceberg \ @@ -377,6 +378,79 @@ you also need to set some (or all) of the following table options when creating +## Example: Query Paimon Append Only Tables on Trino with Iceberg Connector + +In this example, we use Trino Iceberg connector to access Paimon table through Iceberg Hive catalog. +Before trying out this example, make sure that you have configured Trino Iceberg connector. +See [Trino's document](https://trino.io/docs/current/connector/iceberg.html#general-configuration) for more information. + +Let's first create a Paimon table with Iceberg compatibility enabled. + +{{< tabs "paimon-append-only-table-trino-1" >}} + +{{< tab "Flink SQL" >}} +```sql +CREATE CATALOG paimon_catalog WITH ( + 'type' = 'paimon', + 'warehouse' = '' +); + +CREATE TABLE paimon_catalog.`default`.animals ( + kind STRING, + name STRING +) WITH ( + 'metadata.iceberg.storage' = 'hive-catalog', + 'metadata.iceberg.uri' = 'thrift://:' +); + +INSERT INTO paimon_catalog.`default`.animals VALUES ('mammal', 'cat'), ('mammal', 'dog'), ('reptile', 'snake'), ('reptile', 'lizard'); +``` +{{< /tab >}} + +{{< tab "Spark SQL" >}} +Start `spark-sql` with the following command line. + +```bash +spark-sql --jars \ + --conf spark.sql.catalog.paimon_catalog=org.apache.paimon.spark.SparkCatalog \ + --conf spark.sql.catalog.paimon_catalog.warehouse= \ + --packages org.apache.iceberg:iceberg-spark-runtime- \ + --conf spark.sql.catalog.iceberg_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.iceberg_catalog.type=hadoop \ + --conf spark.sql.catalog.iceberg_catalog.warehouse=/iceberg \ + --conf spark.sql.catalog.iceberg_catalog.cache-enabled=false \ # disable iceberg catalog caching to quickly see the result + --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + +Run the following Spark SQL to create Paimon table, insert/update data, and query with Iceberg catalog. + +```sql +CREATE TABLE paimon_catalog.`default`.animals ( + kind STRING, + name STRING +) TBLPROPERTIES ( + 'metadata.iceberg.storage' = 'hive-catalog', + 'metadata.iceberg.uri' = 'thrift://:' +); + +INSERT INTO paimon_catalog.`default`.animals VALUES ('mammal', 'cat'), ('mammal', 'dog'), ('reptile', 'snake'), ('reptile', 'lizard'); +``` +{{< /tab >}} + +{{< /tabs >}} + +Start Trino using Iceberg catalog and query from Paimon table. + +```sql +SELECT * FROM animals WHERE class = 'mammal'; +/* + kind | name +--------+------ + mammal | cat + mammal | dog +*/ +``` + ## Supported Types Paimon Iceberg compatibility currently supports the following data types.