From 99893c83328190f9ce0ffc6ddfdb0a3ced10a916 Mon Sep 17 00:00:00 2001 From: StreamNative Bot <44651360+streamnativebot@users.noreply.github.com> Date: Sun, 27 Oct 2024 18:28:47 -0700 Subject: [PATCH] Docs sync 20241028 (#457) Co-authored-by: shibd --- .../google-bigquery-sink.md | 40 +- .../google-bigquery-sink.md | 6 +- .../v4.0.0.2/google-bigquery-sink.md | 250 +++++++++++ .../google-bigquery-source.md | 4 +- .../google-bigquery-source.md | 4 +- .../v4.0.0.2/google-bigquery-source.md | 180 ++++++++ .../google-cloud-storage-sink.md | 6 +- .../google-cloud-storage-sink.md | 6 +- .../v4.0.0.2/google-cloud-storage-sink.md | 249 ++++++++++ .../google-pubsub-sink.md | 4 +- .../google-pubsub-sink.md | 4 +- .../v4.0.0.2/google-pubsub-sink.md | 148 ++++++ .../google-pubsub-source.md | 4 +- .../google-pubsub-source.md | 4 +- .../v4.0.0.2/google-pubsub-source.md | 155 +++++++ .../{v3.3.1.1 => v3.0.7.1}/lakehouse-sink.md | 24 +- .../{v3.0.6.1 => v3.3.1.9}/lakehouse-sink.md | 24 +- .../lakehouse-sink/v4.0.0.2/lakehouse-sink.md | 424 ++++++++++++++++++ .../lakehouse-source.md | 16 +- .../lakehouse-source.md | 16 +- .../v4.0.0.2/lakehouse-source.md | 266 +++++++++++ .../{v3.0.6.1 => v3.0.7.1}/pinecone-sink.md | 2 +- .../{v3.3.1.1 => v3.3.1.9}/pinecone-sink.md | 2 +- .../pinecone-sink/v4.0.0.2/pinecone-sink.md | 261 +++++++++++ .../{v3.0.6.1 => v3.0.7.1}/snowflake-sink.md | 0 .../{v3.3.1.1 => v3.3.1.9}/snowflake-sink.md | 0 .../snowflake-sink/v4.0.0.2/snowflake-sink.md | 275 ++++++++++++ .../{v3.0.6.1 => v3.0.7.1}/sqs-sink.md | 2 +- .../{v3.3.1.1 => v3.3.1.9}/sqs-sink.md | 2 +- connectors/sqs-sink/v4.0.0.2/sqs-sink.md | 276 ++++++++++++ .../{v3.0.6.1 => v3.0.7.1}/sqs-source.md | 2 +- .../{v3.3.1.1 => v3.3.1.9}/sqs-source.md | 2 +- connectors/sqs-source/v4.0.0.2/sqs-source.md | 149 ++++++ 33 files changed, 2720 insertions(+), 87 deletions(-) rename connectors/google-bigquery-sink/{v3.0.6.1 => v3.0.7.1}/google-bigquery-sink.md (72%) rename connectors/google-bigquery-sink/{v3.3.1.1 => v3.3.1.9}/google-bigquery-sink.md (98%) create mode 100644 connectors/google-bigquery-sink/v4.0.0.2/google-bigquery-sink.md rename connectors/google-bigquery-source/{v3.0.6.1 => v3.0.7.1}/google-bigquery-source.md (99%) rename connectors/google-bigquery-source/{v3.3.1.1 => v3.3.1.9}/google-bigquery-source.md (99%) create mode 100644 connectors/google-bigquery-source/v4.0.0.2/google-bigquery-source.md rename connectors/google-cloud-storage-sink/{v3.0.6.1 => v3.0.7.1}/google-cloud-storage-sink.md (99%) rename connectors/google-cloud-storage-sink/{v3.3.1.1 => v3.3.1.9}/google-cloud-storage-sink.md (99%) create mode 100644 connectors/google-cloud-storage-sink/v4.0.0.2/google-cloud-storage-sink.md rename connectors/google-pubsub-sink/{v3.0.6.1 => v3.0.7.1}/google-pubsub-sink.md (98%) rename connectors/google-pubsub-sink/{v3.3.1.1 => v3.3.1.9}/google-pubsub-sink.md (98%) create mode 100644 connectors/google-pubsub-sink/v4.0.0.2/google-pubsub-sink.md rename connectors/google-pubsub-source/{v3.3.1.1 => v3.0.7.1}/google-pubsub-source.md (98%) rename connectors/google-pubsub-source/{v3.0.6.1 => v3.3.1.9}/google-pubsub-source.md (98%) create mode 100644 connectors/google-pubsub-source/v4.0.0.2/google-pubsub-source.md rename connectors/lakehouse-sink/{v3.3.1.1 => v3.0.7.1}/lakehouse-sink.md (96%) rename connectors/lakehouse-sink/{v3.0.6.1 => v3.3.1.9}/lakehouse-sink.md (96%) create mode 100644 connectors/lakehouse-sink/v4.0.0.2/lakehouse-sink.md rename connectors/lakehouse-source/{v3.0.6.1 => v3.0.7.1}/lakehouse-source.md (95%) rename connectors/lakehouse-source/{v3.3.1.1 => v3.3.1.9}/lakehouse-source.md (95%) create mode 100644 connectors/lakehouse-source/v4.0.0.2/lakehouse-source.md rename connectors/pinecone-sink/{v3.0.6.1 => v3.0.7.1}/pinecone-sink.md (99%) rename connectors/pinecone-sink/{v3.3.1.1 => v3.3.1.9}/pinecone-sink.md (99%) create mode 100644 connectors/pinecone-sink/v4.0.0.2/pinecone-sink.md rename connectors/snowflake-sink/{v3.0.6.1 => v3.0.7.1}/snowflake-sink.md (100%) rename connectors/snowflake-sink/{v3.3.1.1 => v3.3.1.9}/snowflake-sink.md (100%) create mode 100644 connectors/snowflake-sink/v4.0.0.2/snowflake-sink.md rename connectors/sqs-sink/{v3.0.6.1 => v3.0.7.1}/sqs-sink.md (99%) rename connectors/sqs-sink/{v3.3.1.1 => v3.3.1.9}/sqs-sink.md (99%) create mode 100644 connectors/sqs-sink/v4.0.0.2/sqs-sink.md rename connectors/sqs-source/{v3.0.6.1 => v3.0.7.1}/sqs-source.md (99%) rename connectors/sqs-source/{v3.3.1.1 => v3.3.1.9}/sqs-source.md (99%) create mode 100644 connectors/sqs-source/v4.0.0.2/sqs-source.md diff --git a/connectors/google-bigquery-sink/v3.0.6.1/google-bigquery-sink.md b/connectors/google-bigquery-sink/v3.0.7.1/google-bigquery-sink.md similarity index 72% rename from connectors/google-bigquery-sink/v3.0.6.1/google-bigquery-sink.md rename to connectors/google-bigquery-sink/v3.0.7.1/google-bigquery-sink.md index 5339ee30..3354265d 100644 --- a/connectors/google-bigquery-sink/v3.0.6.1/google-bigquery-sink.md +++ b/connectors/google-bigquery-sink/v3.0.7.1/google-bigquery-sink.md @@ -1,7 +1,7 @@ --- description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. author: StreamNative -contributors: shibd,danpi,codelipenghui,illegalnumbers +contributors: shibd,danpi,codelipenghui,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-bigquery-sink" The [Google Cloud BigQuery](https://cloud.google.com/bigquery) sink connector pulls data from Pulsar topics and persists data to Google Cloud BigQuery tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.0.6.1/docs/google-bigquery-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.0.7.1/docs/google-bigquery-sink.png) ## Quick start @@ -139,24 +139,24 @@ SELECT * FROM `{{Your project id}}.{{Your dataset name}}.{{Your table name}}` Before using the Google Cloud BigQuery sink connector, you need to configure it. This table outlines the properties and the descriptions. -| Name | Type | Required | Default | Description | -|-------------------------------|---------|----------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `projectId` | String | Yes | "" (empty string) | The Google BigQuery project ID. | -| `datasetName` | String | Yes | "" (empty string) | The Google BigQuery dataset name. | -| `tableName` | String | Yes | "" (empty string) | The Google BigQuery table name. | -| `credentialJsonString` | String | Yes | "" (empty string) | The authentication JSON key. Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the JSON file that contains your service account key when the `credentialJsonString` is set to an empty string. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries#before-you-begin). | -| `visibleModel` | String | No | "Committed" | The mode that controls when data written to the stream becomes visible in BigQuery for reading. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/write-api#application-created_streams). Available options are `Committed` and `Pending`. | -| `pendingMaxSize` | int | No | 10000 | The maximum number of messages waiting to be committed in `Pending` mode. | -| `batchMaxSize` | int | No | 20 | The maximum number of batch messages. | -| `batchMaxTime` | long | No | 5000 | The maximum batch waiting time (in units of milliseconds). | -| `batchFlushIntervalTime` | long | No | 2000 | The batch flush interval (in units of milliseconds). | -| `failedMaxRetryNum` | int | No | 20 | The maximum retries when appending fails. By default, it sets 2 seconds for each retry. | -| `autoCreateTable` | boolean | No | true | Automatically create a table if no table is available. | -| `autoUpdateTable` | boolean | No | true | Automatically update the table schema if the BigQuery table schema is incompatible with the Pulsar schema. | -| `partitionedTables` | boolean | No | true | Create a partitioned table when the table is automatically created. It will use the `__event_time__` as the partition key. | -| `partitionedTableIntervalDay` | int | No | 7 | The number of days between partitioning of the partitioned table. | -| `clusteredTables` | boolean | No | true | Create a clustered table when the table is automatically created. It will use the `__message_id__` as the cluster key. | -| `defaultSystemField` | String | No | "" (empty string) | Create the system fields when the table is automatically created. You can use commas to separate multiple fields. The supported system fields are: `__schema_version__` , `__partition__` , `__event_time__`, `__publish_time__` , `__message_id__` , `__sequence_id__` , and `__producer_name__`. | +| Name | Type | Required | Sensitive | Default | Description | +|-------------------------------|---------|----------|-----------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `projectId` | String | Yes | false | "" (empty string) | The Google BigQuery project ID. | +| `datasetName` | String | Yes | false | "" (empty string) | The Google BigQuery dataset name. | +| `tableName` | String | Yes | false | "" (empty string) | The Google BigQuery table name. | +| `credentialJsonString` | String | Yes | true | "" (empty string) | The authentication JSON key. Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the JSON file that contains your service account key when the `credentialJsonString` is set to an empty string. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries#before-you-begin). | +| `visibleModel` | String | No | false | "Committed" | The mode that controls when data written to the stream becomes visible in BigQuery for reading. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/write-api#application-created_streams). Available options are `Committed` and `Pending`. | +| `pendingMaxSize` | int | No | false | 10000 | The maximum number of messages waiting to be committed in `Pending` mode. | +| `batchMaxSize` | int | No | false | 20 | The maximum number of batch messages. The actual batch bytes size cannot exceed 10 MB. If it does, the batch will be flushed first. https://cloud.google.com/bigquery/quotas | +| `batchMaxTime` | long | No | false | 5000 | The maximum batch waiting time (in units of milliseconds). | +| `batchFlushIntervalTime` | long | No | false | 2000 | The batch flush interval (in units of milliseconds). | +| `failedMaxRetryNum` | int | No | false | 20 | The maximum retries when appending fails. By default, it sets 2 seconds for each retry. | +| `autoCreateTable` | boolean | No | false | true | Automatically create a table if no table is available. | +| `autoUpdateTable` | boolean | No | false | true | Automatically update the table schema if the BigQuery table schema is incompatible with the Pulsar schema. | +| `partitionedTables` | boolean | No | false | true | Create a partitioned table when the table is automatically created. It will use the `__event_time__` as the partition key. | +| `partitionedTableIntervalDay` | int | No | false | 7 | The number of days between partitioning of the partitioned table. | +| `clusteredTables` | boolean | No | false | true | Create a clustered table when the table is automatically created. It will use the `__message_id__` as the cluster key. | +| `defaultSystemField` | String | No | false | "" (empty string) | Create the system fields when the table is automatically created. You can use commas to separate multiple fields. The supported system fields are: `__schema_version__` , `__partition__` , `__event_time__`, `__publish_time__` , `__message_id__` , `__sequence_id__` , and `__producer_name__`. | ## Advanced features diff --git a/connectors/google-bigquery-sink/v3.3.1.1/google-bigquery-sink.md b/connectors/google-bigquery-sink/v3.3.1.9/google-bigquery-sink.md similarity index 98% rename from connectors/google-bigquery-sink/v3.3.1.1/google-bigquery-sink.md rename to connectors/google-bigquery-sink/v3.3.1.9/google-bigquery-sink.md index 19212638..6d06e27f 100644 --- a/connectors/google-bigquery-sink/v3.3.1.1/google-bigquery-sink.md +++ b/connectors/google-bigquery-sink/v3.3.1.9/google-bigquery-sink.md @@ -1,7 +1,7 @@ --- description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. author: StreamNative -contributors: shibd,danpi,codelipenghui,illegalnumbers +contributors: shibd,danpi,codelipenghui,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-bigquery-sink" The [Google Cloud BigQuery](https://cloud.google.com/bigquery) sink connector pulls data from Pulsar topics and persists data to Google Cloud BigQuery tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.3.1.1/docs/google-bigquery-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.3.1.9/docs/google-bigquery-sink.png) ## Quick start @@ -147,7 +147,7 @@ Before using the Google Cloud BigQuery sink connector, you need to configure it. | `credentialJsonString` | String | Yes | true | "" (empty string) | The authentication JSON key. Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the JSON file that contains your service account key when the `credentialJsonString` is set to an empty string. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries#before-you-begin). | | `visibleModel` | String | No | false | "Committed" | The mode that controls when data written to the stream becomes visible in BigQuery for reading. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/write-api#application-created_streams). Available options are `Committed` and `Pending`. | | `pendingMaxSize` | int | No | false | 10000 | The maximum number of messages waiting to be committed in `Pending` mode. | -| `batchMaxSize` | int | No | false | 20 | The maximum number of batch messages. | +| `batchMaxSize` | int | No | false | 20 | The maximum number of batch messages. The actual batch bytes size cannot exceed 10 MB. If it does, the batch will be flushed first. https://cloud.google.com/bigquery/quotas | | `batchMaxTime` | long | No | false | 5000 | The maximum batch waiting time (in units of milliseconds). | | `batchFlushIntervalTime` | long | No | false | 2000 | The batch flush interval (in units of milliseconds). | | `failedMaxRetryNum` | int | No | false | 20 | The maximum retries when appending fails. By default, it sets 2 seconds for each retry. | diff --git a/connectors/google-bigquery-sink/v4.0.0.2/google-bigquery-sink.md b/connectors/google-bigquery-sink/v4.0.0.2/google-bigquery-sink.md new file mode 100644 index 00000000..e8bc99db --- /dev/null +++ b/connectors/google-bigquery-sink/v4.0.0.2/google-bigquery-sink.md @@ -0,0 +1,250 @@ +--- +description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. +author: StreamNative +contributors: shibd,danpi,codelipenghui,nlu90 +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Google Cloud BigQuery Sink Connector +features: ["BigQuery Connector integrates Apache Pulsar with Google BigQuery."] +icon: "/images/connectors/google-bigquery-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "google-bigquery-sink" +--- + + +The [Google Cloud BigQuery](https://cloud.google.com/bigquery) sink connector pulls data from Pulsar topics and persists data to Google Cloud BigQuery tables. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v4.0.0.2/docs/google-bigquery-sink.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an Google BigQuery sink connector to external systems include: + +1. Create GoogleBigQuery, DataSet in Google Cloud. +2. Create the [Gcloud ServiceAccount](https://cloud.google.com/iam/docs/service-accounts-create) and create a public key certificate. +3. Create the [Gcloud Role](https://cloud.google.com/iam/docs/creating-custom-roles), ensure the Google Cloud role have the following permissions to the Google [BigQuery API](https://cloud.google.com/bigquery/docs/access-control): +```text +- bigquery.tables.create +- bigquery.tables.get +- bigquery.tables.getData +- bigquery.tables.list +- bigquery.tables.update +- bigquery.tables.updateData +``` +4. Grant the service account the above role permissions. + + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type bigquery` with `--archive /path/to/pulsar-io-bigquery.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sinks create \ + --sink-type bigquery \ + --name bigquery-sink \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ + "projectId": "Your BigQuery project Id", + "datasetName": "Your Bigquery DataSet name", + "tableName": "The name of the table you want to write data to is automatically created by default", + "credentialJsonString": "Public key certificate you created above" + }' +``` + +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to the topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java +@Data +@ToString +public class TestMessage { + private String testString; + private String testInt; + + public static void main(String[] args) { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Producer producer = client.newProducer(Schema.AVRO(TestMessage.class)) + .topic("{{Your topic name}}") + .create(); + + AvroDataConvertTestIntegration testMessage = new AvroDataConvertTestIntegration(); + testMessage.setTestString("test string"); + testMessage.setTestInt(123); + MessageId msgID = producer.send(testMessage); + System.out.println("Publish " + testMessage + " and message ID " + msgID); + + producer.flush(); + producer.close(); + client.close(); + } +} + +``` + +### 3. Show data on Google BigQuery + +This connector will automatically create the table structure according to the schema. You can use sql to query the data in the console. + +```sql +SELECT * FROM `{{Your project id}}.{{Your dataset name}}.{{Your table name}}` + ++-----------------+-----------------+--------------------------------+----------------------------+-------------+---------+ +| __meessage_id__ | __sequence_id__ | __event_time__ | __producer_name__ | testString | testInt | ++-----------------+-----------------+--------------------------------+----------------------------+-------------+---------+ +| 9:20:-1 | 0 | 2023-09-14 14:05:29.657000 UTC | test-bigquery-produce-name | test string | 123 | ++-----------------+-----------------+--------------------------------+----------------------------+-------------+---------+ + +``` + +## Configuration Properties + +Before using the Google Cloud BigQuery sink connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|-------------------------------|---------|----------|-----------|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `projectId` | String | Yes | false | "" (empty string) | The Google BigQuery project ID. | +| `datasetName` | String | Yes | false | "" (empty string) | The Google BigQuery dataset name. | +| `tableName` | String | Yes | false | "" (empty string) | The Google BigQuery table name. | +| `credentialJsonString` | String | Yes | true | "" (empty string) | The authentication JSON key. Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the JSON file that contains your service account key when the `credentialJsonString` is set to an empty string. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries#before-you-begin). | +| `visibleModel` | String | No | false | "Committed" | The mode that controls when data written to the stream becomes visible in BigQuery for reading. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/write-api#application-created_streams). Available options are `Committed` and `Pending`. | +| `pendingMaxSize` | int | No | false | 10000 | The maximum number of messages waiting to be committed in `Pending` mode. | +| `batchMaxSize` | int | No | false | 20 | The maximum number of batch messages. The actual batch bytes size cannot exceed 10 MB. If it does, the batch will be flushed first. https://cloud.google.com/bigquery/quotas | +| `batchMaxTime` | long | No | false | 5000 | The maximum batch waiting time (in units of milliseconds). | +| `batchFlushIntervalTime` | long | No | false | 2000 | The batch flush interval (in units of milliseconds). | +| `failedMaxRetryNum` | int | No | false | 20 | The maximum retries when appending fails. By default, it sets 2 seconds for each retry. | +| `autoCreateTable` | boolean | No | false | true | Automatically create a table if no table is available. | +| `autoUpdateTable` | boolean | No | false | true | Automatically update the table schema if the BigQuery table schema is incompatible with the Pulsar schema. | +| `partitionedTables` | boolean | No | false | true | Create a partitioned table when the table is automatically created. It will use the `__event_time__` as the partition key. | +| `partitionedTableIntervalDay` | int | No | false | 7 | The number of days between partitioning of the partitioned table. | +| `clusteredTables` | boolean | No | false | true | Create a clustered table when the table is automatically created. It will use the `__message_id__` as the cluster key. | +| `defaultSystemField` | String | No | false | "" (empty string) | Create the system fields when the table is automatically created. You can use commas to separate multiple fields. The supported system fields are: `__schema_version__` , `__partition__` , `__event_time__`, `__publish_time__` , `__message_id__` , `__sequence_id__` , `__producer_name__` and `__properties__`. The `__properties__` will be a repeat struct on bigquery. key and value will as a string type.. | + + +## Advanced features + +### Delivery guarantees + + +The Pulsar IO connector framework provides three [delivery guarantees](https://pulsar.apache.org/docs/next/functions-concepts#processing-guarantees-and-subscription-types): `at-most-once`, `at-least-once`, and `effectively-once`. + +Currently, the Google Cloud BigQuery sink connector only provides the `at-least-once` delivery guarantee. + +### Tables schema + +The Google Cloud BigQuery sink connector supports automatically creating and updating a table’s schema based on the Pulsar topic schema. You can configure the following options: +``` +autoCreataTables = true +autoUpdateSchema = true +``` + +If the Pulsar topic schema and BigQuery schema are different, the Google Cloud BigQuery sink connector updates schemas by merging them together. +The Google Cloud BigQuery sink connector supports mapping schema structures to the BigQuery [RECORD TYPE](https://cloud.google.com/bigquery/docs/nested-repeated#example_schema). + +In addition, the Google Cloud BigQuery sink connector supports writing some Pulsar-specific fields, as shown below: +``` +# +# optional: __schema_version__ , __partition__ , __event_time__ , __publish_time__ +# __message_id__ , __sequence_id__ , __producer_name__ , __key__ , __properties__ +# +defaultSystemField = __event_time__,__message_id__ +``` + +{% callout title="Note" type="note" %} +The Google Cloud BigQuery sink connector does not delete any fields. If you change a field name in a Pulsar topic, the Google Cloud BigQuery sink connector will preserve both fields. +{% /callout %} + +This table lists the schema types that currently are supported to be converted. + +| Schema | Supported | +|-----------------|-----------| +| AVRO | Yes | +| PRIMITIVE | Yes | +| PROTOBUF_NATIVE | Yes | +| PROTOBUF | No | +| JSON | No | +| KEY_VALUE | No | + +### Partitioned tables + +{% callout title="Note" type="note" %} +This feature is only available when `autoCreateTable` is set to `true`. If you create a table manually, you need to manually specify the partition key. +{% /callout %} + +BigQuery supports [partitioned tables](https://cloud.google.com/bigquery/docs/partitioned-tables). Partitioned tables can improve query and control costs by reducing the data read from the table. +The Google Cloud BigQuery sink connector provides an option to create a partitioned table. The partitioned tables use the __event_time__ as the partition key. +``` +partitioned-tables = true +``` + +### Clustered tables + +{% callout title="Note" type="note" %} +This feature is only available when `autoCreateTable` is set to `true`. If you create a table manually, you need to manually specify the cluster key. +{% /callout %} + +[Clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) can improve the performance of certain queries, such as queries that use filter clauses and queries that aggregate data. The Google Cloud BigQuery sink connector provides an option to create a clustered table. The clustered tables use the __message_id__ as the cluster key. +``` +clustered-tables = true +``` + +### Multiple tasks +You can leverage the Pulsar Functions scheduling mechanism to configure parallelism of the Google Cloud BigQuery sink connector. You can schedule +multiple sink instances to run on different Function worker nodes. These sink instances consume messages according to the configured subscription mode. + +``` +parallelism = 4 +``` + +{% callout title="Note" type="note" %} +It is an effective way to increase parallelism when you encounter write bottlenecks. In addition, you need to pay attention to whether the write rate is greater than [BigQuery Rate Limits](https://cloud.google.com/bigquery/quotas#streaming_inserts) +{% /callout %} + +### Batch progress + +To increase write throughput, the Google Cloud BigQuery sink connector supports configuring the batch size. You can set the batch size and latency using the following options. +``` +batchMaxSize = 100 +batchMaxTime = 4000 +batchFlushIntervalTime = 2000 +``` + + diff --git a/connectors/google-bigquery-source/v3.0.6.1/google-bigquery-source.md b/connectors/google-bigquery-source/v3.0.7.1/google-bigquery-source.md similarity index 99% rename from connectors/google-bigquery-source/v3.0.6.1/google-bigquery-source.md rename to connectors/google-bigquery-source/v3.0.7.1/google-bigquery-source.md index 056b97b7..a489ce4c 100644 --- a/connectors/google-bigquery-source/v3.0.6.1/google-bigquery-source.md +++ b/connectors/google-bigquery-source/v3.0.7.1/google-bigquery-source.md @@ -1,7 +1,7 @@ --- description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. author: StreamNative -contributors: shibd,danpi,codelipenghui,illegalnumbers +contributors: shibd,danpi,codelipenghui,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-bigquery-source" The [Google Cloud BigQuery](https://cloud.google.com/bigquery) Source Connector feeds data from Google Cloud BigQuery tables and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.0.6.1/docs/google-bigquery-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.0.7.1/docs/google-bigquery-source.png) ## Quick start diff --git a/connectors/google-bigquery-source/v3.3.1.1/google-bigquery-source.md b/connectors/google-bigquery-source/v3.3.1.9/google-bigquery-source.md similarity index 99% rename from connectors/google-bigquery-source/v3.3.1.1/google-bigquery-source.md rename to connectors/google-bigquery-source/v3.3.1.9/google-bigquery-source.md index c3e547b4..c0c26c63 100644 --- a/connectors/google-bigquery-source/v3.3.1.1/google-bigquery-source.md +++ b/connectors/google-bigquery-source/v3.3.1.9/google-bigquery-source.md @@ -1,7 +1,7 @@ --- description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. author: StreamNative -contributors: shibd,danpi,codelipenghui,illegalnumbers +contributors: shibd,danpi,codelipenghui,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-bigquery-source" The [Google Cloud BigQuery](https://cloud.google.com/bigquery) Source Connector feeds data from Google Cloud BigQuery tables and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.3.1.1/docs/google-bigquery-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v3.3.1.9/docs/google-bigquery-source.png) ## Quick start diff --git a/connectors/google-bigquery-source/v4.0.0.2/google-bigquery-source.md b/connectors/google-bigquery-source/v4.0.0.2/google-bigquery-source.md new file mode 100644 index 00000000..7fa72d00 --- /dev/null +++ b/connectors/google-bigquery-source/v4.0.0.2/google-bigquery-source.md @@ -0,0 +1,180 @@ +--- +description: BigQuery Connector integrates Apache Pulsar with Google BigQuery. +author: StreamNative +contributors: shibd,danpi,codelipenghui,nlu90 +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Google Cloud BigQuery Source Connector +features: ["BigQuery Connector integrates Apache Pulsar with Google BigQuery."] +icon: "/images/connectors/google-bigquery-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "google-bigquery-source" +--- + + +The [Google Cloud BigQuery](https://cloud.google.com/bigquery) Source Connector feeds data from Google Cloud BigQuery tables and writes data to Pulsar topics. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-bigquery/v4.0.0.2/docs/google-bigquery-source.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an Google BigQuery source connector to external systems include: + +1. Create GoogleBigQuery, DataSet and Table in Google Cloud. You can set the schema of the table, and this connector will convert the Avro schema to Pulsar. +2. Create the [Gcloud ServiceAccount](https://cloud.google.com/iam/docs/service-accounts-create) and create a public key certificate. +3. Create the [Gcloud Role](https://cloud.google.com/iam/docs/creating-custom-roles), ensure the Google Cloud role have the following permissions to the Google [BigQuery API](https://cloud.google.com/bigquery/docs/access-control): +```text +- bigquery.readsessions.create +- bigquery.readsessions.getData +- bigquery.readsessions.update +- bigquery.jobs.create +- bigquery.tables.get +- bigquery.tables.getData +``` +4. Grant the service account the above role permissions. + + +### 1. Write data to Google Bigquery + +You can use SQL to insert some data to a table. For examples: + +```sql +INSERT INTO `{{Your dataset name}}.{{Your table name}}` (message, info) +VALUES + ("message-1", "This is a message-1."), + ("message-2", "This is a message-2."), + ("message-3", "This is a message-3."), + ("message-4", "This is a message-4."), + ("message-5", "This is a message-5."), + ("message-6", "This is a message-6."), + ("message-7", "This is a message-7."), + ("message-8", "This is a message-8."), + ("message-9", "This is a message-9."), + ("message-10", "This is a message-10."); +``` + +{% callout title="Note" type="note" %} + +This connector will create a snapshot of BigQueryTable to synchronize data when it starts, so you must make sure that there is data in the table before starting the connector. + +In other words, it will only synchronize the data before the start-up, and once the data synchronization is complete, the current implementation will not discover new data to synchronize. + +{% /callout %} + + +### 2. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--source-type bigquery` with `--archive /path/to/pulsar-io-bigquery.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sources create \ + --source-type bigquery \ + --name bigquery-source \ + --tenant public \ + --namespace default \ + --destination-topic-name "Your topic name" \ + --parallelism 1 \ + --batch-source-config '{"discoveryTriggererClassName": "org.apache.pulsar.ecosystem.io.bigquery.source.BigQueryOnceTrigger"}' \ + --source-config \ + '{ + "projectId": "Your BigQuery project Id", + "datasetName": "Your Bigquery DataSet name", + "tableName": "Your Bigquery Table name", + "credentialJsonString": "Public key certificate you created above" + }' +``` + +The `--source-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/source-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 3. Show data by Pulsar Consumer + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +This connector will register the Google BigQuery table schema to pulsar. You can use `AUTO_CONSUMER` to consume the data. For example: + +```java + public static void main(String[] args) { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Consumer consumer = client.newConsumer(Schema.AUTO_CONSUME()) + .topic("{{The topic name that you specified when you created the connector}}") + .subscriptionName(subscription) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + + for (int i = 0; i < 10; i++) { + Message message = consumer.receive(10, TimeUnit.SECONDS); + GenericRecord value = message.getValue(); + for (Field field : value.getFields()) { + Object fieldValue = value.getField(field); + System.out.print(field.getName() + ":" + fieldValue + " "); + } + System.out.println(); + consumer.acknowledge(message); + } + client.close(); + } + // output + // message:message-1 info:This is a message-1. + // message:message-2 info:This is a message-2. + // message:message-3 info:This is a message-3. + // message:message-4 info:This is a message-4. + // message:message-5 info:This is a message-5. + // message:message-6 info:This is a message-6. + // message:message-7 info:This is a message-7. + // message:message-8 info:This is a message-8. + // message:message-9 info:This is a message-9. + // message:message-10 info:This is a message-10. +``` + + +## Configuration Properties + +Before using the Google Cloud BigQuery source connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|-----------------------------|---------|----------|-----------|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `projectId` | String | Yes | false | "" (empty string) | The Google BigQuery project ID. | +| `datasetName` | String | Yes | false | "" (empty string) | The Google BigQuery dataset name. | +| `tableName` | String | Yes | false | "" (empty string) | The Google BigQuery table name. | +| `credentialJsonString` | String | No | true | "" (empty string) | The authentication JSON key. Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the JSON file that contains your service account key when the `credentialJsonString` is set to an empty string. For details, see the [Google documentation](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries#before-you-begin). | +| `maxParallelism` | int | No | false | 1 | The maximum parallelism for reading. In fact, the number may be less if the BigQuery source connector deems the data small enough. | +| `forceUpdate` | Boolean | No | false | false | "if forceUpdate=true,a new session will be created. The connector will transmit the data again. | +| `queueSize` | int | No | false | 10000 | The buffer queue size of the source. It is used for storing records before they are sent to Pulsar topics. By default, it is set to `10000`. | +| `sql` | String | No | false | "" (empty string) | The SQL query on BigQuery. The computed result is saved in a temporary table. The temporary table has a configurable expiration time, and the BigQuery source connector automatically deletes the temporary table when the data is transferred completely. The `projectId` and `datasetName` gets values from the configuration file, and the `tableName` is generated by UUID. | | +| `expirationTimeInMinutes` | int | No | false | 1440 | The expiration time in minutes until the table is expired and auto-deleted. | +| `selectedFields` | String | No | false | "" (empty string) | Names of the fields in the table that should be read. | +| `filters` | String | No | false | "" (empty string) | A list of clauses that can filter the result of the table. | +| `checkpointIntervalSeconds` | int | No | false | 60 | The checkpoint interval (in units of seconds). By default, it is set to 60s. | + diff --git a/connectors/google-cloud-storage-sink/v3.0.6.1/google-cloud-storage-sink.md b/connectors/google-cloud-storage-sink/v3.0.7.1/google-cloud-storage-sink.md similarity index 99% rename from connectors/google-cloud-storage-sink/v3.0.6.1/google-cloud-storage-sink.md rename to connectors/google-cloud-storage-sink/v3.0.7.1/google-cloud-storage-sink.md index 4524b064..fbca7a88 100644 --- a/connectors/google-cloud-storage-sink/v3.0.6.1/google-cloud-storage-sink.md +++ b/connectors/google-cloud-storage-sink/v3.0.7.1/google-cloud-storage-sink.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-cloud-storage/blob/maste tags: alias: Google Cloud Storage Sink Connector features: ["Cloud Storage Connector integrates Apache Pulsar with cloud storage."] -icon: "/images/gcloud-storage-logo.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-cloud-storage/tarball/refs/tags/v3.0.6.1 +icon: "/images/connectors/gcloud-storage-logo.svg" +download: https://api.github.com/repos/streamnative/pulsar-io-cloud-storage/tarball/refs/tags/v3.0.7.1 support: streamnative support_link: https://github.com/streamnative/pulsar-io-cloud-storage support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "google-cloud-storage-sink" The [Google Cloud Storage](https://cloud.google.com/storage/docs) sink connector pulls data from Pulsar topics and persists data to Google Cloud Storage buckets. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-cloud-storage/v3.0.6.1/docs/google-cloud-storage-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-cloud-storage/v3.0.7.1/docs/google-cloud-storage-sink.png) ## Quick start diff --git a/connectors/google-cloud-storage-sink/v3.3.1.1/google-cloud-storage-sink.md b/connectors/google-cloud-storage-sink/v3.3.1.9/google-cloud-storage-sink.md similarity index 99% rename from connectors/google-cloud-storage-sink/v3.3.1.1/google-cloud-storage-sink.md rename to connectors/google-cloud-storage-sink/v3.3.1.9/google-cloud-storage-sink.md index 8eadb2d1..c25d0e87 100644 --- a/connectors/google-cloud-storage-sink/v3.3.1.1/google-cloud-storage-sink.md +++ b/connectors/google-cloud-storage-sink/v3.3.1.9/google-cloud-storage-sink.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-cloud-storage/blob/maste tags: alias: Google Cloud Storage Sink Connector features: ["Cloud Storage Connector integrates Apache Pulsar with cloud storage."] -icon: "/images/gcloud-storage-logo.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-cloud-storage/tarball/refs/tags/v3.3.1.1 +icon: "/images/connectors/gcloud-storage-logo.svg" +download: https://api.github.com/repos/streamnative/pulsar-io-cloud-storage/tarball/refs/tags/v3.3.1.9 support: streamnative support_link: https://github.com/streamnative/pulsar-io-cloud-storage support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "google-cloud-storage-sink" The [Google Cloud Storage](https://cloud.google.com/storage/docs) sink connector pulls data from Pulsar topics and persists data to Google Cloud Storage buckets. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-cloud-storage/v3.3.1.1/docs/google-cloud-storage-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-cloud-storage/v3.3.1.9/docs/google-cloud-storage-sink.png) ## Quick start diff --git a/connectors/google-cloud-storage-sink/v4.0.0.2/google-cloud-storage-sink.md b/connectors/google-cloud-storage-sink/v4.0.0.2/google-cloud-storage-sink.md new file mode 100644 index 00000000..99b847c3 --- /dev/null +++ b/connectors/google-cloud-storage-sink/v4.0.0.2/google-cloud-storage-sink.md @@ -0,0 +1,249 @@ +--- +description: Cloud Storage Connector integrates Apache Pulsar with cloud storage. +author: StreamNative +contributors: freeznet,jianyun8023,shibd,RobertIndie +language: Java,Shell,Dockerfile +document: +source: https://github.com/streamnative/pulsar-io-cloud-storage +license: Apache License 2.0 +license_link: https://github.com/streamnative/pulsar-io-cloud-storage/blob/master/LICENSE +tags: +alias: Google Cloud Storage Sink Connector +features: ["Cloud Storage Connector integrates Apache Pulsar with cloud storage."] +icon: "/images/connectors/gcloud-storage-logo.svg" +download: https://api.github.com/repos/streamnative/pulsar-io-cloud-storage/tarball/refs/tags/v4.0.0.2 +support: streamnative +support_link: https://github.com/streamnative/pulsar-io-cloud-storage +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: https://hub.docker.com/r/streamnative/pulsar-io-cloud-storage +sn_available: "true" +id: "google-cloud-storage-sink" +--- + + +The [Google Cloud Storage](https://cloud.google.com/storage/docs) sink connector pulls data from Pulsar topics and persists data to Google Cloud Storage buckets. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-cloud-storage/v4.0.0.2/docs/google-cloud-storage-sink.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an Google Cloud Storage sink connector to external systems include: + +1. Create Cloud Storage buckets in Google Cloud. +2. Create the [Google cloud ServiceAccount](https://cloud.google.com/iam/docs/service-accounts-create) and create a public key certificate. +3. Create the [Google cloud Role](https://cloud.google.com/iam/docs/creating-custom-roles), ensure the Google Cloud role have the following permissions: +```text +- storage.buckets.get +- storage.buckets.list +- storage.objects.create +``` +4. Grant the `ServiceAccount` the above `Role`. + + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type cloud-storage-gcloud` with `--archive /path/to/pulsar-io-cloud-storage.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sinks create \ + --sink-type cloud-storage-gcloud \ + --name gcloud-storage-sink \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ + "gcsServiceAccountKeyFileContent": "Public key certificate you created above", + "provider": "google-cloud-storage", + "bucket": "Your bucket name", + "formatType": "json", + "partitionerType": "PARTITION" + }' +``` + +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to the topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java + public static void main(String[] args) throws Exception { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + + Producer producer = client.newProducer(Schema.STRING) + .topic("{{Your topic name}}") + .create(); + + for (int i = 0; i < 10; i++) { + // JSON string containing a single character + String message = "{\"test-message\": \"test-value\"}"; + producer.send(message); + } + + producer.close(); + client.close(); + } +``` + +### 3. Display data on Google Cloud Storage console + +You can see the object at public/default/{{Your topic name}}-partition-0/xxxx.json on the Google Cloud Storage console. Download and open it, the content is: + +```text +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +{"test-message":"test-value"} +``` + +## Configuration Properties + +Before using the Google Cloud Storage sink connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|-----------------------------------|---------|----------|-----------|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `provider` | String | True | false | null | The Cloud Storage type, google cloud storage only supports the `google-cloud-storage` provider. | +| `bucket` | String | True | false | null | The Cloud Storage bucket. | +| `formatType` | String | True | false | "json" | The data format type. Available options are `json`, `avro`, `bytes`, or `parquet`. By default, it is set to `json`. | +| `partitioner` | String | False | false | null | The partitioner for partitioning the resulting files. Available options are `topic`, `time` or `legacy`. By default, it's set to `legacy`. Please see [Partitioner](#partitioner) for more details. | +| `partitionerType` | String | False | false | null | The legacy partitioning type. It can be configured by topic partitions or by time. By default, the partition type is configured by topic partitions. It only works when the partitioner is set to `legacy`. | +| `gcsServiceAccountKeyFileContent` | String | False | true | "" | The contents of the JSON service key file. If empty, credentials are read from `gcsServiceAccountKeyFilePath` file. | +| `gcsServiceAccountKeyFilePath` | String | False | true | "" | Path to the GCS credentials file. If empty, the credentials file will be read from the `GOOGLE_APPLICATION_CREDENTIALS` environment variable. | +| `timePartitionPattern` | String | False | false | "yyyy-MM-dd" | The format pattern of the time-based partitioning. For details, refer to the Java date and time format. | +| `timePartitionDuration` | String | False | false | "86400000" | The time interval for time-based partitioning. Support formatted interval string, such as `30d`, `24h`, `30m`, `10s`, and also support number in milliseconds precision, such as `86400000` refers to `24h` or `1d`. | +| `partitionerUseIndexAsOffset` | Boolean | False | false | false | Whether to use the Pulsar's message index as offset or the record sequence. It's recommended if the incoming messages may be batched. The brokers may or not expose the index metadata and, if it's not present on the record, the sequence will be used. See [PIP-70](https://github.com/apache/pulsar/wiki/PIP-70%3A-Introduce-lightweight-broker-entry-metadata) for more details. | +| `batchSize` | int | False | false | 10 | The number of records submitted in batch. | +| `batchTimeMs` | long | False | false | 1000 | The interval for batch submission. | +| `maxBatchBytes` | long | False | false | 10000000 | The maximum number of bytes in a batch. | +| `batchModel` | Enum | False | false | BLEND | Determines how records are batched. Options: `BLEND`, `PARTITIONED`. The BLEND which combines all topic records into a single batch, optimizing for throughput, and PARTITIONED which batches records separately for each topic, maintaining topic-level separation. Note: When set to PARTITIONED, the connector will cache data up to the size of the number of subscribed topics multiplied by maxBatchBytes. This means you need to anticipate the connector's memory requirements in advance. | +| `pendingQueueSize` | int | False | false | 10 | The number of records buffered in queue. By default, it is equal to `batchSize`. You can set it manually. | +| `withMetadata` | Boolean | False | false | false | Save message attributes to metadata. | +| `includeTopicToMetadata` | Boolean | False | false | false | Include the topic name to the metadata. | +| `sliceTopicPartitionPath` | Boolean | False | false | false | When it is set to `true`, split the partitioned topic name into separate folders in the bucket path. | +| `useHumanReadableMessageId` | Boolean | False | false | false | Use a human-readable format string for messageId in message metadata. The messageId is in a format like `ledgerId:entryId:partitionIndex:batchIndex`. Otherwise, the messageId is a Hex-encoded string. | +| `withTopicPartitionNumber` | Boolean | False | false | true | When it is set to `true`, include the topic partition number to the object path. | +| `bytesFormatTypeSeparator` | String | False | false | "0x10" | It is inserted between records for the `formatType` of bytes. By default, it is set to '0x10'. An input record that contains the line separator looks like multiple records in the output object. | +| `useHumanReadableSchemaVersion` | Boolean | False | false | false | Use a human-readable format string for the schema version in the message metadata. If it is set to `true`, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format. | +| `skipFailedMessages` | Boolean | False | false | false | Configure whether to skip a message which it fails to be processed. If it is set to `true`, the connector will skip the failed messages by `ack` it. Otherwise, the connector will `fail` the message. | +| `pathPrefix` | String | False | false | false | If it is set, the output files are stored in a folder under the given bucket path. The `pathPrefix` must be in the format of `xx/xxx/`. | +| `avroCodec` | String | False | false | snappy | Compression codec used when formatType=`avro`. Available compression types are: none (no compression), deflate, bzip2, xz, zstandard, snappy. | +| `parquetCodec` | String | False | false | gzip | Compression codec used when formatType=`parquet`. Available compression types are: none (no compression), snappy, gzip, lzo, brotli, lz4, zstd. | +| `jsonAllowNaN` | Boolean | False | false | false | Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=`json`. Since JSON specification does not allow such values this is a non-standard feature and disabled by default. | + +## Advanced features + +### Data format types + +Cloud Storage Sink Connector provides multiple output format options, including JSON, Avro, Bytes, or Parquet. The default format is JSON. +With current implementation, there are some limitations for different formats: + +This table lists the Pulsar Schema types supported by the writers. + +| Pulsar Schema | Writer: Avro | Writer: JSON | Writer: Parquet | Writer: Bytes | +|----------------|--------------|--------------|-----------------|---------------| +| Primitive | ✗ | ✔ * | ✗ | ✔ | +| Avro | ✔ | ✔ | ✔ | ✔ | +| Json | ✔ | ✔ | ✔ | ✔ | +| Protobuf ** | ✔ | ✔ | ✔ | ✔ | +| ProtobufNative | ✔ *** | ✗ | ✔ | ✔ | + +> *: The JSON writer will try to convert the data with a `String` or `Bytes` schema to JSON-format data if convertable. +> +> **: The Protobuf schema is based on the Avro schema. It uses Avro as an intermediate format, so it may not provide the best effort conversion. +> +> ***: The ProtobufNative record holds the Protobuf descriptor and the message. When writing to Avro format, the connector uses [avro-protobuf](https://github.com/apache/avro/tree/master/lang/java/protobuf) to do the conversion. + +This table lists the support of `withMetadata` configurations for different writer formats: + +| Writer Format | `withMetadata` | +|---------------|----------------| +| Avro | ✔ | +| JSON | ✔ | +| Parquet | ✔ * | +| Bytes | ✗ | + +> *: When using `Parquet` with `PROTOBUF_NATIVE` format, the connector will write the messages with `DynamicMessage` format. When `withMetadata` is set to `true`, the connector will add `__message_metadata__` to the messages with `PulsarIOCSCProtobufMessageMetadata` format. +> +> For example, if a message `User` has the following schema: +> ```protobuf +> syntax = "proto3"; +> message User { +> string name = 1; +> int32 age = 2; +> } +> ``` +> +> When `withMetadata` is set to `true`, the connector will write the message `DynamicMessage` with the following schema: +> ```protobuf +> syntax = "proto3"; +> message PulsarIOCSCProtobufMessageMetadata { +> map properties = 1; +> string schema_version = 2; +> string message_id = 3; +> } +> message User { +> string name = 1; +> int32 age = 2; +> PulsarIOCSCProtobufMessageMetadata __message_metadata__ = 3; +> } +> ``` +> + + +### Dead-letter topics + +To use a dead-letter topic, you need to set `skipFailedMessages` to `false`, and set `--max-redeliver-count` and `--dead-letter-topic` when submit the connector with the `pulsar-admin` CLI tool. For more info about dead-letter topics, see the [Pulsar documentation](https://pulsar.apache.org/docs/en/concepts-messaging/#dead-letter-topic). +If a message fails to be sent to the Cloud Storage and there is a dead-letter topic, the connector will send the message to the dead-letter topic. + +### Sink flushing only after batchTimeMs elapses + +There is a scenario where the sink is only flushing whenever the `batchTimeMs` has elapsed, even though there are many messages waiting to be processed. +The reason for this is that the sink will only acknowledge messages after they are flushed to cloud storage but the broker stops sending messages when it reaches a certain limit of unacknowledged messages. +If this limit is lower or close to `batchSize`, the sink never receives enough messages to trigger a flush based on the amount of messages. +In this case please ensure the `maxUnackedMessagesPerConsumer` set in the broker configuration is sufficiently larger than the `batchSize` setting of the sink. + +### Partitioner Type + +There are two types of partitioner: + +- **PARTITION**: This is the default partitioning method based on Pulsar partitions. In other words, data is + partitioned according to the pre-existing partitions in Pulsar topics. For instance, a message for the + topic `public/default/my-topic-partition-0` would be directed to the + file `public/default/my-topic-partition-0/xxx.json`, where `xxx` signifies the earliest message offset in this file. + +- **TIME**: Data is partitioned according to the time it was flushed. Using the previous message as an + example, if it was received on 2023-12-20, it would be directed + to `public/default/my-topic-partition-0/2023-12-20/xxx.json`, where `xxx` also denotes the earliest message offset in + this file. + diff --git a/connectors/google-pubsub-sink/v3.0.6.1/google-pubsub-sink.md b/connectors/google-pubsub-sink/v3.0.7.1/google-pubsub-sink.md similarity index 98% rename from connectors/google-pubsub-sink/v3.0.6.1/google-pubsub-sink.md rename to connectors/google-pubsub-sink/v3.0.7.1/google-pubsub-sink.md index 2b00fe42..4a7c01f6 100644 --- a/connectors/google-pubsub-sink/v3.0.6.1/google-pubsub-sink.md +++ b/connectors/google-pubsub-sink/v3.0.7.1/google-pubsub-sink.md @@ -1,7 +1,7 @@ --- description: The Google Pub/Sub sink connector is used to write messages from Apache Pulsar topics to Google Cloud Pub/Sub. author: StreamNative -contributors: shibd,nodece,Huanli-Meng,nicoloboschi +contributors: shibd,nodece,Huanli-Meng,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-pubsub-sink" The [Google Cloud PubSub](https://cloud.google.com/pubsub) sink connector pulls data from Pulsar topics and persists data to Google Cloud PubSub tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.0.6.1/docs/google-pubsub-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.0.7.1/docs/google-pubsub-sink.png) ## Quick start diff --git a/connectors/google-pubsub-sink/v3.3.1.1/google-pubsub-sink.md b/connectors/google-pubsub-sink/v3.3.1.9/google-pubsub-sink.md similarity index 98% rename from connectors/google-pubsub-sink/v3.3.1.1/google-pubsub-sink.md rename to connectors/google-pubsub-sink/v3.3.1.9/google-pubsub-sink.md index 3f1d5234..76644fee 100644 --- a/connectors/google-pubsub-sink/v3.3.1.1/google-pubsub-sink.md +++ b/connectors/google-pubsub-sink/v3.3.1.9/google-pubsub-sink.md @@ -1,7 +1,7 @@ --- description: The Google Pub/Sub sink connector is used to write messages from Apache Pulsar topics to Google Cloud Pub/Sub. author: StreamNative -contributors: shibd,nodece,Huanli-Meng,nicoloboschi +contributors: shibd,nodece,Huanli-Meng,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-pubsub-sink" The [Google Cloud PubSub](https://cloud.google.com/pubsub) sink connector pulls data from Pulsar topics and persists data to Google Cloud PubSub tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.3.1.1/docs/google-pubsub-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.3.1.9/docs/google-pubsub-sink.png) ## Quick start diff --git a/connectors/google-pubsub-sink/v4.0.0.2/google-pubsub-sink.md b/connectors/google-pubsub-sink/v4.0.0.2/google-pubsub-sink.md new file mode 100644 index 00000000..be122e68 --- /dev/null +++ b/connectors/google-pubsub-sink/v4.0.0.2/google-pubsub-sink.md @@ -0,0 +1,148 @@ +--- +description: The Google Pub/Sub sink connector is used to write messages from Apache Pulsar topics to Google Cloud Pub/Sub. +author: StreamNative +contributors: shibd,nodece,Huanli-Meng,nlu90 +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Google Cloud PubSub Sink Connector +features: ["The Google Pub/Sub sink connector is used to write messages from Apache Pulsar topics to Google Cloud Pub/Sub."] +icon: "/images/connectors/google-pubsub.svg" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "/images/connectors/streamnative.png" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "" +id: "google-pubsub-sink" +--- + + +The [Google Cloud PubSub](https://cloud.google.com/pubsub) sink connector pulls data from Pulsar topics and persists data to Google Cloud PubSub tables. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v4.0.0.2/docs/google-pubsub-sink.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an Google PubSub sink connector to external systems include: + +1. Create Google PubSub Topic in Google Cloud. +2. Create the [Gcloud ServiceAccount](https://cloud.google.com/iam/docs/service-accounts-create) and create a public key certificate. +3. Create the [Gcloud Role](https://cloud.google.com/iam/docs/creating-custom-roles), ensure the Google Cloud role have the following permissions: +```text +- pubsub.topics.create +- pubsub.topics.get +- pubsub.topics.publish +``` +4. Grant the service account the above role permissions. + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type google-pubsub` with `--archive /path/to/pulsar-io-google-pubsub.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sinks create \ + --sink-type google-pubsub \ + --name pubsub-sink \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ + "pubsubProjectId": "Your google pubsub project Id", + "pubsubTopicId": "Your google pubsub Topic name", + "pubsubCredential": "The escaped and compressed public key certificate you created above" + }' +``` +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to the topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java +public class TestProduce { + + public static void main(String[] args) { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Producer producer = client.newProducer() + .topic("{{Your topic name}}") + .create(); + + for (int i = 0; i < 10; i++) { + String message = "my-message-" + i; + MessageId msgID = producer.send(message.getBytes()); + System.out.println("Publish " + "my-message-" + i + + " and message ID " + msgID); + } + + producer.close(); + client.close(); + } +} + +``` + +### 3. Show data on Google PubSub + +You can create a subscription and pull data from the Google Pub/Sub console. + +```text ++---------------------------+-----------------+------------------| +| Publish time | Attribute keys | Message body | ++---------------------------+-----------------+------------------| +| Feb 19, 2024, 4:17:42 PM | - | my-message-0 | +| Feb 19, 2024, 4:17:42 PM | - | my-message-1 | +| Feb 19, 2024, 4:17:42 PM | - | my-message-2 | +| Feb 19, 2024, 4:17:42 PM | - | my-message-3 | +| Feb 19, 2024, 4:17:43 PM | - | my-message-4 | +| Feb 19, 2024, 4:17:43 PM | - | my-message-5 | +| Feb 19, 2024, 4:17:43 PM | - | my-message-6 | +| Feb 19, 2024, 4:17:43 PM | - | my-message-7 | +| Feb 19, 2024, 4:17:44 PM | - | my-message-8 | +| Feb 19, 2024, 4:17:44 PM | - | my-message-9 | ++---------------------------+-----------------+------------------| +``` + +## Configuration Properties + +Before using the Google Cloud PubSub sink connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|--------------------------|--------|----------|-----------|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `pubsubCredential` | String | true | true | "" (empty string) | The credential (JSON string) for accessing the Google Cloud. It needs to be compressed and escaping before use. | +| `pubsubProjectId` | String | true | false | "" (empty string) | The Google Cloud project ID. | +| `pubsubTopicId` | String | true | false | " " (empty string) | The topic ID. It is used to read messages from or write messages to Google Cloud Pub/Sub topics. | +| `pubsubSchemaId` | String | false | false | "" (empty string) | The schema ID. You must set the schema ID when creating a schema for Google Cloud Pub/Sub topics. | +| `pubsubSchemaType` | String | false | false | "" (empty string) | The schema type. You must set the schema type when creating a schema for Google Cloud Pub/Sub topics. Currently, only the AVRO format is supported. | +| `pubsubSchemaEncoding` | String | false | false | "" (empty string) | The encoding of the schema. You must set the schema encoding when creating a schema for Google Cloud Pub/Sub topics. Currently, only the JSON format is supported. | +| `pubsubSchemaDefinition` | String | false | false | "" (empty string) | The definition of the schema. It is used to create a schema to or parse messages from Google Cloud Pub/Sub topics. | + + diff --git a/connectors/google-pubsub-source/v3.3.1.1/google-pubsub-source.md b/connectors/google-pubsub-source/v3.0.7.1/google-pubsub-source.md similarity index 98% rename from connectors/google-pubsub-source/v3.3.1.1/google-pubsub-source.md rename to connectors/google-pubsub-source/v3.0.7.1/google-pubsub-source.md index f14aa6d0..b7385a72 100644 --- a/connectors/google-pubsub-source/v3.3.1.1/google-pubsub-source.md +++ b/connectors/google-pubsub-source/v3.0.7.1/google-pubsub-source.md @@ -1,7 +1,7 @@ --- description: The Google Pub/Sub sink connector allows you to write messages from Google Pub/Sub to Apache Pulsar. author: StreamNative -contributors: shibd,nodece,Huanli-Meng,nicoloboschi +contributors: shibd,nodece,Huanli-Meng,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-pubsub-source" The [Google Cloud Pub/Sub](https://cloud.google.com/pubsub) source connector feeds data from Google Cloud Pub/Sub topics and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.3.1.1/docs/google-pubsub-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.0.7.1/docs/google-pubsub-source.png) ## Quick start diff --git a/connectors/google-pubsub-source/v3.0.6.1/google-pubsub-source.md b/connectors/google-pubsub-source/v3.3.1.9/google-pubsub-source.md similarity index 98% rename from connectors/google-pubsub-source/v3.0.6.1/google-pubsub-source.md rename to connectors/google-pubsub-source/v3.3.1.9/google-pubsub-source.md index 38d9c398..43145caa 100644 --- a/connectors/google-pubsub-source/v3.0.6.1/google-pubsub-source.md +++ b/connectors/google-pubsub-source/v3.3.1.9/google-pubsub-source.md @@ -1,7 +1,7 @@ --- description: The Google Pub/Sub sink connector allows you to write messages from Google Pub/Sub to Apache Pulsar. author: StreamNative -contributors: shibd,nodece,Huanli-Meng,nicoloboschi +contributors: shibd,nodece,Huanli-Meng,nlu90 language: Java,Shell,Dockerfile document: source: Private source @@ -25,7 +25,7 @@ id: "google-pubsub-source" The [Google Cloud Pub/Sub](https://cloud.google.com/pubsub) source connector feeds data from Google Cloud Pub/Sub topics and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.0.6.1/docs/google-pubsub-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v3.3.1.9/docs/google-pubsub-source.png) ## Quick start diff --git a/connectors/google-pubsub-source/v4.0.0.2/google-pubsub-source.md b/connectors/google-pubsub-source/v4.0.0.2/google-pubsub-source.md new file mode 100644 index 00000000..fb1ccaf6 --- /dev/null +++ b/connectors/google-pubsub-source/v4.0.0.2/google-pubsub-source.md @@ -0,0 +1,155 @@ +--- +description: The Google Pub/Sub sink connector allows you to write messages from Google Pub/Sub to Apache Pulsar. +author: StreamNative +contributors: shibd,nodece,Huanli-Meng,nlu90 +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Google Cloud Pub/Sub Source Connector +features: ["The Google Pub/Sub sink connector allows you to write messages from Google Pub/Sub to Apache Pulsar."] +icon: "/images/connectors/google-pubsub.svg" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "" +id: "google-pubsub-source" +--- + + +The [Google Cloud Pub/Sub](https://cloud.google.com/pubsub) source connector feeds data from Google Cloud Pub/Sub topics and writes data to Pulsar topics. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-google-pubsub/v4.0.0.2/docs/google-pubsub-source.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an Google PubSub source connector to external systems include: + +1. Create Google PubSub Topic in Google Cloud. +2. Create the [Gcloud ServiceAccount](https://cloud.google.com/iam/docs/service-accounts-create) and create a public key certificate. +3. Create the [Gcloud Role](https://cloud.google.com/iam/docs/creating-custom-roles), ensure the Google Cloud role have the following permissions: +```text +- pubsub.subscriptions.consume +- pubsub.subscriptions.create +- pubsub.subscriptions.get +- pubsub.subscriptions.update +- pubsub.topics.attachSubscription +``` +4. Grant the service account the above role permissions. + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--source-type google-pubsub` with `--archive /path/to/pulsar-io-google-pubsub.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sources create \ + --source-type google-pubsub \ + --name pubsub-source \ + --tenant public \ + --namespace default \ + --destination-topic-name "Your topic name" \ + --parallelism 1 \ + --source-config \ + '{ + "pubsubProjectId": "Your google pubsub project Id", + "pubsubTopicId": "Your google pubsub Topic name", + "pubsubCredential": "The escaped and compressed public key certificate you created above" + }' +``` + +The `--source-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/source-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Write data to Google PubSub topic + +Send some messages to the Google Cloud PubSub using the [gcloud CLI tool](https://cloud.google.com/sdk/docs/install) + +```shell +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-0" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-1" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-2" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-3" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-4" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-5" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-6" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-7" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-8" +gcloud pubsub topics publish {{Your PubSub Topic Name}} --message="my-message-9" +``` + +### 3. Show data by Pulsar Consumer + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +```java + public static void main(String[] args) { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Consumer consumer = client.newConsumer(Schema.AUTO_CONSUME()) + .topic("{{The topic name that you specified when you created the connector}}") + .subscriptionName(subscription) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + Consumer consumer = client.newConsumer() + .topic("{{The topic name that you specified when you created the connector}}") + .subscriptionName("test-sub") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + + for (int i = 0; i < 10; i++) { + Message msg = consumer.receive(); + consumer.acknowledge(msg); + System.out.println("Receive message " + new String(msg.getData())); + } + client.close(); + } + // output + // Receive message my-message-0 + // Receive message my-message-1 + // Receive message my-message-2 + // Receive message my-message-3 + // Receive message my-message-4 + // Receive message my-message-5 + // Receive message my-message-6 + // Receive message my-message-7 + // Receive message my-message-8 + // Receive message my-message-9 +``` + + +## Configuration Properties + +Before using the Google PubSub source connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|--------------------------|--------|----------|-----------|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `pubsubCredential` | String | true | true | "" (empty string) | The credential (JSON string) for accessing the Google Cloud. It needs to be compressed and escaping before use. | +| `pubsubProjectId` | String | true | false | "" (empty string) | The Google Cloud project ID. | +| `pubsubTopicId` | String | true | false | " " (empty string) | The topic ID. It is used to read messages from or write messages to Google Cloud Pub/Sub topics. | + + diff --git a/connectors/lakehouse-sink/v3.3.1.1/lakehouse-sink.md b/connectors/lakehouse-sink/v3.0.7.1/lakehouse-sink.md similarity index 96% rename from connectors/lakehouse-sink/v3.3.1.1/lakehouse-sink.md rename to connectors/lakehouse-sink/v3.0.7.1/lakehouse-sink.md index b71ab0c3..cafa69a9 100644 --- a/connectors/lakehouse-sink/v3.3.1.1/lakehouse-sink.md +++ b/connectors/lakehouse-sink/v3.0.7.1/lakehouse-sink.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LI tags: alias: Lakehouse Sink Connector features: ["pulsar lakehouse connector"] -icon: "/images/pulsar-hub.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.3.1.1 +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.0.7.1 support: streamnative support_link: https://github.com/streamnative/pulsar-io-lakehouse support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "lakehouse-sink" The Lakehouse sink connector (including the [Hudi](https://hudi.apache.org), [Iceberg](https://iceberg.apache.org/), and [Delta Lake](https://delta.io/) sink connectors) fetches data from a Pulsar topic and saves data to the Lakehouse tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.3.1.1/docs/lakehouse-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.0.7.1/docs/lakehouse-sink.png) # How to get @@ -62,7 +62,7 @@ To build the Lakehouse sink connector from the source code, follow these steps. ```bash ls target - pulsar-io-lakehouse-3.3.1.1.nar + pulsar-io-lakehouse-3.0.7.1.nar ``` # How to configure @@ -147,7 +147,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-hudi-pulsar" ], - "archive": "connectors/pulsar-io-hudi-3.3.1.1.nar", + "archive": "connectors/pulsar-io-hudi-3.0.7.1.nar", "processingGuarantees": "EFFECTIVELY_ONCE", "parallelism": 1, "configs": { @@ -171,7 +171,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-hudi-pulsar" ], - "archive": "connectors/pulsar-io-hudi-3.3.1.1-cloud.nar", + "archive": "connectors/pulsar-io-hudi-3.0.7.1-cloud.nar", "parallelism": 1, "processingGuarantees": "EFFECTIVELY_ONCE", "configs": { @@ -199,7 +199,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-iceberg-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"iceberg", @@ -227,7 +227,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-iceberg-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1-cloud.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"iceberg", @@ -258,7 +258,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-delta-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"delta", @@ -280,7 +280,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-delta-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1-cloud.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"delta", @@ -350,7 +350,7 @@ This example describes how to use the Lakehouse sink connector to fetch data fro 1. Copy the NAR package to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.3.1.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.1.nar + cp pulsar-io-lakehouse-3.0.7.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.7.1.nar ``` 2. Start Pulsar in standalone mode. @@ -383,7 +383,7 @@ This example explains how to create a Lakehouse sink connector in an on-premises 1. Copy the NAR package of the Lakehouse sink connector to the Pulsar connectors directory. ```bash - cp pulsar-io-lakehouse-3.3.1.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.1.nar + cp pulsar-io-lakehouse-3.0.7.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.7.1.nar ``` 2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). diff --git a/connectors/lakehouse-sink/v3.0.6.1/lakehouse-sink.md b/connectors/lakehouse-sink/v3.3.1.9/lakehouse-sink.md similarity index 96% rename from connectors/lakehouse-sink/v3.0.6.1/lakehouse-sink.md rename to connectors/lakehouse-sink/v3.3.1.9/lakehouse-sink.md index c29e0c1f..3e8d4b47 100644 --- a/connectors/lakehouse-sink/v3.0.6.1/lakehouse-sink.md +++ b/connectors/lakehouse-sink/v3.3.1.9/lakehouse-sink.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LI tags: alias: Lakehouse Sink Connector features: ["pulsar lakehouse connector"] -icon: "/images/pulsar-hub.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.0.6.1 +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.3.1.9 support: streamnative support_link: https://github.com/streamnative/pulsar-io-lakehouse support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "lakehouse-sink" The Lakehouse sink connector (including the [Hudi](https://hudi.apache.org), [Iceberg](https://iceberg.apache.org/), and [Delta Lake](https://delta.io/) sink connectors) fetches data from a Pulsar topic and saves data to the Lakehouse tables. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.0.6.1/docs/lakehouse-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.3.1.9/docs/lakehouse-sink.png) # How to get @@ -62,7 +62,7 @@ To build the Lakehouse sink connector from the source code, follow these steps. ```bash ls target - pulsar-io-lakehouse-3.0.6.1.nar + pulsar-io-lakehouse-3.3.1.9.nar ``` # How to configure @@ -147,7 +147,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-hudi-pulsar" ], - "archive": "connectors/pulsar-io-hudi-3.0.6.1.nar", + "archive": "connectors/pulsar-io-hudi-3.3.1.9.nar", "processingGuarantees": "EFFECTIVELY_ONCE", "parallelism": 1, "configs": { @@ -171,7 +171,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-hudi-pulsar" ], - "archive": "connectors/pulsar-io-hudi-3.0.6.1-cloud.nar", + "archive": "connectors/pulsar-io-hudi-3.3.1.9-cloud.nar", "parallelism": 1, "processingGuarantees": "EFFECTIVELY_ONCE", "configs": { @@ -199,7 +199,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-iceberg-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"iceberg", @@ -227,7 +227,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-iceberg-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9-cloud.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"iceberg", @@ -258,7 +258,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-delta-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"delta", @@ -280,7 +280,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "inputs": [ "test-delta-pulsar" ], - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9-cloud.nar", "processingGuarantees":"EFFECTIVELY_ONCE", "configs":{ "type":"delta", @@ -350,7 +350,7 @@ This example describes how to use the Lakehouse sink connector to fetch data fro 1. Copy the NAR package to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.0.6.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.6.1.nar + cp pulsar-io-lakehouse-3.3.1.9.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.9.nar ``` 2. Start Pulsar in standalone mode. @@ -383,7 +383,7 @@ This example explains how to create a Lakehouse sink connector in an on-premises 1. Copy the NAR package of the Lakehouse sink connector to the Pulsar connectors directory. ```bash - cp pulsar-io-lakehouse-3.0.6.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.6.1.nar + cp pulsar-io-lakehouse-3.3.1.9.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.9.nar ``` 2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). diff --git a/connectors/lakehouse-sink/v4.0.0.2/lakehouse-sink.md b/connectors/lakehouse-sink/v4.0.0.2/lakehouse-sink.md new file mode 100644 index 00000000..1dae2131 --- /dev/null +++ b/connectors/lakehouse-sink/v4.0.0.2/lakehouse-sink.md @@ -0,0 +1,424 @@ +--- +description: pulsar lakehouse connector +author: StreamNative +contributors: zymap,hangc0276,Huanli-Meng,horizonzy +language: Java,Shell,Dockerfile,Python +document: +source: https://github.com/streamnative/pulsar-io-lakehouse +license: Apache License 2.0 +license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LICENSE +tags: +alias: Lakehouse Sink Connector +features: ["pulsar lakehouse connector"] +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v4.0.0.2 +support: streamnative +support_link: https://github.com/streamnative/pulsar-io-lakehouse +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "" +id: "lakehouse-sink" +--- + + +The Lakehouse sink connector (including the [Hudi](https://hudi.apache.org), [Iceberg](https://iceberg.apache.org/), and [Delta Lake](https://delta.io/) sink connectors) fetches data from a Pulsar topic and saves data to the Lakehouse tables. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v4.0.0.2/docs/lakehouse-sink.png) + +# How to get + +This section describes how to build the Lakehouse sink connector. + +You can get the Lakehouse sink connector using one of the following methods: + +- Download the NAR package from [the download page](https://github.com/streamnative/pulsar-io-lakehouse/releases). +- Build it from the source code. + +To build the Lakehouse sink connector from the source code, follow these steps. + +1. Clone the source code to your machine. + + ```bash + git clone https://github.com/streamnative/pulsar-io-lakehouse.git + ``` + +2. Build the connector in the `pulsar-io-lakehouse` directory. + + - Build the NAR package for your local file system. + + ```bash + mvn clean install -DskipTests + ``` + + - Build the NAR package for your cloud storage (Including AWS, GCS and Azure related package dependency). + + ```bash + mvn clean install -P cloud -DskipTests + ``` + + After the connector is successfully built, a NAR package is generated under the target directory. + + ```bash + ls target + pulsar-io-lakehouse-4.0.0.2.nar + ``` + +# How to configure + +Before using the Lakehouse sink connector, you need to configure it. This table lists the properties and the descriptions. + +::: tabs + +@@@ Hudi +For a list of Hudi configurations, see [Write Client Configs](https://hudi.apache.org/docs/configurations#WRITE_CLIENT). + +| Name | Type | Required | Default | Description +|--------------------------------------|----------|----------|---|-------------------------------------------------------------| +| `type` | String | true | N/A | The type of the Lakehouse source connector. Available values: `hudi`, `iceberg`, and `delta`. | +| `maxCommitInterval` | Integer | false | 120 | The maximum flush interval (in units of seconds) for each batch. By default, it is set to 120s. | +| `maxRecordsPerCommit` | Integer | false | 10_000_000 | The maximum number of records for each batch to commit. By default, it is set to `10_000_000`. | +| `maxCommitFailedTimes` | Integer | false | 5 | The maximum commit failure times until failing the process. By default, it is set to `5`. | +| `sinkConnectorQueueSize` | Integer | false | 10_000 | The maximum queue size of the Lakehouse sink connector to buffer records before writing to Lakehouse tables. | +| `partitionColumns` | List | false | Collections.empytList() | The partition columns for Lakehouse tables. | | +| `processingGuarantees` | Int | true | " " (empty string) | The processing guarantees. Currently the Lakehouse connector only supports `EFFECTIVELY_ONCE`. | +| `hudi.table.name` | String | true | N/A | The name of the Hudi table that Pulsar topic sinks data to. | +| `hoodie.table.type` | String | false | COPY_ON_WRITE | The type of the Hudi table of the underlying data for one write. It cannot be changed between writes. | +| `hoodie.base.path` | String | true | N/A | The base path of the lake storage where all table data is stored. It always has a specific prefix with the storage scheme (for example, hdfs://, s3:// etc). Hudi stores all the main metadata about commits, savepoints, cleaning audit logs etc in the `.hoodie` directory. | +| `hoodie.datasource.write.recordkey.field` | String | false | UUID | The record key field. It is used as the `recordKey` component of `HoodieKey`. You can obtain the value by invoking `.toString()` on the field value. You can use the dot notation for nested fields such as a.b.c. | +| `hoodie.datasource.write.partitionpath.field` | String | true | N/A | The partition path field. It is used as the `partitionPath` component of the `HoodieKey`. You can obtain the value by invoking `.toString()`. | +@@@ + +@@@ Iceberg +| Name | Type | Required | Default | Description +|--------------------------------------|----------|----------|---|-------------------------------------------------------------| +| `type` | String | true | N/A | The type of the Lakehouse source connector. Available values: `hudi`, `iceberg`, and `delta`. | +| `maxCommitInterval` | Integer | false | 120 | The maximum flush interval (in units of seconds) for each batch. By default, it is set to 120s. | +| `maxRecordsPerCommit` | Integer | false | 10_000_000 | The maximum number of records for each batch to commit. By default, it is set to `10_000_000`. | +| `maxCommitFailedTimes` | Integer | false | 5 | The maximum commit failure times until failing the process. By default, it is set to `5`. | +| `sinkConnectorQueueSize` | Integer | false | 10_000 | The maximum queue size of the Lakehouse sink connector to buffer records before writing to Lakehouse tables. | +| `partitionColumns` | List | false | Collections.empytList() | The partition columns for Lakehouse tables. | | +| `processingGuarantees` | Int | true | " " (empty string) | The processing guarantees. Currently the Lakehouse connector only supports `EFFECTIVELY_ONCE`. | +| `catalogProperties` | Map | true | N/A | The properties of the Iceberg catalog. For details, see [Iceberg catalog properties](https://iceberg.apache.org/docs/latest/configuration/#catalog-properties). `catalog-impl` and `warehouse` configurations are required. Currently, Iceberg catalogs only support `hadoopCatalog` and `hiveCatalog`. | +| `tableProperties` | Map | false | N/A | The properties of the Iceberg table. For details, see [Iceberg table properties](https://iceberg.apache.org/docs/latest/configuration/#table-properties). | +| `catalogName` | String | false | icebergSinkConnector | The name of the Iceberg catalog. | +| `tableNamespace` | String | true | N/A | The namespace of the Iceberg table. | +| `tableName` | String | true | N/A | The name of the Iceberg table. | +@@@ + +@@@ Delta Lake +| Name | Type | Required | Default | Description +|--------------------------------------|----------|----------|---|-------------------------------------------------------------| +| `type` | String | true | N/A | The type of the Lakehouse source connector. Available values: `hudi`, `iceberg`, and `delta`. | +| `maxCommitInterval` | Integer | false | 120 | The maximum flush interval (in units of seconds) for each batch. By default, it is set to 120s. | +| `maxRecordsPerCommit` | Integer | false | 10_000_000 | The maximum number of records for each batch to commit. By default, it is set to `10_000_000`. | +| `maxCommitFailedTimes` | Integer | false | 5 | The maximum commit failure times until failing the process. By default, it is set to `5`. | +| `sinkConnectorQueueSize` | Integer | false | 10_000 | The maximum queue size of the Lakehouse sink connector to buffer records before writing to Lakehouse tables. | +| `partitionColumns` | List | false | Collections.empytList() | The partition columns for Lakehouse tables. | | +| `processingGuarantees` | Int | true | " " (empty string) | The processing guarantees. Currently the Lakehouse connector only supports `EFFECTIVELY_ONCE`. | +| `tablePath` | String | true | N/A | The path of the Delta table. | +| `compression` | String | false | SNAPPY | The compression type of the Delta Parquet file. compression type. By default, it is set to `SNAPPY`. | +| `deltaFileType` | String | false | parquet | The type of the Delta file. By default, it is set to `parquet`. | +| `appId` | String | false | pulsar-delta-sink-connector | The Delta APP ID. By default, it is set to `pulsar-delta-sink-connector`. | +@@@ + +::: + +> **Note** +> +> The Lakehouse sink connector uses the Hadoop file system to read and write data to and from cloud objects, such as AWS, GCS, and Azure. If you want to configure Hadoop related properties, you should use the prefix `hadoop.`. + +## Examples + +You can create a configuration file (JSON or YAML) to set the properties if you use [Pulsar Function Worker](https://pulsar.apache.org/docs/en/functions-worker/) to run connectors in a cluster. + +::: tabs + +@@@ Hudi + +- The Hudi table that is stored in the file system + + ```json + { + "tenant": "public", + "namespace": "default", + "name": "hudi-sink", + "inputs": [ + "test-hudi-pulsar" + ], + "archive": "connectors/pulsar-io-hudi-4.0.0.2.nar", + "processingGuarantees": "EFFECTIVELY_ONCE", + "parallelism": 1, + "configs": { + "type": "hudi", + "hoodie.table.name": "hudi-connector-test", + "hoodie.table.type": "COPY_ON_WRITE", + "hoodie.base.path": "file:///tmp/data/hudi-sink", + "hoodie.datasource.write.recordkey.field": "id", + "hoodie.datasource.write.partitionpath.field": "id", + } + } + ``` + +- The Hudi table that is stored in the AWS S3 + + ```json + { + "tenant": "public", + "namespace": "default", + "name": "hudi-sink", + "inputs": [ + "test-hudi-pulsar" + ], + "archive": "connectors/pulsar-io-hudi-4.0.0.2-cloud.nar", + "parallelism": 1, + "processingGuarantees": "EFFECTIVELY_ONCE", + "configs": { + "type": "hudi", + "hoodie.table.name": "hudi-connector-test", + "hoodie.table.type": "COPY_ON_WRITE", + "hoodie.base.path": "s3a://bucket/path/to/hudi", + "hoodie.datasource.write.recordkey.field": "id", + "hoodie.datasource.write.partitionpath.field": "id", + "hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain" + } + } + ``` +@@@ + +@@@ Iceberg +- The Iceberg table that is stored in the file system + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"iceberg_sink", + "parallelism":2, + "inputs": [ + "test-iceberg-pulsar" + ], + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2.nar", + "processingGuarantees":"EFFECTIVELY_ONCE", + "configs":{ + "type":"iceberg", + "maxCommitInterval":120, + "maxRecordsPerCommit":10000000, + "catalogName":"test_v1", + "tableNamespace":"iceberg_sink_test", + "tableName":"ice_sink_person", + "catalogProperties":{ + "warehouse":"file:///tmp/data/iceberg-sink", + "catalog-impl":"hadoopCatalog" + } + } + } + ``` + +- The Iceberg table that is stored in cloud storage (AWS S3, GCS, or Azure) + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"iceberg_sink", + "parallelism":2, + "inputs": [ + "test-iceberg-pulsar" + ], + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2-cloud.nar", + "processingGuarantees":"EFFECTIVELY_ONCE", + "configs":{ + "type":"iceberg", + "maxCommitInterval":120, + "maxRecordsPerCommit":10000000, + "catalogName":"test_v1", + "tableNamespace":"iceberg_sink_test", + "tableName":"ice_sink_person", + "hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain", + "catalogProperties":{ + "warehouse":"s3a://test-dev-us-west-2/lakehouse/iceberg_sink", + "catalog-impl":"hadoopCatalog" + } + } + } + ``` +@@@ + +@@@ Delta Lake +- The Delta table that is stored in the file system + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"delta_sink", + "parallelism":1, + "inputs": [ + "test-delta-pulsar" + ], + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2.nar", + "processingGuarantees":"EFFECTIVELY_ONCE", + "configs":{ + "type":"delta", + "maxCommitInterval":120, + "maxRecordsPerCommit":10000000, + "tablePath": "file:///tmp/data/delta-sink" + } + } + ``` + +- The Delta table that is stored in cloud storage (AWS S3, GCS, or Azure) + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"delta_sink", + "parallelism":1, + "inputs": [ + "test-delta-pulsar" + ], + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2-cloud.nar", + "processingGuarantees":"EFFECTIVELY_ONCE", + "configs":{ + "type":"delta", + "maxCommitInterval":120, + "maxRecordsPerCommit":10000000, + "tablePath": "s3a://test-dev-us-west-2/lakehouse/delta_sink", + "hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain" + } + } + ``` +@@@ + +::: + +## Data format types + +The Lakehouse sink connector provides multiple output format options, including Avro and Parquet. The default format is Parquet. +With the current implementation, there are some limitations for different formats: + +This table lists the Pulsar Schema types supported by the writers. + +| Pulsar Schema | Writer: Avro | Writer: Parquet | +|------------------|--------------|-----------------| +| Primitive | ✗ | ✗ | +| Avro | ✔ | ✔ | +| Json | ✔ | ✔ | +| Protobuf * | ✗ | ✗ | +| ProtobufNative * | ✗ | ✗ | + +> *: The Protobuf schema is based on the Avro schema. It uses Avro as an intermediate format, so it may not provide the best effort conversion. +> +> *: The ProtobufNative record holds the Protobuf descriptor and the message. When writing to Avro format, the connector uses [avro-protobuf](https://github.com/apache/avro/tree/master/lang/java/protobuf) to do the conversion. + +# How to use + +You can use the Lakehouse sink connector with Function Worker. You can use the Lakehouse sink connector as a non built-in connector or a built-in connector. + +::: tabs + +@@@ Use it as a non built-in connector + +If you already have a Pulsar cluster, you can use the Lakehouse sink connector as a non built-in connector directly. + +This example shows how to create a Lakehouse sink connector on a Pulsar cluster using the [`pulsar-admin sinks create`](http://pulsar.apache.org/tools/pulsar-admin/2.8.0-SNAPSHOT/#-em-create-em--24) command. + +``` +PULSAR_HOME/bin/pulsar-admin sinks create \ +--sink-config-file +``` + +@@@ + +@@@ Use it as a built-in connector + +You can make the Lakehouse sink connector as a built-in connector and use it on a standalone cluster or an on-premises cluster. + +## Standalone cluster + +This example describes how to use the Lakehouse sink connector to fetch data from Pulsar topics and save data to Lakehouse tables in standalone mode. + +### Prerequisites + +- Install Pulsar locally. For details, see [set up a standalone Pulsar locally](https://pulsar.apache.org/docs/en/standalone/#install-pulsar-using-binary-release). + +### Steps + +1. Copy the NAR package to the Pulsar connectors directory. + + ``` + cp pulsar-io-lakehouse-4.0.0.2.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-4.0.0.2.nar + ``` + +2. Start Pulsar in standalone mode. + + ``` + PULSAR_HOME/bin/pulsar standalone + ``` + +3. Run the lakehouse sink connector locally. + + ``` + PULSAR_HOME/bin/pulsar-admin sink localrun \ + --sink-config-file + ``` + +4. Send messages to Pulsar topics. + + This example sends ten “hello” messages to the `test-lakehouse-pulsar` topic in the `default` namespace of the `public` tenant. + + ``` + PULSAR_HOME/bin/pulsar-client produce public/default/test-lakehouse-pulsar --messages hello -n 10 + ``` + +5. Query the data from the Lakehouse table. For details, see [Hudi Quickstart guide](https://hudi.apache.org/docs/quick-start-guide), [Iceberg Quickstart guide](https://iceberg.apache.org/docs/latest/getting-started/), and [Delta Quickstart guide](https://delta.io/learn/getting-started). + +## On-premises cluster + +This example explains how to create a Lakehouse sink connector in an on-premises cluster. + +1. Copy the NAR package of the Lakehouse sink connector to the Pulsar connectors directory. + + ```bash + cp pulsar-io-lakehouse-4.0.0.2.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-4.0.0.2.nar + ``` + +2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). + + ```bash + PULSAR_HOME/bin/pulsar-admin sinks reload + ``` + +3. Check whether the Lakehouse sink connector is available on the list or not. + + ```bash + PULSAR_HOME/bin/pulsar-admin sinks available-sinks + ``` + +4. Create a Lakehouse sink connector on a Pulsar cluster using the [`pulsar-admin sinks create`](http://pulsar.apache.org/tools/pulsar-admin/2.8.0-SNAPSHOT/#-em-create-em--24) command. + + ```bash + PULSAR_HOME/bin/pulsar-admin sinks create \ + --sink-config-file + ``` + +@@@ + +::: + +# Demos + +This table lists demos that show how to run the [Delta Lake](https://delta.io/), [Hudi](https://hudi.apache.org), and [Iceberg](https://iceberg.apache.org/) sink connectors with other external systems. + +Currently, only the demo on the Delta Lake sink connector is available. + +| Connector | Link | +|------------|----------------------------------------------------------------------------------------------------------------------------------| +| Delta Lake | For details, see the [Delta Lake demo](https://github.com/streamnative/pulsar-io-lakehouse/blob/master/docs/delta-lake-demo.md). | +| Hudi | | +| Iceberg | | + + diff --git a/connectors/lakehouse-source/v3.0.6.1/lakehouse-source.md b/connectors/lakehouse-source/v3.0.7.1/lakehouse-source.md similarity index 95% rename from connectors/lakehouse-source/v3.0.6.1/lakehouse-source.md rename to connectors/lakehouse-source/v3.0.7.1/lakehouse-source.md index 0e1f3108..6803625e 100644 --- a/connectors/lakehouse-source/v3.0.6.1/lakehouse-source.md +++ b/connectors/lakehouse-source/v3.0.7.1/lakehouse-source.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LI tags: alias: Lakehouse Source Connector features: ["pulsar lakehouse connector"] -icon: "/images/pulsar-hub.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.0.6.1 +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.0.7.1 support: streamnative support_link: https://github.com/streamnative/pulsar-io-lakehouse support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "lakehouse-source" The Lakehouse source connector (currently only including the [Delta Lake](https://delta.io/) source connector) fetches the Lakehouse table's changelog and saves changelogs into a Pulsar topic. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.0.6.1/docs/lakehouse-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.0.7.1/docs/lakehouse-source.png) # How to get @@ -62,7 +62,7 @@ To build the Lakehouse source connector from the source code, follow these steps ```bash ls target - pulsar-io-lakehouse-3.0.6.1.nar + pulsar-io-lakehouse-3.0.7.1.nar ``` # How to configure @@ -110,7 +110,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "parallelism":1, "topicName": "delta_source", "processingGuarantees":"ATLEAST_ONCE", - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1.nar", "configs":{ "type":"delta", "checkpointInterval": 180, @@ -135,7 +135,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "parallelism":1, "topicName": "delta_source", "processingGuarantees":"ATLEAST_ONCE", - "archive": "connectors/pulsar-io-lakehouse-3.0.6.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.0.7.1-cloud.nar", "configs":{ "type":"delta", "checkpointInterval": 180, @@ -195,7 +195,7 @@ This example describes how to use the Lakehouse source connector to fetch data f 1. Copy the NAR package to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.0.6.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.6.1.nar + cp pulsar-io-lakehouse-3.0.7.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.7.1.nar ``` 2. Start Pulsar in standalone mode. @@ -226,7 +226,7 @@ This example explains how to create a Lakehouse source connector in an on-premis 1. Copy the NAR package of the Lakehouse source connector to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.0.6.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.6.1.nar + cp pulsar-io-lakehouse-3.0.7.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.0.7.1.nar ``` 2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). diff --git a/connectors/lakehouse-source/v3.3.1.1/lakehouse-source.md b/connectors/lakehouse-source/v3.3.1.9/lakehouse-source.md similarity index 95% rename from connectors/lakehouse-source/v3.3.1.1/lakehouse-source.md rename to connectors/lakehouse-source/v3.3.1.9/lakehouse-source.md index 93c4ec9f..c7009bc4 100644 --- a/connectors/lakehouse-source/v3.3.1.1/lakehouse-source.md +++ b/connectors/lakehouse-source/v3.3.1.9/lakehouse-source.md @@ -10,8 +10,8 @@ license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LI tags: alias: Lakehouse Source Connector features: ["pulsar lakehouse connector"] -icon: "/images/pulsar-hub.svg" -download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.3.1.1 +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v3.3.1.9 support: streamnative support_link: https://github.com/streamnative/pulsar-io-lakehouse support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" @@ -25,7 +25,7 @@ id: "lakehouse-source" The Lakehouse source connector (currently only including the [Delta Lake](https://delta.io/) source connector) fetches the Lakehouse table's changelog and saves changelogs into a Pulsar topic. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.3.1.1/docs/lakehouse-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v3.3.1.9/docs/lakehouse-source.png) # How to get @@ -62,7 +62,7 @@ To build the Lakehouse source connector from the source code, follow these steps ```bash ls target - pulsar-io-lakehouse-3.3.1.1.nar + pulsar-io-lakehouse-3.3.1.9.nar ``` # How to configure @@ -110,7 +110,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "parallelism":1, "topicName": "delta_source", "processingGuarantees":"ATLEAST_ONCE", - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9.nar", "configs":{ "type":"delta", "checkpointInterval": 180, @@ -135,7 +135,7 @@ You can create a configuration file (JSON or YAML) to set the properties if you "parallelism":1, "topicName": "delta_source", "processingGuarantees":"ATLEAST_ONCE", - "archive": "connectors/pulsar-io-lakehouse-3.3.1.1-cloud.nar", + "archive": "connectors/pulsar-io-lakehouse-3.3.1.9-cloud.nar", "configs":{ "type":"delta", "checkpointInterval": 180, @@ -195,7 +195,7 @@ This example describes how to use the Lakehouse source connector to fetch data f 1. Copy the NAR package to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.3.1.1.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.1.nar + cp pulsar-io-lakehouse-3.3.1.9.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.9.nar ``` 2. Start Pulsar in standalone mode. @@ -226,7 +226,7 @@ This example explains how to create a Lakehouse source connector in an on-premis 1. Copy the NAR package of the Lakehouse source connector to the Pulsar connectors directory. ``` - cp pulsar-io-lakehouse-3.3.1.1.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.1.nar + cp pulsar-io-lakehouse-3.3.1.9.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-3.3.1.9.nar ``` 2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). diff --git a/connectors/lakehouse-source/v4.0.0.2/lakehouse-source.md b/connectors/lakehouse-source/v4.0.0.2/lakehouse-source.md new file mode 100644 index 00000000..8958a6f6 --- /dev/null +++ b/connectors/lakehouse-source/v4.0.0.2/lakehouse-source.md @@ -0,0 +1,266 @@ +--- +description: pulsar lakehouse connector +author: StreamNative +contributors: zymap,hangc0276,Huanli-Meng,horizonzy +language: Java,Shell,Dockerfile,Python +document: +source: https://github.com/streamnative/pulsar-io-lakehouse +license: Apache License 2.0 +license_link: https://github.com/streamnative/pulsar-io-lakehouse/blob/master/LICENSE +tags: +alias: Lakehouse Source Connector +features: ["pulsar lakehouse connector"] +icon: "/images/streamnative.png" +download: https://api.github.com/repos/streamnative/pulsar-io-lakehouse/tarball/refs/tags/v4.0.0.2 +support: streamnative +support_link: https://github.com/streamnative/pulsar-io-lakehouse +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "" +id: "lakehouse-source" +--- + + +The Lakehouse source connector (currently only including the [Delta Lake](https://delta.io/) source connector) fetches the Lakehouse table's changelog and saves changelogs into a Pulsar topic. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-lakehouse/v4.0.0.2/docs/lakehouse-source.png) + +# How to get + +This section describes how to build the Lakehouse source connector. + +You can get the Lakehouse source connector using one of the following methods: + +- Download the NAR package from [the download page](https://github.com/streamnative/pulsar-io-lakehouse/releases). +- Build it from the source code. + +To build the Lakehouse source connector from the source code, follow these steps.◊ + +1. Clone the source code to your machine. + + ```bash + git clone https://github.com/streamnative/pulsar-io-lakehouse.git + ``` + +2. Build the connector in the `pulsar-io-lakehouse` directory. + + - Build the NAR package for your local file system. + + ```bash + mvn clean install -DskipTests + ``` + + - Build the NAR package for your cloud storage (Including AWS, GCS, and Azure-related package dependency). + + ```bash + mvn clean install -P cloud -DskipTests + ``` + + After the connector is successfully built, a NAR package is generated under the target directory. + + ```bash + ls target + pulsar-io-lakehouse-4.0.0.2.nar + ``` + +# How to configure + +Before using the Lakehouse source connector, you need to configure it. This table lists the properties and the descriptions. + +::: tabs + +@@@ Delta Lake +| Name | Type | Required | Default | Description +|--------------------------------------|----------|----------|---|-------------------------------------------------------------| +| `type` | String | true | N/A | The type of the Lakehouse source connector. Available values: `delta`. | +| `checkpointInterval` | int | false | 30 | The checkpoint interval (in units of seconds). By default, it is set to 30s. | +| `queueSize` | int | false | 10_000 | The buffer queue size of the Lakehouse source connector. The buffer queue is used for store records before they are sent to Pulsar topics. By default, it is set to `10_000`. | +| `fetchHistoryData` | bool | false | false | Configure whether to fetch the history data of the table. By default, it is set to `false`. | +| `startSnapshotVersion` | long | false | -1 | The Delta snapshot version to start capturing data change. Available values: [-1: LATEST, -2: EARLIEST]. The `startSnapshotVersion` and `startTimestamp` are mutually exclusive. | +| `startTimestamp` | long | false | N/A | The Delta snapshot timestamp (in units of seconds) to start capturing data change. The `startSnapshotVersion` and `startTimestamp` are mutually exclusive. | +| `tablePath` | String | true | N/A | The path of the Delta table. | +| `parquetParseThreads` | int | false | Runtime.getRuntime().availableProcessors() | The parallelism of paring Delta Parquet files. By default, it is set to `Runtime.getRuntime().availableProcessors()`. | +| `maxReadBytesSizeOneRound` | long | false | Total memory * 0.2 | The maximum read bytes size from Parquet files in one fetch round. By default, it is set to 20% of the heap memory. | +| `maxReadRowCountOneRound` | int | false | 100_000 | The maximum read number of rows processed in one round. By default, it is set to `1_000_000`. | +@@@ + +::: + +> **Note** +> +> The Lakehouse source connector uses the Hadoop file system to read and write data to and from cloud objects, such as AWS, GCS, and Azure. If you want to configure Hadoop related properties, you should use the prefix `hadoop.`. + +## Examples + +You can create a configuration file (JSON or YAML) to set the properties if you use [Pulsar Function Worker](https://pulsar.apache.org/docs/en/functions-worker/) to run connectors in a cluster. + +::: tabs + +@@@ Delta Lake + +- The Delta table that is stored in the file system + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"delta_source", + "parallelism":1, + "topicName": "delta_source", + "processingGuarantees":"ATLEAST_ONCE", + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2.nar", + "configs":{ + "type":"delta", + "checkpointInterval": 180, + "queueSize": 10000, + "fatchHistoryData": false, + "startSnapshotVersion": -1, + "tablePath": "file:///tmp/data/delta-source", + "parquetParseThreads": 3, + "maxReadBytesSizeOneRound": 134217728, + "maxReadRowCountOneRound": 100000 + } + } + ``` + +- The Delta table that is stored in cloud storage (AWS S3, GCS, or Azure) + + ```json + { + "tenant":"public", + "namespace":"default", + "name":"delta_source", + "parallelism":1, + "topicName": "delta_source", + "processingGuarantees":"ATLEAST_ONCE", + "archive": "connectors/pulsar-io-lakehouse-4.0.0.2-cloud.nar", + "configs":{ + "type":"delta", + "checkpointInterval": 180, + "queueSize": 10000, + "fatchHistoryData": false, + "startSnapshotVersion": -1, + "tablePath": "s3a://test-dev-us-west-2/lakehouse/delta_source", + "hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain", + "parquetParseThreads": 3, + "maxReadBytesSizeOneRound": 134217728, + "maxReadRowCountOneRound": 100000 + } + } + ``` +@@@ + +::: + +## Data format types + +Currently, The Lakehouse source connector only supports reading Delta table changelogs, which adopt a `parquet` storage format. + + +# How to use + +You can use the Lakehouse source connector with Function Worker. You can use the Lakehouse source connector as a non built-in connector or a built-in connector. + +::: tabs + +@@@ Use it as a non built-in connector + +If you already have a Pulsar cluster, you can use the Lakehouse source connector as a non built-in connector directly. + +This example shows how to create a Lakehouse source connector on a Pulsar cluster using the [`pulsar-admin sources create`](https://pulsar.apache.org/tools/pulsar-admin/2.8.0-SNAPSHOT/#-em-create-em--14) command. + +``` +PULSAR_HOME/bin/pulsar-admin sources create \ +--source-config-file +``` + +@@@ + +@@@ Use it as a built-in connector + +You can make the Lakehouse source connector as a built-in connector and use it on a standalone cluster or an on-premises cluster. + +## Standalone cluster + +This example describes how to use the Lakehouse source connector to fetch data from Lakehouse tables and save data to Pulsar topics in standalone mode. + +### Prerequisites + +- Install Pulsar locally. For details, see [set up a standalone Pulsar locally](https://pulsar.apache.org/docs/en/standalone/#install-pulsar-using-binary-release). + +### Steps + +1. Copy the NAR package to the Pulsar connectors directory. + + ``` + cp pulsar-io-lakehouse-4.0.0.2.nar PULSAR_HOME/connectors/pulsar-io-lakehouse-4.0.0.2.nar + ``` + +2. Start Pulsar in standalone mode. + + ``` + PULSAR_HOME/bin/pulsar standalone + ``` + +3. Run the lakehouse source connector locally. + + ```bash + PULSAR_HOME/bin/pulsar-admin sources localrun \ + --source-config-file + ``` + +4. Write rows into the Lakehouse table. For details, see [Getting Started with Delta Lake](https://delta.io/learn/getting-started). + +5. Consume Pulsar topics to get changelogs. + + ```bash + PULSAR_HOME/bin/pulsar-client consume -s test-sub -n 0 + ``` + +## On-premises cluster + +This example explains how to create a Lakehouse source connector in an on-premises cluster. + +1. Copy the NAR package of the Lakehouse source connector to the Pulsar connectors directory. + + ``` + cp pulsar-io-lakehouse-4.0.0.2.nar $PULSAR_HOME/connectors/pulsar-io-lakehouse-4.0.0.2.nar + ``` + +2. Reload all [built-in connectors](https://pulsar.apache.org/docs/en/next/io-connectors/). + + ``` + PULSAR_HOME/bin/pulsar-admin sources reload + ``` + +3. Check whether the Lakehouse source connector is available on the list or not. + + ``` + PULSAR_HOME/bin/pulsar-admin sources available-sources + ``` + +4. Create a Lakehouse source connector on a Pulsar cluster using the [`pulsar-admin sources create`](https://pulsar.apache.org/tools/pulsar-admin/2.8.0-SNAPSHOT/#-em-create-em--14) command. + + ``` + PULSAR_HOME/bin/pulsar-admin sources create \ + --source-config-file + ``` + +@@@ + +::: + +# Demos + +This table lists demos that show how to run the [Delta Lake](https://delta.io/), [Hudi](https://hudi.apache.org), and [Iceberg](https://iceberg.apache.org/) source connectors with other external systems. + +Currently, only the demo on the Delta Lake source connector is available. + +| Connector | Link | +|------------|----------------------------------------------------------------------------------------------------------------------------------| +| Delta Lake | For details, see the [Delta Lake demo](https://github.com/streamnative/pulsar-io-lakehouse/blob/master/docs/delta-lake-demo.md). | +| Hudi | | +| Iceberg | | + diff --git a/connectors/pinecone-sink/v3.0.6.1/pinecone-sink.md b/connectors/pinecone-sink/v3.0.7.1/pinecone-sink.md similarity index 99% rename from connectors/pinecone-sink/v3.0.6.1/pinecone-sink.md rename to connectors/pinecone-sink/v3.0.7.1/pinecone-sink.md index 90fc426f..516f4183 100644 --- a/connectors/pinecone-sink/v3.0.6.1/pinecone-sink.md +++ b/connectors/pinecone-sink/v3.0.7.1/pinecone-sink.md @@ -29,7 +29,7 @@ This connector allows access to pinecone.io with a pulsar topic. The sink connec takes in messages and writes them if they are in a proper format to a Pinecone index. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-pinecone/v3.0.6.1/docs/pinecone.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-pinecone/v3.0.7.1/docs/pinecone.png) ## Quick start diff --git a/connectors/pinecone-sink/v3.3.1.1/pinecone-sink.md b/connectors/pinecone-sink/v3.3.1.9/pinecone-sink.md similarity index 99% rename from connectors/pinecone-sink/v3.3.1.1/pinecone-sink.md rename to connectors/pinecone-sink/v3.3.1.9/pinecone-sink.md index d7612136..6f5b944c 100644 --- a/connectors/pinecone-sink/v3.3.1.1/pinecone-sink.md +++ b/connectors/pinecone-sink/v3.3.1.9/pinecone-sink.md @@ -29,7 +29,7 @@ This connector allows access to pinecone.io with a pulsar topic. The sink connec takes in messages and writes them if they are in a proper format to a Pinecone index. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-pinecone/v3.3.1.1/docs/pinecone.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-pinecone/v3.3.1.9/docs/pinecone.png) ## Quick start diff --git a/connectors/pinecone-sink/v4.0.0.2/pinecone-sink.md b/connectors/pinecone-sink/v4.0.0.2/pinecone-sink.md new file mode 100644 index 00000000..2ec0966a --- /dev/null +++ b/connectors/pinecone-sink/v4.0.0.2/pinecone-sink.md @@ -0,0 +1,261 @@ +--- +description: A connector to pinecone.io +author: StreamNative +contributors: illegalnumbers,dependabot[bot],shibd,streamnativebot +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Pinecone Connector +features: ["A connector to pinecone.io"] +icon: "/images/connectors/pinecone-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "pinecone-sink" +--- + + +# Pinecone Sink Connector + +This connector allows access to pinecone.io with a pulsar topic. The sink connector +takes in messages and writes them if they are in a proper format to a Pinecone +index. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-pinecone/v4.0.0.2/docs/pinecone.png) + +## Quick start + +1. Pay for a license. +2. Create an index on pinecone.io + +Do one of the following. + +Either +- Download the image (from streamnative/pulsar-io-pinecone). + +or +- Run the connector directly on StreamNative Cloud. + +And finally +- Provide the configuration below and start the connector. + +### Prerequisites + +The prerequisites for connecting a Pinecone sink connector to external systems include: + +1. A pinecone.io api key +2. A index name +3. A namespace name + +See conf/pulsar-io-template.yaml for more information. + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type pinecone` with `--archive /path/to/pulsar-io-pinecone.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sink create \ + --sink-type pinecone \ + --name pinecone \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ "apiKey": "abcd-123","indexName": "test", "namespace": "test" }' +``` + +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to the topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java +@Data +@ToString +public class TestMessage { + public static void main(String[] args) { + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); + + String testMessage = '{ "id": "v1", "values": [1.0]}'; + + MessageId msgID = producer.send(testMessage); + System.out.println("Publish " + testMessage + " and message ID " + msgID); + + producer.flush(); + producer.close(); + client.close(); + } +} + +``` + +### 3. Querying Data From Index + +You can look in the query UI from Pinecone or you can run a raw Pinecone +query yourself using a client. There are several on the Pinecone website +which are listed including Python, Node, and cURL. + +```python +# Taken from https://www.pinecone.io/ +# Mock vectorized search query (vectorize with LLM of choice) +query = [0.1] # len(query) = 1, same as the indexed vectors + +# Send query with (optional) filter to index and get back 1 result (top_k=1) +index.query( + vector=query, + top_k=1 +) +``` + +## Configuration Properties + +Before using the Pinecone sink connector, you need to configure it. This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|-------------------------------|---------|----------|-----------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| apiKey | string | True | True | None | The API key for the Pinecone service. Find this in the Pinecone dashboard. +| indexName | string | True | False | None | The name of the Pinecone index to which you want to write data. Find this in the Pinecone dashboard. +| namespace | string | True | False | None | The name of the Pinecone namespace to which you want to write data. Find this in the Pinecone dashboard. +| dimensions | integer | False | False | None | The number of dimensions required by the index. If a request is made to upsert data into an index with a different number of dimensions, the request will fail. If not provided the connector will make it's best attempt to upsert the data and if the connection fails due to a mismatch the message will eventually be DLQ'd. +| queryMetadata | JSON | False | False | None | The metadata to be associated with the request to the index.This should be a JSON object in the form {"key": "value", "key2": "value2" }. + +## Advanced features + +### Monitoring + +Currently we provide several metrics for monitoring. + +- `pinecone-upsert-successful` +- `pinecone-upsert-failed` +- `pinecone-connector-active` +- `pinecone-upsert-failed-no-config` +- `pinecone-upsert-failed-no-client` +- `pinecone-upsert-failed-no-index-connection` +- `pinecone-upsert-failed-parsing-error` +- `pinecone-upsert-failed-dimension-error` + +These can all be used to manage the connectors status. + +### Troubleshooting + +If you get a failed upsert problem the most likely candidate is the formatting +of your messages. These are required to be in a format like the following. + +``` +{ "id": "string", "values": [float, float, ...]} +``` + +or the form +``` +{ "metadata": { "key": "value", "key2": "value2", ... }, id: "string", "values": [float, float, ...]} +``` + +Other likely candidates are problems with your connection to Pinecone. Check your +configuration values and any exceptions that are ocurring from the connector. + +Some example commands for debugging locally are as follows. + +Produce a sample message. + +``` +pulsar-client produce persistent://public/default/pinecone-source -m '{"id":"v1", "values": [3.0]}' -s '\n' +``` + +Clear a backlog of messages. +``` +pulsar-admin --admin-url http://localhost:8080 topics clear-backlog --subscription public/default/pinecone persistent://public/default/pinecone-source +``` + +Delete a topic subscription. +``` +pulsar-admin --admin-url http://localhost:8080 topics unsubscribe \ + --subscription public/default/pinecone \ + persistent://public/default/pinecone-source +``` + +Consume a group of messages. +``` +pulsar-client consume -n 1 persistent://public/default/pinecone-source -s public/default/pinecone +``` + +If you need to add a maven shell using jenv you can do this with a +helpful script. +``` +mvn dependency:build-classpath -DincludeTypes=jar -Dmdep.outputFile=.cp.txt +jshell --class-path `cat .cp.txt`:target/classes +``` + +And remember if you have maven problems on install that you need to +use JDK 8 with this project. + +``` +mvn --version # should be java 8 +jenv exec mvn # if using jenv you can exec the local version using + # this +``` + +### Delivery guarantees + +The Pulsar IO connector framework provides three [delivery guarantees](https://pulsar.apache.org/docs/next/functions-concepts#processing-guarantees-and-subscription-types): `at-most-once`, `at-least-once`, and `effectively-once`. + +Currently, the Pinecone sink connector provides the at-least-once delivery guarantee. + +### Examples + +With the source connector you can connect to Pinecone with a valid configuration +and then write messages to it. An example using localrun is shown below. + +``` +pulsar-admin --admin-url http://localhost:8080/ sinks localrun --broker-service-url pulsar://localhost:6650/ --archive "file:///Users/your-user/src/pulsar-io-pinecone/pinecone-connector/target/pulsar-io-pinecone-0.2.0.nar" --classname "org.streamnative.pulsar.io.pinecone.PineconeConnectorSink" --name "pinecone" --sink-config '{ "apiKey": "abcd-123","indexName": "test", "namespace": "test", "dimensions": 1 }' --inputs persistent://public/default/pinecone-source +``` + +This can be used when building the JAR of the project from scratch using +`mvn clean install`. + +Similar configuration can be setup when using an image mounted with a config file +defining environment variables or when using in Kubernetes. + +This table lists the schema types that currently are supported to be converted. + +| Schema | Supported | +|-----------------|-----------| +| AVRO | No | +| PRIMITIVE | Yes | (only bytes and string) +| PROTOBUF_NATIVE | Yes | +| PROTOBUF | No | +| JSON | Yes | +| KEY_VALUE | No | + + diff --git a/connectors/snowflake-sink/v3.0.6.1/snowflake-sink.md b/connectors/snowflake-sink/v3.0.7.1/snowflake-sink.md similarity index 100% rename from connectors/snowflake-sink/v3.0.6.1/snowflake-sink.md rename to connectors/snowflake-sink/v3.0.7.1/snowflake-sink.md diff --git a/connectors/snowflake-sink/v3.3.1.1/snowflake-sink.md b/connectors/snowflake-sink/v3.3.1.9/snowflake-sink.md similarity index 100% rename from connectors/snowflake-sink/v3.3.1.1/snowflake-sink.md rename to connectors/snowflake-sink/v3.3.1.9/snowflake-sink.md diff --git a/connectors/snowflake-sink/v4.0.0.2/snowflake-sink.md b/connectors/snowflake-sink/v4.0.0.2/snowflake-sink.md new file mode 100644 index 00000000..7c196d21 --- /dev/null +++ b/connectors/snowflake-sink/v4.0.0.2/snowflake-sink.md @@ -0,0 +1,275 @@ +--- +description: The Snowflake sink connector is used to write messages from Apache Pulsar topics to Snowflake tables. +author: StreamNative +contributors: RobertIndie,danpi,dependabot[bot],streamnativebot +language: Java,Shell,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: Snowflake Sink Connector +features: ["The Snowflake sink connector is used to write messages from Apache Pulsar topics to Snowflake tables."] +icon: "/images/connectors/snowflake-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "snowflake-sink" +--- + + +The [Snowflake](https://www.snowflake.com/) sink connector pulls data from Pulsar topics and persists data to Snowflake. For more information about connectors, see [Connector Overview](https://docs.streamnative.io/docs/connector-overview). + +This document introduces how to get started with creating a Snowflake sink connector and get it up and running. + +![](images/snowflake-sink.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting a Snowflake sink connector to external systems include: + +1. Prepare a snowflake account +2. Get the account URL from the `Admin - Accounts` page and click the link. It should be the format like `https://.snowflakecomputing.com`. + +3. Generate the public key and private key for the authentication. For more details, please check [this guide](https://docs.snowflake.com/en/user-guide/key-pair-auth#step-1-generate-the-private-key) + +```sh +openssl genrsa 2048 | openssl pkcs8 -topk8 -inform PEM -out rsa_key.p8 -nocrypt +openssl rsa -in rsa_key.p8 -pubout -out rsa_key.pub +``` + +It will generate `rsa_key.p8` (the private key) and `rsa_key.pub` (the public key) locally. + +4. Log in and configure the public key. + +See [Installing SnowSQL](https://docs.snowflake.com/en/user-guide/snowsql-install-config) to install the SnowSQL. + +```sh +snowsql -a ${account_identifier} -u ${user_name} +``` + +The `-a` is followed by an account identifier, which is a substring of the account URL before. The `-u` is followed by your user name. After logging in, set the public key passphrase: + +```sh +ALTER USER ${user_name} SET RSA_PUBLIC_KEY='MIIBIjA...'; +``` + +You can get the public key passphrase `(MIIBIjA…)` by running the following command: + +```sh +grep -v "\-\-\-" rsa_key.pub | tr -d '\n' +``` + +### 1. Create Snowflake objects and grant permission + +Before creating the connector, you need to grant the permissions in Snowflake. Write the following content into a file, e.g. name it with `grant.sql`. The script creates a user `snservice` that will be used in the sink config later and associate it with a role `snrole` that is only used in Snowflake internally. Then it grants the necessary permissions. + +```sql +CREATE DATABASE st_tuts; +CREATE SCHEMA st_tuts.demo; +CREATE ROLE snrole; +CREATE USER snservice; +GRANT ROLE snrole TO USER snservice; +ALTER USER snservice SET DEFAULT_ROLE = snrole; + +GRANT USAGE ON DATABASE st_tuts TO ROLE snrole; +GRANT USAGE ON SCHEMA st_tuts.demo TO ROLE snrole; +GRANT USAGE ON WAREHOUSE compute_wh TO ROLE snrole; +GRANT CREATE TABLE ON SCHEMA st_tuts.demo TO ROLE snrole; +GRANT CREATE STAGE ON SCHEMA st_tuts.demo TO ROLE snrole; +GRANT CREATE PIPE ON SCHEMA st_tuts.demo TO ROLE snrole; +ALTER USER snservice SET RSA_PUBLIC_KEY='MIIBI...'; +``` + +Then run the following command to execute the SQL script above. + +```sh +snowsql -a ${account_identifier} -u ${user_name} -f grant.sql +``` + +### 2. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type snowflake` with `--archive /path/to/pulsar-io-snowflake.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sinks create \ + --sink-type snowflake \ + --name snowflake \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ + "user": "SNSERVICE", + "host": "https://.snowflakecomputing.com", + "schema": "demo", + "database": "st_tuts", + "privateKey": "...", + "warehouse": "compute_wh" + }' +``` + +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. + +You can get the private key passphrase `(MIIBIjA…)` by running the following command: + +```sh +grep -v '\-\-\-' rsa_key.p8 | tr -d '\n' +``` + +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 3. Send messages to he topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Producer producer = client.newProducer(Schema.STRING) + .topic("{{Your topic name}}") + .create(); + + String message = "hello world"; + MessageId msgID = producer.send(message); + System.out.println("Publish " + message + " and message ID " + msgID); + + producer.flush(); + producer.close(); + client.close(); +``` + +You can also send the message using the command line: +```sh +$ bin/pulsar-client produce pulsar-topic-name --messages "hello world" +``` + +### 4. Check the data on Snowflake table + +First, you need to execute the following SQL command to grant the role `SNROLE` to the user you logged in. + +```sh +GRANT ROLE SNROLE TO USER ${account_name}; +``` + +Then, switch the role to `SNROLE`, under `Data - Database - ST_TUTS - DEMO - Tables` you will find table `PERSISTENT___PUBLIC_DEFAULT_INPUT_SNOWFLAKE_1118738946` is created and records the messages produced above. + +![Data Sample](images/data-sample.jpg) + +## Configuration Properties + +This table outlines the properties and the descriptions. + +| Name | Type | Required | Sensitive | Default | Description | +|----------------------------|---------|----------|-----------|---------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `user` | String | Yes | false | ""(empty string) | The user account name of the Snowflake service. | +| `privateKey` | String | Yes | true | ""(empty string) | The private key of the user. | +| `host` | String | Yes | false | ""(empty string) | The host URL of the snowflake service. | +| `database` | String | Yes | false | ""(empty string) | The Snowflake database where the connector will sink data. | +| `schema` | String | Yes | false | ""(empty string) | The Snowflake schema belongs to the level below the Snowflake database and consists of a set of tables. | +| `tableName` | String | No | false | ""(empty string) | If the `autoCreateTable` option is set to `false`, the Snowflake connector will persist messages to this table. | +| `warehouse` | String | No | false | ""(empty string) | The warehouse name in the snowflake. By default, no warehouse name is set. | +| `bufferCountRecords` | int | No | false | 10_000 | The number of records that are buffered in the memory before they are ingested to Snowflake. By default, it is set to `10_000`. | +| `bufferSizeBytes` | int | No | false | 5_000_000 | The cumulative size (in units of bytes) of the records that are buffered in the memory before they are ingested in Snowflake as data files. By default, it is set to `5_000_000` (5 MB). | +| `bufferFlushTimeInSeconds` | int | No | false | 60 | The number of seconds between buffer flushes, where the flush is from the Pulsar’s memory cache to the internal stage. By default, it is set to `60` seconds. | +| `autoCreateTable` | boolean | No | false | false | Automatically create a table when the table does not exist. | +| `processingGuarantees` | String | No | false | "ATLEAST_ONCE" | Specify the processing guarantee semantics. Currently, the Snowflake connector only supports `ATLEAST_ONCE` processing guarantee semantics. | +| `topic2table` | String | No | false | ""(empty string) | Specify the mapping relationship between topics and tables. The topic name should be its complete name. Each topic and the mapped table name should be separated by a colon, such as `persistent://public/default/topic1:table1,persistent://public/default/topic2:table2`. | +| `metadataField` | String | No | false | "__message_id__,__partition__,__topic__,__event_time__" | The metadata fields for each snowflake record. You can separate multiple fields with commas. The supported metadata fields are: __schema_version__ , __partition__ , __event_time__ , __publish_time__ , __message_id__ , __sequence_id__ , __producer_name__ , __topic__. Currently, the Snowflake sink connector currently does not support custom metadata. | + +## Advanced features +This section describes the advanced features of the Snowflake sink connector. For details about how to configure these features, see [how to configure](#how-to-configure). + +## Delivery guarantees +The Pulsar IO connector framework provides three [delivery guarantees](https://pulsar.apache.org/docs/next/functions-concepts#processing-guarantees-and-subscription-types): `at-most-once`, `at-least-once`, and `effectively-once`. + +Currently, the Snowflake sink connector only supports the `at-least-once` delivery guarantee semantic. + +## Table name mapping +The Snowflake sink connector supports automatically creating a table when the table does not exist. You can configure the following options: +``` +autoCreateTable=true +``` + +The Snowflake sink connector allows you to specify the mapping relationship between topics and tables. Each topic and its mapped table name should be separated by a colon. +And please note that the topic name should be its complete name. +The parameter `topic2table` see example below: +``` +topic2table=persistent://public/default/topic1:table1,persistent://public/default/topic2:table2 +``` + +## Metadata Fields +There are two fields in the table: metadata and content. Metadata is ancillary information in content, such as `topic`, `messageId`, `publishTime`, and so on. +By default, the following metadata fields of Pulsar will be created as the metadata: +``` +metadataField=__message_id__,__partition__,__topic__,__event_time__ +``` +> **Note** +> +> Currently, the Snowflake sink connector does not support custom metadata. + +## Data format types + +The snowflake connector supports converting some Pulsar schemas, as listed in the following table. + +| Pulsar Schema | Supported | +|-----------------|-----------| +| AVRO | Yes | +| PRIMITIVE | Yes | +| JSON | Yes | +| KEY_VALUE | No | +| PROTOBUF | No | +| PROTOBUF_NATIVE | No | + +All data will be converted and written in JSON format under the "content" column. Below is a table showing the +conversion for each Schema Type: + +| Schema Type | Converted Content | Example | +|------------------------------------------|------------------------------------------------------------|--------------------------------------| +| BYTES | Base64-encoded String | "SGVsbG8=" (Hello in base64) | +| Boolean | Boolean | true | +| INT8, INT16, INT32, INT64, FLOAT, DOUBLE | Number | 1234 | +| STRING | String | "Hello" | +| JSON | JSON Object | {"name": "John", "age": 30} | +| AVRO | JSON Object | {"name": "John", "age": 30} | +| DATE, TIME, TIMESTAMP | Number (milliseconds since Jan 1, 1970, GMT) | 1654849667447 | +| INSTANT | Number (seconds since 1970-01-01T00:00:00Z) | 1654826254.091 | +| LOCAL_DATE | Array [Year, Month, Day] | [2022, 12, 1] | +| LOCAL_TIME | Array [Hour, Minute, Second, Nanosecond] | [16, 30, 28, 150000000] | +| LOCAL_DATE_TIME | Array [Year, Month, Day, Hour, Minute, Second, Nanosecond] | [2022, 12, 1, 16, 30, 28, 150000000] | + +## Batch progress + +To increase write throughput, you can configure the buffer size and latency for the Snowflake sink connector. +``` +bufferCountRecords = 10_000 +bufferSizeBytes = 5_000_000 +bufferFlushTimeInSeconds = 120 +``` + + diff --git a/connectors/sqs-sink/v3.0.6.1/sqs-sink.md b/connectors/sqs-sink/v3.0.7.1/sqs-sink.md similarity index 99% rename from connectors/sqs-sink/v3.0.6.1/sqs-sink.md rename to connectors/sqs-sink/v3.0.7.1/sqs-sink.md index acb020f3..b3282557 100644 --- a/connectors/sqs-sink/v3.0.6.1/sqs-sink.md +++ b/connectors/sqs-sink/v3.0.7.1/sqs-sink.md @@ -25,7 +25,7 @@ id: "sqs-sink" The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) sink connector pulls data from Pulsar topics and persists data to AWS SQS. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.0.6.1/docs/sqs-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.0.7.1/docs/sqs-sink.png) ## Quick start diff --git a/connectors/sqs-sink/v3.3.1.1/sqs-sink.md b/connectors/sqs-sink/v3.3.1.9/sqs-sink.md similarity index 99% rename from connectors/sqs-sink/v3.3.1.1/sqs-sink.md rename to connectors/sqs-sink/v3.3.1.9/sqs-sink.md index 0c563939..a7b1766c 100644 --- a/connectors/sqs-sink/v3.3.1.1/sqs-sink.md +++ b/connectors/sqs-sink/v3.3.1.9/sqs-sink.md @@ -25,7 +25,7 @@ id: "sqs-sink" The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) sink connector pulls data from Pulsar topics and persists data to AWS SQS. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.3.1.1/docs/sqs-sink.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.3.1.9/docs/sqs-sink.png) ## Quick start diff --git a/connectors/sqs-sink/v4.0.0.2/sqs-sink.md b/connectors/sqs-sink/v4.0.0.2/sqs-sink.md new file mode 100644 index 00000000..b5ca4e4e --- /dev/null +++ b/connectors/sqs-sink/v4.0.0.2/sqs-sink.md @@ -0,0 +1,276 @@ +--- +description: The SQS source connector is used to consume messages from Pulsar topics and publish them to AWS SQS. +author: StreamNative +contributors: freeznet,shibd,Anonymitaet,nlu90 +language: Java,Shell,Python,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: AWS SQS Sink Connector +features: ["The SQS source connector is used to consume messages from Pulsar topics and publish them to AWS SQS."] +icon: "/images/connectors/sqs-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "sqs-sink" +--- + + +The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) sink connector pulls data from Pulsar topics and persists data to AWS SQS. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v4.0.0.2/docs/sqs-sink.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an AWS SQS sink connector to external systems include: + +1. Create SQS in AWS. +2. Create the [AWS User](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html) and create `AccessKey`(Please record `AccessKey` and `SecretAccessKey`). +3. Assign the following permissions to the AWS User: +- sqs:CreateQueue +- sqs:SendMessage + + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--sink-type sqs` with `--archive /path/to/pulsar-io-sqs.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sinks create \ + --sink-type sqs \ + --name sqs-sink \ + --tenant public \ + --namespace default \ + --inputs "Your topic name" \ + --parallelism 1 \ + --sink-config \ + '{ + "awsRegion": "Your aws sqs region", + "queueName": "Your AWS SQS name", + "awsCredentialPluginParam": "{\"accessKey\":\"Your AWS access key\",\"secretKey\":\"Your AWS secret access key\"}" + }' +``` + +The `--sink-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/sink-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to the topic + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` java + PulsarClient client = PulsarClient.builder() + .serviceUrl("{{Your Pulsar URL}}") + .build(); + + Producer producer = client.newProducer(Schema.STRING) + .topic("{{Your topic name}}") + .create(); + + String message = "test-message"; + MessageId msgID = producer.send(message); + System.out.println("Publish " + message + " and message ID " + msgID); + + producer.flush(); + producer.close(); + client.close(); +``` + +### 3. Show data on AWS SQS +You can use the following simple code to receive messages from AWS SQS. + +``` java + public static void main(String[] args) { + + AmazonSQS client = AmazonSQSClientBuilder.standard() + .withCredentials(new AWSStaticCredentialsProvider( + new BasicAWSCredentials("Your access key", "Your secret key"))) + .withRegion("Your AWS SQS region").build(); + + String queueUrl = client.getQueueUrl(new GetQueueUrlRequest("Your SQS name")).getQueueUrl(); + ReceiveMessageResult receiveMessageResult = client.receiveMessage(queueUrl); + for (Message message : receiveMessageResult.getMessages()) { + System.out.println("Receive msg: " + message.getBody()); + } + client.shutdown(); + } + + // Output + // Receive msg: {"value" : "test-message"} +``` + +## Configuration Properties + +Before using the AWS SQS sink connector, you need to configure it. This table outlines the properties and the +Descriptions of an AWS SQS sink connector. + +| Name | Type | Required | Sensitive | Default | Description | +|----------------------------|--------|----------|-----------|--------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `awsRegion` | String | true | false | " " (empty string) | Supported AWS region. For example, us-west-1, us-west-2. | +| `queueName` | String | true | false | " " (empty string) | The name of the SQS queue that messages should be read from or written to. | +| `awsCredentialPluginName` | String | false | false | " " (empty string) | The fully-qualified class name of implementation of [AwsCredentialProviderPlugin](https://github.com/apache/pulsar/blob/master/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java). For more information, see [Configure AwsCredentialProviderPlugin](###Configure AwsCredentialProviderPlugin). | +| `awsCredentialPluginParam` | String | false | true | " " (empty string) | The JSON parameter to initialize `awsCredentialsProviderPlugin`. For more information, see [Configure AwsCredentialProviderPlugin](###Configure AwsCredentialProviderPlugin). | +| `awsEndpoint` | String | false | false | " " (empty string) | AWS SQS end-point URL. You can find it at [AWS SQS Service endpoints](https://docs.aws.amazon.com/general/latest/gr/sqs-service.html#sqs_region). | +| `metadataFields` | String | false | false | "pulsar.key" | The metadata fields to be sent to the SQS message attributes. Valid values are 'pulsar.topic, pulsar.key, pulsar.partitionIndex, pulsar.sequence, pulsar.properties.{{Your properties key}}, pulsar.eventTime' | + + +### Configure AwsCredentialProviderPlugin + +AWS SQS sink connector allows you to use three ways to connect to AWS SQS by configuring `awsCredentialPluginName`. + +- Leave `awsCredentialPluginName` empty to get the connector authenticated by passing `accessKey` and `secretKey` in `awsCredentialPluginParam`. + + ```json + {"accessKey":"Your access key","secretKey":"Your secret key"} + ``` + +- Set `awsCredentialPluginName` to `org.apache.pulsar.io.aws.AwsDefaultProviderChainPlugin` to use the default AWS provider chain. With this option, you don't need to configure `awsCredentialPluginParam`. For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +- Set `awsCredentialPluginName` to `org.apache.pulsar.io.aws.STSAssumeRoleProviderPlugin` to use the [default AWS provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default), and you need to configure `roleArn` and `roleSessionNmae` in `awsCredentialPluginParam`. For more information, see [AWS documentation](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html). + + ```json + {"roleArn": "arn...", "roleSessionName": "name"} + ``` + +## Advanced features + +### Schema Support + +The AWS SQS sink connector supports the following schema types: `Primitive Schema`, `Avro Schema`, and `JSON Schema`. + +#### Primitive Schema + +For the primitive type, the payload format is as follows: + +```JSON +{ + "value": "test-value" +} + +// or + +{ + "value": true +} + +// or + +{ + "value": 1234 +} + +// or + +{ + "value": "2023-10-17" +} + +// or + +{ + "value": "MjAyMy0xMC0xNw==" // bytes(base64-encoded) +} +``` + +The value types include: Number, Boolean, and String. Here's a table indicating the conversion type for each Primitive +Schema Type: + +| Primitive Schema Type | JSON Conversion Type | Example | +|------------------------------------------|------------------------------------------------------|-----------------------------------------------------------| +| Boolean | Boolean | true | +| INT8, INT16, INT32, INT64, FLOAT, DOUBLE | Number | 1234 | +| STRING | String | "Hello" | +| BYTES | Base64-encoded String | "SGVsbG8=" (base64-encoded version of the string "Hello") | +| DATE, TIME, TIMESTAMP | ISO 8601 String (yyy-MM-dd'T'HH:mm:ss.SSSXXX) | '2023-10-30T06:13:48.123+08:00' | +| LocalDate | ISO 8601 String (yyyy-MM-dd) | '2023-10-17' | +| LocalTime | ISO 8601 String (HH:mm:ss.SSSSSSSSS) | '04:30:33.123456789' | +| LocalDateTime | ISO 8601 String (yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS) | '2023-10-17T04:30:33.123456789' | +| Instant | ISO 8601 String (yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX) | '2023-10-30T06:13:48.123456789+08:00' | + +#### Struct Schema (Avro Schema and JSON Schema) + +For the struct schema types `JSON` and `AVRO`, the value is converted into a JSON object. The conversion rules outlined +in the `Primitive schema section` are applied to all primitive type fields within this value object. Nested objects are +also supported. + +Here is an example: + +```JSON +{ + "stringField": "hello", + "timeField": "2023-10-17T08:22:11.263Z", + "numberField": 100, + "valueField": "test-value" +} +``` + +Here are the rules for handling the logical type of the Avro based struct schema (`AVRO` and `JSON`): + +| Logical Type | JSON Conversion Type | Example | +|------------------------------|--------------------------------------------------|------------------------------------| +| `time-millis`, `time-micros` | ISO 8601 String (HH:mm:ss.SSS) | '13:48:41.123' | +| `timestamp-millis` | ISO 8601 String (yyy-MM-dd'T'HH:mm:ss.SSSXXX) | '2023-10-30T06:13:48.123+08:00' | +| `timestamp-micros` | ISO 8601 String (yyy-MM-dd'T'HH:mm:ss.SSSSSSXXX) | '2023-10-30T06:13:48.123456+08:00' | +| `local-timestamp-millis` | ISO 8601 String (yyyy-MM-dd'T'HH:mm:ss.SSS) | '2023-10-29T22:13:48.123' | +| `local-timestamp-micros` | ISO 8601 String (yyyy-MM-dd'T'HH:mm:ss.SSSSSS) | '2023-10-29T22:13:48.123456' | + + +### Metadata Support + +SQS sink connector will put metadata of Pulsar into SQS `message attributes`. SQS message attributes accommodate various data types such as String, Number, Binary, and so forth. + +The supported metadata fields of Pulsar are: + +- `topic`: The `string` type of source topic name +- `key`: The `string` type of the message key. +- `partitionIndex`: The `number` type of the topic partition index of the topic. +- `sequence`: The `number` type of the sequence ID. +- `properties`: This is a map, and will unfold this map, placing each key-value pair into the SQS `message attribute`. The type of the key is `string`, and the type of the value is `string`. +- `eventTime`: The event time of the message in the [ISO 8601 format](https://www.w3.org/TR/NOTE-datetime) +- `messageId`: The string representation of a message ID. eg, `"1:1:-1:-1"` + +You can get metadata form `message attributes`, for examples: +```yaml +"pulsar.topic": "test-topic" +"pulsar.key": "test-key" +"pulsar.partitionsIndex": 1 +"pulsar.sequence": 100 +"pulsar.properties.key1": "test-properties.value1" +"pulsar.properties.key2": "test-properties.value2" +"pulsar.eventTime": "2023-10-17T04:30:33.123456789" +"pulsar.messageId": "1:1:-1:-1" +``` + +Users can choose the metadata fields through the `metaDataField` configuration. It is a `string` in which multiple fields are separated by commas. And this connector will verify that the number of metadata cannot exceed 10. + +For examples: +```yaml +config: + metaDataField: 'pulsar.topic, pulsar.key, pulsar.partitionIndex, pulsar.sequence, pulsar.properties.key1' +``` + + diff --git a/connectors/sqs-source/v3.0.6.1/sqs-source.md b/connectors/sqs-source/v3.0.7.1/sqs-source.md similarity index 99% rename from connectors/sqs-source/v3.0.6.1/sqs-source.md rename to connectors/sqs-source/v3.0.7.1/sqs-source.md index 77f2000e..b8c37f51 100644 --- a/connectors/sqs-source/v3.0.6.1/sqs-source.md +++ b/connectors/sqs-source/v3.0.7.1/sqs-source.md @@ -25,7 +25,7 @@ id: "sqs-source" The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) source connector feeds data from Amazon AWS SQS and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.0.6.1/docs/sqs-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.0.7.1/docs/sqs-source.png) ## Quick start diff --git a/connectors/sqs-source/v3.3.1.1/sqs-source.md b/connectors/sqs-source/v3.3.1.9/sqs-source.md similarity index 99% rename from connectors/sqs-source/v3.3.1.1/sqs-source.md rename to connectors/sqs-source/v3.3.1.9/sqs-source.md index c6536e89..11518612 100644 --- a/connectors/sqs-source/v3.3.1.1/sqs-source.md +++ b/connectors/sqs-source/v3.3.1.9/sqs-source.md @@ -25,7 +25,7 @@ id: "sqs-source" The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) source connector feeds data from Amazon AWS SQS and writes data to Pulsar topics. -![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.3.1.1/docs/sqs-source.png) +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v3.3.1.9/docs/sqs-source.png) ## Quick start diff --git a/connectors/sqs-source/v4.0.0.2/sqs-source.md b/connectors/sqs-source/v4.0.0.2/sqs-source.md new file mode 100644 index 00000000..2f85b423 --- /dev/null +++ b/connectors/sqs-source/v4.0.0.2/sqs-source.md @@ -0,0 +1,149 @@ +--- +description: The SQS source connector is used to consume messages from Amazon SQS and publish them to Pulsar. +author: StreamNative +contributors: freeznet,shibd,Anonymitaet,nlu90 +language: Java,Shell,Python,Dockerfile +document: +source: Private source +license: StreamNative, Inc.. All Rights Reserved +license_link: +tags: +alias: AWS SQS Source Connector +features: ["The SQS source connector is used to consume messages from Amazon SQS and publish them to Pulsar."] +icon: "/images/connectors/sqs-logo.png" +download: +support: streamnative +support_link: https://streamnative.io +support_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +owner_name: "streamnative" +owner_img: "https://avatars.githubusercontent.com/u/44651383?v=4" +dockerfile: +sn_available: "true" +id: "sqs-source" +--- + + +The [AWS Simple Queue Service (SQS)](https://aws.amazon.com/sqs/?nc1=h_ls) source connector feeds data from Amazon AWS SQS and writes data to Pulsar topics. + +![](https://raw.githubusercontent.com/streamnative/pulsar-io-sqs/v4.0.0.2/docs/sqs-source.png) + +## Quick start + +### Prerequisites + +The prerequisites for connecting an AWS SQS source connector to external systems include: + +1. Create SQS in AWS. +2. Create the [AWS User](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html) and create `AccessKey`(Please record `AccessKey` and `SecretAccessKey`). +3. Assign the following permissions to the AWS User: +- sqs:CreateQueue +- sqs:DeleteMessage +- sqs:ChangeMessageVisibility +- sqs:GetQueueUrl +- sqs:GetQueueAttributes +- sqs:ReceiveMessage + + +### 1. Create a connector + +The following command shows how to use [pulsarctl](https://github.com/streamnative/pulsarctl) to create a `builtin` connector. If you want to create a `non-builtin` connector, +you need to replace `--source-type sqs` with `--archive /path/to/pulsar-io-sqs.nar`. You can find the button to download the `nar` package at the beginning of the document. + +{% callout title="For StreamNative Cloud User" type="note" %} +If you are a StreamNative Cloud user, you need [set up your environment](https://docs.streamnative.io/docs/connector-setup) first. +{% /callout %} + +```bash +pulsarctl sources create \ + --source-type sqs \ + --name sqs-source \ + --tenant public \ + --namespace default \ + --destination-topic-name "Your topic name" \ + --parallelism 1 \ + --source-config \ + '{ + "awsRegion": "Your aws sqs region", + "queueName": "Your AWS SQS name", + "awsCredentialPluginParam": "{\"accessKey\":\"Your AWS access key\",\"secretKey\":\"Your AWS secret access key\"}" + }' +``` + +The `--source-config` is the minimum necessary configuration for starting this connector, and it is a JSON string. You need to substitute the relevant parameters with your own. +If you want to configure more parameters, see [Configuration Properties](#configuration-properties) for reference. + +{% callout title="Note" type="note" %} +You can also choose to use a variety of other tools to create a connector: +- [pulsar-admin](https://pulsar.apache.org/docs/3.1.x/io-use/): The command arguments for `pulsar-admin` are similar to those of `pulsarctl`. You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector ). +- [RestAPI](https://pulsar.apache.org/source-rest-api/?version=3.1.1): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Terraform](https://github.com/hashicorp/terraform): You can find an example for [StreamNative Cloud Doc](https://docs.streamnative.io/docs/connector-create#create-a-built-in-connector). +- [Function Mesh](https://functionmesh.io/docs/connectors/run-connector): The docker image can be found at the beginning of the document. +{% /callout %} + +### 2. Send messages to AWS SQS +You can use the following simple code to send messages to AWS SQS. +``` java + public static void main(String[] args) { + AmazonSQS client = AmazonSQSClientBuilder.standard() + .withCredentials(new AWSStaticCredentialsProvider( + new BasicAWSCredentials("Your access key", "Your secret key"))) + .withRegion("Your AWS SQS region").build(); + String queueUrl = client.getQueueUrl(new GetQueueUrlRequest("Your SQS name")).getQueueUrl(); + client.sendMessage(queueUrl, "Hello World!"); + client.shutdown(); + } +``` + +### 3. Show data using Pulsar client + +{% callout title="Note" type="note" %} +If your connector is created on StreamNative Cloud, you need to authenticate your clients. See [Build applications using Pulsar clients](https://docs.streamnative.io/docs/qs-connect#jumpstart-for-beginners) for more information. +{% /callout %} + +``` shell +bin/pulsar-client \ +--url "Your Pulsar serviceUrl" \ +consume "The topic that you specified when you created the connector" -s "test-sub" -n 10 -p Earliest + +----- got message ----- +key:[null], properties:[], content:Hello World! +``` + +## Configuration Properties + +Before using the AWS SQS source connector, you need to configure it. This table outlines the properties and the +Descriptions of an AWS SQS source connector. + +| Name | Type | Required | Sensitive | Default | Description | +|----------------------------|--------|----------|-----------|--------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `awsRegion` | String | true | false | " " (empty string) | Supported AWS region. For example, us-west-1, us-west-2. | +| `queueName` | String | true | false | " " (empty string) | The name of the SQS queue that messages should be read from or written to. | +| `awsCredentialPluginName` | String | false | false | " " (empty string) | The fully-qualified class name of implementation of [AwsCredentialProviderPlugin](https://github.com/apache/pulsar/blob/master/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java). For more information, see [Configure AwsCredentialProviderPlugin](###Configure AwsCredentialProviderPlugin). | +| `awsCredentialPluginParam` | String | false | true | " " (empty string) | The JSON parameter to initialize `awsCredentialsProviderPlugin`. For more information, see [Configure AwsCredentialProviderPlugin](###Configure AwsCredentialProviderPlugin). | +| `awsEndpoint` | String | false | false | " " (empty string) | AWS SQS end-point URL. You can find it at [AWS SQS Service endpoints](https://docs.aws.amazon.com/general/latest/gr/sqs-service.html#sqs_region). | +| `batchSizeOfOnceReceive` | int | false | false | 1 | The maximum number of messages that are pulled from SQS at one time. By default, it is set to 1. The value ranges from 1 to 10. | +| `numberOfConsumers` | int | false | false | 1 | The expected numbers of consumers. You can scale consumers horizontally to achieve high throughput. By default, it is set to 1. The value ranges from 1 to 50. | +| `sourceQueueSize` | int | false | false | 10000 | The size of the queue that holds the messages received from SQS. By default, it is set to 10000. The value cannot smaller than 1. | + +{% callout title="Note" type="note" %} +The `batchSizeOfOnceReceive ` and `numberOfConsumers` options are available for SQS source 2.8.4.3+, 2.9.4.1+, and 2.10.1.13+. For details about how to test AWS SQS source performance, see [Performance Test on AWS SQS Source Connector](https://github.com/streamnative/pulsar-io-sqs/blob/master/docs/source_performance_test.md). +{% /callout %} + +### Configure AwsCredentialProviderPlugin + +AWS SQS source connector allows you to use three ways to connect to AWS SQS by configuring `awsCredentialPluginName`. + +- Leave `awsCredentialPluginName` empty to get the connector authenticated by passing `accessKey` and `secretKey` in `awsCredentialPluginParam`. + + ```json + {"accessKey":"Your access key","secretKey":"Your secret key"} + ``` + +- Set `awsCredentialPluginName` to `org.apache.pulsar.io.aws.AwsDefaultProviderChainPlugin` to use the default AWS provider chain. With this option, you don't need to configure `awsCredentialPluginParam`. For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +- Set `awsCredentialPluginName` to `org.apache.pulsar.io.aws.STSAssumeRoleProviderPlugin` to use the [default AWS provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default), and you need to configure `roleArn` and `roleSessionNmae` in `awsCredentialPluginParam`. For more information, see [AWS documentation](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html). + + ```json + {"roleArn": "arn...", "roleSessionName": "name"} + ``` +