forked from YotpoLtd/metorikku
-
Notifications
You must be signed in to change notification settings - Fork 0
/
job_config_sample.yaml
289 lines (272 loc) · 9.19 KB
/
job_config_sample.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# ---- SAMPLE YAML OF JOB CONFIG FILE ---- #
# !MANDATORY! Metrics and Metrics directories be executed
metrics:
- /path/to/metric-1
- /path/to/metric-2
# Input configuration
inputs:
input_1:
file:
path: parquet/input_1.parquet
input_2:
file:
# The path of the file, you can add multiple files with ,
path: json/input_2.csv
# Optional, if omitted we'll guess by the extension (fallback to parquet)
format: csv
# Optional, define custom schema via a json schema file (https://json-schema.org/)
schemaPath: schema/schema.json
# Optional send any spark supported option to the reader
options:
quoteAll: false
# Optional define stream reader that can be used to read streaming data.
isStream: true
input_3:
file_date_range:
template: parquet/%s/input_1.parquet
date_range:
format: yyyy/MM/dd
startDate: 2017/09/01
endDate: 2017/09/03
# Below are optional (check out the file input example above)
format: parquet
schemaPath: schema/schema.json
options:
opt: val
input_4:
jdbc:
connectionUrl: jdbc:mysql://localhost/db?zeroDateTimeBehavior=convertToNull
user: user
password: pass
dbTable: some_table
preActions: |
DROP TABLE IF EXISTS some_table;
CREATE TABLE some_table AS select * FROM other_table;
# You can optionally add here any supported option from https://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases
options:
numPartitions: 100
driver: com.mysql.jdbc.Driver
input_5:
kafka:
servers:
- localhost:9092
topic: some_topic
schemaRegistryUrl: https://schema-registry-url # optional
schemaSubject: subject # optional
# Add any other options supported by the DataStreamWriter/Kafka Producer
options:
kafka.max.request.size: "30000000"
opt: val
input_6:
cassandra:
host: 127.0.0.1
user: user
password: password
table: table
keySpace: keySpace
options:
input_7:
elasticsearch:
nodes: localhost:9200
user: user
password: password
index: index
input_8:
catalog:
tableName: |
(
SELECT * FROM some_table
)
preActions: |
CREATE some_table VIEW persons AS SELECT 'some_id' as id, 'some_name' as name
input_9:
mongo:
uri: mongodb://localhost:27017
database: test
collection: users
options:
partitioner: com.mongodb.spark.sql.connector.read.partitioner.PaginateIntoPartitionsPartitioner
aggregation.pipeline: |
{
$match: {
$and: [
{"datetime": {"$gte": {"$date": "2023-01-01T00:00:00.000Z" } } },
{"datetime": {"$lt": {"$gte": "2024-01-01T00:00:00.000Z"" } } }
]
}
}
# Set custom variables that would be accessible from the SQL
variables:
StartDate: 2017/09/01
EndDate: 2017/09/20
TrimmedDateFormat: yyyy/MM/dd
output:
# elasticsearch Database argument: (host:port) specifying host (under nodes option) is mandatory.
elasticsearch:
nodes: localhost:9200
user: user
password: password
# cassandra Database arguments: host is mandatory. username and password are supported
cassandra:
host: example.cassandra.db
username: user
password: password
# Redshift Database arguments: jdbcURL and tempS3Dir are mandatory.
redshift:
jdbcURL: jdbc:redshift://<IP>:<PORT>/file?user=username&password=pass
tempS3Dir: s3://path/to/redshift/temp/dir/
awsIAMRole: <your-aws-role-arn>
# Redis Database arguments: host is mandatory. port, auth and db are supported
redis:
host: hostname
port: port-number
auth: authentication
db: database
# Segment API Key
segment:
apiKey: apikey
# Output file directory
file:
dir: /path/to/parquet/output
# JDBC database
jdbc:
connectionUrl: "jdbc:postgresql://localhost:5432/databasename"
user: username
password: password
driver: "org.postgresql.Driver"
# Apache Hudi
hudi:
dir: /path/to/parquet/output
# Optional: This controls the level of parallelism of hudi writing (should be similar to shuffle partitions) - (default is 1500)
parallelism: 1
# Optional: upsert/insert/bulkinsert (default is upsert)
operation: upsert
# Optional: COPY_ON_WRITE/MERGE_ON_READ (default is COPY_ON_WRITE)
storageType: COPY_ON_WRITE
# Optional: Maximum number of versions to retain
maxVersions: 1
# Optional: Hive database to use when writing (default is default)
hiveDB: default
# Hive server URL
hiveJDBCURL: jdbc:hive2://hive:10000
# Optional: credentials to hive
hiveUserName: root
hivePassword: pass
# Optional: toggle hudi hive sync
hiveSync: false
# Optional: enable metorikku to take control over the hive sync process (used in order to support Hive1)
manualHiveSync: true
# Optional: when manualHiveSync is enabled, you need to define your partitions manually here
manualHiveSyncPartitions:
part: 0
# Optional: extra options (http://hudi.incubator.apache.org/configurations.html)
options:
propery_a: value_a
propery_b: value_b
mongodb:
uri: mongodb://localhost:27017
options:
ssl: true
ssl.domain_match: false
# Delta. WARNING: Must define catalog with Delta type
delta:
dir: /path/to/parquet/output
# Optional: max records per parquet file
maxRecordsPerFile: 1000
# Optional: This controls if manifest files have to be created for Trino / Athena compatibility
generateManifest: true
# Optional: extra options (https://docs.delta.io/latest/delta-batch.html)
options:
propery_a: value_a
propery_b: value_b
# Iceberg. WARNING: Must define catalog with Iceberg type
iceberg:
dir: /path/to/parquet/output # Use when saveMode = Create and Iceberg type != hadoop
# Optional: File format to use for this write operation; parquet, avro, or orc
writeFormat: orc
# Optional: Desired isolation level for Dataframe overwrite operations.
isolationLevel: serializable
# Optional: Overrides this table’s write.target-file-size-bytes
targetFileSizeBytes: 10000000
# Optional: extra options (https://iceberg.apache.org/docs/latest/spark-configuration/#write-options)
options:
propery_a: value_a
propery_b: value_b
# You can also use named outputs (all outputs above are supported)
outputs:
fileDir1:
file:
dir: /path/to/parquet/output
fileDir2:
file:
dir: /path/to/parquet/output2
# If set to true, triggers Explain before saving
explain: true
# Shows a Preview of the output
showPreviewLines: 42
# Prints the query after running it
showQuery: true
# Caches the step before each preview
cacheOnPreview: true
# Set Log Level : ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
logLevel: WARN
# Set Application Name to have app name prefix in spark instrumentation counters
appName: appName
# Set instrumentation writer (default is spark metrics)
instrumentation:
influxdb:
url: http://localhost:8086
username: username
password: password
dbName: test
# Optionally set catalog parameters (for hive support)
catalog:
type: some_type
enableHive: true
#Extra Hadoop configuration without "spark.hadoop."
hadoopConfig:
opt: val
options:
opt: val
database: some_database
# If planning to use Delta Format + AWS Glue
# type: Delta
# enableHive: true
# hadoopConfig:
# hive.metastore.client.factory.class: "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
# # If planning to use another Glue Catalog
# hive.metastore.glue.catalogid: "123456"
# If planning to use Iceberg Format + AWS Glue. See https://iceberg.apache.org/docs/latest/configuration/#catalog-properties
# type: Iceberg
# enableHive: true
# hadoopConfig:
# hive.metastore.client.factory.class: "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
# # If planning to use another Glue Catalog
# hive.metastore.glue.catalogid: "123456"
# options:
# catalog-impl: org.apache.iceberg.aws.glue.GlueCatalog
# io-impl: org.apache.iceberg.aws.s3.S3FileIO
# lock-impl: org.apache.iceberg.aws.dynamodb.DynamoDbLockManager
# lock.table: "some-dynamodb-table-name"
# If planning to use Iceberg Format + Local
# type: Iceberg
# options:
# type: hadoop
# warehouse: /tmp/hadoop_warehouse
# Set options for streaming writing
streaming:
# Set the trigger mode (ProcessingTime, Once, Continuous)
triggerMode: ProcessingTime
# If trigger is ProcessingTime/Continuous set the trigger duration
triggerDuration: 10 seconds
# Possible values are append/replace/complete
outputMode: append
# Where to save Spark's checkpoint
checkpointLocation: /tmp/checkpoint
# Optionally set streaming to use foreachBatch when writing streams. this enable writing to all available writers and to write to multiple outputs.
batchMode: true
# Add any other options supported by the DataStreamWriter
extraOptions:
opt: val
# Optional: controls caching and counting on each output (default is true)
cacheCountOnOutput: false