config/job_config_sample.yaml

# ---- SAMPLE YAML OF JOB CONFIG FILE ---- #

# !MANDATORY! Metrics and Metrics directories be executed
metrics:
  - /path/to/metric-1
  - /path/to/metric-2

# Input configuration
inputs:
  input_1:
    file:
      path: parquet/input_1.parquet
  input_2:
    file:
      # The path of the file, you can add multiple files with ,
      path: json/input_2.csv
      # Optional, if omitted we'll guess by the extension (fallback to parquet)
      format: csv
      # Optional, define custom schema via a json schema file (https://json-schema.org/)
      schemaPath: schema/schema.json
      # Optional send any spark supported option to the reader
      options:
        quoteAll: false
      # Optional define stream reader that can be used to read streaming data.
      isStream: true
  input_3:
    file_date_range:
      template: parquet/%s/input_1.parquet
      date_range:
        format: yyyy/MM/dd
        startDate: 2017/09/01
        endDate: 2017/09/03
      # Below are optional (check out the file input example above)
      format: parquet
      schemaPath: schema/schema.json
      options:
        opt: val
  input_4:
    jdbc:
      connectionUrl: jdbc:mysql://localhost/db?zeroDateTimeBehavior=convertToNull
      user: user
      password: pass
      dbTable: some_table
      preActions: |
        DROP TABLE IF EXISTS some_table;
        CREATE TABLE some_table AS select * FROM other_table;
      # You can optionally add here any supported option from https://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases
      options:
        numPartitions: 100
        driver: com.mysql.jdbc.Driver
  input_5:
    kafka:
      servers:
        - localhost:9092
      topic: some_topic
      schemaRegistryUrl: https://schema-registry-url # optional
      schemaSubject: subject # optional
      # Add any other options supported by the DataStreamWriter/Kafka Producer
      options:
        kafka.max.request.size: "30000000"
        opt: val
  input_6:
    cassandra:
      host: 127.0.0.1
      user: user
      password: password
      table: table
      keySpace: keySpace
      options:
  input_7:
    elasticsearch:
      nodes: localhost:9200
      user: user
      password: password
      index: index
  input_8:
    catalog:
      tableName: |
        (
          SELECT * FROM some_table
        )
      preActions: |
        CREATE some_table VIEW persons AS SELECT 'some_id' as id, 'some_name' as name
  input_9:
    mongo:
      uri: mongodb://localhost:27017
      database: test
      collection: users
      options:
        partitioner: com.mongodb.spark.sql.connector.read.partitioner.PaginateIntoPartitionsPartitioner
        aggregation.pipeline: |
          {
            $match: {
              $and: [
                {"datetime": {"$gte": {"$date": "2023-01-01T00:00:00.000Z" } } },
                {"datetime": {"$lt": {"$gte": "2024-01-01T00:00:00.000Z"" } } }
              ]
            }
          }

# Set custom variables that would be accessible from the SQL
variables:
  StartDate: 2017/09/01
  EndDate: 2017/09/20
  TrimmedDateFormat: yyyy/MM/dd

output:
  # elasticsearch Database argument: (host:port) specifying host (under nodes option) is mandatory.
  elasticsearch:
    nodes: localhost:9200
    user: user
    password: password
  # cassandra Database arguments: host is mandatory. username and password are supported
  cassandra:
    host: example.cassandra.db
    username: user
    password: password
  # Redshift Database arguments: jdbcURL and tempS3Dir are mandatory.
  redshift:
    jdbcURL: jdbc:redshift://<IP>:<PORT>/file?user=username&password=pass
    tempS3Dir: s3://path/to/redshift/temp/dir/
    awsIAMRole: <your-aws-role-arn>
  # Redis Database arguments: host is mandatory. port, auth and db are supported
  redis:
    host: hostname
    port: port-number
    auth: authentication
    db: database
  # Segment API Key
  segment:
    apiKey: apikey
  # Output file directory
  file:
    dir: /path/to/parquet/output
  # JDBC database
  jdbc:
    connectionUrl: "jdbc:postgresql://localhost:5432/databasename"
    user: username
    password: password
    driver: "org.postgresql.Driver"
  # Apache Hudi
  hudi:
    dir: /path/to/parquet/output
    # Optional: This controls the level of parallelism of hudi writing (should be similar to shuffle partitions) - (default is 1500)
    parallelism: 1
    # Optional: upsert/insert/bulkinsert (default is upsert)
    operation: upsert
    # Optional: COPY_ON_WRITE/MERGE_ON_READ (default is COPY_ON_WRITE)
    storageType: COPY_ON_WRITE
    # Optional: Maximum number of versions to retain
    maxVersions: 1
    # Optional: Hive database to use when writing (default is default)
    hiveDB: default
    # Hive server URL
    hiveJDBCURL: jdbc:hive2://hive:10000
    # Optional: credentials to hive
    hiveUserName: root
    hivePassword: pass
    # Optional: toggle hudi hive sync
    hiveSync: false
    # Optional: enable metorikku to take control over the hive sync process (used in order to support Hive1)
    manualHiveSync: true
    # Optional: when manualHiveSync is enabled, you need to define your partitions manually here
    manualHiveSyncPartitions:
      part: 0
    # Optional: extra options (http://hudi.incubator.apache.org/configurations.html)
    options:
      propery_a: value_a
      propery_b: value_b
  mongodb:
    uri: mongodb://localhost:27017
    options:
      ssl: true
      ssl.domain_match: false
  # Delta. WARNING: Must define catalog with Delta type
  delta:
    dir: /path/to/parquet/output
    # Optional: max records per parquet file
    maxRecordsPerFile: 1000
    # Optional: This controls if manifest files have to be created for Trino / Athena compatibility
    generateManifest: true
    # Optional: extra options (https://docs.delta.io/latest/delta-batch.html)
    options:
      propery_a: value_a
      propery_b: value_b
  # Iceberg. WARNING: Must define catalog with Iceberg type
  iceberg:
    dir: /path/to/parquet/output # Use when saveMode = Create and Iceberg type != hadoop
    # Optional: File format to use for this write operation; parquet, avro, or orc
    writeFormat: orc
    # Optional: Desired isolation level for Dataframe overwrite operations.
    isolationLevel: serializable
    # Optional: Overrides this table’s write.target-file-size-bytes
    targetFileSizeBytes: 10000000
    # Optional: extra options (https://iceberg.apache.org/docs/latest/spark-configuration/#write-options)
    options:
      propery_a: value_a
      propery_b: value_b

# You can also use named outputs (all outputs above are supported)
outputs:
  fileDir1:
    file:
      dir: /path/to/parquet/output
  fileDir2:
    file:
      dir: /path/to/parquet/output2

# If set to true, triggers Explain before saving
explain: true

# Shows a Preview of the output
showPreviewLines: 42

# Prints the query after running it
showQuery: true

# Caches the step before each preview
cacheOnPreview: true

# Set Log Level : ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
logLevel: WARN

# Set Application Name to have app name prefix in spark instrumentation counters
appName: appName

# Set instrumentation writer (default is spark metrics)
instrumentation:
  influxdb:
    url: http://localhost:8086
    username: username
    password: password
    dbName: test
# Optionally set catalog parameters (for hive support)
catalog:
  type: some_type
  enableHive: true
  #Extra Hadoop configuration without "spark.hadoop."
  hadoopConfig:
    opt: val
  options:
    opt: val
  database: some_database

  # If planning to use Delta Format + AWS Glue
  # type: Delta
  # enableHive: true
  # hadoopConfig:
  #   hive.metastore.client.factory.class: "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
  #   # If planning to use another Glue Catalog
  #   hive.metastore.glue.catalogid: "123456"

  # If planning to use Iceberg Format + AWS Glue. See https://iceberg.apache.org/docs/latest/configuration/#catalog-properties
  # type: Iceberg
  # enableHive: true
  # hadoopConfig:
  #   hive.metastore.client.factory.class: "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
  #   # If planning to use another Glue Catalog
  #   hive.metastore.glue.catalogid: "123456"
  # options:
  #   catalog-impl: org.apache.iceberg.aws.glue.GlueCatalog
  #   io-impl: org.apache.iceberg.aws.s3.S3FileIO
  #   lock-impl: org.apache.iceberg.aws.dynamodb.DynamoDbLockManager
  #   lock.table: "some-dynamodb-table-name"

  # If planning to use Iceberg Format + Local
  # type: Iceberg
  # options:
  #   type: hadoop
  #   warehouse: /tmp/hadoop_warehouse

# Set options for streaming writing
streaming:
  # Set the trigger mode (ProcessingTime, Once, Continuous)
  triggerMode: ProcessingTime
  # If trigger is ProcessingTime/Continuous set the trigger duration
  triggerDuration: 10 seconds
  # Possible values are append/replace/complete
  outputMode: append
  # Where to save Spark's checkpoint
  checkpointLocation: /tmp/checkpoint
  # Optionally set streaming to use foreachBatch when writing streams. this enable writing to all available writers and to write to multiple outputs.
  batchMode: true
  # Add any other options supported by the DataStreamWriter
  extraOptions:
    opt: val

# Optional: controls caching and counting on each output (default is true)
cacheCountOnOutput: false