Merge pull request #359 from datayoga-io/358-jobs-should-be-scaffolde…

…d-as-dyyamldyyml Changed jobs to *.dy.yaml
datayoga-io · Mar 13, 2024 · c702877 · c702877
2 parents 453750a + 9755eb7
commit c702877
Show file tree

Hide file tree

Showing 36 changed files with 677 additions and 669 deletions.
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
@@ -76,6 +76,7 @@ jobs:
         run: |
           git config user.name github-actions
           git config user.email [email protected]
+          git pull
           git add .
           git diff --cached --exit-code || git commit -m "update autogenerated docs"
           git push
diff --git a/.github/workflows/generate-jsonschema.yml b/.github/workflows/generate-jsonschema.yml
@@ -48,5 +48,7 @@ jobs:
           git config user.name github-actions
           git config user.email [email protected]
           git add .
-          git diff --cached --exit-code || git commit -m "update json schemas"
-          git push --force
+          if ! git diff --cached --exit-code; then
+            git commit -m "update json schemas"
+            git push
+          fi
diff --git a/cli/src/datayoga/__main__.py b/cli/src/datayoga/__main__.py
@@ -109,7 +109,7 @@ def run(
         sys.exit(1)
 
     # validate the job
-    job_file = path.join(directory, "jobs", job_name.replace(".", os.sep) + ".yaml")
+    job_file = path.join(directory, "jobs", job_name.replace(".", os.sep) + ".dy.yaml")
     try:
         job_settings = utils.read_yaml(job_file)
         logger.debug(f"job_settings: {job_settings}")

diff --git a/core/README.md b/core/README.md
@@ -16,7 +16,7 @@ This demonstrates how to transform data using a DataYoga job.
 
 ### Create a Job
 
-Use this `example.yaml`:
+Use this `example.dy.yaml`:
 
 ```yaml
 steps:
@@ -65,7 +65,7 @@ from datayoga_core.job import Job
 from datayoga_core.result import Result, Status
 from datayoga_core.utils import read_yaml
 
-job_settings = read_yaml("example.yaml")
+job_settings = read_yaml("example.dy.yaml")
 job = dy.compile(job_settings)
 
 assert job.transform([{"fname": "jane", "lname": "smith", "country_code": 1, "country_name": "usa", "credit_card": "1234-5678-0000-9999", "gender": "F"}]).processed == [

diff --git a/core/src/datayoga_core/__init__.py b/core/src/datayoga_core/__init__.py
@@ -11,33 +11,29 @@
 logger = logging.getLogger("dy")
 
 
-def compile(
-        job_settings: Dict[str, Any],
-        whitelisted_blocks: Optional[List[str]] = None) -> Job:
-    """
-    Compiles a job in YAML
+def compile(job_settings: Dict[str, Any], whitelisted_blocks: Optional[List[str]] = None) -> Job:
+    """Compiles a job in YAML.
 
     Args:
-        job_settings (Dict[str, Any]): Job settings
+        job_settings (Dict[str, Any]): Job settings.
         whitelisted_blocks: (Optional[List[str]], optional): Whitelisted blocks. Defaults to None.
 
     Returns:
-        Job: Compiled job
+        Job: Compiled job.
     """
     logger.debug("Compiling job")
     return Job.compile(job_settings, whitelisted_blocks)
 
 
 def validate(job_settings: Dict[str, Any], whitelisted_blocks: Optional[List[str]] = None):
-    """
-    Validates a job in YAML
+    """Validates a job in YAML.
 
     Args:
-        job_settings (Dict[str, Any]): Job settings
+        job_settings (Dict[str, Any]): Job settings.
         whitelisted_blocks: (Optional[List[str]], optional): Whitelisted blocks. Defaults to None.
 
     Raises:
-        ValueError: When the job is invalid
+        ValueError: When the job is invalid.
     """
     logger.debug("Validating job")
     try:
@@ -49,21 +45,22 @@ def validate(job_settings: Dict[str, Any], whitelisted_blocks: Optional[List[str
         raise ValueError(e)
 
 
-def transform(job_settings: Dict[str, Any],
-              data: List[Dict[str, Any]],
-              context: Optional[Context] = None,
-              whitelisted_blocks: Optional[List[str]] = None) -> JobResult:
-    """
-    Transforms data against a certain job
+def transform(
+    job_settings: Dict[str, Any],
+    data: List[Dict[str, Any]],
+    context: Optional[Context] = None,
+    whitelisted_blocks: Optional[List[str]] = None
+) -> JobResult:
+    """Transforms data against a certain job.
 
     Args:
-        job_settings (Dict[str, Any]): Job settings
-        data (List[Dict[str, Any]]): Data to transform
+        job_settings (Dict[str, Any]): Job settings.
+        data (List[Dict[str, Any]]): Data to transform.
         context (Optional[Context]): Context. Defaults to None.
         whitelisted_blocks: (Optional[List[str]]): Whitelisted blocks. Defaults to None.
 
     Returns:
-        JobResult: Job result
+        JobResult: Job result.
     """
     job = compile(job_settings, whitelisted_blocks)
     job.init(context)

diff --git a/core/src/datayoga_core/resources/scaffold/README.md b/core/src/datayoga_core/resources/scaffold/README.md
@@ -10,13 +10,13 @@
 ├── connections.yaml
 └── jobs
     └── sample
-        └── hello.yaml
+        └── hello.dy.yaml
 ```
 
 - `.gitignore`: For convenience, this is used to ignore the data folder.
 - `data`: Folder to store data input files or output. This folder can be located anywhere as long as the runner has access to it.
 - `connections.yaml`: Contains definitions of source and target connectors and other general settings.
-- `jobs`: Source job YAMLs. These can be nested and referenced as modules using a dot notation. e.g. `jobs/sample/hello.yaml` is referenced as `sample.hello` when running the job.
+- `jobs`: Source job YAMLs. These can be nested and referenced as modules using a dot notation. e.g. `jobs/sample/hello.dy.yaml` is referenced as `sample.hello` when running the job.
 
 ## Run a Job
 

diff --git a/...resources/scaffold/jobs/sample/hello.yaml → ...ources/scaffold/jobs/sample/hello.dy.yaml b/...resources/scaffold/jobs/sample/hello.yaml → ...ources/scaffold/jobs/sample/hello.dy.yaml
diff --git a/docs/creating-jobs.md b/docs/creating-jobs.md
@@ -4,9 +4,9 @@ nav_order: 5
 
 # Creating Jobs
 
-Jobs are created by creating yaml files in the `jobs` folder. Each job is composed of several `steps` that activate `blocks`. A `block` defines the business logic of an action. For example, a `block` can write to a Kafka stream, can read from a cloud API, can transform structure, or enrich a message with external data. A `steps` activates the `block` with a set of parameters.
+Jobs are created by creating `dy.yaml` files in the `jobs` folder. Each job is composed of several `steps` that activate `blocks`. A `block` defines the business logic of an action. For example, a `block` can write to a Kafka stream, can read from a cloud API, can transform structure, or enrich a message with external data. A `steps` activates the `block` with a set of parameters.
 
-## Overview of the Job Yaml Structure
+## Overview of the Job YAML Structure
 
 Each Job must start with a block that either produces data or accepts data from external sources.
 The subsequent blocks each receive the output of the previous step as an input. The data will be streamed through these blocks as data flows through the chain.
@@ -33,7 +33,7 @@ It supports both async processing, multi-threading, and multi-processing to enab
 To deploy a job to the DataYoga Runner, use the DataYoga CLI.
 
 ```bash
-datayoga run jobname.yaml
+datayoga run jobname
 ```
 
 ## Tutorial - a Job that Reads from Redis and Writes to Postgres
@@ -58,7 +58,7 @@ docker run -p 5432:5432 -e POSTGRES_PASSWORD=mysecretpassword -d postgres
 
 DataYoga manages connections in a special file named `connections.yaml`. Each connection is defined with a logical name and can define properties needed for the connection. Reference to environment variables, interpolation, and secrets is available.
 
-Add the connections to Redis and Postgres above to the connections.yaml:
+Add the connections to Redis and Postgres above to the `connections.yaml`:
 
 ```bash
 cat << EOF > connections.yaml
@@ -79,7 +79,7 @@ EOF
 ### Create the Job
 
 ```bash
-cat << EOF > redis_to_pg.yaml
+cat << EOF > redis_to_pg.dy.yaml
 steps:
 - uses: redis.read_stream
   with:
@@ -114,5 +114,5 @@ EOF
 ### Run the Job in the DataYoga Runner
 
 ```bash
-datayoga run redis_to_pg.yaml
+datayoga run redis_to_pg
 ```
diff --git a/docs/directory-structure.md b/docs/directory-structure.md
@@ -14,10 +14,10 @@ The `datayoga init` command produces the following directory structure:
 ├── connections.yaml
 └── jobs
     └── sample
-        └── hello.yaml
+        └── hello.dy.yaml
 ```
 
 - `.gitignore`: For convenience, this is used to ignore the data folder.
 - `data`: Folder to store data input files or output. This folder can be located anywhere as long as the runner has access to it.
 - `connections.yaml`: Contains definitions of source and target connectors and other general settings.
-- `jobs`: Source job YAMLs. These can be nested and referenced as modules using a dot notation. e.g. `jobs/sample/hello.yaml` is referenced as `sample.hello` when running the job.
+- `jobs`: Source job YAMLs. These can be nested and referenced as modules using a dot notation. e.g. `jobs/sample/hello.dy.yaml` is referenced as `sample.hello` when running the job.
diff --git a/docs/library.md b/docs/library.md
@@ -72,7 +72,7 @@ import datayoga_core as dy
 from datayoga_core.job import Job
 from datayoga_core.utils import read_yaml
 
-job_settings = read_yaml("example.yaml")
+job_settings = read_yaml("example.dy.yaml")
 job = dy.compile(job_settings)
 
 assert job.transform({"fname": "jane", "lname": "smith", "country_code": 1, "country_name": "usa", "credit_card": "1234-5678-0000-9999", "gender": "F"})[0] == {"first_name": "jane", "last_name": "smith", "country": "1 - USA", "full_name": "jane smith", "greeting": "Hello Ms. jane smith"}

diff --git a/examples/airbnb_csv_to_redis.yaml → examples/airbnb_csv_to_redis.dy.yaml b/examples/airbnb_csv_to_redis.yaml → examples/airbnb_csv_to_redis.dy.yaml
diff --git a/examples/test_csv_producer.dy.yaml b/examples/test_csv_producer.dy.yaml
@@ -0,0 +1,27 @@
+input:
+  uses: files.read_csv
+  with:
+    file: examples/test.csv
+steps:
+  - uses: add_field
+    with:
+      fields:
+        - field: full_name
+          language: jmespath
+          expression: concat([fname, ' ' , lname])
+  - uses: map
+    with:
+      expression:
+        {
+          first_name: fname,
+          last_name: lname,
+          country: country_code || ' - ' || UPPER(country_name),
+          full_name: full_name,
+          greeting: "'Hello ' || CASE WHEN gender = 'F' THEN 'Ms.' WHEN gender = 'M' THEN 'Mr.' ELSE 'N/A' END || ' ' || full_name"
+        }
+      language: sql
+  - uses: relational.write
+    with:
+      connection: hr
+      schema: hr
+      table: emp
diff --git a/examples/test_csv_producer.yaml b/examples/test_csv_producer.yaml
diff --git a/examples/test_redis_producer.dy.yaml b/examples/test_redis_producer.dy.yaml
@@ -0,0 +1,28 @@
+input:
+  uses: redis.read_stream
+  with:
+    connection: cache
+    stream_name: emp
+steps:
+  - uses: add_field
+    with:
+      fields:
+        - field: full_name
+          language: jmespath
+          expression: concat([fname, ' ' , lname])
+  - uses: map
+    with:
+      expression:
+        {
+          first_name: fname,
+          last_name: lname,
+          country: country_code || ' - ' || UPPER(country_name),
+          full_name: full_name,
+          greeting: "'Hello ' || CASE WHEN gender = 'F' THEN 'Ms.' WHEN gender = 'M' THEN 'Mr.' ELSE 'N/A' END || ' ' || full_name"
+        }
+      language: sql
+  - uses: relational.write
+    with:
+      connection: hr
+      schema: hr
+      table: emp
diff --git a/examples/test_redis_producer.yaml b/examples/test_redis_producer.yaml
diff --git a/integration-tests/common/utils.py b/integration-tests/common/utils.py
@@ -31,7 +31,7 @@ def execute_program(command: str, background: bool = False) -> Optional[Popen]:
 
 def wait_program(process: Popen, sig: Optional[int] = signal.SIGTERM, ignore_errors: bool = False):
     """Waits a child program to finish and logs its output.
-    Sends a signal to the process if it set
+    Sends a signal to the process if it set.
 
     Args:
         process (Popen): Process to kill.
@@ -57,8 +57,12 @@ def wait_program(process: Popen, sig: Optional[int] = signal.SIGTERM, ignore_err
             raise ValueError("command failed")
 
 
-def run_job(job_name: str, piped_from: Optional[str] = None, piped_to: Optional[str] = None,
-            background: bool = False) -> Optional[Popen]:
+def run_job(
+    job_name: str,
+    piped_from: Optional[str] = None,
+    piped_to: Optional[str] = None,
+    background: bool = False
+) -> Optional[Popen]:
     """Runs a job using the `datayoga` command-line tool.
 
     Args:

diff --git a/...tests/resources/jobs/tests/csv_to_pg.yaml → ...ts/resources/jobs/tests/csv_to_pg.dy.yaml b/...tests/resources/jobs/tests/csv_to_pg.yaml → ...ts/resources/jobs/tests/csv_to_pg.dy.yaml
diff --git a/...ts/resources/jobs/tests/csv_to_redis.yaml → ...resources/jobs/tests/csv_to_redis.dy.yaml b/...ts/resources/jobs/tests/csv_to_redis.yaml → ...resources/jobs/tests/csv_to_redis.dy.yaml
diff --git a/...s/resources/jobs/tests/http_to_redis.yaml → ...esources/jobs/tests/http_to_redis.dy.yaml b/...s/resources/jobs/tests/http_to_redis.yaml → ...esources/jobs/tests/http_to_redis.dy.yaml
diff --git a/...sts/resources/jobs/tests/pg_to_redis.yaml → .../resources/jobs/tests/pg_to_redis.dy.yaml b/...sts/resources/jobs/tests/pg_to_redis.yaml → .../resources/jobs/tests/pg_to_redis.dy.yaml
diff --git a/...bs/tests/redis/abort/redis_to_stdout.yaml → ...tests/redis/abort/redis_to_stdout.dy.yaml b/...bs/tests/redis/abort/redis_to_stdout.yaml → ...tests/redis/abort/redis_to_stdout.dy.yaml
diff --git a/...s/tests/redis/ignore/redis_to_stdout.yaml → ...ests/redis/ignore/redis_to_stdout.dy.yaml b/...s/tests/redis/ignore/redis_to_stdout.yaml → ...ests/redis/ignore/redis_to_stdout.dy.yaml
diff --git a/...sources/jobs/tests/redis_lookup_hash.yaml → ...rces/jobs/tests/redis_lookup_hash.dy.yaml b/...sources/jobs/tests/redis_lookup_hash.yaml → ...rces/jobs/tests/redis_lookup_hash.dy.yaml
diff --git a/...sources/jobs/tests/redis_lookup_list.yaml → ...rces/jobs/tests/redis_lookup_list.dy.yaml b/...sources/jobs/tests/redis_lookup_list.yaml → ...rces/jobs/tests/redis_lookup_list.dy.yaml
diff --git a/...esources/jobs/tests/redis_lookup_set.yaml → ...urces/jobs/tests/redis_lookup_set.dy.yaml b/...esources/jobs/tests/redis_lookup_set.yaml → ...urces/jobs/tests/redis_lookup_set.dy.yaml
diff --git a/...s/jobs/tests/redis_lookup_sorted_set.yaml → ...obs/tests/redis_lookup_sorted_set.dy.yaml b/...s/jobs/tests/redis_lookup_sorted_set.yaml → ...obs/tests/redis_lookup_sorted_set.dy.yaml
diff --git a/...urces/jobs/tests/redis_lookup_string.yaml → ...es/jobs/tests/redis_lookup_string.dy.yaml b/...urces/jobs/tests/redis_lookup_string.yaml → ...es/jobs/tests/redis_lookup_string.dy.yaml
diff --git a/...obs/tests/redis_lookup_string_nested.yaml → .../tests/redis_lookup_string_nested.dy.yaml b/...obs/tests/redis_lookup_string_nested.yaml → .../tests/redis_lookup_string_nested.dy.yaml
diff --git a/...ources/jobs/tests/redis_to_cassandra.yaml → ...ces/jobs/tests/redis_to_cassandra.dy.yaml b/...ources/jobs/tests/redis_to_cassandra.yaml → ...ces/jobs/tests/redis_to_cassandra.dy.yaml
diff --git a/...ts/resources/jobs/tests/redis_to_db2.yaml → ...resources/jobs/tests/redis_to_db2.dy.yaml b/...ts/resources/jobs/tests/redis_to_db2.yaml → ...resources/jobs/tests/redis_to_db2.dy.yaml
diff --git a/.../resources/jobs/tests/redis_to_mysql.yaml → ...sources/jobs/tests/redis_to_mysql.dy.yaml b/.../resources/jobs/tests/redis_to_mysql.yaml → ...sources/jobs/tests/redis_to_mysql.dy.yaml
diff --git a/...resources/jobs/tests/redis_to_oracle.yaml → ...ources/jobs/tests/redis_to_oracle.dy.yaml b/...resources/jobs/tests/redis_to_oracle.yaml → ...ources/jobs/tests/redis_to_oracle.dy.yaml
diff --git a/...sts/resources/jobs/tests/redis_to_pg.yaml → .../resources/jobs/tests/redis_to_pg.dy.yaml b/...sts/resources/jobs/tests/redis_to_pg.yaml → .../resources/jobs/tests/redis_to_pg.dy.yaml
diff --git a/...ources/jobs/tests/redis_to_sqlserver.yaml → ...ces/jobs/tests/redis_to_sqlserver.dy.yaml b/...ources/jobs/tests/redis_to_sqlserver.yaml → ...ces/jobs/tests/redis_to_sqlserver.dy.yaml
diff --git a/...resources/jobs/tests/stdin_to_stdout.yaml → ...ources/jobs/tests/stdin_to_stdout.dy.yaml b/...resources/jobs/tests/stdin_to_stdout.yaml → ...ources/jobs/tests/stdin_to_stdout.dy.yaml