mrpowers-io · kunaljubce · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
     - main
+    - planning-1.0-release
   workflow_dispatch:
 
 jobs:
@@ -17,18 +18,12 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - pyspark-version: 2.4.8  # latest published 2.x version
-            pip-packages: "pypandoc==1.7 pyspark==2.4.8"  # downgrade of pypandoc necessary
-          - pyspark-version: 3.0.3
-            pip-packages: "pyspark==3.0.3"
-          - pyspark-version: 3.1.3
-            pip-packages: "pyspark==3.1.3"
-          - pyspark-version: 3.2.4
-            pip-packages: "pyspark==3.2.4"
-          - pyspark-version: 3.3.2
-            pip-packages: "pyspark==3.3.2"
-          - pyspark-version: 3.4.0
-            pip-packages: "pyspark==3.4.0"
+          - pyspark-version: 3.3.4
+            pip-packages: "pyspark==3.3.4"
+          - pyspark-version: 3.4.3
+            pip-packages: "pyspark==3.4.3"
+          - pyspark-version: 3.5.1
+            pip-packages: "pyspark==3.5.1"
 
     steps:
       - uses: actions/checkout@v1
@@ -39,33 +34,17 @@ jobs:
         uses: actions/setup-java@v3
         with:
           distribution: 'zulu'
-          java-version: '8'  # Supported by Spark 2.x & 3.x
-
-      - name: Get supported Python Version depending on PySpark
-        uses: haya14busa/action-cond@v1
-        id: python_version
-        with:
-          cond: ${{ startsWith(matrix.pyspark-version, '2.') }}
-          if_true: '3.7'  # latest supported version for PySpark 2.x
-          if_false: '3.9'  # PySpark 3+
+          java-version: '17'  # Spark 4.0 will drop Java 11;
 
       - name: Set up Python ${{ steps.python_version.outputs.value }}
         uses: actions/setup-python@v2
         with:
-          python-version: ${{ steps.python_version.outputs.value }}
-
-      - name: Get supported Poetry version
-        uses: haya14busa/action-cond@v1
-        id: poetry_version
-        with:
-          cond: ${{ startsWith(matrix.pyspark-version, '2.') }}
-          if_true: '1.5.1'  # latest supported version for PySpark 2.x
-          if_false: '1.6.1'  # PySpark 3+
+          python-version: '3.10'
 
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
-          version: ${{ steps.poetry_version.outputs.value }}
+          version: '1.6.1'
 
       - name: Cache Poetry virtualenv
         uses: actions/cache@v1
@@ -88,6 +67,19 @@ jobs:
       - name: Run tests with pytest against PySpark ${{ matrix.pyspark-version }}
         run: make test
 
+      - name: Run tests using Spark-Connect against PySpark ${{ matrix.pyspark-version }}
+        env:
+          HADOOP_VERSION: 3
+          SPARK_VERSION: ${{ matrix.pyspark-version }}
+          SPARK_CONNECT_MODE_ENABLE: 1
+        run: |
+          if [[ "${SPARK_VERSION}" > "3.4" ]]; then
+            sh scripts/run_spark_connect_server.sh
+            make test_spark_connect
+          else
+            echo "Skipping Spark-Connect tests for Spark version <= 3.4"
+          fi
+
   check-license-headers:
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -1,6 +1,14 @@
 name: Lint
 
-on: push 
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+    - planning-1.0-release
+  workflow_dispatch:
 
 jobs:
   ruff:
@@ -9,8 +17,8 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: '3.10'
       - name: Run Ruff
         uses: chartboost/ruff-action@v1
         with: 
-          version: 0.0.291
+          version: 0.5.2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.0.291'
+    rev: 'v0.5.2'
     hooks:
       - id: ruff
   - repo: local

diff --git a/.python-version b/.python-version
diff --git a/Makefile b/Makefile
@@ -1,8 +1,10 @@
 # COMMON CLI COMMANDS FOR DEVELOPMENT 
 
+all: help
+
 .PHONY: install_test
 install_test:
-	@poetry install --with=development,testing
+	@poetry install --with=development,testing --extras connect
 
 .PHONY: install_deps
 install_deps:
@@ -14,8 +16,28 @@ update_deps:
 
 .PHONY: test
 test:
-	@poetry run pytest tests
+	@poetry run pytest tests -k "not test_spark_connect.py"
+
+.PHONY: test
+test_spark_connect:
+	@poetry run pytest tests/test_spark_connect.py
 
 .PHONY: lint 
 lint:
 	@poetry run ruff check --fix quinn
+
+.PHONY: format
+format:
+	@poetry run ruff format quinn
+
+# Inspired by https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+.PHONY: help
+help:
+	@echo '................... Quinn ..........................'
+	@echo 'help                      - print that message'
+	@echo 'lint                      - run linter'
+	@echo 'format                    - reformat the code'
+	@echo 'test                      - run tests'
+	@echo 'install_test              - install test deps'
+	@echo 'install_deps              - install dev deps'
+	@echo 'update_deps               - update and install deps'
diff --git a/README.md b/README.md
@@ -41,12 +41,28 @@ quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"])
 
 **validate_schema()**
 
-Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`.
+Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`. By default, `ignore_nullable` is set to False, so exception will be raised even if column names and data types are matching but nullability conditions are mismatching.
 
 ```python
-quinn.validate_schema(source_df, required_schema)
+quinn.validate_schema(required_schema, df_to_be_validated=source_df)
 ```
 
+You can also set `ignore_nullable` to True, so the validation will happen only on column names and data types, not on nullability. 
+
+```python
+quinn.validate_schema(required_schema, ignore_nullable=True, df_to_be_validated=source_df)
+```
+
+> [!TIP]
+> This function can also be used as a decorator to other functions that return a dataframe. This can help validate the schema of the returned df. When used as a decorator, you don't need to pass the `df_to_be_validated` argument as this validation is performed on the df returned by the base function on which the decorator is applied.
+> 
+> ```python
+> @quinn.validate_schema(required_schema, ignore_nullable=True)
+> def get_df():
+>   return df
+> ```
+
+
 **validate_absence_of_columns()**
 
 Raises an exception if `source_df` contains `age` or `cool` columns.
@@ -476,41 +492,41 @@ from quinn.extensions import *
 
 ### Column Extensions
 
-**isFalsy()**
+**is_falsy()**
 
-Returns `True` if `has_stuff` is `None` or `False`.
+Returns a Column indicating whether all values in the Column are False or NULL: `True` if `has_stuff` is `None` or `False`.
 
 ```python
 source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy())
 ```
 
-**isTruthy()**
+**is_truthy()**
 
-Returns `True` unless `has_stuff` is `None` or `False`.
+Calculates a boolean expression that is the opposite of is_falsy for the given Column: `True` unless `has_stuff` is `None` or `False`.
 
 ```python
 source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy())
 ```
 
-**isNullOrBlank()**
+**is_null_or_blank()**
 
-Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
+Returns a Boolean value which expresses whether a given column is NULL or contains only blank characters: `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
 
 ```python
 source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
 ```
 
-**isNotIn()**
+**is_not_in()**
 
-Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list.
+To see if a value is not in a list of values: `True` if `fun_thing` is not included in the `bobs_hobbies` list.
 
 ```python
 source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
 ```
 
-**nullBetween()**
+**null_between()**
 
-Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.
+To see if a value is between two values in a null friendly way: `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.
 
 ```python
 source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))

diff --git a/benchmarks/create_benchmark_df.py b/benchmarks/create_benchmark_df.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 import random
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as F  # noqa: N812
@@ -38,7 +38,7 @@ def save_benchmark_df(
     spark: SparkSession,
     n: int,
     data_label: str,
-    repartition_n: Optional[int] = None,
+    repartition_n: int | None = None,
 ) -> None:
     """Save a benchmark dataframe to disk."""
     print(f"Generating benchmark df for n={n}")