diff --git a/README.md b/README.md index 5bc3e910..1fbaa060 100644 --- a/README.md +++ b/README.md @@ -476,41 +476,41 @@ from quinn.extensions import * ### Column Extensions -**isFalsy()** +**is_falsy()** -Returns `True` if `has_stuff` is `None` or `False`. +Returns a Column indicating whether all values in the Column are False or NULL: `True` if `has_stuff` is `None` or `False`. ```python source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy()) ``` -**isTruthy()** +**is_truthy()** -Returns `True` unless `has_stuff` is `None` or `False`. +Calculates a boolean expression that is the opposite of is_falsy for the given Column: `True` unless `has_stuff` is `None` or `False`. ```python source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy()) ``` -**isNullOrBlank()** +**is_null_or_blank()** -Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace). +Returns a Boolean value which expresses whether a given column is NULL or contains only blank characters: `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace). ```python source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank()) ``` -**isNotIn()** +**is_not_in()** -Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list. +To see if a value is not in a list of values: `True` if `fun_thing` is not included in the `bobs_hobbies` list. ```python source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies)) ``` -**nullBetween()** +**null_between()** -Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`. +To see if a value is between two values in a null friendly way: `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`. ```python source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age"))) diff --git a/benchmarks/create_benchmark_df.py b/benchmarks/create_benchmark_df.py index 61a86136..301b2b45 100644 --- a/benchmarks/create_benchmark_df.py +++ b/benchmarks/create_benchmark_df.py @@ -14,7 +14,7 @@ from __future__ import annotations import random -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from pyspark.sql import SparkSession from pyspark.sql import functions as F # noqa: N812 @@ -38,7 +38,7 @@ def save_benchmark_df( spark: SparkSession, n: int, data_label: str, - repartition_n: Optional[int] = None, + repartition_n: int | None = None, ) -> None: """Save a benchmark dataframe to disk.""" print(f"Generating benchmark df for n={n}") diff --git a/pyproject.toml b/pyproject.toml index 55f79750..eab84a84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,8 +93,6 @@ ignore = [ "D205", # It is broken "TCH003", # I have no idea what is it about "PLC1901", # Strange thing - "UP007", # Not supported in py3.6 - "UP038", # Not supported in all py versions "SIM108", # Don't create long ternary operators "PTH123", # Don't force use of Pathlib "PTH207", # Don't force use of Pathlib @@ -109,3 +107,6 @@ ignore = [ "quinn/__init__.py" = ["F401", "F403"] "quinn/functions.py" = ["FBT003"] "quinn/keyword_finder.py" = ["A002"] + +[tool.ruff.isort] +required-imports = ["from __future__ import annotations"] \ No newline at end of file diff --git a/quinn/__init__.py b/quinn/__init__.py index 469cac82..61f491eb 100644 --- a/quinn/__init__.py +++ b/quinn/__init__.py @@ -12,6 +12,7 @@ # limitations under the License. """quinn API.""" +from __future__ import annotations from quinn.append_if_schema_identical import append_if_schema_identical from quinn.dataframe_helpers import ( diff --git a/quinn/append_if_schema_identical.py b/quinn/append_if_schema_identical.py index ac05f1ee..7ea8b0bf 100644 --- a/quinn/append_if_schema_identical.py +++ b/quinn/append_if_schema_identical.py @@ -10,8 +10,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations -from pyspark.sql import DataFrame +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pyspark.sql import DataFrame class SchemaMismatchError(ValueError): diff --git a/quinn/functions.py b/quinn/functions.py index 12464c81..f8b37c7a 100644 --- a/quinn/functions.py +++ b/quinn/functions.py @@ -278,9 +278,9 @@ def is_falsy(col: Column) -> Column: def is_truthy(col: Column) -> Column: - """Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` col. + """Calculates a boolean expression that is the opposite of is_falsy for the given ``Column`` col. - :param Column col: The ``Column`` to calculate the opposite of isFalsy for. + :param Column col: The ``Column`` to calculate the opposite of is_falsy for. :returns: A ``Column`` with the results of the calculation. :rtype: Column """ diff --git a/quinn/math.py b/quinn/math.py index e385b1ff..c7a4196b 100644 --- a/quinn/math.py +++ b/quinn/math.py @@ -15,16 +15,14 @@ from __future__ import annotations -from typing import Optional, Union - from pyspark.sql import Column from pyspark.sql import functions as F # noqa: N812 def rand_laplace( - mu: Union[float, Column], - beta: Union[float, Column], - seed: Optional[int] = None, + mu: float | Column, + beta: float | Column, + seed: int | None = None, ) -> Column: """Generate random numbers from Laplace(mu, beta). @@ -47,7 +45,7 @@ def rand_laplace( def div_or_else( cola: Column, colb: Column, - default: Union[float, Column] = 0.0, + default: float | Column = 0.0, ) -> Column: """Return result of division of cola by colb or default if colb is zero. diff --git a/quinn/schema_helpers.py b/quinn/schema_helpers.py index d314ff1b..80797192 100644 --- a/quinn/schema_helpers.py +++ b/quinn/schema_helpers.py @@ -14,7 +14,6 @@ from __future__ import annotations import json -from typing import Optional from pyspark.sql import SparkSession from pyspark.sql import types as T # noqa: N812 @@ -100,7 +99,7 @@ def schema_from_csv(spark: SparkSession, file_path: str) -> T.StructType: # noq :rtype: pyspark.sql.types.StructType """ - def _validate_json(metadata: Optional[str]) -> dict: + def _validate_json(metadata: str | None) -> dict: if metadata is None: return {} diff --git a/quinn/split_columns.py b/quinn/split_columns.py index e2bdef87..d667815a 100644 --- a/quinn/split_columns.py +++ b/quinn/split_columns.py @@ -13,7 +13,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from pyspark.sql.functions import length, split, trim, udf, when from pyspark.sql.types import IntegerType @@ -28,7 +28,7 @@ def split_col( # noqa: PLR0913 delimiter: str, new_col_names: list[str], mode: str = "permissive", - default: Optional[str] = None, + default: str | None = None, ) -> DataFrame: """Splits the given column based on the delimiter and creates new columns with the split values.