diff --git a/metaphor/s3/README.md b/metaphor/s3/README.md index 7af55eb8..d6a826f2 100644 --- a/metaphor/s3/README.md +++ b/metaphor/s3/README.md @@ -146,6 +146,8 @@ All other file types are automatically ignored. If not specified, all of the abo You can optionally specify the URI patterns to exclude using the `excludes` config. It supports wildcards but not `{table}` & `{partition}` labels. +To exclude an entire directory, use `s3://bucket/directory` instead of `s3://bucket/directory/*`. + ## Optional Configurations ### TLS Verification diff --git a/metaphor/s3/path_spec.py b/metaphor/s3/path_spec.py index 0e114c2b..a69bbf51 100644 --- a/metaphor/s3/path_spec.py +++ b/metaphor/s3/path_spec.py @@ -293,6 +293,8 @@ def allow_path(self, path: str) -> bool: path.rsplit("/", slash_to_remove)[0] if exclude[-1] == "/": exclude_pat = exclude[:-1] + elif exclude[-2:] == "/*": + exclude_pat = exclude[:-2] else: exclude_pat = exclude if fnmatch(path.rsplit("/", slash_to_remove)[0], exclude_pat): diff --git a/pyproject.toml b/pyproject.toml index 6240549b..8222af89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.132" +version = "0.14.133" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "] diff --git a/tests/s3/test_path_spec.py b/tests/s3/test_path_spec.py new file mode 100644 index 00000000..8991a9d2 --- /dev/null +++ b/tests/s3/test_path_spec.py @@ -0,0 +1,26 @@ +import pytest + +from metaphor.s3.path_spec import PathSpec + + +def test_exclude_all_files_in_directory() -> None: + path_spec = PathSpec( + uri="s3://bucket/v1/app/*/{table}/v4/*/{partition[0]}/{partition[1]}/{partition[2]}/*.*", + file_types={ + "parquet", + }, + excludes=["s3://bucket/v1/app/global/*"], + ) + assert not path_spec.allow_path("s3://bucket/v1/app/global/foo/v4") + + +@pytest.mark.skip +def test_exclude_table_wildcard() -> None: + path_spec = PathSpec( + uri="s3://bucket/v1/services/data/{table}/v1/*/{partition[0]}/{partition[1]}/{partition[2]}/*.*", + file_types={ + "parquet", + }, + excludes=["s3://bucket/v1/services/data/skipped_*"], + ) + assert not path_spec.allow_path("s3://bucket/v1/services/data/skipped_0/")