Skip to content

Commit

Permalink
Tighten up validation of output paths
Browse files Browse the repository at this point in the history
1) require that they have a file extension, and are not an openended
   * glob.

2) require `moderately_sensitive` output paths to be of the correct type.

Note: requiring file extensions on path globs has 3 benefits:

1) It allows us catch file type errors statically (the main goal of this
   PR)

2) It stops an open ended * from capturing unintented outputs

3) It helps ensure path globs are unique, which is a useful property,
   especially when we move to specifiying files inputs rather than
   `needs`.
  • Loading branch information
bloodearnest committed Nov 3, 2023
1 parent 55fda77 commit 761fc08
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 23 deletions.
21 changes: 21 additions & 0 deletions pipeline/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,23 @@
# The magic action name which means "run every action"
RUN_ALL_COMMAND = "run_all"

LEVEL4_FILE_TYPES = set(
[
# tables
".csv",
".tsv",
# images
".jpg",
".jpeg",
".png",
".svg",
".svgz",
# reports
".html",
".pdf",
".txt",
".log",
".json",
".md",
]
)
4 changes: 2 additions & 2 deletions pipeline/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def validate_output_filenames_are_valid(cls, outputs: RawOutputs) -> RawOutputs:
for privacy_level, output in outputs.items():
for output_id, filename in output.items():
try:
assert_valid_glob_pattern(filename)
assert_valid_glob_pattern(filename, privacy_level)
except InvalidPatternError as e:
raise ValueError(f"Output path {filename} is not permitted: {e}")
raise ValueError(f"Output path {filename} is invalid: {e}")

return outputs

Expand Down
17 changes: 13 additions & 4 deletions pipeline/validation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import annotations

import posixpath
from pathlib import PurePosixPath, PureWindowsPath
from pathlib import Path, PurePosixPath, PureWindowsPath
from typing import TYPE_CHECKING

from .constants import LEVEL4_FILE_TYPES
from .exceptions import InvalidPatternError
from .outputs import get_first_output_file, get_output_dirs

Expand All @@ -12,7 +13,7 @@
from .models import Action


def assert_valid_glob_pattern(pattern: str) -> None:
def assert_valid_glob_pattern(pattern: str, privacy_level: str) -> None:
"""
These patterns get converted into regular expressions and matched
with a `find` command so there shouldn't be any possibility of a path
Expand All @@ -31,11 +32,19 @@ def assert_valid_glob_pattern(pattern: str) -> None:
f"contains '{expr}' (only the * wildcard character is supported)"
)

if pattern.endswith("/"):
path = Path(pattern)

if path.suffix == "":
raise InvalidPatternError(
"looks like a directory (only files should be specified)"
"output paths must have a file type extension at the end"
)

if privacy_level == "moderately_sensitive":
if path.suffix not in LEVEL4_FILE_TYPES:
raise InvalidPatternError(
f"{path.suffix} is not an allowed file type for moderately_sensitive outputs"
)

# Check that the path is in normal form
if posixpath.normpath(pattern) != pattern:
raise InvalidPatternError(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,12 @@ def test_outputs_with_invalid_pattern():
"actions": {
"generate_cohort": {
"run": "cohortextractor:latest generate_cohort",
"outputs": {"highly_sensitive": {"test": "test?foo"}},
"outputs": {"highly_sensitive": {"test": "test/foo"}},
},
},
}

msg = "Output path test\\?foo is not permitted:"
msg = "Output path test/foo is invalid:"
with pytest.raises(ValidationError, match=msg):
Pipeline(**data)

Expand Down
31 changes: 16 additions & 15 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,22 @@


def test_assert_valid_glob_pattern():
assert_valid_glob_pattern("foo/bar/*.txt")
assert_valid_glob_pattern("foo")
assert_valid_glob_pattern("foo/bar/*.txt", "highly_sensitive")
assert_valid_glob_pattern("foo/bar/*.txt", "moderately_sensitive")
bad_patterns = [
"/abs/path",
"ends/in/slash/",
"not//canonical",
"path/../traversal",
"c:/windows/absolute",
"recursive/**/glob.pattern",
"questionmark?",
"/[square]brackets",
"\\ftest",
"metadata",
"metadata/test",
("/abs/path.txt", "highly_sensitive"),
("not//canonical.txt", "highly_sensitive"),
("path/../traversal.txt", "highly_sensitive"),
("c:/windows/absolute.txt", "highly_sensitive"),
("recursive/**/glob.pattern", "highly_sensitive"),
("questionmark?.txt", "highly_sensitive"),
("/[square]brackets.txt", "highly_sensitive"),
("\\ftest.txt", "highly_sensitive"),
("metadata", "highly_sensitive"),
("metadata/test.txt", "highly_sensitive"),
("outputs/*", "highly_sensitive"),
("outputs/output.rds", "moderately_sensitive"),
]
for pattern in bad_patterns:
for pattern, sensitivity in bad_patterns:
with pytest.raises(InvalidPatternError):
assert_valid_glob_pattern(pattern)
assert_valid_glob_pattern(pattern, sensitivity)

0 comments on commit 761fc08

Please sign in to comment.