Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass include/exclude args to semgrep run #80

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions integration_tests/base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,23 @@ def setup_method(self):
"You must register the codemod to a CodemodCollection."
) from exc

@property
def relative_code_path(self):
return os.path.relpath(self.code_path, SAMPLES_DIR)

def _assert_run_fields(self, run, output_path):
assert run["vendor"] == "pixee"
assert run["tool"] == "codemodder-python"
assert run["version"] == __VERSION__
assert run["elapsed"] != ""
assert (
run["commandLine"]
== f"codemodder {SAMPLES_DIR} --output {output_path} --codemod-include={self.codemod_wrapper.name} --path-include={self.code_path}"
== f"codemodder {SAMPLES_DIR} --output {output_path} --codemod-include={self.codemod_wrapper.name} --path-include={self.relative_code_path}"
)
assert run["directory"] == os.path.abspath(SAMPLES_DIR)
assert run["sarifs"] == []

def _assert_results_fields(self, results, output_path):
def _assert_results_fields(self, results):
assert len(results) == 1
result = results[0]
assert result["codemod"] == self.codemod_wrapper.id
Expand All @@ -94,9 +98,11 @@ def _assert_results_fields(self, results, output_path):
# A codemod may change multiple files. For now we will
# assert the resulting data for one file only.
change = [
result for result in result["changeset"] if result["path"] == output_path
result
for result in result["changeset"]
if result["path"] == self.relative_code_path
][0]
assert change["path"] == output_path
assert change["path"] == self.relative_code_path
assert change["diff"] == self.expected_diff

assert len(change["changes"]) == self.num_changes
Expand All @@ -114,10 +120,7 @@ def _assert_codetf_output(self):
run = codetf["run"]
self._assert_run_fields(run, self.output_path)
results = codetf["results"]
# CodeTf2 spec requires relative paths
self._assert_results_fields(
results, os.path.relpath(self.code_path, SAMPLES_DIR)
)
self._assert_results_fields(results)

def check_code_before(self):
with open(self.code_path, "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -145,7 +148,7 @@ def test_file_rewritten(self):
"--output",
self.output_path,
f"--codemod-include={self.codemod_wrapper.name}",
f"--path-include={self.code_path}",
f"--path-include={self.relative_code_path}",
]

self.check_code_before()
Expand Down
18 changes: 17 additions & 1 deletion src/codemodder/code_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,26 @@ def file_line_patterns(file_path: str | Path, patterns: Sequence[str]):
]


def filter_files(names: Sequence[str], patterns: Sequence[str], exclude: bool = False):
def filter_files(
names: Sequence[str],
parent_path: str,
patterns: Sequence[str],
exclude: bool = False,
):
patterns = (
[x.split(":")[0] for x in (patterns or [])]
if not exclude
# An excluded line should not cause the entire file to be excluded
else [x for x in (patterns or []) if ":" not in x]
)

# TODO: handle case when parent path is "."
patterns = [
str(Path(parent_path) / Path(pat))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably faster with os.path.join

if not pat.startswith("*")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition doesn't make sense to me. Do tests fail without it? This means that the behavior is going to be entirely dependent on the presence or absence of a trailing slash in parent_path, which I don't think is what we want. If my pattern is *.py I want it to mean /path/to/foo/*.py and not /path/to/foo*.py.

I'm wondering whether removing this also correctly handles the . case, although that would also be fixed by normalizing all of the given paths using os.path.abspath.

else parent_path + pat
for pat in patterns
]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to combine and avoid doing multiple list comp operations but it was honestly easier to iterate and easier to understand separating it. We can refactor later on with a generator or something.

Also, I don't feel super confident with this if not pat.startswith("*") logic, but tests show it works right now. happy to revisit

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion:
I think you can use a combination of glob and pathlib to solve issues with globs and relative paths

full_path = (Path(parent_path) / Path(pat)).resolve()
files = glob.glob(str(full_path))

There are two caveats: (1) resolve will also make it absolute, we may need to handle the current directory carefully, (2) glob will return actual files in the filesystem which is slower than fnmatch

return itertools.chain(*[fnmatch.filter(names, pattern) for pattern in patterns])


Expand All @@ -65,15 +78,18 @@ def match_files(
that match the criteria of both exclude and include patterns.
"""
all_files = [str(path) for path in Path(parent_path).rglob("*")]

included_files = set(
filter_files(
all_files,
str(parent_path),
include_paths if include_paths is not None else DEFAULT_INCLUDED_PATHS,
)
)
excluded_files = set(
filter_files(
all_files,
str(parent_path),
exclude_paths if exclude_paths is not None else DEFAULT_EXCLUDED_PATHS,
exclude=True,
)
Expand Down
9 changes: 1 addition & 8 deletions src/codemodder/codemodder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
import os
import sys
from pathlib import Path

import libcst as cst
from libcst.codemod import CodemodContext
Expand Down Expand Up @@ -129,13 +128,7 @@ def run(original_args) -> int:

log_section("startup")
logger.info("codemodder: python/%s", __VERSION__)

context = CodemodExecutionContext(
Path(argv.directory),
argv.dry_run,
argv.verbose,
codemod_registry,
)
context = CodemodExecutionContext(argv, codemod_registry)

# TODO: this should be a method of CodemodExecutionContext
codemods_to_run = codemod_registry.match_codemods(
Expand Down
13 changes: 7 additions & 6 deletions src/codemodder/context.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import argparse
import logging
from pathlib import Path
import itertools
Expand All @@ -21,14 +22,14 @@ class CodemodExecutionContext: # pylint: disable=too-many-instance-attributes

def __init__(
self,
directory: Path,
dry_run: bool,
verbose: bool,
cli_args: argparse.Namespace,
registry: CodemodRegistry,
):
self.directory = directory
self.dry_run = dry_run
self.verbose = verbose
self.directory = Path(cli_args.directory)
self.dry_run = cli_args.dry_run
self.verbose = cli_args.verbose
self.path_include: list = cli_args.path_include
self.path_exclude: list = cli_args.path_exclude
self._results_by_codemod = {}
self._failures_by_codemod = {}
self.dependencies = {}
Expand Down
18 changes: 18 additions & 0 deletions src/codemodder/semgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@ def run(execution_context: CodemodExecutionContext, yaml_files: List[Path]) -> d
"-o",
temp_sarif_file.name,
]

if execution_context.path_exclude:
command.extend(
itertools.chain.from_iterable(
map(
lambda f: ["--exclude", f"{execution_context.directory}{f}"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like there should be a path separator here.

execution_context.path_exclude,
)
)
)
if execution_context.path_include:
# Note: parent path is not passed with --include
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why this should be the case.

command.extend(
itertools.chain.from_iterable(
map(lambda f: ["--include", str(f)], execution_context.path_include)
)
)

command.extend(
itertools.chain.from_iterable(
map(lambda f: ["--config", str(f)], yaml_files)
Expand Down
27 changes: 14 additions & 13 deletions tests/codemods/base_codemod_test.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# pylint: disable=no-member,not-callable,attribute-defined-outside-init
import argparse
from collections import defaultdict
import os
from pathlib import Path
from textwrap import dedent
from typing import ClassVar

import libcst as cst
from libcst.codemod import CodemodContext
import mock

from codemodder.context import CodemodExecutionContext
from codemodder.code_directory import DEFAULT_INCLUDED_PATHS, DEFAULT_EXCLUDED_PATHS
from codemodder.file_context import FileContext
from codemodder.registry import CodemodRegistry, CodemodCollection
from codemodder.semgrep import run as semgrep_run
Expand All @@ -21,18 +22,23 @@ class BaseCodemodTest:
def setup_method(self):
self.file_context = None

def _make_context(self, root):
cli_args = argparse.Namespace(
directory=root,
dry_run=True,
verbose=False,
path_include=DEFAULT_INCLUDED_PATHS,
path_exclude=DEFAULT_EXCLUDED_PATHS,
)
return CodemodExecutionContext(cli_args, registry=mock.MagicMock())

def run_and_assert(self, tmpdir, input_code, expected):
tmp_file_path = tmpdir / "code.py"
self.run_and_assert_filepath(tmpdir, tmp_file_path, input_code, expected)

def run_and_assert_filepath(self, root, file_path, input_code, expected):
input_tree = cst.parse_module(dedent(input_code))
self.execution_context = CodemodExecutionContext(
directory=root,
dry_run=True,
verbose=False,
registry=mock.MagicMock(),
)
self.execution_context = self._make_context(root)
self.file_context = FileContext(
file_path,
[],
Expand Down Expand Up @@ -74,12 +80,7 @@ def results_by_id_filepath(self, input_code, file_path):
return semgrep_run(self.execution_context, results[0].yaml_files)

def run_and_assert_filepath(self, root, file_path, input_code, expected):
self.execution_context = CodemodExecutionContext(
directory=root,
dry_run=True,
verbose=False,
registry=mock.MagicMock(),
)
self.execution_context = self._make_context(root)
input_tree = cst.parse_module(input_code)
all_results = self.results_by_id_filepath(input_code, file_path)
results = all_results[str(file_path)]
Expand Down
63 changes: 35 additions & 28 deletions tests/test_code_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ def dir_structure(tmp_path_factory):
(tests_dir / "test_make_request.py").touch()
(tests_dir / "test_insecure_random.py").touch()

assert len(list(base_dir.rglob("*"))) == 9
sub_tests_dir = tests_dir / "tests"
sub_tests_dir.mkdir()
(sub_tests_dir / "something.py").touch()

assert len(list(base_dir.rglob("*"))) == 11

return base_dir

Expand All @@ -35,11 +39,18 @@ def _assert_expected(self, result_files, expected_files):
file_names.sort()
assert file_names == expected_files

def test_all_py_files_match(self, dir_structure):
def test_all_py_files_match_except_tests_dir(self, dir_structure):
expected = ["empty_for_testing.py", "insecure_random.py", "make_request.py"]
files = match_files(dir_structure)
self._assert_expected(files, expected)

def test_tests_not_excluded(self, dir_structure):
expected = ["test_insecure_random.py", "test_make_request.py"]
# anything in foo/tests will be analyzed but anything in
# foo/tests/tests will not be analyzed by default
files = match_files(dir_structure / "tests")
self._assert_expected(files, expected)

def test_match_excluded(self, dir_structure):
expected = ["empty_for_testing.py", "insecure_random.py"]
files = match_files(dir_structure, ["**/tests/**", "*request.py"])
Expand Down Expand Up @@ -102,40 +113,36 @@ def test_match_excluded_precedence_over_included(self, dir_structure):
self._assert_expected(files, expected)

def test_test_directory_not_excluded(self, dir_structure):
expected = ["test_insecure_random.py", "test_make_request.py"]
expected = ["something.py", "test_insecure_random.py", "test_make_request.py"]
files = match_files(
dir_structure, exclude_paths=["**/samples/**", "**/more_samples/**"]
)
self._assert_expected(files, expected)

def test_include_test_overridden_by_default_excludes(self, mocker):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests are using parent_dir as "." which does not work as expected. We will revisit

mocker.patch(
"codemodder.code_directory.Path.rglob",
return_value=[
"foo/tests/test_insecure_random.py",
"foo/tests/test_make_request.py",
],
)
mocker.patch(
"codemodder.code_directory.Path.is_file",
return_value=True,
)
files = match_files(Path("."), include_paths=["**/tests/**"])
def test_include_test_overridden_by_default_excludes(self, dir_structure):
files = match_files(dir_structure, include_paths=["**/tests/**"])
self._assert_expected(files, [])

def test_include_test_without_default_includes(self, mocker):
files = ["foo/tests/test_insecure_random.py", "foo/tests/test_make_request.py"]
mocker.patch(
"codemodder.code_directory.Path.rglob",
return_value=files,
)
mocker.patch(
"codemodder.code_directory.Path.is_file",
return_value=True,
)
result = match_files(Path("."), exclude_paths=[])
assert result == [Path(x) for x in files]
def test_include_test_without_default_excludes(self, dir_structure):
expected = [
"empty_for_testing.py",
"insecure_random.py",
"make_request.py",
"something.py",
"test_insecure_random.py",
"test_make_request.py",
]
files = match_files(dir_structure, exclude_paths=[])
self._assert_expected(files, expected)

def test_extract_line_from_pattern(self):
lines = file_line_patterns(Path("insecure_random.py"), ["insecure_*.py:3"])
assert lines == [3]

def test_include_specific_file(self, dir_structure):
expected = ["empty_for_testing.py"]
files = match_files(
dir_structure / "samples" / "more_samples",
include_paths=["empty_for_testing.py"],
)
self._assert_expected(files, expected)