Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter semgrep results based on initial findings #115

Merged
merged 2 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 44 additions & 36 deletions src/codemodder/codemodder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from codemodder.executor import CodemodExecutorWrapper
from codemodder.project_analysis.python_repo_manager import PythonRepoManager
from codemodder.report.codetf_reporter import report_default
from codemodder.result import ResultSet
from codemodder.semgrep import run as run_semgrep


Expand All @@ -34,18 +35,17 @@ def update_code(file_path, new_code):
def find_semgrep_results(
context: CodemodExecutionContext,
codemods: list[CodemodExecutorWrapper],
) -> set[str]:
) -> ResultSet:
"""Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs"""
yaml_files = list(
itertools.chain.from_iterable(
[codemod.yaml_files for codemod in codemods if codemod.yaml_files]
)
)
if not yaml_files:
return set()
return ResultSet()

results = run_semgrep(context, yaml_files)
return {rule_id for file_changes in results.values() for rule_id in file_changes}
return run_semgrep(context, yaml_files)


def create_diff(original_tree: cst.Module, new_tree: cst.Module) -> str:
Expand Down Expand Up @@ -103,7 +103,7 @@ def process_file(
file_path: Path,
base_directory: Path,
codemod,
sarif,
results: ResultSet,
cli_args,
): # pylint: disable=too-many-arguments
logger.debug("scanning file %s", file_path)
Expand All @@ -112,14 +112,17 @@ def process_file(

line_exclude = file_line_patterns(file_path, cli_args.path_exclude)
line_include = file_line_patterns(file_path, cli_args.path_include)
sarif_for_file = sarif.get(str(file_path)) or {}
findings_for_rule = results.results_for_rule_and_file(
drdavella marked this conversation as resolved.
Show resolved Hide resolved
codemod.name, # TODO: should be full ID
file_path,
)

file_context = FileContext(
base_directory,
file_path,
line_exclude,
line_include,
sarif_for_file,
findings_for_rule,
)

try:
Expand All @@ -146,27 +149,50 @@ def analyze_files(
execution_context: CodemodExecutionContext,
files_to_analyze,
codemod,
sarif,
results: ResultSet,
cli_args,
):
with ThreadPoolExecutor(max_workers=cli_args.max_workers) as executor:
logger.debug(
"using executor with %s threads",
cli_args.max_workers,
)
results = executor.map(
analysis_results = executor.map(
lambda args: process_file(
args[0],
args[1],
execution_context.directory,
codemod,
sarif,
results,
cli_args,
),
enumerate(files_to_analyze),
)
executor.shutdown(wait=True)
execution_context.process_results(codemod.id, results)
execution_context.process_results(codemod.id, analysis_results)


def log_report(context, argv, elapsed_ms, files_to_analyze):
log_section("report")
logger.info("scanned: %s files", len(files_to_analyze))
all_failures = context.get_failed_files()
logger.info(
"failed: %s files (%s unique)",
len(all_failures),
len(set(all_failures)),
)
all_changes = context.get_changed_files()
logger.info(
"changed: %s files (%s unique)",
len(all_changes),
len(set(all_changes)),
)
logger.info("report file: %s", argv.output)
logger.info("total elapsed: %s ms", elapsed_ms)
logger.info(" semgrep: %s ms", context.timer.get_time_ms("semgrep"))
logger.info(" parse: %s ms", context.timer.get_time_ms("parse"))
logger.info(" transform: %s ms", context.timer.get_time_ms("transform"))
logger.info(" write: %s ms", context.timer.get_time_ms("write"))


def run(original_args) -> int:
Expand Down Expand Up @@ -221,25 +247,27 @@ def run(original_args) -> int:
return 0

full_names = [str(path) for path in files_to_analyze]
logger.debug("matched files:")
log_list(logging.DEBUG, "matched files", full_names)

semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run)
semgrep_results: ResultSet = find_semgrep_results(context, codemods_to_run)
semgrep_finding_ids = semgrep_results.all_rule_ids()

log_section("scanning")
# run codemods one at a time making sure to respect the given sequence
for codemod in codemods_to_run:
# Unfortunately the IDs from semgrep are not fully specified
# TODO: eventually we need to be able to use fully specified IDs here
if codemod.is_semgrep and codemod.name not in semgrep_results:
if codemod.is_semgrep and codemod.name not in semgrep_finding_ids:
logger.debug(
"no results from semgrep for %s, skipping analysis",
codemod.id,
)
continue

logger.info("running codemod %s", codemod.id)
results = codemod.apply(context)
semgrep_files = semgrep_results.files_for_rule(codemod.name)
# Non-semgrep codemods ignore the semgrep results
results = codemod.apply(context, semgrep_files)
analyze_files(
context,
files_to_analyze,
Expand All @@ -256,27 +284,7 @@ def run(original_args) -> int:
elapsed_ms = int(elapsed.total_seconds() * 1000)
report_default(elapsed_ms, argv, original_args, results)

log_section("report")
logger.info("scanned: %s files", len(files_to_analyze))
all_failures = context.get_failed_files()
logger.info(
"failed: %s files (%s unique)",
len(all_failures),
len(set(all_failures)),
)
all_changes = context.get_changed_files()
logger.info(
"changed: %s files (%s unique)",
len(all_changes),
len(set(all_changes)),
)
logger.info("report file: %s", argv.output)
logger.info("total elapsed: %s ms", elapsed_ms)
logger.info("semgrep: %s ms", context.timer.get_time_ms("semgrep"))
logger.info("parse: %s ms", context.timer.get_time_ms("parse"))
logger.info("transform: %s ms", context.timer.get_time_ms("transform"))
logger.info("write: %s ms", context.timer.get_time_ms("write"))

log_report(context, argv, elapsed_ms, files_to_analyze)
return 0


Expand Down
2 changes: 1 addition & 1 deletion src/codemodder/codemods/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def __init_subclass__(cls):
def __init__(self, codemod_context: CodemodContext, file_context: FileContext):
BaseCodemod.__init__(self, codemod_context, file_context)
_SemgrepCodemod.__init__(self, file_context)
BaseTransformer.__init__(self, codemod_context, self._results)
BaseTransformer.__init__(self, codemod_context, file_context.findings)

def _new_or_updated_node(self, original_node, updated_node):
if self.node_is_selected(original_node):
Expand Down
28 changes: 9 additions & 19 deletions src/codemodder/codemods/base_codemod.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from codemodder.change import Change
from codemodder.dependency import Dependency
from codemodder.file_context import FileContext
from codemodder.result import ResultSet
from codemodder.semgrep import run as semgrep_run


Expand Down Expand Up @@ -51,14 +52,14 @@ def __init__(self, file_context: FileContext):
self.file_context = file_context

@classmethod
def apply_rule(cls, context, *args, **kwargs):
def apply_rule(cls, context, *args, **kwargs) -> ResultSet:
"""
Apply rule associated with this codemod and gather results

Does nothing by default. Subclasses may override for custom rule logic.
"""
del context, args, kwargs
return {}
return ResultSet()

@classmethod
def name(cls):
Expand Down Expand Up @@ -105,28 +106,17 @@ class SemgrepCodemod(BaseCodemod):
YAML_FILES: ClassVar[List[str]] = NotImplemented
is_semgrep = True

def __init__(self, *args):
super().__init__(*args)
self._results = (
self.file_context.results_by_id.get(
self.METADATA.NAME # pylint: disable=no-member
)
or []
)

@classmethod
def apply_rule(
cls, context, yaml_files, *args, **kwargs
): # pylint: disable=arguments-differ
def apply_rule(cls, context, *args, **kwargs) -> ResultSet:
"""
Apply semgrep to gather rule results
"""
del args, kwargs
yaml_files = kwargs.get("yaml_files") or args[0]
files_to_analyze = kwargs.get("files_to_analyze") or args[1]
with context.timer.measure("semgrep"):
return semgrep_run(context, yaml_files)
return semgrep_run(context, yaml_files, files_to_analyze)

@property
def should_transform(self):
"""Semgrep codemods should attempt transform only if there are
semgrep results"""
return bool(self._results)
"""Semgrep codemods should attempt transform only if there are semgrep results"""
return bool(self.file_context.findings)
52 changes: 22 additions & 30 deletions src/codemodder/codemods/base_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
from libcst.codemod import ContextAwareVisitor, VisitorBasedCodemodCommand
from libcst.metadata import PositionProvider

from codemodder.result import Result


class UtilsMixin:
METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)
results: list[Result]

def __init__(self, context, results):
super().__init__(context)
def __init__(self, results: list[Result]):
self.results = results

def filter_by_result(self, pos_to_match):
all_pos = [extract_pos_from_result(result) for result in self.results]
return any(match_pos(pos_to_match, position) for position in all_pos)
return any(
location.match(pos_to_match)
for result in self.results
for location in result.locations
)

def filter_by_path_includes_or_excludes(self, pos_to_match):
"""
Expand All @@ -28,6 +32,7 @@ def filter_by_path_includes_or_excludes(self, pos_to_match):
def node_is_selected(self, node) -> bool:
if not self.results:
return False

pos_to_match = self.node_position(node)
return self.filter_by_result(
pos_to_match
Expand All @@ -41,34 +46,21 @@ def lineno_for_node(self, node):
return self.node_position(node).start.line


class BaseTransformer(UtilsMixin, VisitorBasedCodemodCommand):
...
class BaseTransformer(VisitorBasedCodemodCommand, UtilsMixin):
METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)

def __init__(self, context, results: list[Result]):
super().__init__(context)
UtilsMixin.__init__(self, results)


class BaseVisitor(ContextAwareVisitor, UtilsMixin):
METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)

class BaseVisitor(UtilsMixin, ContextAwareVisitor):
...
def __init__(self, context, results: list[Result]):
super().__init__(context)
UtilsMixin.__init__(self, results)


def match_line(pos, line):
return pos.start.line == line and pos.end.line == line


def extract_pos_from_result(result):
region = result["locations"][0]["physicalLocation"]["region"]
# TODO it may be the case some of these attributes do not exist
return (
region.get("startLine"),
region["startColumn"],
region.get("endLine") or region.get("startLine"),
region["endColumn"],
)


def match_pos(pos, x):
# needs some leeway because the semgrep and libcst won't exactly match
return (
pos.start.line == x[0]
and (pos.start.column in (x[1] - 1, x[1]))
and pos.end.line == x[2]
and (pos.end.column in (x[3] - 1, x[3]))
)
9 changes: 7 additions & 2 deletions src/codemodder/executor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from importlib.abc import Traversable
from pathlib import Path

from wrapt import CallableObjectProxy

Expand All @@ -22,13 +23,17 @@ def __init__(
self.docs_module = docs_module
self.semgrep_config_module = semgrep_config_module

def apply(self, context):
def apply(self, context, files: list[Path]):
"""
Wraps the codemod's apply method to inject additional arguments.

Not all codemods will need these arguments.
"""
return self.apply_rule(context, yaml_files=self.yaml_files)
return self.apply_rule(
context,
yaml_files=self.yaml_files,
files_to_analyze=files,
)

@property
def name(self):
Expand Down
5 changes: 3 additions & 2 deletions src/codemodder/file_context.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List
from typing import List

from codemodder.change import Change, ChangeSet
from codemodder.dependency import Dependency
from codemodder.result import Result
from codemodder.utils.timer import Timer


Expand All @@ -17,7 +18,7 @@ class FileContext: # pylint: disable=too-many-instance-attributes
file_path: Path
line_exclude: List[int] = field(default_factory=list)
line_include: List[int] = field(default_factory=list)
results_by_id: Dict = field(default_factory=dict)
findings: List[Result] = field(default_factory=list)
dependencies: set[Dependency] = field(default_factory=set)
codemod_changes: List[Change] = field(default_factory=list)
results: List[ChangeSet] = field(default_factory=list)
Expand Down
Loading