Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only run semgrep for codemods with initial results #107

Merged
merged 1 commit into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions src/codemodder/codemodder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from concurrent.futures import ThreadPoolExecutor
import datetime
import difflib
import itertools
import logging
import os
import sys
Expand All @@ -19,6 +20,7 @@
from codemodder.executor import CodemodExecutorWrapper
from codemodder.project_analysis.python_repo_manager import PythonRepoManager
from codemodder.report.codetf_reporter import report_default
from codemodder.semgrep import run as run_semgrep


def update_code(file_path, new_code):
Expand All @@ -29,6 +31,18 @@ def update_code(file_path, new_code):
f.write(new_code)


def find_semgrep_results(
context: CodemodExecutionContext,
codemods: list[CodemodExecutorWrapper],
) -> set[str]:
"""Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs"""
yaml_files = itertools.chain.from_iterable(
[codemod.yaml_files for codemod in codemods if codemod.yaml_files]
)
results = run_semgrep(context, yaml_files)
return {rule_id for file_changes in results.values() for rule_id in file_changes}


def apply_codemod_to_file(
base_directory: Path,
file_context,
Expand Down Expand Up @@ -184,7 +198,7 @@ def run(original_args) -> int:
log_list(logging.INFO, "including paths", argv.path_include)
log_list(logging.INFO, "excluding paths", argv.path_exclude)

files_to_analyze = match_files(
files_to_analyze: list[Path] = match_files(
context.directory, argv.path_exclude, argv.path_include
)
if not files_to_analyze:
Expand All @@ -195,18 +209,22 @@ def run(original_args) -> int:
logger.debug("matched files:")
log_list(logging.DEBUG, "matched files", full_names)

semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run)

log_section("scanning")
# run codemods one at a time making sure to respect the given sequence
for codemod in codemods_to_run:
logger.info("running codemod %s", codemod.id)
results = codemod.apply(context)
if codemod.is_semgrep and not results:
# Unfortunately the IDs from semgrep are not fully specified
# TODO: eventually we need to be able to use fully specified IDs here
if codemod.is_semgrep and codemod.name not in semgrep_results:
logger.debug(
"no results from semgrep for %s, skipping analysis",
codemod.id,
)
continue

logger.info("running codemod %s", codemod.id)
results = codemod.apply(context)
analyze_files(
context,
files_to_analyze,
Expand Down
15 changes: 1 addition & 14 deletions src/codemodder/sarifs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections import defaultdict
import json
from pathlib import Path
from typing import List, Union
from typing import Union


def extract_rule_id(result, sarif_run) -> Union[str, None]:
Expand Down Expand Up @@ -43,15 +42,3 @@ def results_by_path_and_rule_id(sarif_file):
rule_id_dict.setdefault(rule_id, []).append(r)
path_and_ruleid_dict[path].update(rule_id_dict)
return path_and_ruleid_dict


def parse_sarif_files(sarifs: List[Path]) -> defaultdict[str, defaultdict[str, List]]:
"""
Parse sarif files organize their results into a dict of dicts organized by path and id.
"""
path_id_dict: defaultdict[str, defaultdict[str, List]] = defaultdict(
lambda: defaultdict(list)
)
for path in sarifs:
path_id_dict.update(results_by_path_and_rule_id(Path(path)))
return path_id_dict
4 changes: 2 additions & 2 deletions src/codemodder/semgrep.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import subprocess
import itertools
from tempfile import NamedTemporaryFile
from typing import List
from typing import Iterable
from pathlib import Path
from codemodder.context import CodemodExecutionContext
from codemodder.sarifs import results_by_path_and_rule_id
from codemodder.logging import logger


def run(execution_context: CodemodExecutionContext, yaml_files: List[Path]) -> dict:
def run(execution_context: CodemodExecutionContext, yaml_files: Iterable[Path]) -> dict:
"""
Runs Semgrep and outputs a dict with the results organized by rule_id.
"""
Expand Down