From deb18c6f1ef54e4bb89942d6986c4cd6f639b0db Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Fri, 27 Oct 2023 11:46:02 -0400 Subject: [PATCH] Only apply semgrep to codemods with results --- src/codemodder/codemodder.py | 26 ++++++++++++++++++++++---- src/codemodder/sarifs.py | 15 +-------------- src/codemodder/semgrep.py | 4 ++-- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/codemodder/codemodder.py b/src/codemodder/codemodder.py index 5e52c2a2..af918eaf 100644 --- a/src/codemodder/codemodder.py +++ b/src/codemodder/codemodder.py @@ -1,6 +1,7 @@ from concurrent.futures import ThreadPoolExecutor import datetime import difflib +import itertools import logging import os import sys @@ -19,6 +20,7 @@ from codemodder.executor import CodemodExecutorWrapper from codemodder.project_analysis.python_repo_manager import PythonRepoManager from codemodder.report.codetf_reporter import report_default +from codemodder.semgrep import run as run_semgrep def update_code(file_path, new_code): @@ -29,6 +31,18 @@ def update_code(file_path, new_code): f.write(new_code) +def find_semgrep_results( + context: CodemodExecutionContext, + codemods: list[CodemodExecutorWrapper], +) -> set[str]: + """Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs""" + yaml_files = itertools.chain.from_iterable( + [codemod.yaml_files for codemod in codemods if codemod.yaml_files] + ) + results = run_semgrep(context, yaml_files) + return {rule_id for file_changes in results.values() for rule_id in file_changes} + + def apply_codemod_to_file( base_directory: Path, file_context, @@ -184,7 +198,7 @@ def run(original_args) -> int: log_list(logging.INFO, "including paths", argv.path_include) log_list(logging.INFO, "excluding paths", argv.path_exclude) - files_to_analyze = match_files( + files_to_analyze: list[Path] = match_files( context.directory, argv.path_exclude, argv.path_include ) if not files_to_analyze: @@ -195,18 +209,22 @@ def run(original_args) -> int: logger.debug("matched files:") log_list(logging.DEBUG, "matched files", full_names) + semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run) + log_section("scanning") # run codemods one at a time making sure to respect the given sequence for codemod in codemods_to_run: - logger.info("running codemod %s", codemod.id) - results = codemod.apply(context) - if codemod.is_semgrep and not results: + # Unfortunately the IDs from semgrep are not fully specified + # TODO: eventually we need to be able to use fully specified IDs here + if codemod.is_semgrep and codemod.name not in semgrep_results: logger.debug( "no results from semgrep for %s, skipping analysis", codemod.id, ) continue + logger.info("running codemod %s", codemod.id) + results = codemod.apply(context) analyze_files( context, files_to_analyze, diff --git a/src/codemodder/sarifs.py b/src/codemodder/sarifs.py index 371337d7..86da018a 100644 --- a/src/codemodder/sarifs.py +++ b/src/codemodder/sarifs.py @@ -1,7 +1,6 @@ from collections import defaultdict import json -from pathlib import Path -from typing import List, Union +from typing import Union def extract_rule_id(result, sarif_run) -> Union[str, None]: @@ -43,15 +42,3 @@ def results_by_path_and_rule_id(sarif_file): rule_id_dict.setdefault(rule_id, []).append(r) path_and_ruleid_dict[path].update(rule_id_dict) return path_and_ruleid_dict - - -def parse_sarif_files(sarifs: List[Path]) -> defaultdict[str, defaultdict[str, List]]: - """ - Parse sarif files organize their results into a dict of dicts organized by path and id. - """ - path_id_dict: defaultdict[str, defaultdict[str, List]] = defaultdict( - lambda: defaultdict(list) - ) - for path in sarifs: - path_id_dict.update(results_by_path_and_rule_id(Path(path))) - return path_id_dict diff --git a/src/codemodder/semgrep.py b/src/codemodder/semgrep.py index 59493d04..1c08ea60 100644 --- a/src/codemodder/semgrep.py +++ b/src/codemodder/semgrep.py @@ -1,14 +1,14 @@ import subprocess import itertools from tempfile import NamedTemporaryFile -from typing import List +from typing import Iterable from pathlib import Path from codemodder.context import CodemodExecutionContext from codemodder.sarifs import results_by_path_and_rule_id from codemodder.logging import logger -def run(execution_context: CodemodExecutionContext, yaml_files: List[Path]) -> dict: +def run(execution_context: CodemodExecutionContext, yaml_files: Iterable[Path]) -> dict: """ Runs Semgrep and outputs a dict with the results organized by rule_id. """