From deb18c6f1ef54e4bb89942d6986c4cd6f639b0db Mon Sep 17 00:00:00 2001
From: Daniel D'Avella <dan.davella@pixee.ai>
Date: Fri, 27 Oct 2023 11:46:02 -0400
Subject: [PATCH] Only apply semgrep to codemods with results

---
 src/codemodder/codemodder.py | 26 ++++++++++++++++++++++----
 src/codemodder/sarifs.py     | 15 +--------------
 src/codemodder/semgrep.py    |  4 ++--
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/codemodder/codemodder.py b/src/codemodder/codemodder.py
index 5e52c2a2..af918eaf 100644
--- a/src/codemodder/codemodder.py
+++ b/src/codemodder/codemodder.py
@@ -1,6 +1,7 @@
 from concurrent.futures import ThreadPoolExecutor
 import datetime
 import difflib
+import itertools
 import logging
 import os
 import sys
@@ -19,6 +20,7 @@
 from codemodder.executor import CodemodExecutorWrapper
 from codemodder.project_analysis.python_repo_manager import PythonRepoManager
 from codemodder.report.codetf_reporter import report_default
+from codemodder.semgrep import run as run_semgrep
 
 
 def update_code(file_path, new_code):
@@ -29,6 +31,18 @@ def update_code(file_path, new_code):
         f.write(new_code)
 
 
+def find_semgrep_results(
+    context: CodemodExecutionContext,
+    codemods: list[CodemodExecutorWrapper],
+) -> set[str]:
+    """Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs"""
+    yaml_files = itertools.chain.from_iterable(
+        [codemod.yaml_files for codemod in codemods if codemod.yaml_files]
+    )
+    results = run_semgrep(context, yaml_files)
+    return {rule_id for file_changes in results.values() for rule_id in file_changes}
+
+
 def apply_codemod_to_file(
     base_directory: Path,
     file_context,
@@ -184,7 +198,7 @@ def run(original_args) -> int:
     log_list(logging.INFO, "including paths", argv.path_include)
     log_list(logging.INFO, "excluding paths", argv.path_exclude)
 
-    files_to_analyze = match_files(
+    files_to_analyze: list[Path] = match_files(
         context.directory, argv.path_exclude, argv.path_include
     )
     if not files_to_analyze:
@@ -195,18 +209,22 @@ def run(original_args) -> int:
     logger.debug("matched files:")
     log_list(logging.DEBUG, "matched files", full_names)
 
+    semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run)
+
     log_section("scanning")
     # run codemods one at a time making sure to respect the given sequence
     for codemod in codemods_to_run:
-        logger.info("running codemod %s", codemod.id)
-        results = codemod.apply(context)
-        if codemod.is_semgrep and not results:
+        # Unfortunately the IDs from semgrep are not fully specified
+        # TODO: eventually we need to be able to use fully specified IDs here
+        if codemod.is_semgrep and codemod.name not in semgrep_results:
             logger.debug(
                 "no results from semgrep for %s, skipping analysis",
                 codemod.id,
             )
             continue
 
+        logger.info("running codemod %s", codemod.id)
+        results = codemod.apply(context)
         analyze_files(
             context,
             files_to_analyze,
diff --git a/src/codemodder/sarifs.py b/src/codemodder/sarifs.py
index 371337d7..86da018a 100644
--- a/src/codemodder/sarifs.py
+++ b/src/codemodder/sarifs.py
@@ -1,7 +1,6 @@
 from collections import defaultdict
 import json
-from pathlib import Path
-from typing import List, Union
+from typing import Union
 
 
 def extract_rule_id(result, sarif_run) -> Union[str, None]:
@@ -43,15 +42,3 @@ def results_by_path_and_rule_id(sarif_file):
                     rule_id_dict.setdefault(rule_id, []).append(r)
             path_and_ruleid_dict[path].update(rule_id_dict)
     return path_and_ruleid_dict
-
-
-def parse_sarif_files(sarifs: List[Path]) -> defaultdict[str, defaultdict[str, List]]:
-    """
-    Parse sarif files organize their results into a dict of dicts organized by path and id.
-    """
-    path_id_dict: defaultdict[str, defaultdict[str, List]] = defaultdict(
-        lambda: defaultdict(list)
-    )
-    for path in sarifs:
-        path_id_dict.update(results_by_path_and_rule_id(Path(path)))
-    return path_id_dict
diff --git a/src/codemodder/semgrep.py b/src/codemodder/semgrep.py
index 59493d04..1c08ea60 100644
--- a/src/codemodder/semgrep.py
+++ b/src/codemodder/semgrep.py
@@ -1,14 +1,14 @@
 import subprocess
 import itertools
 from tempfile import NamedTemporaryFile
-from typing import List
+from typing import Iterable
 from pathlib import Path
 from codemodder.context import CodemodExecutionContext
 from codemodder.sarifs import results_by_path_and_rule_id
 from codemodder.logging import logger
 
 
-def run(execution_context: CodemodExecutionContext, yaml_files: List[Path]) -> dict:
+def run(execution_context: CodemodExecutionContext, yaml_files: Iterable[Path]) -> dict:
     """
     Runs Semgrep and outputs a dict with the results organized by rule_id.
     """