pixee · drdavella · Nov 6, 2023 · Oct 31, 2023 · Nov 2, 2023
@@ -20,6 +20,7 @@
 from codemodder.executor import CodemodExecutorWrapper
 from codemodder.project_analysis.python_repo_manager import PythonRepoManager
 from codemodder.report.codetf_reporter import report_default
+from codemodder.result import ResultSet
 from codemodder.semgrep import run as run_semgrep
 
 
@@ -34,18 +35,17 @@ def update_code(file_path, new_code):
 def find_semgrep_results(
     context: CodemodExecutionContext,
     codemods: list[CodemodExecutorWrapper],
-) -> set[str]:
+) -> ResultSet:
     """Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs"""
     yaml_files = list(
         itertools.chain.from_iterable(
             [codemod.yaml_files for codemod in codemods if codemod.yaml_files]
         )
     )
     if not yaml_files:
-        return set()
+        return ResultSet()
 
-    results = run_semgrep(context, yaml_files)
-    return {rule_id for file_changes in results.values() for rule_id in file_changes}
+    return run_semgrep(context, yaml_files)
 
 
 def create_diff(original_tree: cst.Module, new_tree: cst.Module) -> str:
@@ -103,7 +103,7 @@ def process_file(
     file_path: Path,
     base_directory: Path,
     codemod,
-    sarif,
+    results: ResultSet,
     cli_args,
 ):  # pylint: disable=too-many-arguments
     logger.debug("scanning file %s", file_path)
@@ -112,14 +112,17 @@ def process_file(
 
     line_exclude = file_line_patterns(file_path, cli_args.path_exclude)
     line_include = file_line_patterns(file_path, cli_args.path_include)
-    sarif_for_file = sarif.get(str(file_path)) or {}
+    findings_for_rule = results.results_for_rule_and_file(
+        codemod.name,  # TODO: should be full ID
+        file_path,
+    )
 
     file_context = FileContext(
         base_directory,
         file_path,
         line_exclude,
         line_include,
-        sarif_for_file,
+        findings_for_rule,
     )
 
     try:
@@ -146,27 +149,50 @@ def analyze_files(
     execution_context: CodemodExecutionContext,
     files_to_analyze,
     codemod,
-    sarif,
+    results: ResultSet,
     cli_args,
 ):
     with ThreadPoolExecutor(max_workers=cli_args.max_workers) as executor:
         logger.debug(
             "using executor with %s threads",
             cli_args.max_workers,
         )
-        results = executor.map(
+        analysis_results = executor.map(
             lambda args: process_file(
                 args[0],
                 args[1],
                 execution_context.directory,
                 codemod,
-                sarif,
+                results,
                 cli_args,
             ),
             enumerate(files_to_analyze),
         )
         executor.shutdown(wait=True)
-        execution_context.process_results(codemod.id, results)
+        execution_context.process_results(codemod.id, analysis_results)
+
+
+def log_report(context, argv, elapsed_ms, files_to_analyze):
+    log_section("report")
+    logger.info("scanned: %s files", len(files_to_analyze))
+    all_failures = context.get_failed_files()
+    logger.info(
+        "failed: %s files (%s unique)",
+        len(all_failures),
+        len(set(all_failures)),
+    )
+    all_changes = context.get_changed_files()
+    logger.info(
+        "changed: %s files (%s unique)",
+        len(all_changes),
+        len(set(all_changes)),
+    )
+    logger.info("report file: %s", argv.output)
+    logger.info("total elapsed: %s ms", elapsed_ms)
+    logger.info("  semgrep:     %s ms", context.timer.get_time_ms("semgrep"))
+    logger.info("  parse:       %s ms", context.timer.get_time_ms("parse"))
+    logger.info("  transform:   %s ms", context.timer.get_time_ms("transform"))
+    logger.info("  write:       %s ms", context.timer.get_time_ms("write"))
 
 
 def run(original_args) -> int:
@@ -221,25 +247,27 @@ def run(original_args) -> int:
         return 0
 
     full_names = [str(path) for path in files_to_analyze]
-    logger.debug("matched files:")
     log_list(logging.DEBUG, "matched files", full_names)
 
-    semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run)
+    semgrep_results: ResultSet = find_semgrep_results(context, codemods_to_run)
+    semgrep_finding_ids = semgrep_results.all_rule_ids()
 
     log_section("scanning")
     # run codemods one at a time making sure to respect the given sequence
     for codemod in codemods_to_run:
         # Unfortunately the IDs from semgrep are not fully specified
         # TODO: eventually we need to be able to use fully specified IDs here
-        if codemod.is_semgrep and codemod.name not in semgrep_results:
+        if codemod.is_semgrep and codemod.name not in semgrep_finding_ids:
             logger.debug(
                 "no results from semgrep for %s, skipping analysis",
                 codemod.id,
             )
             continue
 
         logger.info("running codemod %s", codemod.id)
-        results = codemod.apply(context)
+        semgrep_files = semgrep_results.files_for_rule(codemod.name)
+        # Non-semgrep codemods ignore the semgrep results
+        results = codemod.apply(context, semgrep_files)
         analyze_files(
             context,
             files_to_analyze,
@@ -256,27 +284,7 @@ def run(original_args) -> int:
     elapsed_ms = int(elapsed.total_seconds() * 1000)
     report_default(elapsed_ms, argv, original_args, results)
 
-    log_section("report")
-    logger.info("scanned: %s files", len(files_to_analyze))
-    all_failures = context.get_failed_files()
-    logger.info(
-        "failed: %s files (%s unique)",
-        len(all_failures),
-        len(set(all_failures)),
-    )
-    all_changes = context.get_changed_files()
-    logger.info(
-        "changed: %s files (%s unique)",
-        len(all_changes),
-        len(set(all_changes)),
-    )
-    logger.info("report file: %s", argv.output)
-    logger.info("total elapsed: %s ms", elapsed_ms)
-    logger.info("semgrep:       %s ms", context.timer.get_time_ms("semgrep"))
-    logger.info("parse:         %s ms", context.timer.get_time_ms("parse"))
-    logger.info("transform:     %s ms", context.timer.get_time_ms("transform"))
-    logger.info("write:         %s ms", context.timer.get_time_ms("write"))
-
+    log_report(context, argv, elapsed_ms, files_to_analyze)
     return 0
 
 

@@ -109,7 +109,7 @@ def __init_subclass__(cls):
     def __init__(self, codemod_context: CodemodContext, file_context: FileContext):
         BaseCodemod.__init__(self, codemod_context, file_context)
         _SemgrepCodemod.__init__(self, file_context)
-        BaseTransformer.__init__(self, codemod_context, self._results)
+        BaseTransformer.__init__(self, codemod_context, file_context.findings)
 
     def _new_or_updated_node(self, original_node, updated_node):
         if self.node_is_selected(original_node):

@@ -7,6 +7,7 @@
 from codemodder.change import Change
 from codemodder.dependency import Dependency
 from codemodder.file_context import FileContext
+from codemodder.result import ResultSet
 from codemodder.semgrep import run as semgrep_run
 
 
@@ -51,14 +52,14 @@ def __init__(self, file_context: FileContext):
         self.file_context = file_context
 
     @classmethod
-    def apply_rule(cls, context, *args, **kwargs):
+    def apply_rule(cls, context, *args, **kwargs) -> ResultSet:
         """
         Apply rule associated with this codemod and gather results
 
         Does nothing by default. Subclasses may override for custom rule logic.
         """
         del context, args, kwargs
-        return {}
+        return ResultSet()
 
     @classmethod
     def name(cls):
@@ -105,28 +106,17 @@ class SemgrepCodemod(BaseCodemod):
     YAML_FILES: ClassVar[List[str]] = NotImplemented
     is_semgrep = True
 
-    def __init__(self, *args):
-        super().__init__(*args)
-        self._results = (
-            self.file_context.results_by_id.get(
-                self.METADATA.NAME  # pylint: disable=no-member
-            )
-            or []
-        )
-
     @classmethod
-    def apply_rule(
-        cls, context, yaml_files, *args, **kwargs
-    ):  # pylint: disable=arguments-differ
+    def apply_rule(cls, context, *args, **kwargs) -> ResultSet:
         """
         Apply semgrep to gather rule results
         """
-        del args, kwargs
+        yaml_files = kwargs.get("yaml_files") or args[0]
+        files_to_analyze = kwargs.get("files_to_analyze") or args[1]
         with context.timer.measure("semgrep"):
-            return semgrep_run(context, yaml_files)
+            return semgrep_run(context, yaml_files, files_to_analyze)
 
     @property
     def should_transform(self):
-        """Semgrep codemods should attempt transform only if there are
-        semgrep results"""
-        return bool(self._results)
+        """Semgrep codemods should attempt transform only if there are semgrep results"""
+        return bool(self.file_context.findings)
@@ -2,17 +2,21 @@
 from libcst.codemod import ContextAwareVisitor, VisitorBasedCodemodCommand
 from libcst.metadata import PositionProvider
 
+from codemodder.result import Result
+
 
 class UtilsMixin:
-    METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)
+    results: list[Result]
 
-    def __init__(self, context, results):
-        super().__init__(context)
+    def __init__(self, results: list[Result]):
         self.results = results
 
     def filter_by_result(self, pos_to_match):
-        all_pos = [extract_pos_from_result(result) for result in self.results]
-        return any(match_pos(pos_to_match, position) for position in all_pos)
+        return any(
+            location.match(pos_to_match)
+            for result in self.results
+            for location in result.locations
+        )
 
     def filter_by_path_includes_or_excludes(self, pos_to_match):
         """
@@ -28,6 +32,7 @@ def filter_by_path_includes_or_excludes(self, pos_to_match):
     def node_is_selected(self, node) -> bool:
         if not self.results:
             return False
+
         pos_to_match = self.node_position(node)
         return self.filter_by_result(
             pos_to_match
@@ -41,34 +46,21 @@ def lineno_for_node(self, node):
         return self.node_position(node).start.line
 
 
-class BaseTransformer(UtilsMixin, VisitorBasedCodemodCommand):
-    ...
+class BaseTransformer(VisitorBasedCodemodCommand, UtilsMixin):
+    METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)
+
+    def __init__(self, context, results: list[Result]):
+        super().__init__(context)
+        UtilsMixin.__init__(self, results)
+
 
+class BaseVisitor(ContextAwareVisitor, UtilsMixin):
+    METADATA_DEPENDENCIES: Tuple[Any, ...] = (PositionProvider,)
 
-class BaseVisitor(UtilsMixin, ContextAwareVisitor):
-    ...
+    def __init__(self, context, results: list[Result]):
+        super().__init__(context)
+        UtilsMixin.__init__(self, results)
 
 
 def match_line(pos, line):
     return pos.start.line == line and pos.end.line == line
-
-
-def extract_pos_from_result(result):
-    region = result["locations"][0]["physicalLocation"]["region"]
-    # TODO it may be the case some of these attributes do not exist
-    return (
-        region.get("startLine"),
-        region["startColumn"],
-        region.get("endLine") or region.get("startLine"),
-        region["endColumn"],
-    )
-
-
-def match_pos(pos, x):
-    # needs some leeway because the semgrep and libcst won't exactly match
-    return (
-        pos.start.line == x[0]
-        and (pos.start.column in (x[1] - 1, x[1]))
-        and pos.end.line == x[2]
-        and (pos.end.column in (x[3] - 1, x[3]))
-    )
@@ -1,4 +1,5 @@
 from importlib.abc import Traversable
+from pathlib import Path
 
 from wrapt import CallableObjectProxy
 
@@ -22,13 +23,17 @@ def __init__(
         self.docs_module = docs_module
         self.semgrep_config_module = semgrep_config_module
 
-    def apply(self, context):
+    def apply(self, context, files: list[Path]):
         """
         Wraps the codemod's apply method to inject additional arguments.
 
         Not all codemods will need these arguments.
         """
-        return self.apply_rule(context, yaml_files=self.yaml_files)
+        return self.apply_rule(
+            context,
+            yaml_files=self.yaml_files,
+            files_to_analyze=files,
+        )
 
     @property
     def name(self):

@@ -1,9 +1,10 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, List
+from typing import List
 
 from codemodder.change import Change, ChangeSet
 from codemodder.dependency import Dependency
+from codemodder.result import Result
 from codemodder.utils.timer import Timer
 
 
@@ -17,7 +18,7 @@ class FileContext:  # pylint: disable=too-many-instance-attributes
     file_path: Path
     line_exclude: List[int] = field(default_factory=list)
     line_include: List[int] = field(default_factory=list)
-    results_by_id: Dict = field(default_factory=dict)
+    findings: List[Result] = field(default_factory=list)
     dependencies: set[Dependency] = field(default_factory=set)
     codemod_changes: List[Change] = field(default_factory=list)
     results: List[ChangeSet] = field(default_factory=list)