From d1518d32f52f8e4da417590da73844d62669d317 Mon Sep 17 00:00:00 2001
From: Stephan Lipp <sphlpp91@gmail.com>
Date: Fri, 27 Oct 2023 14:42:03 +0200
Subject: [PATCH] refactor(sast): add sanity checks in main SAST routine

---
 sast-fuzz/static_analysis/sast/config.yml     |   6 +-
 .../static_analysis/sast/src/sfa/__init__.py  |  29 ++--
 .../sast/src/sfa/analysis/tool_runner.py      | 125 +++++++++++-------
 3 files changed, 92 insertions(+), 68 deletions(-)

diff --git a/sast-fuzz/static_analysis/sast/config.yml b/sast-fuzz/static_analysis/sast/config.yml
index 9cde425..e191b13 100644
--- a/sast-fuzz/static_analysis/sast/config.yml
+++ b/sast-fuzz/static_analysis/sast/config.yml
@@ -5,12 +5,14 @@ scoring:
     tools: 1.0
 tools:
   flawfinder:
+    sanity_checks: 'always' # Options: always, cmake, none
     path: 'python2 /opt/flawfinder-2.0.19/flawfinder.py'
     checks:
       - '--falsepositive'
       - '--minlevel=3'
       - '--neverignore'
   semgrep:
+    sanity_checks: 'always' # Options: always, cmake, none
     path: '/usr/local/bin/semgrep'
     checks:
       - 'r/c.lang.security.double-free.double-free'
@@ -27,6 +29,7 @@ tools:
       - 'r/c.lang.security.use-after-free.use-after-free'
     num_threads: 8
   infer:
+    sanity_checks: 'always' # Options: always, cmake, none
     path: '/opt/infer-1.1.0/bin/infer'
     checks:
       - '--no-default-checkers'
@@ -41,7 +44,7 @@ tools:
       - '--uninit'
     num_threads: 8
   codeql:
-    sanity_checks: 'cmake' # Options: always, cmake, none
+    sanity_checks: 'always' # Options: always, cmake, none
     lib_path: '/opt/codeql-2.12.0/lib'
     path: '/opt/codeql-2.12.0/cli/codeql'
     checks:
@@ -264,6 +267,7 @@ tools:
       - '%LIBRARY_PATH%/cpp/ql/src/Summary/LinesOfUserCode.ql'
     num_threads: 8
   clang_scan:
+    sanity_checks: 'always' # Options: always, cmake, none
     path: '/opt/llvm-12.0.0/build/bin/scan-build'
     checks:
       - '-disable-checker core.CallAndMessage'
diff --git a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py
index 5ecfb7f..fae18cc 100644
--- a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py
+++ b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py
@@ -19,22 +19,11 @@
 
 import yaml
 
-
-# fmt: off
-class SanityChecks(Enum):
-    """
-    Options of when to run sanity checks.
-    """
-
-    ALWAYS = auto(); CMAKE = auto(); NONE = auto()
-# fmt: on
-
-
 ScoreWeights = namedtuple("ScoreWeights", ["flags", "tools"], defaults=[0.5, 0.5])
 
 # SAST tool configuration
 SASTToolConfig = namedtuple(
-    "SASTToolConfig", ["sanity_checks", "path", "checks", "num_threads"], defaults=[SanityChecks.NONE, "", "", -1]
+    "SASTToolConfig", ["sanity_checks", "path", "checks", "num_threads"], defaults=["", "", "", -1]
 )
 
 
@@ -70,27 +59,33 @@ def from_yaml(cls, file: Path) -> "AppConfig":
         return cls(
             ScoreWeights(config["scoring"]["weights"]["flags"], config["scoring"]["weights"]["tools"]),
             flawfinder=SASTToolConfig(
-                SanityChecks.NONE, config["tools"]["flawfinder"]["path"], config["tools"]["flawfinder"]["checks"], -1
+                config["tools"]["flawfinder"]["sanity_checks"],
+                config["tools"]["flawfinder"]["path"],
+                config["tools"]["flawfinder"]["checks"],
+                -1,
             ),
             semgrep=SASTToolConfig(
-                SanityChecks.NONE,
+                config["tools"]["semgrep"]["sanity_checks"],
                 config["tools"]["semgrep"]["path"],
                 config["tools"]["semgrep"]["checks"],
                 config["tools"]["semgrep"]["num_threads"],
             ),
             infer=SASTToolConfig(
-                SanityChecks.NONE,
+                config["tools"]["infer"]["sanity_checks"],
                 config["tools"]["infer"]["path"],
                 config["tools"]["infer"]["checks"],
                 config["tools"]["infer"]["num_threads"],
             ),
             codeql=SASTToolConfig(
-                SanityChecks[config["tools"]["codeql"]["sanity_checks"].upper()],
+                config["tools"]["codeql"]["sanity_checks"],
                 config["tools"]["codeql"]["path"],
                 codeql_checks,
                 config["tools"]["codeql"]["num_threads"],
             ),
             clang_scan=SASTToolConfig(
-                SanityChecks.NONE, config["tools"]["clang_scan"]["path"], config["tools"]["clang_scan"]["checks"], -1
+                config["tools"]["clang_scan"]["sanity_checks"],
+                config["tools"]["clang_scan"]["path"],
+                config["tools"]["clang_scan"]["checks"],
+                -1,
             ),
         )
diff --git a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py
index a052ba6..bd0d471 100644
--- a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py
+++ b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py
@@ -23,7 +23,7 @@
 from tempfile import TemporaryDirectory
 from typing import Callable, ClassVar, Dict, Optional
 
-from sfa import SanityChecks, SASTToolConfig
+from sfa import SASTToolConfig
 from sfa.analysis import SASTFlag, SASTFlags
 from sfa.utils.fs import copy_dir, find_files
 from sfa.utils.proc import run_shell_command
@@ -57,12 +57,11 @@ def is_cmake_project(subject_dir: Path) -> bool:
     return (subject_dir / "CMakeLists.txt").exists()
 
 
-def convert_sarif(string: str, sanity_check: Optional[Callable[[Dict], None]] = None) -> SASTFlags:
+def default_sarif_checks(string: str) -> Dict:
     """
-    Convert SARIF data into our SAST flag format.
+    Run default checks on SARIF string.
 
     :param string:
-    :param sanity_check:
     :return:
     """
     if len(string.strip()) == 0:
@@ -73,8 +72,17 @@ def convert_sarif(string: str, sanity_check: Optional[Callable[[Dict], None]] =
     if sarif_data["version"] != SARIF_VERSION:
         raise ValueError(f"SARIF version {sarif_data['version']} is not supported.")
 
-    if sanity_check is not None:
-        sanity_check(sarif_data)
+    return sarif_data
+
+
+def convert_sarif(string: str) -> SASTFlags:
+    """
+    Convert SARIF data into our SAST flag format.
+
+    :param string:
+    :return:
+    """
+    sarif_data = json.loads(string)
 
     flags = SASTFlags()
 
@@ -129,6 +137,16 @@ def _analyze(self, working_dir: Path) -> str:
         """
         pass
 
+    @abstractmethod
+    def _sanity_checks(self, string: str) -> None:
+        """
+        Run sanity checks on SAST tool output.
+
+        :param string:
+        :return:
+        """
+        pass
+
     @abstractmethod
     def _format(self, string: str) -> SASTFlags:
         """
@@ -141,7 +159,7 @@ def _format(self, string: str) -> SASTFlags:
 
     def run(self) -> SASTFlags:
         """
-        Setup target program, run SAST tool, and format output.
+        Setup target program, run SAST tool (+ sanity checks), and format output.
 
         :return:
         """
@@ -150,6 +168,11 @@ def run(self) -> SASTFlags:
                 working_dir = self._setup(Path(temp_dir))
                 flags = self._analyze(working_dir)
 
+            if self._config.sanity_checks == "always" or (
+                self._config.sanity_checks == "cmake" and self._is_cmake_project
+            ):
+                self._sanity_checks(flags)
+
             return self._format(flags)
 
         except Exception as ex:
@@ -172,6 +195,9 @@ def _analyze(self, working_dir: Path) -> str:
             f"{self._config.path} --dataonly --sarif {' '.join(self._config.checks)} {working_dir}"
         )
 
+    def _sanity_checks(self, string: str) -> None:
+        default_sarif_checks(string)
+
     def _format(self, string: str) -> SASTFlags:
         return convert_sarif(string)
 
@@ -189,6 +215,9 @@ def _analyze(self, working_dir: Path) -> str:
             f"{self._config.path} scan --quiet --sarif --jobs {self._config.num_threads} {' '.join([f'--config {check}' for check in self._config.checks])} {working_dir}"
         )
 
+    def _sanity_checks(self, string: str) -> None:
+        default_sarif_checks(string)
+
     def _format(self, string: str) -> SASTFlags:
         return convert_sarif(string)
 
@@ -220,6 +249,9 @@ def _analyze(self, working_dir: Path) -> str:
         # By default, Infer writes the results into the 'report.json' file once the analysis is complete.
         return (working_dir / "report.json").read_text()
 
+    def _sanity_checks(self, string: str) -> None:
+        pass
+
     def _format(self, string: str) -> SASTFlags:
         flags = SASTFlags()
 
@@ -241,39 +273,6 @@ class CodeQLRunner(SASTToolRunner):
     CodeQL runner.
     """
 
-    def _sanity_check(self, sarif_data: Dict) -> None:
-        """
-        Run sanity checks on CodeQL output.
-
-        :param sarif_data:
-        :return:
-        """
-        n_runs = len(sarif_data["runs"])
-
-        if n_runs == 0:
-            raise ValueError("No CodeQL execution runs found.")
-
-        # Let's take the last executed SAST run for the sanity check
-        run = sarif_data["runs"][n_runs - 1]
-        metrics = run["properties"].get("metricResults")
-
-        if metrics is None:
-            raise ValueError("No CodeQL metrics data found in SARIF file.")
-
-        loc = 0
-        user_loc = 0
-
-        for m in metrics:
-            if m["ruleId"] == "cpp/summary/lines-of-code":
-                loc = int(m["value"])
-            if m["ruleId"] == "cpp/summary/lines-of-user-code":
-                user_loc = int(m["value"])
-
-        logging.debug(f"Sanity-Check [CodeQL]: LoC = {loc}, User-LoC = {user_loc}")
-
-        if user_loc == 0:
-            raise ValueError("No user C/C++ source code found in the CodeQL database.")
-
     def _setup(self, temp_dir: Path) -> Path:
         result_dir = temp_dir / "codeql_res"
 
@@ -303,18 +302,37 @@ def _analyze(self, working_dir: Path) -> str:
 
         return result_file.read_text()
 
-    def _format(self, string: str) -> SASTFlags:
-        run_sc = False
+    def _sanity_checks(self, string: str) -> None:
+        sarif_data = default_sarif_checks(string)
 
-        if self._config.sanity_checks == SanityChecks.ALWAYS or (
-            self._config.sanity_checks == SanityChecks.CMAKE and self._is_cmake_project
-        ):
-            run_sc = True
+        n_runs = len(sarif_data["runs"])
 
-        if not run_sc:
-            return convert_sarif(string)
-        else:
-            return convert_sarif(string, self._sanity_check)
+        if n_runs == 0:
+            raise ValueError("No CodeQL execution runs found.")
+
+        # Let's take the last executed SAST run for the sanity check
+        run = sarif_data["runs"][n_runs - 1]
+        metrics = run["properties"].get("metricResults")
+
+        if metrics is None:
+            raise ValueError("No CodeQL metrics data found in SARIF file.")
+
+        loc = 0
+        user_loc = 0
+
+        for m in metrics:
+            if m["ruleId"] == "cpp/summary/lines-of-code":
+                loc = int(m["value"])
+            if m["ruleId"] == "cpp/summary/lines-of-user-code":
+                user_loc = int(m["value"])
+
+        logging.debug(f"Sanity-Check [CodeQL]: LoC = {loc}, User-LoC = {user_loc}")
+
+        if user_loc == 0:
+            raise ValueError("No user C/C++ source code found in the CodeQL database.")
+
+    def _format(self, string: str) -> SASTFlags:
+        return convert_sarif(string)
 
 
 class ClangScanRunner(SASTToolRunner):
@@ -340,6 +358,10 @@ def _analyze(self, working_dir: Path) -> str:
         # (JSON string) of each file as one line to the return string.
         return os.linesep.join(map(lambda file: json.dumps(json.loads(file.read_text()), indent=None), result_files))
 
+    def _sanity_checks(self, string: str) -> None:
+        for sarif_str in string.split(os.linesep):
+            default_sarif_checks(sarif_str)
+
     def _format(self, string: str) -> SASTFlags:
         nested_flags = map(convert_sarif, string.split(os.linesep))
 
@@ -369,6 +391,9 @@ def _setup(self, temp_dir: Path) -> Path:
     def _analyze(self, working_dir: Path) -> str:
         return (working_dir / self._report_name).read_text()
 
+    def _sanity_checks(self, string: str) -> None:
+        pass
+
     def _format(self, string: str) -> SASTFlags:
         flags = SASTFlags()