From d06d08a63a93ff70d6d9952b741cbf146c8554c2 Mon Sep 17 00:00:00 2001 From: Stephan Lipp Date: Wed, 25 Oct 2023 10:56:23 +0200 Subject: [PATCH 1/4] refactor(crash-dedup): minor change --- scripts/crash-dedup/src/cdd/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/crash-dedup/src/cdd/main.py b/scripts/crash-dedup/src/cdd/main.py index 21fbaf7..fca8330 100644 --- a/scripts/crash-dedup/src/cdd/main.py +++ b/scripts/crash-dedup/src/cdd/main.py @@ -150,7 +150,7 @@ def main( logging.error(ex) for n_frames in set(n_frames_list or [None]): # type: ignore - summary_file = output_dir / f"summary{'' if n_frames is None else '_nframes' + str(n_frames)}.csv" + summary_file = output_dir / f"summary{'' if n_frames is None else '_nf' + str(n_frames)}.csv" summary = group_by(sanitizer_infos, n_frames, consider_lines) summary.to_csv(summary_file) From cd1f18036c94d0bf529f050024c1f4bf3712b615 Mon Sep 17 00:00:00 2001 From: Stephan Lipp Date: Wed, 25 Oct 2023 17:56:23 +0200 Subject: [PATCH 2/4] feat(crash-dedup): add option to incl. filepath in dedup --- scripts/crash-dedup/src/cdd/container/san.py | 15 ++++-- .../crash-dedup/src/cdd/container/summary.py | 11 +++- scripts/crash-dedup/src/cdd/grouping.py | 9 +++- scripts/crash-dedup/src/cdd/main.py | 10 +++- scripts/crash-dedup/tests/test_grouping.py | 54 +++++++++++++++---- 5 files changed, 81 insertions(+), 18 deletions(-) diff --git a/scripts/crash-dedup/src/cdd/container/san.py b/scripts/crash-dedup/src/cdd/container/san.py index c69aa30..33fd3ac 100644 --- a/scripts/crash-dedup/src/cdd/container/san.py +++ b/scripts/crash-dedup/src/cdd/container/san.py @@ -92,18 +92,27 @@ def __init__(self, input_id: str, san: str, vtype: str, stack_trace: StackTrace) self.vtype = vtype self.stack_trace = stack_trace - def sorting_key(self, n_frames: Optional[int] = None, consider_lines: bool = False) -> Tuple: + def sorting_key( + self, n_frames: Optional[int] = None, consider_filepaths: bool = False, consider_lines: bool = False + ) -> Tuple: """ Get sorting key for grouping sanitizer outputs. :param n_frames: + :param consider_filepaths: :param consider_lines: :return: """ stack_trace = self.stack_trace if n_frames is None else self.stack_trace[:n_frames] - if not consider_lines: - stack_trace = [(t.id, t.file, t.function) for t in stack_trace] # type: ignore + if consider_filepaths: + if not consider_lines: + stack_trace = [(t.id, t.file, t.function) for t in stack_trace] # type: ignore + else: + if not consider_lines: + stack_trace = [(t.id, t.function) for t in stack_trace] # type: ignore + else: + stack_trace = [(t.id, t.function, t.line) for t in stack_trace] # type: ignore return self.san, self.vtype, stack_trace diff --git a/scripts/crash-dedup/src/cdd/container/summary.py b/scripts/crash-dedup/src/cdd/container/summary.py index 803946e..2731206 100644 --- a/scripts/crash-dedup/src/cdd/container/summary.py +++ b/scripts/crash-dedup/src/cdd/container/summary.py @@ -32,9 +32,14 @@ class DedupSummary: """ def __init__( - self, n_frames: Optional[int], consider_lines: bool, summary: Optional[List[DedupEntry]] = None + self, + n_frames: Optional[int], + consider_filepaths: bool, + consider_lines: bool, + summary: Optional[List[DedupEntry]] = None, ) -> None: self.n_frames = n_frames + self.consider_filepaths = consider_filepaths self.consider_lines = consider_lines self.summary = summary or [] @@ -44,7 +49,8 @@ def add(self, id: int, key: Tuple, elems: List[SanitizerOutput]) -> None: def to_csv(self, file: Path) -> None: with file.open("w+") as csv_file: csv_file.write( - "bug_id,n_dedup_frames,consider_lines,input_name,sanitizer,vuln_type,stack_trace,n_frames" + os.linesep + "bug_id,n_dedup_frames,consider_filepaths,consider_lines,input_name,sanitizer,vuln_type,stack_trace,n_frames" + + os.linesep ) for entry in self.summary: @@ -57,6 +63,7 @@ def to_csv(self, file: Path) -> None: ( str(entry.bug_id), frame_str, + str(self.consider_filepaths), str(self.consider_lines), str(san_output.input_id).replace(CSV_SEP, "-"), san_output.san, diff --git a/scripts/crash-dedup/src/cdd/grouping.py b/scripts/crash-dedup/src/cdd/grouping.py index 4951c97..0308c70 100644 --- a/scripts/crash-dedup/src/cdd/grouping.py +++ b/scripts/crash-dedup/src/cdd/grouping.py @@ -20,20 +20,25 @@ def group_by( - sanitizer_infos: List[SanitizerOutput], n_frames: Optional[int] = None, consider_lines: bool = False + sanitizer_infos: List[SanitizerOutput], + n_frames: Optional[int] = None, + consider_filepaths: bool = False, + consider_lines: bool = False, ) -> DedupSummary: """ Group/deduplicate sanitizer outputs. :param sanitizer_infos: :param n_frames: + :param consider_filepaths: :param consider_lines: :return: """ - keyfunc = lambda s: s.sorting_key(n_frames, consider_lines) + keyfunc = lambda s: s.sorting_key(n_frames, consider_filepaths, consider_lines) return DedupSummary( n_frames, + consider_filepaths, consider_lines, [ DedupEntry(i, k, list(g)) diff --git a/scripts/crash-dedup/src/cdd/main.py b/scripts/crash-dedup/src/cdd/main.py index fca8330..39b3ba4 100644 --- a/scripts/crash-dedup/src/cdd/main.py +++ b/scripts/crash-dedup/src/cdd/main.py @@ -87,6 +87,14 @@ def main( help="Number(s) of stack frames to be included in deduplication. Note: If not specified, all frames are considered.", ), ] = None, + consider_filepaths: Annotated[ + bool, + typer.Option( + "--consider-filepaths", + is_flag=True, + help="Consider the filepaths within the stack frames for deduplication.", + ), + ] = False, consider_lines: Annotated[ bool, typer.Option( @@ -152,5 +160,5 @@ def main( for n_frames in set(n_frames_list or [None]): # type: ignore summary_file = output_dir / f"summary{'' if n_frames is None else '_nf' + str(n_frames)}.csv" - summary = group_by(sanitizer_infos, n_frames, consider_lines) + summary = group_by(sanitizer_infos, n_frames, consider_filepaths, consider_lines) summary.to_csv(summary_file) diff --git a/scripts/crash-dedup/tests/test_grouping.py b/scripts/crash-dedup/tests/test_grouping.py index 85e02d2..e7744f2 100644 --- a/scripts/crash-dedup/tests/test_grouping.py +++ b/scripts/crash-dedup/tests/test_grouping.py @@ -63,7 +63,7 @@ def test_group_by_all_frames(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) # Assert n = len(actual.summary) @@ -84,7 +84,7 @@ def test_group_by_all_frames_same_vtype(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) # Assert n = len(actual.summary) @@ -105,7 +105,7 @@ def test_group_by_all_frames_same_trace(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) # Assert n = len(actual.summary) @@ -126,7 +126,7 @@ def test_group_by_all_frames_different_sanitizer(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) # Assert n = len(actual.summary) @@ -147,7 +147,24 @@ def test_group_by_all_frames_same_group(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) + + # Assert + self.assertEqual(len(actual.summary), 1) + + for info in sanitizer_infos: + self.assertIn(info, actual.summary[0].elems) + + def test_group_by_all_frames_same_group_ignore_filepaths(self) -> None: + # Arrange + sanitizer_infos = [ + SanitizerOutput("input1", "san1", "type1", [StackFrame(1, "file1", "func1", 10)]), + SanitizerOutput("input2", "san1", "type1", [StackFrame(1, "file2", "func1", 10)]), + SanitizerOutput("input3", "san1", "type1", [StackFrame(1, "file3", "func1", 10)]), + ] + + # Act + actual = group_by(sanitizer_infos, None, False, True) # Assert self.assertEqual(len(actual.summary), 1) @@ -164,7 +181,24 @@ def test_group_by_all_frames_same_group_ignore_lines(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, False) + actual = group_by(sanitizer_infos, None, True, False) + + # Assert + self.assertEqual(len(actual.summary), 1) + + for info in sanitizer_infos: + self.assertIn(info, actual.summary[0].elems) + + def test_group_by_all_frames_same_group_ignore_filepaths_and_lines(self) -> None: + # Arrange + sanitizer_infos = [ + SanitizerOutput("input1", "san1", "type1", [StackFrame(1, "file1", "func1", 10)]), + SanitizerOutput("input2", "san1", "type1", [StackFrame(1, "file2", "func1", 20)]), + SanitizerOutput("input3", "san1", "type1", [StackFrame(1, "file3", "func1", 30)]), + ] + + # Act + actual = group_by(sanitizer_infos, None, False, False) # Assert self.assertEqual(len(actual.summary), 1) @@ -188,7 +222,7 @@ def test_group_by_all_frames_real_vulns(self) -> None: ] # Act - actual = group_by(sanitizer_infos, None, True) + actual = group_by(sanitizer_infos, None, True, True) # Assert for group in expected: @@ -208,7 +242,7 @@ def test_group_by_n_frames_1(self) -> None: ] # Act - actual = group_by(sanitizer_infos, 1, True) + actual = group_by(sanitizer_infos, 1, True, True) # Assert for group in expected: @@ -229,7 +263,7 @@ def test_group_by_n_frames_5(self) -> None: ] # Act - actual = group_by(sanitizer_infos, 5, True) + actual = group_by(sanitizer_infos, 5, True, True) # Assert for group in expected: @@ -251,7 +285,7 @@ def test_group_by_n_frames_7(self) -> None: ] # Act - actual = group_by(sanitizer_infos, 7, True) + actual = group_by(sanitizer_infos, 7, True, True) # Assert for group in expected: From 3e710fae8d1a0be7ac634a81873b16bc348add0d Mon Sep 17 00:00:00 2001 From: Stephan Lipp Date: Fri, 27 Oct 2023 13:02:43 +0200 Subject: [PATCH 3/4] feat(sast): add sanity check options to config --- sast-fuzz/static_analysis/sast/config.yml | 3 ++- .../static_analysis/sast/src/sfa/__init__.py | 27 ++++++++++++++++--- .../sast/src/sfa/analysis/tool_runner.py | 8 ++++-- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/sast-fuzz/static_analysis/sast/config.yml b/sast-fuzz/static_analysis/sast/config.yml index 019b64b..9cde425 100644 --- a/sast-fuzz/static_analysis/sast/config.yml +++ b/sast-fuzz/static_analysis/sast/config.yml @@ -41,6 +41,7 @@ tools: - '--uninit' num_threads: 8 codeql: + sanity_checks: 'cmake' # Options: always, cmake, none lib_path: '/opt/codeql-2.12.0/lib' path: '/opt/codeql-2.12.0/cli/codeql' checks: @@ -258,7 +259,7 @@ tools: - '%LIBRARY_PATH%/cpp/ql/src/Security/CWE/CWE-764/UnreleasedLock.ql' - '%LIBRARY_PATH%/cpp/ql/src/Security/CWE/CWE-807/TaintedCondition.ql' - '%LIBRARY_PATH%/cpp/ql/src/Security/CWE/CWE-835/InfiniteLoopWithUnsatisfiableExitCondition.ql' - # not actually security queries, but metrics for sanity checking: + # No security checks, but important for CodeQL debugging and sanity checks - '%LIBRARY_PATH%/cpp/ql/src/Summary/LinesOfCode.ql' - '%LIBRARY_PATH%/cpp/ql/src/Summary/LinesOfUserCode.ql' num_threads: 8 diff --git a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py index 738f991..264d5f7 100644 --- a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py +++ b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py @@ -14,14 +14,28 @@ from collections import namedtuple from dataclasses import dataclass +from enum import Enum, auto from pathlib import Path import yaml + +# fmt: off +class SanityChecks(Enum): + """ + Options of when to run sanity checks. + """ + + ALWAYS = auto(); CMAKE = auto(); NONE = auto() +# fmt: on + + ScoreWeights = namedtuple("ScoreWeights", ["flags", "tools"], defaults=[0.5, 0.5]) # SAST tool configuration -SASTToolConfig = namedtuple("SASTToolConfig", ["path", "checks", "num_threads"], defaults=["", "", -1]) +SASTToolConfig = namedtuple( + "SASTToolConfig", ["sanity_checks", "path", "checks", "num_threads"], defaults=[SanityChecks.NONE, "", "", -1] +) @dataclass @@ -56,22 +70,27 @@ def from_yaml(cls, file: Path) -> "AppConfig": return cls( ScoreWeights(config["scoring"]["weights"]["flags"], config["scoring"]["weights"]["tools"]), flawfinder=SASTToolConfig( - config["tools"]["flawfinder"]["path"], config["tools"]["flawfinder"]["checks"], -1 + SanityChecks.NONE, config["tools"]["flawfinder"]["path"], config["tools"]["flawfinder"]["checks"], -1 ), semgrep=SASTToolConfig( + SanityChecks.NONE, config["tools"]["semgrep"]["path"], config["tools"]["semgrep"]["checks"], config["tools"]["semgrep"]["num_threads"], ), infer=SASTToolConfig( + SanityChecks.NONE, config["tools"]["infer"]["path"], config["tools"]["infer"]["checks"], config["tools"]["infer"]["num_threads"], ), codeql=SASTToolConfig( - config["tools"]["codeql"]["path"], codeql_checks, config["tools"]["codeql"]["num_threads"] + SanityChecks[config["tools"]["codeql"]["sanity_checks"]], + config["tools"]["codeql"]["path"], + codeql_checks, + config["tools"]["codeql"]["num_threads"], ), clang_scan=SASTToolConfig( - config["tools"]["clang_scan"]["path"], config["tools"]["clang_scan"]["checks"], -1 + SanityChecks.NONE, config["tools"]["clang_scan"]["path"], config["tools"]["clang_scan"]["checks"], -1 ), ) diff --git a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py index 879e6fb..4c51f6c 100644 --- a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py +++ b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py @@ -21,7 +21,7 @@ from itertools import chain from pathlib import Path from tempfile import TemporaryDirectory -from typing import ClassVar, Dict +from typing import Callable, ClassVar, Dict, Optional from sfa import SASTToolConfig from sfa.analysis import SASTFlag, SASTFlags @@ -92,11 +92,12 @@ def codeql_sanity_check(sarif_string: str) -> None: logging.info(f"CodeQL picked up {loc} LoC, {uloc} of which are considered user-written code.") -def convert_sarif(string: str) -> SASTFlags: +def convert_sarif(string: str, sanity_check: Optional[Callable[[Dict], None]] = None) -> SASTFlags: """ Convert SARIF data into our SAST flag format. :param string: + :param sanity_check: :return: """ if len(string.strip()) == 0: @@ -107,6 +108,9 @@ def convert_sarif(string: str) -> SASTFlags: if sarif_data["version"] != SARIF_VERSION: raise ValueError(f"SARIF version {sarif_data['version']} is not supported.") + if sanity_check is not None: + sanity_check(sarif_data) + flags = SASTFlags() for run in sarif_data["runs"]: From b25651af5e4501c6c09756da0475e0eb963f71f1 Mon Sep 17 00:00:00 2001 From: Stephan Lipp Date: Fri, 27 Oct 2023 13:41:29 +0200 Subject: [PATCH 4/4] refactor(sast): minor changes of the sanity checks --- .../static_analysis/sast/src/sfa/__init__.py | 2 +- .../sast/src/sfa/analysis/tool_runner.py | 103 ++++++++++-------- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py index 264d5f7..5ecfb7f 100644 --- a/sast-fuzz/static_analysis/sast/src/sfa/__init__.py +++ b/sast-fuzz/static_analysis/sast/src/sfa/__init__.py @@ -85,7 +85,7 @@ def from_yaml(cls, file: Path) -> "AppConfig": config["tools"]["infer"]["num_threads"], ), codeql=SASTToolConfig( - SanityChecks[config["tools"]["codeql"]["sanity_checks"]], + SanityChecks[config["tools"]["codeql"]["sanity_checks"].upper()], config["tools"]["codeql"]["path"], codeql_checks, config["tools"]["codeql"]["num_threads"], diff --git a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py index 4c51f6c..a052ba6 100644 --- a/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py +++ b/sast-fuzz/static_analysis/sast/src/sfa/analysis/tool_runner.py @@ -23,7 +23,7 @@ from tempfile import TemporaryDirectory from typing import Callable, ClassVar, Dict, Optional -from sfa import SASTToolConfig +from sfa import SanityChecks, SASTToolConfig from sfa.analysis import SASTFlag, SASTFlags from sfa.utils.fs import copy_dir, find_files from sfa.utils.proc import run_shell_command @@ -57,41 +57,6 @@ def is_cmake_project(subject_dir: Path) -> bool: return (subject_dir / "CMakeLists.txt").exists() -def codeql_sanity_check(sarif_string: str) -> None: - """ - Check if the CodeQL database actually contains user code. - """ - - if len(sarif_string.strip()) == 0: - raise ValueError("Empty input / no JSON string.") - - sarif_data = json.loads(sarif_string) - - if sarif_data["version"] != SARIF_VERSION: - raise ValueError(f"SARIF version {sarif_data['version']} is not supported.") - - for run in sarif_data["runs"]: - metrics = run["properties"].get("metricResults") - if metrics is None: - logging.debug( - "CodeQL sanity check failed because required metadata was not found in the SARIF file. Make sure to include the 'Summary' queries if you want this check." - ) - return - - for metric in metrics: - if metric["ruleId"] == "cpp/summary/lines-of-code": - loc = int(metric["value"]) - elif metric["ruleId"] == "cpp/summary/lines-of-user-code": - uloc = int(metric["value"]) - - if uloc == 0: - logging.warn( - "The CodeQL database contains no user code. That usually means CodeQL did not process the build script as intended and will lead to EMPTY RESULTS." - ) - else: - logging.info(f"CodeQL picked up {loc} LoC, {uloc} of which are considered user-written code.") - - def convert_sarif(string: str, sanity_check: Optional[Callable[[Dict], None]] = None) -> SASTFlags: """ Convert SARIF data into our SAST flag format. @@ -276,14 +241,54 @@ class CodeQLRunner(SASTToolRunner): CodeQL runner. """ + def _sanity_check(self, sarif_data: Dict) -> None: + """ + Run sanity checks on CodeQL output. + + :param sarif_data: + :return: + """ + n_runs = len(sarif_data["runs"]) + + if n_runs == 0: + raise ValueError("No CodeQL execution runs found.") + + # Let's take the last executed SAST run for the sanity check + run = sarif_data["runs"][n_runs - 1] + metrics = run["properties"].get("metricResults") + + if metrics is None: + raise ValueError("No CodeQL metrics data found in SARIF file.") + + loc = 0 + user_loc = 0 + + for m in metrics: + if m["ruleId"] == "cpp/summary/lines-of-code": + loc = int(m["value"]) + if m["ruleId"] == "cpp/summary/lines-of-user-code": + user_loc = int(m["value"]) + + logging.debug(f"Sanity-Check [CodeQL]: LoC = {loc}, User-LoC = {user_loc}") + + if user_loc == 0: + raise ValueError("No user C/C++ source code found in the CodeQL database.") + def _setup(self, temp_dir: Path) -> Path: result_dir = temp_dir / "codeql_res" - run_shell_command( - f"{self._config.path} database create --language=cpp --command=./{BUILD_SCRIPT_NAME} --threads={self._config.num_threads} {result_dir}", - cwd=copy_dir(self._subject_dir, temp_dir), - env=SAST_SETUP_ENV, - ) + if self._is_cmake_project: + run_shell_command( + f"{self._config.path} database create --language=cpp --command=./{BUILD_SCRIPT_NAME} --threads={self._config.num_threads} {result_dir}", + cwd=copy_dir(self._subject_dir, temp_dir), + env=SAST_SETUP_ENV, + ) + else: + run_shell_command( + f'./{BUILD_SCRIPT_NAME} "{self._config.path} database create --language=cpp --command=make --threads={self._config.num_threads} {result_dir}"', + cwd=copy_dir(self._subject_dir, temp_dir), + env=SAST_SETUP_ENV, + ) return result_dir @@ -296,12 +301,20 @@ def _analyze(self, working_dir: Path) -> str: time.sleep(5) - sarif_string = result_file.read_text() - codeql_sanity_check(sarif_string) - return sarif_string + return result_file.read_text() def _format(self, string: str) -> SASTFlags: - return convert_sarif(string) + run_sc = False + + if self._config.sanity_checks == SanityChecks.ALWAYS or ( + self._config.sanity_checks == SanityChecks.CMAKE and self._is_cmake_project + ): + run_sc = True + + if not run_sc: + return convert_sarif(string) + else: + return convert_sarif(string, self._sanity_check) class ClangScanRunner(SASTToolRunner):