[centipede] add support for gathering stats for centipede runs

This PR adds supports so that CF supports Centipede stats. This will help understand better how centipede fuzzers are performing on ClusterFuzz.
google · Jan 7, 2025 · 2a9c6fb · 2a9c6fb
1 parent 13fb6f2
commit 2a9c6fb
Show file tree

Hide file tree

Showing 4 changed files with 575 additions and 7 deletions.
diff --git a/src/clusterfuzz/_internal/bot/fuzzers/centipede/engine.py b/src/clusterfuzz/_internal/bot/fuzzers/centipede/engine.py
@@ -14,10 +14,15 @@
 """Centipede engine interface."""
 
 from collections import namedtuple
+import csv
 import os
 import pathlib
 import re
 import shutil
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
 
 from clusterfuzz._internal.bot.fuzzers import dictionary_manager
 from clusterfuzz._internal.bot.fuzzers import engine_common
@@ -28,6 +33,7 @@
 from clusterfuzz._internal.system import environment
 from clusterfuzz._internal.system import new_process
 from clusterfuzz.fuzz import engine
+from clusterfuzz.stacktraces import constants as stacktraces_constants
 
 _CLEAN_EXIT_SECS = 10
 
@@ -72,9 +78,68 @@ def _set_sanitizer_options(fuzzer_path):
   environment.set_memory_tool_options(sanitizer_options_var, sanitizer_options)
 
 
+def _parse_centipede_stats(
+    stats_file: str) -> Optional[Dict[str, Union[int, float]]]:
+  """Parses the Centipede stats file and returns a dictionary with labels
+  and their respective values.
+
+  Args:
+      stats_file: the path to Centipede stats file.
+
+  Returns:
+      a dictionary containing the stats.
+  """
+  if not os.path.exists(stats_file):
+    return None
+  with open(stats_file, 'r') as statsfile:
+    csvreader = csv.reader(statsfile)
+    l = list(csvreader)
+    # If the binary could not run at all, the file will be empty or with only
+    # the column description line.
+    if len(l) <= 1:
+      return None
+    return {
+        l[0][i]: float(l[-1][i]) if '.' in l[-1][i] else int(l[-1][i])
+        for i in range(0,
+                       len(l[0]) - 1)
+    }
+
+
+def _parse_centipede_logs(log_lines: List[str]) -> Dict[str, int]:
+  """Parses Centipede outputs and generates stats for it.
+
+  Args:
+      log_lines: the log lines.
+
+  Returns:
+      the stats.
+  """
+  stats = {
+      'crash_count': 0,
+      'timeout_count': 0,
+      'oom_count': 0,
+      'leak_count': 0,
+  }
+  for line in log_lines:
+    if re.search(stacktraces_constants.CENTIPEDE_TIMEOUT_REGEX, line):
+      stats['timeout_count'] = 1
+      continue
+    if re.search(stacktraces_constants.OUT_OF_MEMORY_REGEX, line):
+      stats['oom_count'] = 1
+      continue
+    if re.search(CRASH_REGEX, line):
+      stats['crash_count'] = 1
+      continue
+  return stats
+
+
 class Engine(engine.Engine):
   """Centipede engine implementation."""
 
+  def __init__(self):
+    super().__init__()
+    self.workdir = self._create_temp_dir('workdir')
+
   @property
   def name(self):
     return 'centipede'
@@ -126,8 +191,7 @@ def prepare(self, corpus_dir, target_path, build_dir):
     # 1. Centipede-readable corpus file;
     # 2. Centipede-readable feature file;
     # 3. Crash reproducing inputs.
-    workdir = self._create_temp_dir('workdir')
-    arguments[constants.WORKDIR_FLAGNAME] = str(workdir)
+    arguments[constants.WORKDIR_FLAGNAME] = str(self.workdir)
 
     # Directory corpus_dir saves the corpus files required by ClusterFuzz.
     arguments[constants.CORPUS_DIR_FLAGNAME] = corpus_dir
@@ -214,6 +278,7 @@ def fuzz(self, target_path, options, reproducers_dir, max_time):  # pylint: disa
     timeout = max_time + _CLEAN_EXIT_SECS
     fuzz_result = runner.run_and_wait(
         additional_args=options.arguments, timeout=timeout)
+    log_lines = fuzz_result.output.splitlines()
     fuzz_result.output = Engine.trim_logs(fuzz_result.output)
 
     reproducer_path = _get_reproducer_path(fuzz_result.output, reproducers_dir)
@@ -224,8 +289,20 @@ def fuzz(self, target_path, options, reproducers_dir, max_time):  # pylint: disa
               str(reproducer_path), fuzz_result.output, [],
               int(fuzz_result.time_executed)))
 
-    # Stats report is not available in Centipede yet.
-    stats = None
+    stats_filename = f'fuzzing-stats-{os.path.basename(target_path)}.000000.csv'
+    stats_file = os.path.join(self.workdir, stats_filename)
+    stats = _parse_centipede_stats(stats_file)
+    if not stats:
+      stats = {}
+    actual_duration = int(
+        stats.get('FuzzTimeSec_Avg', fuzz_result.time_executed or 0.0))
+    fuzzing_time_percent = 100 * actual_duration / float(max_time)
+    stats.update({
+        'expected_duration': int(max_time),
+        'actual_duration': actual_duration,
+        'fuzzing_time_percent': fuzzing_time_percent,
+    })
+    stats.update(_parse_centipede_logs(log_lines))
     return engine.FuzzResult(fuzz_result.output, fuzz_result.command, crashes,
                              stats, fuzz_result.time_executed)
 
@@ -412,10 +489,9 @@ def minimize_testcase(self, target_path, arguments, input_path, output_path,
       TimeoutError: If the testcase minimization exceeds max_time.
     """
     runner = _get_runner(target_path)
-    workdir = self._create_temp_dir('workdir')
     args = [
         f'--binary={target_path}',
-        f'--workdir={workdir}',
+        f'--workdir={self.workdir}',
         f'--minimize_crash={input_path}',
         f'--num_runs={constants.NUM_RUNS_PER_MINIMIZATION}',
         '--seed=1',
@@ -425,7 +501,7 @@ def minimize_testcase(self, target_path, arguments, input_path, output_path,
       logs.warning(
           'Testcase minimization timed out.', fuzzer_output=result.output)
       raise TimeoutError('Minimization timed out.')
-    minimum_testcase = self._get_smallest_crasher(workdir)
+    minimum_testcase = self._get_smallest_crasher(self.workdir)
     if minimum_testcase:
       shutil.copyfile(minimum_testcase, output_path)
     else: