diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e3944cd..54045f7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -113,9 +113,14 @@ jobs:
           invoke test
       -
         name: Benchmark
+        id: benchmark
         run: |
-          echo 'benchmark='"$(invoke benchmark -f base64)" >> "$GITHUB_OUTPUT"
-
+          invoke benchmark -f json | tee benchmark.txt
+      -
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.python_version }}_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.experimental }}
+          path: benchmark.txt
 
   check-versions:
     runs-on: 'ubuntu-latest'
@@ -151,3 +156,80 @@ jobs:
           if [ "x$(invoke local-tag-exists --format json 'v${{ steps.version.outputs.CURRENT_VERSION }}')" = 'xtrue' ]; then
             echo '::warning file=CURRENT_VERSION.txt,line=2,title=Version already exists in tags::Tag v${{ steps.version.outputs.CURRENT_VERSION }} already exists.'
           fi
+
+  collect_benchmark:
+    runs-on: 'ubuntu-latest'
+    needs: test_matrix
+    steps:
+      -
+        name: Set up Python 3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+          architecture: 'x64'
+      -
+        uses: actions/download-artifact@v4
+        with:
+          path: benchmarks
+      -
+        run: |
+          ls -R benchmarks
+          (jq -c -r --slurp <<< $(for name in $(echo benchmarks/*); do   cat "$name/benchmark.txt" | jq -c -r '.[] | ["'$(basename $name)'", .] | flatten'; done)) | tee benchmark.json
+          python -m pip install pytablewriter terminaltables
+          python -c '
+          import json
+          import pytablewriter as ptw
+          from terminaltables import GithubFlavoredMarkdownTable
+
+          with open("benchmark.json", "rb") as fh:
+              items = json.load(fh)
+              mapping = {}
+              platforms = []
+              keys = [""]
+              rows = [None] * len(frozenset(p[0] for p in items))
+              for platform, group, test_name, result in items:
+                  col_name = f"{group}\n{test_name}"
+                  try:
+                      keys.index(col_name)
+                  except ValueError:
+                      keys.append(col_name)
+                  try:
+                      platforms.index(platform)
+                  except ValueError:
+                      platforms.append(platform)
+
+              def _sort(s: str):
+                  version, platform, arch, experimental = s.split("_")
+                  experimental = experimental.lower() == "true"
+                  is_pypy = False
+                  if version.startswith("pypy"):
+                      version = version[len("pypy"):]
+                      is_pypy = True
+                  major, minor = map(int, version.split("."))
+                  return (-1 if is_pypy else 0, (major, minor), platform, arch, experimental)
+
+              platforms = tuple(sorted(platforms, key=_sort, reverse=True))
+              print(platforms)
+              for platform, group, test_name, result in items:
+                  col_name = f"{group}\n{test_name}"
+                  key_index = keys.index(col_name)
+                  row_index = platforms.index(platform)
+                  if rows[row_index] is None:
+                      rows[row_index] = [None] * len(keys)
+                      rows[row_index][0] = platform
+                  rows[row_index][key_index] = result
+              if True:
+                  table = GithubFlavoredMarkdownTable([keys, *rows])
+                  with open("BENCHMARK.md", "w") as fh:
+                      fh.write("# Benchmark of ${{ github.sha }}\n\n")
+                      fh.write(table.table)
+              else:
+                  writer = ptw.RstGridTableWriter(
+                      table_name="Benchmark of ${{ github.sha }}",
+                      headers=keys[1:],
+                      value_matrix=rows,
+                  )
+                  with open("BENCHMARK.rst", "w") as fh:
+                    writer.dump(fh)
+          '
+          echo "$(cat BENCHMARK.md)" >> "$GITHUB_STEP_SUMMARY"
diff --git a/instruct/__main__.py b/instruct/__main__.py
index cf4fe63..2ef2700 100644
--- a/instruct/__main__.py
+++ b/instruct/__main__.py
@@ -51,49 +51,50 @@ class Next(ComplexTest):
     next: int
 
 
-def main():
+def main(count=1_000_000):
     ttl = timeit.timeit(
-        't = Test(name_or_id="name")', setup="from __main__ import Test", number=1000000
+        't = Test(name_or_id="name")', setup="from __main__ import Test", number=count
     )
-    per_round_ms = (ttl / 1000000) * 1000000
-    print("Overhead of allocation, one field, safeties on: {:.2f}us".format(per_round_ms))
+    print("Overhead of allocation")
+    per_round_ms = (ttl / count) * count
+    print("one field, safeties on: {:.2f} us".format(per_round_ms))
 
     ttl = timeit.timeit(
         't = Test(name_or_id="name")',
         setup="from __main__ import TestOptimized as Test",
-        number=1000000,
+        number=count,
     )
-    per_round_ms = (ttl / 1000000) * 1000000
-    print("Overhead of allocation, one field, safeties off: {:.2f}us".format(per_round_ms))
+    per_round_ms = (ttl / count) * count
+    print("one field, safeties off: {:.2f} us".format(per_round_ms))
 
-    print("Overhead of setting a field:")
+    print("Overhead of setting a field")
     ttl = timeit.timeit(test_statement, setup="from __main__ import Test;t = Test()")
-    per_round_ms = (ttl / 1000000) * 1000000
+    per_round_ms = (ttl / count) * count
     print("Test with safeties: {:.2f} us".format(per_round_ms))
 
     ttl = timeit.timeit(
         test_statement,
         setup="from __main__ import TestOptimized as Test;t = Test()",
-        number=1000000,
+        number=count,
     )
-    per_round_ms = (ttl / 1000000) * 1000000
+    per_round_ms = (ttl / count) * count
     print("Test without safeties: {:.2f} us".format(per_round_ms))
 
     print("Overhead of clearing/setting")
     ttl = timeit.timeit(
         "clear(t);t.name_or_id = 1",
         setup='from __main__ import Test, clear;t = Test(name_or_id="name")',
-        number=1000000,
+        number=count,
     )
-    per_round_ms = (ttl / 1000000) * 1000000
+    per_round_ms = (ttl / count) * count
     print("Test with safeties: {:.2f} us".format(per_round_ms))
 
     ttl = timeit.timeit(
         "clear(t);t.name_or_id = 1",
         setup='from __main__ import TestOptimized as Test,clear;t = Test(name_or_id="name")',
-        number=1000000,
+        number=count,
     )
-    per_round_ms = (ttl / 1000000) * 1000000
+    per_round_ms = (ttl / count) * count
     print("Test without safeties: {:.2f} us".format(per_round_ms))
 
 
@@ -112,6 +113,7 @@ def main():
     subparsers = parser.add_subparsers()
     benchmark = subparsers.add_parser("benchmark")
     benchmark.set_defaults(mode="benchmark")
+    benchmark.add_argument("count", default=1_000_000, type=int)
     if PyCallGraph is not None:
         callgraph = subparsers.add_parser("callgraph")
         callgraph.set_defaults(mode="callgraph")
@@ -121,7 +123,7 @@ def main():
     if not args.mode:
         raise SystemExit("Use benchmark or callgraph")
     if args.mode == "benchmark":
-        main()
+        main(args.count)
     if PyCallGraph and args.mode == "callgraph":
         names = [random.choice((("test",) * 10) + (-1, None)) for _ in range(1000)]
         ids = [random.randint(1, 232) for _ in range(1000)]
diff --git a/tasks.py b/tasks.py
index db31c7a..cfb6478 100644
--- a/tasks.py
+++ b/tasks.py
@@ -1005,15 +1005,40 @@ def parse_with_unit(s: str) -> Tuple[Union[int, float], str]:
 
 
 @task
-def benchmark(context: Context) -> UnitValue:
+def benchmark(
+    context: Context,
+    type_: Union[Type[UnitValue], Type[str], Literal["UnitValue", "str"]] = "str",
+    *,
+    count: Optional[int] = None,
+) -> Union[UnitValue, Tuple[str, ...]]:
+    if type_ == "UnitValue":
+        type_ = UnitValue
+    elif type_ == "str":
+        type_ = str
+    assert type_ in (str, UnitValue)
     python_bin = _.python_path(str, silent=True)
-    fh = context.run(f"{python_bin} -m instruct benchmark", hide="stdout")
+    fh = context.run(f"{python_bin} -m instruct benchmark {count or ''}", hide="stdout")
     assert fh is not None
     tests = []
+    section = None
     for line in fh.stdout.strip().splitlines():
         with suppress(ValueError):
             name, val = (x.strip() for x in line.strip().split(":", 1))
             if val:
-                tests.append(UnitValue(name, _.parse_with_unit(val)))
+                if type_ is UnitValue:
+                    v = UnitValue(name, _.parse_with_unit(val, silent=True))
+                else:
+                    v = (
+                        f"{name}",
+                        f"{val}",
+                    )
+                if section:
+                    tests.append((section, *v))
+                else:
+                    tests.append(v)
+                continue
+        if line.strip().endswith(":"):
+            line = line.strip()[:-1]
+        section = line
 
     return tuple(tests)