From 6f765ef7fa608ad05be9df625d8005aec4d7646a Mon Sep 17 00:00:00 2001
From: Behnam Robatmili <brobatmili@chanzuckerberg.com>
Date: Fri, 19 Jan 2024 11:02:03 -0800
Subject: [PATCH] Early version of profiler harness

Include a basic benchmark as the starting point and needed scripts
---
 .github/workflows/profiler.yml            | 30 +++++++++++++++
 tools/perf_checker/perf_checker.py        | 44 ++++++++++++++++++++++
 tools/perf_checker/perf_checker.sh        | 46 +++++++++++++++++++++++
 tools/perf_checker/test_anndata_export.py | 20 ++++++++++
 4 files changed, 140 insertions(+)
 create mode 100644 .github/workflows/profiler.yml
 create mode 100644 tools/perf_checker/perf_checker.py
 create mode 100755 tools/perf_checker/perf_checker.sh
 create mode 100644 tools/perf_checker/test_anndata_export.py

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
new file mode 100644
index 000000000..a86123eb7
--- /dev/null
+++ b/.github/workflows/profiler.yml
@@ -0,0 +1,30 @@
+name: Profiler
+
+on:
+  pull_request:
+
+jobs:
+  run_profiler:
+    name: Run Profiler
+    runs-on: ubuntu-latest
+    permissions: # these permissions must be set for AWS auth to work!
+      id-token: write
+      contents: read
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-region: us-west-2
+          role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
+          role-session-name: PushDockerImage
+
+      - name: Run all tests
+        run: |
+          python -m venv profiler_env
+          source profiler_env/bin/activate
+          ./tools/perf_checker/perf_checker.sh
\ No newline at end of file
diff --git a/tools/perf_checker/perf_checker.py b/tools/perf_checker/perf_checker.py
new file mode 100644
index 000000000..72221f07e
--- /dev/null
+++ b/tools/perf_checker/perf_checker.py
@@ -0,0 +1,44 @@
+import profiler
+import argparse
+
+# The script takes a command and a database path and looks
+# the performance anomalies in the performance history of that
+# command across the profiled runs.
+
+parser = argparse.ArgumentParser()
+parser.add_argument("benchmark", type=str)
+parser.add_argument("db_path", type=str)
+
+args = parser.parse_args()
+
+# Processes the set of previously written logs
+# The threshold (ratio) of allowable performance degradation between profiling runs
+threshold = 1.10  # Percent difference
+
+db = profiler.data.FileBasedProfileDB(args.db_path)
+dt = db.find(f"{args.benchmark}")
+
+
+if len(dt) >= 2:
+    first_profile = dt[0]
+    curr_profile = dt[len(dt) - 1]
+    first_time = first_profile.user_time_sec
+    curr_time =  curr_profile.user_time_sec
+
+    formatted_first_profile = str(first_profile).replace('\\n', '\n').replace('\\t', '\t')
+    formatted_curr_profile = str(curr_profile).replace('\\n', '\n').replace('\\t', '\t')
+
+    if float(curr_time) > threshold * float(first_time):
+        print(f"*** First profile:\n {formatted_first_profile}")
+        print(f"*** Current profile:\n {formatted_curr_profile}")
+        print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+        raise SystemExit(f"Potential performance degradation detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+
+    if threshold * float(curr_time) < float(first_time):
+        print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+
+    print(f"*** First profile:\n {formatted_first_profile}")
+    print(f"*** Current profile:\n {formatted_curr_profile}")
+    print(
+        f"TileDB version ver = first: {first_profile.tiledbsoma_version} curr: {curr_profile.tiledbsoma_version}"
+    )
diff --git a/tools/perf_checker/perf_checker.sh b/tools/perf_checker/perf_checker.sh
new file mode 100755
index 000000000..4687e548c
--- /dev/null
+++ b/tools/perf_checker/perf_checker.sh
@@ -0,0 +1,46 @@
+.sh
+#!/bin/sh
+set -euox pipefail
+
+# Installing the requirements
+python -m venv perf
+source perf/bin/activate
+pip install psutil
+pip install gitpython
+pip install somacore
+pip install tiledbsoma
+pip install cellxgene_census
+
+# Installing mount-s3
+sudo wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.deb
+sudo apt install -y ./mount-s3.deb
+
+# Setting up mount-s3. We use S3 file system as it is necessary to persist the
+# profiling run data that are performed below
+mkdir ./census-profiler-tests
+mkdir ./s3_cache
+mount-s3 census-profiler-tests ./census-profiler-tests --cache ./s3_cache  --metadata-ttl 300
+dbpath=`pwd`/census-profiler-tests
+
+# New benchmarks must be added to this list
+declare -a benchmarks=("./tools/perf_checker/benchmark1.py")
+
+# Download the repo including the profiler
+git clone https://github.com/single-cell-data/TileDB-SOMA.git
+# Downloading TileDB-SOMA (remove the next line once the branch is merged)
+cd TileDB-SOMA
+git checkout census_profiler
+pip install profiler/
+pip list | grep profiler
+cd ../
+
+# Download gnu time tool
+sudo apt-get update -y
+sudo apt-get install -y time
+
+# Running all benchmarks and checking performance changes
+for benchmark in ${benchmarks}
+do
+  python ./TileDB-SOMA/profiler/profiler.py "python ${benchmark}" $dbpath
+  python ./tools/pref_checker/perf_checker.py "python ${benchmark}" $dbpath
+done
\ No newline at end of file
diff --git a/tools/perf_checker/test_anndata_export.py b/tools/perf_checker/test_anndata_export.py
new file mode 100644
index 000000000..545440e41
--- /dev/null
+++ b/tools/perf_checker/test_anndata_export.py
@@ -0,0 +1,20 @@
+from sys import stderr
+from time import perf_counter
+
+import cellxgene_census
+import tiledbsoma as soma
+
+print("Starting bm 1", file=stderr)
+census_S3_latest = dict(census_version="2024-01-01")
+
+
+def main():
+    with cellxgene_census.open_soma(**census_S3_latest) as census:
+        with census["census_data"]["homo_sapiens"].axis_query(
+            measurement_name="RNA",
+            obs_query=soma.AxisQuery(value_filter="""tissue_general == 'hand'"""),
+        ) as query:
+            query.to_anndata(X_name="raw")
+
+
+main()