jaydeluca · jaydeluca · Sep 27, 2023 · Sep 27, 2023
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import argparse
+from github_client import GithubClient
+from results_parser import parse
+from single_file_cache import SingleFileCache
+from utilities import get_dates_between, convert_to_plot
+from datetime import datetime
+from collections import defaultdict
+
+COMMIT_CACHE_FILE = 'cache/benchmark-date-commit-cache.json'
+REPORT_CACHE_FILE = 'cache/benchmark-report-cache.json'
+
+
+class BenchmarkApp:
+    def __init__(self, file_path: str):
+        self.client = GithubClient()
+        self.commit_cache = SingleFileCache(location=COMMIT_CACHE_FILE)
+        self.report_cache = SingleFileCache(location=REPORT_CACHE_FILE)
+        self.file_path = file_path
+
+    def get_commit_by_date(self, repository, date):
+        find_commit = self.commit_cache.retrieve_value(date)
+        if not find_commit:
+            find_commit = self.client.get_most_recent_commit(repository, date, "gh-pages")
+            if find_commit:
+                self.commit_cache.add_to_cache(date, find_commit)
+
+        return find_commit
+
+    def get_report_by_commit(self, repository, commit):
+        repo_data = self.report_cache.retrieve_value(commit)
+
+        if not repo_data:
+            repo_data = self.client.get_file_at_commit(repository=repository, filepath=self.file_path, commit_sha=commit)
+            self.report_cache.add_to_cache(commit, repo_data)
+
+        return repo_data
+
+
+def main(args):
+    file_path = "benchmark-overhead/results/release/summary.txt"
+
+    metrics = [
+        "Min heap used (MB)",
+        "Max heap used (MB)"
+    ]
+
+    timeframe = get_dates_between(args.start, datetime.now().date(), args.interval)
+    result = defaultdict(dict)
+
+    app = BenchmarkApp(file_path=file_path)
+
+    for snapshot in timeframe:
+        commit = app.get_commit_by_date(date=snapshot, repository=args.repo)
+
+        report = app.get_report_by_commit(repository=args.repo, commit=commit)
+        parsed = parse(report, metrics)
+        if parsed:
+            result[snapshot]["date"] = snapshot
+            for metric in metrics:
+                result[snapshot][metric] = parsed.metrics[metric]
+
+    dates, metric_values = convert_to_plot(result, metrics)
+
+    for metric, values in metric_values.items():
+        plt.plot(dates, values, label=metric)
+
+    plt.xlabel('Date')
+    plt.ylabel('MB')
+    plt.title('Benchmark Metrics')
+    plt.xticks(rotation=45)
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Benchmark Tracker')
+    parser.add_argument("-r", "--repo",
+                        help="Repository name. "
+                             "ex: open-telemetry/opentelemetry-java-instrumentation",
+                        required=True)
+    parser.add_argument("-s", "--start",
+                        help="Starting Date (will calculate from this date until now)",
+                        required=True)
+    parser.add_argument("-i", "--interval",
+                        help="Interval (in days) between data points", required=True)
+    arguments = parser.parse_args()
+    main(arguments)
diff --git a/github_client.py b/github_client.py
@@ -1,3 +1,5 @@
+import base64
+
 import requests
 import os
 
@@ -17,13 +19,14 @@ def _get(self, url, params=None):
         except Exception as e:
             print(e)
 
-    def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
+    def get_most_recent_commit(self, repo: str, timestamp: str, branch: str) -> requests.models.Response:
         api_url = f"{self.base_url}/repos/{repo}/commits"
 
         params = {
             "per_page": 1,
             "until": timestamp,
-            "order": "desc"
+            "order": "desc",
+            "sha": branch
         }
 
         response = self._get(api_url, params=params)
@@ -40,7 +43,7 @@ def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
             print(f"Error: {response.status_code}")
             return None
 
-    def get_repository_at_commit(self, repository, commit_sha):
+    def get_repository_at_commit(self, repository: str, commit_sha: str):
         api_url = f"{self.base_url}/repos/{repository}/git/trees/{commit_sha}?recursive=1"
 
         response = self._get(api_url)
@@ -50,3 +53,17 @@ def get_repository_at_commit(self, repository, commit_sha):
         else:
             print(f"Error: {response.status_code}")
             return None
+
+    def get_file_at_commit(self, repository: str, filepath: str, commit_sha: str):
+        api_url = f"{self.base_url}/repos/{repository}/contents/{filepath}"
+
+        response = self._get(api_url, params={"ref": commit_sha})
+
+        if response.status_code == 200:
+            # File content is base64 encoded, decode it
+            content = response.json().get("content", "")
+            content = base64.b64decode(content)
+            return str(content, encoding='utf-8')
+        else:
+            print(f"Error: {response.status_code}")
+            return None
diff --git a/main.py b/main.py
@@ -7,7 +7,7 @@
 
 from data_filter import DataFilter
 from multi_file_cache import MultiFileCache
-from utilities import count_by_file_extension, get_dates_between
+from utilities import count_by_file_extension, get_dates_between, convert_to_plot
 
 from single_file_cache import SingleFileCache
 from github_client import GithubClient
@@ -27,7 +27,7 @@ def __init__(self, languages: List[str], path_prefix: str, keyword: str):
     def get_commit_by_date(self, repository, date):
         find_commit = self.commit_cache.retrieve_value(date)
         if not find_commit:
-            find_commit = self.client.get_most_recent_commit(repository, date)
+            find_commit = self.client.get_most_recent_commit(repository, date, "main")
             if find_commit:
                 self.commit_cache.add_to_cache(date, find_commit)
 
@@ -74,9 +74,7 @@ def main(args):
         except Exception as e:
             print(f"Error for {snapshot}, {e}")
 
-    dates = []
-
-    language_counts = {}
+    dates, language_counts = convert_to_plot(result, languages)
 
     for item in result.values():
         dates.append(item["date"][:10])

diff --git a/media/benchmark_output.png b/media/benchmark_output.png
diff --git a/readme.md b/readme.md
@@ -1,6 +1,10 @@
-# Code Migration Tracker
+# Repo Metrics
 
-Goal: Given a repository, a timeframe, and any filtering rules, track a goal over time.
+Random collection of tools to pull and visualize various data about a repository as timeseries metrics.
+
+Functionality:
+* Timeseries file count tracking: for tracking migration projects from one language to another.
+* Pull benchmark data and visualize as timeseries
 
 ## Setup
 
@@ -11,7 +15,23 @@ export GITHUB_TOKEN="insert-your-token"
 make install
 ```
 
-## Arguments
+## Test / linting
+
+```
+# Make sure you install pytest and ruff
+make install
+
+# Tests
+make test
+
+# Linting
+make lint
+```
+
+
+## File Count Tracking
+
+### Arguments
 
 | Argument   | Command        | Description                                                                | Example                                                    |
 |------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
@@ -20,7 +40,7 @@ make install
 | Interval   | -i, --interval | Interval (in days) between data points                                     | --interval 14                                              |
 
 
-## Example Usage:
+### Example Usage:
 
 In the `open-telemetry/opentelemetry-java-instrumentation` repository, track the conversion of tests from groovy to java 
 in the `instrumentation` directory starting from 2022-11-15 with a data point every 2 weeks.
@@ -32,19 +52,28 @@ Output:
 ![Example](./media/example_output.png)
 
 
-## Test / linting
+## Benchmark Visualization
 
-```
-# Make sure you install pytest and ruff
-make install
+This is very specific to the open-telemetry/opentelemetry-java-instrumentation repo
 
-# Tests
-make test
+### Arguments
 
-# Linting
-make lint
-```
+| Argument   | Command        | Description                                                                | Example                                                    |
+|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
+| Repository | -r, --repo     | Repository name.                                                           | --repo "open-telemetry/opentelemetry-java-instrumentation" |
+| Start Date | -s, --start    | Starting Date in format %Y-%m-%d (will calculate from this date until now) | --start "2022-11-15"                                       |
+| Interval   | -i, --interval | Interval (in days) between data points                                     | --interval 14                                              |
+
+
+### Example Usage:
+
+Chart Min and max heap starting from 2022-02-14 with a data point every 30 days.
 
+`python benchmark.py -r "open-telemetry/opentelemetry-java-instrumentation" -s "2022-02-14" -i 30`
+
+Output:
+
+![Example](./media/benchmark_output.png)
 
 ## Approach
 
@@ -54,6 +83,3 @@ make lint
   - Cache this data locally to avoid repeated api calls
 - Generate Graph to show results over time frame
 
-
-## Data Filters
-
diff --git a/results_parser.py b/results_parser.py
@@ -0,0 +1,46 @@
+from datetime import datetime
+from typing import List
+
+
+class ReportMetrics:
+    def __init__(self, date: str):
+        self.date = date
+        self.metrics = {}
+
+
+def parse(report: str, metrics: List[str]) -> ReportMetrics:
+    if report is None:
+        return None
+
+    split = report.split("----------------------------------------------------------\n")
+
+    metrics_split = split[2].split("\n")
+    date = convert_to_desired_format(split[1].split("Run at ")[1].split("\n")[0])
+
+    report_metrics = ReportMetrics(date=date)
+
+    try:
+        for line in metrics_split:
+            for metric in metrics:
+
+                if line.startswith(metric):
+                    values = line.split(":")
+                    report_metrics.metrics[metric] = float(values[1].split()[1])
+    except IndexError:
+        return None
+
+    return report_metrics
+
+
+def convert_to_desired_format(date_str):
+    # Define the input and output date formats
+    input_format = "%a %b %d %H:%M:%S UTC %Y"
+    output_format = "%Y-%m-%d"
+
+    try:
+        parsed_date = datetime.strptime(date_str, input_format)
+        formatted_date = parsed_date.strftime(output_format)
+        return formatted_date
+    except ValueError:
+        print("Invalid date format")
+        return None
diff --git a/tests/results_parser_test.py b/tests/results_parser_test.py
@@ -0,0 +1,25 @@
+import unittest
+
+from results_parser import parse
+
+
+class ResultsParserTestCase(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        self.example = """----------------------------------------------------------\n Run at Sat Sep 23 05:22:19 UTC 2023\n release : compares no agent, latest stable, and latest snapshot agents\n 5 users, 5000 iterations\n----------------------------------------------------------\nAgent               :              none           latest         snapshot\nRun duration        :          00:02:27         00:02:57         00:03:06\nAvg. CPU (user) %   :        0.46024063       0.48809186       0.49900937\nMax. CPU (user) %   :         0.5527638        0.5891089              0.6\nAvg. mch tot cpu %  :         0.9943353       0.99306744        0.9932704\nStartup time (ms)   :             19598            16351            17050\nTotal allocated MB  :          27799.50         34195.20         58039.97\nMin heap used (MB)  :             88.10           115.85           112.63\nMax heap used (MB)  :            365.90           557.00           478.78\nThread switch rate  :          28534.94        29848.291        32354.986\nGC time (ms)        :              1800             3014             2928\nGC pause time (ms)  :              1814             3052             2959\nReq. mean (ms)      :             10.74            12.82            13.51\nReq. p95 (ms)       :             32.04            38.45            40.28\nIter. mean (ms)     :            144.60           173.90           182.65\nIter. p95 (ms)      :            233.74           275.94           291.89\nNet read avg (bps)  :        5441971.00       4728712.00       4507975.00\nNet write avg (bps) :        7256048.00      25533599.00      24434992.00\nPeak threads        :                43               55               56\n"""
+        self.metrics = [
+            "Min heap used (MB)",
+            "Max heap used (MB)"
+        ]
+        super(ResultsParserTestCase, self).__init__(*args, **kwargs)
+
+    def test_parse_metrics_from_summary(self):
+        result = parse(report=self.example, metrics=self.metrics)
+        self.assertEqual(557.00, result.metrics["Max heap used (MB)"])
+        self.assertEqual(115.85, result.metrics["Min heap used (MB)"])
+
+    def test_parse_date_from_summary(self):
+        result = parse(report=self.example, metrics=self.metrics)
+        self.assertEqual("2023-09-23", result.date)
+
+