Skip to content

Commit

Permalink
benchmarks working
Browse files Browse the repository at this point in the history
  • Loading branch information
jaydeluca committed Sep 27, 2023
1 parent 1033d01 commit b148b46
Show file tree
Hide file tree
Showing 9 changed files with 338 additions and 25 deletions.
91 changes: 91 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from typing import List

Check failure on line 1 in benchmark.py

View workflow job for this annotation

GitHub Actions / build (3.8)

Ruff (F401)

benchmark.py:1:20: F401 `typing.List` imported but unused

Check failure on line 1 in benchmark.py

View workflow job for this annotation

GitHub Actions / build (3.10)

Ruff (F401)

benchmark.py:1:20: F401 `typing.List` imported but unused

Check failure on line 1 in benchmark.py

View workflow job for this annotation

GitHub Actions / build (3.11)

Ruff (F401)

benchmark.py:1:20: F401 `typing.List` imported but unused

import matplotlib.pyplot as plt
import argparse
from github_client import GithubClient
from results_parser import parse
from single_file_cache import SingleFileCache
from utilities import get_dates_between, convert_to_plot
from datetime import datetime
from collections import defaultdict

COMMIT_CACHE_FILE = 'cache/benchmark-date-commit-cache.json'
REPORT_CACHE_FILE = 'cache/benchmark-report-cache.json'


class BenchmarkApp:
def __init__(self, file_path: str):
self.client = GithubClient()
self.commit_cache = SingleFileCache(location=COMMIT_CACHE_FILE)
self.report_cache = SingleFileCache(location=REPORT_CACHE_FILE)
self.file_path = file_path

def get_commit_by_date(self, repository, date):
find_commit = self.commit_cache.retrieve_value(date)
if not find_commit:
find_commit = self.client.get_most_recent_commit(repository, date, "gh-pages")
if find_commit:
self.commit_cache.add_to_cache(date, find_commit)

return find_commit

def get_report_by_commit(self, repository, commit):
repo_data = self.report_cache.retrieve_value(commit)

if not repo_data:
repo_data = self.client.get_file_at_commit(repository=repository, filepath=self.file_path, commit_sha=commit)
self.report_cache.add_to_cache(commit, repo_data)

return repo_data


def main(args):
file_path = "benchmark-overhead/results/release/summary.txt"

metrics = [
"Min heap used (MB)",
"Max heap used (MB)"
]

timeframe = get_dates_between(args.start, datetime.now().date(), args.interval)
result = defaultdict(dict)

app = BenchmarkApp(file_path=file_path)

for snapshot in timeframe:
commit = app.get_commit_by_date(date=snapshot, repository=args.repo)

report = app.get_report_by_commit(repository=args.repo, commit=commit)
parsed = parse(report, metrics)
if parsed:
result[snapshot]["date"] = snapshot
for metric in metrics:
result[snapshot][metric] = parsed.metrics[metric]

dates, metric_values = convert_to_plot(result, metrics)

for metric, values in metric_values.items():
plt.plot(dates, values, label=metric)

plt.xlabel('Date')
plt.ylabel('MB')
plt.title('Benchmark Metrics')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Benchmark Tracker')
parser.add_argument("-r", "--repo",
help="Repository name. "
"ex: open-telemetry/opentelemetry-java-instrumentation",
required=True)
parser.add_argument("-s", "--start",
help="Starting Date (will calculate from this date until now)",
required=True)
parser.add_argument("-i", "--interval",
help="Interval (in days) between data points", required=True)
arguments = parser.parse_args()
main(arguments)
23 changes: 20 additions & 3 deletions github_client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64

import requests
import os

Expand All @@ -17,13 +19,14 @@ def _get(self, url, params=None):
except Exception as e:
print(e)

def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
def get_most_recent_commit(self, repo: str, timestamp: str, branch: str) -> requests.models.Response:
api_url = f"{self.base_url}/repos/{repo}/commits"

params = {
"per_page": 1,
"until": timestamp,
"order": "desc"
"order": "desc",
"sha": branch
}

response = self._get(api_url, params=params)
Expand All @@ -40,7 +43,7 @@ def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
print(f"Error: {response.status_code}")
return None

def get_repository_at_commit(self, repository, commit_sha):
def get_repository_at_commit(self, repository: str, commit_sha: str):
api_url = f"{self.base_url}/repos/{repository}/git/trees/{commit_sha}?recursive=1"

response = self._get(api_url)
Expand All @@ -50,3 +53,17 @@ def get_repository_at_commit(self, repository, commit_sha):
else:
print(f"Error: {response.status_code}")
return None

def get_file_at_commit(self, repository: str, filepath: str, commit_sha: str):
api_url = f"{self.base_url}/repos/{repository}/contents/{filepath}"

response = self._get(api_url, params={"ref": commit_sha})

if response.status_code == 200:
# File content is base64 encoded, decode it
content = response.json().get("content", "")
content = base64.b64decode(content)
return str(content, encoding='utf-8')
else:
print(f"Error: {response.status_code}")
return None
8 changes: 3 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from data_filter import DataFilter
from multi_file_cache import MultiFileCache
from utilities import count_by_file_extension, get_dates_between
from utilities import count_by_file_extension, get_dates_between, convert_to_plot

from single_file_cache import SingleFileCache
from github_client import GithubClient
Expand All @@ -27,7 +27,7 @@ def __init__(self, languages: List[str], path_prefix: str, keyword: str):
def get_commit_by_date(self, repository, date):
find_commit = self.commit_cache.retrieve_value(date)
if not find_commit:
find_commit = self.client.get_most_recent_commit(repository, date)
find_commit = self.client.get_most_recent_commit(repository, date, "main")
if find_commit:
self.commit_cache.add_to_cache(date, find_commit)

Expand Down Expand Up @@ -74,9 +74,7 @@ def main(args):
except Exception as e:
print(f"Error for {snapshot}, {e}")

dates = []

language_counts = {}
dates, language_counts = convert_to_plot(result, languages)

for item in result.values():
dates.append(item["date"][:10])
Expand Down
Binary file added media/benchmark_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 42 additions & 16 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Code Migration Tracker
# Repo Metrics

Goal: Given a repository, a timeframe, and any filtering rules, track a goal over time.
Random collection of tools to pull and visualize various data about a repository as timeseries metrics.

Functionality:
* Timeseries file count tracking: for tracking migration projects from one language to another.
* Pull benchmark data and visualize as timeseries

## Setup

Expand All @@ -11,7 +15,23 @@ export GITHUB_TOKEN="insert-your-token"
make install
```

## Arguments
## Test / linting

```
# Make sure you install pytest and ruff
make install
# Tests
make test
# Linting
make lint
```


## File Count Tracking

### Arguments

| Argument | Command | Description | Example |
|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
Expand All @@ -20,7 +40,7 @@ make install
| Interval | -i, --interval | Interval (in days) between data points | --interval 14 |


## Example Usage:
### Example Usage:

In the `open-telemetry/opentelemetry-java-instrumentation` repository, track the conversion of tests from groovy to java
in the `instrumentation` directory starting from 2022-11-15 with a data point every 2 weeks.
Expand All @@ -32,19 +52,28 @@ Output:
![Example](./media/example_output.png)


## Test / linting
## Benchmark Visualization

```
# Make sure you install pytest and ruff
make install
This is very specific to the open-telemetry/opentelemetry-java-instrumentation repo

# Tests
make test
### Arguments

# Linting
make lint
```
| Argument | Command | Description | Example |
|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
| Repository | -r, --repo | Repository name. | --repo "open-telemetry/opentelemetry-java-instrumentation" |
| Start Date | -s, --start | Starting Date in format %Y-%m-%d (will calculate from this date until now) | --start "2022-11-15" |
| Interval | -i, --interval | Interval (in days) between data points | --interval 14 |


### Example Usage:

Chart Min and max heap starting from 2022-02-14 with a data point every 30 days.

`python benchmark.py -r "open-telemetry/opentelemetry-java-instrumentation" -s "2022-02-14" -i 30`

Output:

![Example](./media/benchmark_output.png)

## Approach

Expand All @@ -54,6 +83,3 @@ make lint
- Cache this data locally to avoid repeated api calls
- Generate Graph to show results over time frame


## Data Filters

46 changes: 46 additions & 0 deletions results_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from datetime import datetime
from typing import List


class ReportMetrics:
def __init__(self, date: str):
self.date = date
self.metrics = {}


def parse(report: str, metrics: List[str]) -> ReportMetrics:
if report is None:
return None

split = report.split("----------------------------------------------------------\n")

metrics_split = split[2].split("\n")
date = convert_to_desired_format(split[1].split("Run at ")[1].split("\n")[0])

report_metrics = ReportMetrics(date=date)

try:
for line in metrics_split:
for metric in metrics:

if line.startswith(metric):
values = line.split(":")
report_metrics.metrics[metric] = float(values[1].split()[1])
except IndexError:
return None

return report_metrics


def convert_to_desired_format(date_str):
# Define the input and output date formats
input_format = "%a %b %d %H:%M:%S UTC %Y"
output_format = "%Y-%m-%d"

try:
parsed_date = datetime.strptime(date_str, input_format)
formatted_date = parsed_date.strftime(output_format)
return formatted_date
except ValueError:
print("Invalid date format")
return None
25 changes: 25 additions & 0 deletions tests/results_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest

from results_parser import parse


class ResultsParserTestCase(unittest.TestCase):

def __init__(self, *args, **kwargs):
self.example = """----------------------------------------------------------\n Run at Sat Sep 23 05:22:19 UTC 2023\n release : compares no agent, latest stable, and latest snapshot agents\n 5 users, 5000 iterations\n----------------------------------------------------------\nAgent : none latest snapshot\nRun duration : 00:02:27 00:02:57 00:03:06\nAvg. CPU (user) % : 0.46024063 0.48809186 0.49900937\nMax. CPU (user) % : 0.5527638 0.5891089 0.6\nAvg. mch tot cpu % : 0.9943353 0.99306744 0.9932704\nStartup time (ms) : 19598 16351 17050\nTotal allocated MB : 27799.50 34195.20 58039.97\nMin heap used (MB) : 88.10 115.85 112.63\nMax heap used (MB) : 365.90 557.00 478.78\nThread switch rate : 28534.94 29848.291 32354.986\nGC time (ms) : 1800 3014 2928\nGC pause time (ms) : 1814 3052 2959\nReq. mean (ms) : 10.74 12.82 13.51\nReq. p95 (ms) : 32.04 38.45 40.28\nIter. mean (ms) : 144.60 173.90 182.65\nIter. p95 (ms) : 233.74 275.94 291.89\nNet read avg (bps) : 5441971.00 4728712.00 4507975.00\nNet write avg (bps) : 7256048.00 25533599.00 24434992.00\nPeak threads : 43 55 56\n"""
self.metrics = [
"Min heap used (MB)",
"Max heap used (MB)"
]
super(ResultsParserTestCase, self).__init__(*args, **kwargs)

def test_parse_metrics_from_summary(self):
result = parse(report=self.example, metrics=self.metrics)
self.assertEqual(557.00, result.metrics["Max heap used (MB)"])
self.assertEqual(115.85, result.metrics["Min heap used (MB)"])

def test_parse_date_from_summary(self):
result = parse(report=self.example, metrics=self.metrics)
self.assertEqual("2023-09-23", result.date)


Loading

0 comments on commit b148b46

Please sign in to comment.