Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

benchmarks #7

Merged
merged 1 commit into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import matplotlib.pyplot as plt
import argparse
from github_client import GithubClient
from results_parser import parse
from single_file_cache import SingleFileCache
from utilities import get_dates_between, convert_to_plot
from datetime import datetime
from collections import defaultdict

COMMIT_CACHE_FILE = 'cache/benchmark-date-commit-cache.json'
REPORT_CACHE_FILE = 'cache/benchmark-report-cache.json'


class BenchmarkApp:
def __init__(self, file_path: str):
self.client = GithubClient()
self.commit_cache = SingleFileCache(location=COMMIT_CACHE_FILE)
self.report_cache = SingleFileCache(location=REPORT_CACHE_FILE)
self.file_path = file_path

def get_commit_by_date(self, repository, date):
find_commit = self.commit_cache.retrieve_value(date)
if not find_commit:
find_commit = self.client.get_most_recent_commit(repository, date, "gh-pages")
if find_commit:
self.commit_cache.add_to_cache(date, find_commit)

return find_commit

def get_report_by_commit(self, repository, commit):
repo_data = self.report_cache.retrieve_value(commit)

if not repo_data:
repo_data = self.client.get_file_at_commit(repository=repository, filepath=self.file_path, commit_sha=commit)
self.report_cache.add_to_cache(commit, repo_data)

return repo_data


def main(args):
file_path = "benchmark-overhead/results/release/summary.txt"

metrics = [
"Min heap used (MB)",
"Max heap used (MB)"
]

timeframe = get_dates_between(args.start, datetime.now().date(), args.interval)
result = defaultdict(dict)

app = BenchmarkApp(file_path=file_path)

for snapshot in timeframe:
commit = app.get_commit_by_date(date=snapshot, repository=args.repo)

report = app.get_report_by_commit(repository=args.repo, commit=commit)
parsed = parse(report, metrics)
if parsed:
result[snapshot]["date"] = snapshot
for metric in metrics:
result[snapshot][metric] = parsed.metrics[metric]

dates, metric_values = convert_to_plot(result, metrics)

for metric, values in metric_values.items():
plt.plot(dates, values, label=metric)

plt.xlabel('Date')
plt.ylabel('MB')
plt.title('Benchmark Metrics')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Benchmark Tracker')
parser.add_argument("-r", "--repo",
help="Repository name. "
"ex: open-telemetry/opentelemetry-java-instrumentation",
required=True)
parser.add_argument("-s", "--start",
help="Starting Date (will calculate from this date until now)",
required=True)
parser.add_argument("-i", "--interval",
help="Interval (in days) between data points", required=True)
arguments = parser.parse_args()
main(arguments)
23 changes: 20 additions & 3 deletions github_client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64

import requests
import os

Expand All @@ -17,13 +19,14 @@ def _get(self, url, params=None):
except Exception as e:
print(e)

def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
def get_most_recent_commit(self, repo: str, timestamp: str, branch: str) -> requests.models.Response:
api_url = f"{self.base_url}/repos/{repo}/commits"

params = {
"per_page": 1,
"until": timestamp,
"order": "desc"
"order": "desc",
"sha": branch
}

response = self._get(api_url, params=params)
Expand All @@ -40,7 +43,7 @@ def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
print(f"Error: {response.status_code}")
return None

def get_repository_at_commit(self, repository, commit_sha):
def get_repository_at_commit(self, repository: str, commit_sha: str):
api_url = f"{self.base_url}/repos/{repository}/git/trees/{commit_sha}?recursive=1"

response = self._get(api_url)
Expand All @@ -50,3 +53,17 @@ def get_repository_at_commit(self, repository, commit_sha):
else:
print(f"Error: {response.status_code}")
return None

def get_file_at_commit(self, repository: str, filepath: str, commit_sha: str):
api_url = f"{self.base_url}/repos/{repository}/contents/{filepath}"

response = self._get(api_url, params={"ref": commit_sha})

if response.status_code == 200:
# File content is base64 encoded, decode it
content = response.json().get("content", "")
content = base64.b64decode(content)
return str(content, encoding='utf-8')
else:
print(f"Error: {response.status_code}")
return None
8 changes: 3 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from data_filter import DataFilter
from multi_file_cache import MultiFileCache
from utilities import count_by_file_extension, get_dates_between
from utilities import count_by_file_extension, get_dates_between, convert_to_plot

from single_file_cache import SingleFileCache
from github_client import GithubClient
Expand All @@ -27,7 +27,7 @@ def __init__(self, languages: List[str], path_prefix: str, keyword: str):
def get_commit_by_date(self, repository, date):
find_commit = self.commit_cache.retrieve_value(date)
if not find_commit:
find_commit = self.client.get_most_recent_commit(repository, date)
find_commit = self.client.get_most_recent_commit(repository, date, "main")
if find_commit:
self.commit_cache.add_to_cache(date, find_commit)

Expand Down Expand Up @@ -74,9 +74,7 @@ def main(args):
except Exception as e:
print(f"Error for {snapshot}, {e}")

dates = []

language_counts = {}
dates, language_counts = convert_to_plot(result, languages)

for item in result.values():
dates.append(item["date"][:10])
Expand Down
Binary file added media/benchmark_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 42 additions & 16 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Code Migration Tracker
# Repo Metrics

Goal: Given a repository, a timeframe, and any filtering rules, track a goal over time.
Random collection of tools to pull and visualize various data about a repository as timeseries metrics.

Functionality:
* Timeseries file count tracking: for tracking migration projects from one language to another.
* Pull benchmark data and visualize as timeseries

## Setup

Expand All @@ -11,7 +15,23 @@ export GITHUB_TOKEN="insert-your-token"
make install
```

## Arguments
## Test / linting

```
# Make sure you install pytest and ruff
make install

# Tests
make test

# Linting
make lint
```


## File Count Tracking

### Arguments

| Argument | Command | Description | Example |
|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
Expand All @@ -20,7 +40,7 @@ make install
| Interval | -i, --interval | Interval (in days) between data points | --interval 14 |


## Example Usage:
### Example Usage:

In the `open-telemetry/opentelemetry-java-instrumentation` repository, track the conversion of tests from groovy to java
in the `instrumentation` directory starting from 2022-11-15 with a data point every 2 weeks.
Expand All @@ -32,19 +52,28 @@ Output:
![Example](./media/example_output.png)


## Test / linting
## Benchmark Visualization

```
# Make sure you install pytest and ruff
make install
This is very specific to the open-telemetry/opentelemetry-java-instrumentation repo

# Tests
make test
### Arguments

# Linting
make lint
```
| Argument | Command | Description | Example |
|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------|
| Repository | -r, --repo | Repository name. | --repo "open-telemetry/opentelemetry-java-instrumentation" |
| Start Date | -s, --start | Starting Date in format %Y-%m-%d (will calculate from this date until now) | --start "2022-11-15" |
| Interval | -i, --interval | Interval (in days) between data points | --interval 14 |


### Example Usage:

Chart Min and max heap starting from 2022-02-14 with a data point every 30 days.

`python benchmark.py -r "open-telemetry/opentelemetry-java-instrumentation" -s "2022-02-14" -i 30`

Output:

![Example](./media/benchmark_output.png)

## Approach

Expand All @@ -54,6 +83,3 @@ make lint
- Cache this data locally to avoid repeated api calls
- Generate Graph to show results over time frame


## Data Filters

46 changes: 46 additions & 0 deletions results_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from datetime import datetime
from typing import List


class ReportMetrics:
def __init__(self, date: str):
self.date = date
self.metrics = {}


def parse(report: str, metrics: List[str]) -> ReportMetrics:
if report is None:
return None

split = report.split("----------------------------------------------------------\n")

metrics_split = split[2].split("\n")
date = convert_to_desired_format(split[1].split("Run at ")[1].split("\n")[0])

report_metrics = ReportMetrics(date=date)

try:
for line in metrics_split:
for metric in metrics:

if line.startswith(metric):
values = line.split(":")
report_metrics.metrics[metric] = float(values[1].split()[1])
except IndexError:
return None

return report_metrics


def convert_to_desired_format(date_str):
# Define the input and output date formats
input_format = "%a %b %d %H:%M:%S UTC %Y"
output_format = "%Y-%m-%d"

try:
parsed_date = datetime.strptime(date_str, input_format)
formatted_date = parsed_date.strftime(output_format)
return formatted_date
except ValueError:
print("Invalid date format")
return None
25 changes: 25 additions & 0 deletions tests/results_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest

from results_parser import parse


class ResultsParserTestCase(unittest.TestCase):

def __init__(self, *args, **kwargs):
self.example = """----------------------------------------------------------\n Run at Sat Sep 23 05:22:19 UTC 2023\n release : compares no agent, latest stable, and latest snapshot agents\n 5 users, 5000 iterations\n----------------------------------------------------------\nAgent : none latest snapshot\nRun duration : 00:02:27 00:02:57 00:03:06\nAvg. CPU (user) % : 0.46024063 0.48809186 0.49900937\nMax. CPU (user) % : 0.5527638 0.5891089 0.6\nAvg. mch tot cpu % : 0.9943353 0.99306744 0.9932704\nStartup time (ms) : 19598 16351 17050\nTotal allocated MB : 27799.50 34195.20 58039.97\nMin heap used (MB) : 88.10 115.85 112.63\nMax heap used (MB) : 365.90 557.00 478.78\nThread switch rate : 28534.94 29848.291 32354.986\nGC time (ms) : 1800 3014 2928\nGC pause time (ms) : 1814 3052 2959\nReq. mean (ms) : 10.74 12.82 13.51\nReq. p95 (ms) : 32.04 38.45 40.28\nIter. mean (ms) : 144.60 173.90 182.65\nIter. p95 (ms) : 233.74 275.94 291.89\nNet read avg (bps) : 5441971.00 4728712.00 4507975.00\nNet write avg (bps) : 7256048.00 25533599.00 24434992.00\nPeak threads : 43 55 56\n"""
self.metrics = [
"Min heap used (MB)",
"Max heap used (MB)"
]
super(ResultsParserTestCase, self).__init__(*args, **kwargs)

def test_parse_metrics_from_summary(self):
result = parse(report=self.example, metrics=self.metrics)
self.assertEqual(557.00, result.metrics["Max heap used (MB)"])
self.assertEqual(115.85, result.metrics["Min heap used (MB)"])

def test_parse_date_from_summary(self):
result = parse(report=self.example, metrics=self.metrics)
self.assertEqual("2023-09-23", result.date)


Loading