diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..378d448 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,91 @@ +from typing import List + +import matplotlib.pyplot as plt +import argparse +from github_client import GithubClient +from results_parser import parse +from single_file_cache import SingleFileCache +from utilities import get_dates_between, convert_to_plot +from datetime import datetime +from collections import defaultdict + +COMMIT_CACHE_FILE = 'cache/benchmark-date-commit-cache.json' +REPORT_CACHE_FILE = 'cache/benchmark-report-cache.json' + + +class BenchmarkApp: + def __init__(self, file_path: str): + self.client = GithubClient() + self.commit_cache = SingleFileCache(location=COMMIT_CACHE_FILE) + self.report_cache = SingleFileCache(location=REPORT_CACHE_FILE) + self.file_path = file_path + + def get_commit_by_date(self, repository, date): + find_commit = self.commit_cache.retrieve_value(date) + if not find_commit: + find_commit = self.client.get_most_recent_commit(repository, date, "gh-pages") + if find_commit: + self.commit_cache.add_to_cache(date, find_commit) + + return find_commit + + def get_report_by_commit(self, repository, commit): + repo_data = self.report_cache.retrieve_value(commit) + + if not repo_data: + repo_data = self.client.get_file_at_commit(repository=repository, filepath=self.file_path, commit_sha=commit) + self.report_cache.add_to_cache(commit, repo_data) + + return repo_data + + +def main(args): + file_path = "benchmark-overhead/results/release/summary.txt" + + metrics = [ + "Min heap used (MB)", + "Max heap used (MB)" + ] + + timeframe = get_dates_between(args.start, datetime.now().date(), args.interval) + result = defaultdict(dict) + + app = BenchmarkApp(file_path=file_path) + + for snapshot in timeframe: + commit = app.get_commit_by_date(date=snapshot, repository=args.repo) + + report = app.get_report_by_commit(repository=args.repo, commit=commit) + parsed = parse(report, metrics) + if parsed: + result[snapshot]["date"] = snapshot + for metric in metrics: + result[snapshot][metric] = parsed.metrics[metric] + + dates, metric_values = convert_to_plot(result, metrics) + + for metric, values in metric_values.items(): + plt.plot(dates, values, label=metric) + + plt.xlabel('Date') + plt.ylabel('MB') + plt.title('Benchmark Metrics') + plt.xticks(rotation=45) + plt.legend() + plt.tight_layout() + plt.show() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Benchmark Tracker') + parser.add_argument("-r", "--repo", + help="Repository name. " + "ex: open-telemetry/opentelemetry-java-instrumentation", + required=True) + parser.add_argument("-s", "--start", + help="Starting Date (will calculate from this date until now)", + required=True) + parser.add_argument("-i", "--interval", + help="Interval (in days) between data points", required=True) + arguments = parser.parse_args() + main(arguments) diff --git a/github_client.py b/github_client.py index 131ecd0..58c6342 100644 --- a/github_client.py +++ b/github_client.py @@ -1,3 +1,5 @@ +import base64 + import requests import os @@ -17,13 +19,14 @@ def _get(self, url, params=None): except Exception as e: print(e) - def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response: + def get_most_recent_commit(self, repo: str, timestamp: str, branch: str) -> requests.models.Response: api_url = f"{self.base_url}/repos/{repo}/commits" params = { "per_page": 1, "until": timestamp, - "order": "desc" + "order": "desc", + "sha": branch } response = self._get(api_url, params=params) @@ -40,7 +43,7 @@ def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response: print(f"Error: {response.status_code}") return None - def get_repository_at_commit(self, repository, commit_sha): + def get_repository_at_commit(self, repository: str, commit_sha: str): api_url = f"{self.base_url}/repos/{repository}/git/trees/{commit_sha}?recursive=1" response = self._get(api_url) @@ -50,3 +53,17 @@ def get_repository_at_commit(self, repository, commit_sha): else: print(f"Error: {response.status_code}") return None + + def get_file_at_commit(self, repository: str, filepath: str, commit_sha: str): + api_url = f"{self.base_url}/repos/{repository}/contents/{filepath}" + + response = self._get(api_url, params={"ref": commit_sha}) + + if response.status_code == 200: + # File content is base64 encoded, decode it + content = response.json().get("content", "") + content = base64.b64decode(content) + return str(content, encoding='utf-8') + else: + print(f"Error: {response.status_code}") + return None diff --git a/main.py b/main.py index 83f8312..cff5324 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ from data_filter import DataFilter from multi_file_cache import MultiFileCache -from utilities import count_by_file_extension, get_dates_between +from utilities import count_by_file_extension, get_dates_between, convert_to_plot from single_file_cache import SingleFileCache from github_client import GithubClient @@ -27,7 +27,7 @@ def __init__(self, languages: List[str], path_prefix: str, keyword: str): def get_commit_by_date(self, repository, date): find_commit = self.commit_cache.retrieve_value(date) if not find_commit: - find_commit = self.client.get_most_recent_commit(repository, date) + find_commit = self.client.get_most_recent_commit(repository, date, "main") if find_commit: self.commit_cache.add_to_cache(date, find_commit) @@ -74,9 +74,7 @@ def main(args): except Exception as e: print(f"Error for {snapshot}, {e}") - dates = [] - - language_counts = {} + dates, language_counts = convert_to_plot(result, languages) for item in result.values(): dates.append(item["date"][:10]) diff --git a/media/benchmark_output.png b/media/benchmark_output.png new file mode 100644 index 0000000..fdd0a52 Binary files /dev/null and b/media/benchmark_output.png differ diff --git a/readme.md b/readme.md index 93fb746..5d4582d 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,10 @@ -# Code Migration Tracker +# Repo Metrics -Goal: Given a repository, a timeframe, and any filtering rules, track a goal over time. +Random collection of tools to pull and visualize various data about a repository as timeseries metrics. + +Functionality: +* Timeseries file count tracking: for tracking migration projects from one language to another. +* Pull benchmark data and visualize as timeseries ## Setup @@ -11,7 +15,23 @@ export GITHUB_TOKEN="insert-your-token" make install ``` -## Arguments +## Test / linting + +``` +# Make sure you install pytest and ruff +make install + +# Tests +make test + +# Linting +make lint +``` + + +## File Count Tracking + +### Arguments | Argument | Command | Description | Example | |------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------| @@ -20,7 +40,7 @@ make install | Interval | -i, --interval | Interval (in days) between data points | --interval 14 | -## Example Usage: +### Example Usage: In the `open-telemetry/opentelemetry-java-instrumentation` repository, track the conversion of tests from groovy to java in the `instrumentation` directory starting from 2022-11-15 with a data point every 2 weeks. @@ -32,19 +52,28 @@ Output: ![Example](./media/example_output.png) -## Test / linting +## Benchmark Visualization -``` -# Make sure you install pytest and ruff -make install +This is very specific to the open-telemetry/opentelemetry-java-instrumentation repo -# Tests -make test +### Arguments -# Linting -make lint -``` +| Argument | Command | Description | Example | +|------------|----------------|----------------------------------------------------------------------------|------------------------------------------------------------| +| Repository | -r, --repo | Repository name. | --repo "open-telemetry/opentelemetry-java-instrumentation" | +| Start Date | -s, --start | Starting Date in format %Y-%m-%d (will calculate from this date until now) | --start "2022-11-15" | +| Interval | -i, --interval | Interval (in days) between data points | --interval 14 | + + +### Example Usage: + +Chart Min and max heap starting from 2022-02-14 with a data point every 30 days. +`python benchmark.py -r "open-telemetry/opentelemetry-java-instrumentation" -s "2022-02-14" -i 30` + +Output: + +![Example](./media/benchmark_output.png) ## Approach @@ -54,6 +83,3 @@ make lint - Cache this data locally to avoid repeated api calls - Generate Graph to show results over time frame - -## Data Filters - diff --git a/results_parser.py b/results_parser.py new file mode 100644 index 0000000..e413cf5 --- /dev/null +++ b/results_parser.py @@ -0,0 +1,46 @@ +from datetime import datetime +from typing import List + + +class ReportMetrics: + def __init__(self, date: str): + self.date = date + self.metrics = {} + + +def parse(report: str, metrics: List[str]) -> ReportMetrics: + if report is None: + return None + + split = report.split("----------------------------------------------------------\n") + + metrics_split = split[2].split("\n") + date = convert_to_desired_format(split[1].split("Run at ")[1].split("\n")[0]) + + report_metrics = ReportMetrics(date=date) + + try: + for line in metrics_split: + for metric in metrics: + + if line.startswith(metric): + values = line.split(":") + report_metrics.metrics[metric] = float(values[1].split()[1]) + except IndexError: + return None + + return report_metrics + + +def convert_to_desired_format(date_str): + # Define the input and output date formats + input_format = "%a %b %d %H:%M:%S UTC %Y" + output_format = "%Y-%m-%d" + + try: + parsed_date = datetime.strptime(date_str, input_format) + formatted_date = parsed_date.strftime(output_format) + return formatted_date + except ValueError: + print("Invalid date format") + return None diff --git a/tests/results_parser_test.py b/tests/results_parser_test.py new file mode 100644 index 0000000..dfa3cb0 --- /dev/null +++ b/tests/results_parser_test.py @@ -0,0 +1,25 @@ +import unittest + +from results_parser import parse + + +class ResultsParserTestCase(unittest.TestCase): + + def __init__(self, *args, **kwargs): + self.example = """----------------------------------------------------------\n Run at Sat Sep 23 05:22:19 UTC 2023\n release : compares no agent, latest stable, and latest snapshot agents\n 5 users, 5000 iterations\n----------------------------------------------------------\nAgent : none latest snapshot\nRun duration : 00:02:27 00:02:57 00:03:06\nAvg. CPU (user) % : 0.46024063 0.48809186 0.49900937\nMax. CPU (user) % : 0.5527638 0.5891089 0.6\nAvg. mch tot cpu % : 0.9943353 0.99306744 0.9932704\nStartup time (ms) : 19598 16351 17050\nTotal allocated MB : 27799.50 34195.20 58039.97\nMin heap used (MB) : 88.10 115.85 112.63\nMax heap used (MB) : 365.90 557.00 478.78\nThread switch rate : 28534.94 29848.291 32354.986\nGC time (ms) : 1800 3014 2928\nGC pause time (ms) : 1814 3052 2959\nReq. mean (ms) : 10.74 12.82 13.51\nReq. p95 (ms) : 32.04 38.45 40.28\nIter. mean (ms) : 144.60 173.90 182.65\nIter. p95 (ms) : 233.74 275.94 291.89\nNet read avg (bps) : 5441971.00 4728712.00 4507975.00\nNet write avg (bps) : 7256048.00 25533599.00 24434992.00\nPeak threads : 43 55 56\n""" + self.metrics = [ + "Min heap used (MB)", + "Max heap used (MB)" + ] + super(ResultsParserTestCase, self).__init__(*args, **kwargs) + + def test_parse_metrics_from_summary(self): + result = parse(report=self.example, metrics=self.metrics) + self.assertEqual(557.00, result.metrics["Max heap used (MB)"]) + self.assertEqual(115.85, result.metrics["Min heap used (MB)"]) + + def test_parse_date_from_summary(self): + result = parse(report=self.example, metrics=self.metrics) + self.assertEqual("2023-09-23", result.date) + + diff --git a/tests/test_utilities.py b/tests/test_utilities.py index b25200a..aafcfdf 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -1,6 +1,7 @@ +from collections import defaultdict from unittest import TestCase -from utilities import get_dates_between, count_by_file_extension +from utilities import get_dates_between, count_by_file_extension, convert_to_plot from datetime import datetime @@ -56,3 +57,99 @@ def test_count_by_file_extension(self): self.assertEqual(2, result['groovy']) self.assertEqual(1, result['txt']) + def test_report_generator(self): + metrics = [ + "Min heap used (MB)", + "Max heap used (MB)" + ] + + expected_dates = ['2022-11-16', '2022-11-30', '2022-12-14', '2022-12-28', + '2023-01-11', '2023-01-25', '2023-02-08', '2023-02-22', + '2023-03-08', '2023-03-22', '2023-04-05', '2023-04-19', + '2023-05-03', '2023-05-17', '2023-05-31', '2023-06-14', + '2023-06-28', '2023-07-12', '2023-07-26', '2023-08-09', + '2023-08-23', '2023-09-06', '2023-09-20'] + + test = {'2022-11-16T00:00:00Z': {'date': '2022-11-16T00:00:00Z', + 'Min heap used (MB)': 88.37, + 'Max heap used (MB)': 360.79}, + '2022-11-30T00:00:00Z': {'date': '2022-11-30T00:00:00Z', + 'Min heap used (MB)': 93.24, + 'Max heap used (MB)': 357.57}, + '2022-12-14T00:00:00Z': {'date': '2022-12-14T00:00:00Z', + 'Min heap used (MB)': 93.12, + 'Max heap used (MB)': 489.0}, + '2022-12-28T00:00:00Z': {'date': '2022-12-28T00:00:00Z', + 'Min heap used (MB)': 93.36, + 'Max heap used (MB)': 339.98}, + '2023-01-11T00:00:00Z': {'date': '2023-01-11T00:00:00Z', + 'Min heap used (MB)': 90.34, + 'Max heap used (MB)': 448.27}, + '2023-01-25T00:00:00Z': {'date': '2023-01-25T00:00:00Z', + 'Min heap used (MB)': 90.57, + 'Max heap used (MB)': 343.99}, + '2023-02-08T00:00:00Z': {'date': '2023-02-08T00:00:00Z', + 'Min heap used (MB)': 88.31, + 'Max heap used (MB)': 389.67}, + '2023-02-22T00:00:00Z': {'date': '2023-02-22T00:00:00Z', + 'Min heap used (MB)': 89.99, + 'Max heap used (MB)': 334.34}, + '2023-03-08T00:00:00Z': {'date': '2023-03-08T00:00:00Z', + 'Min heap used (MB)': 85.4, + 'Max heap used (MB)': 340.25}, + '2023-03-22T00:00:00Z': {'date': '2023-03-22T00:00:00Z', + 'Min heap used (MB)': 94.67, + 'Max heap used (MB)': 362.89}, + '2023-04-05T00:00:00Z': {'date': '2023-04-05T00:00:00Z', + 'Min heap used (MB)': 83.31, + 'Max heap used (MB)': 406.11}, + '2023-04-19T00:00:00Z': {'date': '2023-04-19T00:00:00Z', + 'Min heap used (MB)': 108.68, + 'Max heap used (MB)': 474.59}, + '2023-05-03T00:00:00Z': {'date': '2023-05-03T00:00:00Z', + 'Min heap used (MB)': 90.29, + 'Max heap used (MB)': 396.23}, + '2023-05-17T00:00:00Z': {'date': '2023-05-17T00:00:00Z', + 'Min heap used (MB)': 96.5, + 'Max heap used (MB)': 448.06}, + '2023-05-31T00:00:00Z': {'date': '2023-05-31T00:00:00Z', + 'Min heap used (MB)': 94.72, + 'Max heap used (MB)': 362.23}, + '2023-06-14T00:00:00Z': {'date': '2023-06-14T00:00:00Z', + 'Min heap used (MB)': 112.4, + 'Max heap used (MB)': 483.55}, + '2023-06-28T00:00:00Z': {'date': '2023-06-28T00:00:00Z', + 'Min heap used (MB)': 109.7, + 'Max heap used (MB)': 478.83}, + '2023-07-12T00:00:00Z': {'date': '2023-07-12T00:00:00Z', + 'Min heap used (MB)': 115.64, + 'Max heap used (MB)': 511.06}, + '2023-07-26T00:00:00Z': {'date': '2023-07-26T00:00:00Z', + 'Min heap used (MB)': 117.18, + 'Max heap used (MB)': 545.65}, + '2023-08-09T00:00:00Z': {'date': '2023-08-09T00:00:00Z', + 'Min heap used (MB)': 111.84, + 'Max heap used (MB)': 483.33}, + '2023-08-23T00:00:00Z': {'date': '2023-08-23T00:00:00Z', + 'Min heap used (MB)': 114.26, + 'Max heap used (MB)': 503.5}, + '2023-09-06T00:00:00Z': {'date': '2023-09-06T00:00:00Z', + 'Min heap used (MB)': 116.66, + 'Max heap used (MB)': 554.82}, + '2023-09-20T00:00:00Z': {'date': '2023-09-20T00:00:00Z', + 'Min heap used (MB)': 103.79, + 'Max heap used (MB)': 521.74}} + + expected_plots = { + 'Min heap used (MB)': [88.37, 93.24, 93.12, 93.36, 90.34, 90.57, 88.31, + 89.99, 85.4, 94.67, 83.31, 108.68, 90.29, 96.5, + 94.72, 112.4, 109.7, 115.64, 117.18, 111.84, 114.26, + 116.66, 103.79], + 'Max heap used (MB)': [360.79, 357.57, 489.0, 339.98, 448.27, 343.99, + 389.67, 334.34, 340.25, 362.89, 406.11, 474.59, + 396.23, 448.06, 362.23, 483.55, 478.83, 511.06, + 545.65, 483.33, 503.5, 554.82, 521.74]} + + dates, plots = convert_to_plot(test, metrics) + self.assertEqual(expected_plots, plots) + self.assertEqual(expected_dates, dates) diff --git a/utilities.py b/utilities.py index a8ccf1f..651fef9 100644 --- a/utilities.py +++ b/utilities.py @@ -35,3 +35,16 @@ def count_by_file_extension(files: List[str], languages: List[str]) -> dict: if file.endswith(extension): file_counts[ext] += 1 return file_counts + + +def convert_to_plot(input_dict: dict, items): + result = {} + dates = [] + for entry in input_dict.values(): + dates.append(entry["date"][:10]) + for item in items: + try: + result[item].append(entry[item]) + except KeyError: + result[item] = [entry[item]] + return dates, result