diff --git a/CodeFile.py b/CodeFile.py new file mode 100644 index 0000000..66908c6 --- /dev/null +++ b/CodeFile.py @@ -0,0 +1,4 @@ +class CodeFile: + def __init__(self, path: str, size: int): + self.path = path + self.size = size diff --git a/Makefile b/Makefile index 2321bf8..26b23c0 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,5 @@ update-example: python3 main.py -r "open-telemetry/opentelemetry-java-instrumentation" -l "groovy" -s "2022-11-15" -i 14 -o "./media/example_output2.png" python3 count_by_instrumentation.py -r "open-telemetry/opentelemetry-java-instrumentation" -l "groovy" -o "./media/example_pie_output.png" - - .PHONY: all all: install test lint \ No newline at end of file diff --git a/count_by_instrumentation.py b/count_by_instrumentation.py index e28432a..991f5fa 100644 --- a/count_by_instrumentation.py +++ b/count_by_instrumentation.py @@ -23,7 +23,7 @@ def get_commit_by_date(self, repository, date): def get_repository_by_commit(self, repository, commit): repo_data = self.client.get_repository_at_commit(repository, commit) - repo_data = self.data_filter.parse_data(repo_data) + repo_data = self.data_filter.get_file_counts_and_lengths(repo_data) return repo_data @@ -42,16 +42,26 @@ def main(args): repository=args.repo, commit=commit ) - count = count_by_language_and_file_extension(files=repo_files["files"], - languages=[args.language]) + file_counts, file_sizes = count_by_language_and_file_extension( + files=repo_files["files"], + languages=[args.language]) - df = pd.DataFrame(list(count.items()), columns=['Key', 'Value']) + # Print the table showing file counts and sizes + data = [(key, file_counts[key], file_sizes[key]) for key in file_counts.keys()] + df2 = pd.DataFrame(data, columns=['Key', 'File Count', 'Total File Size']) + df2 = df2.sort_values(by='Total File Size', key=lambda col: col.astype(int), + ascending=False) + + print(df2.to_markdown(index=False)) + print(f"| Total | {df2['File Count'].sum()} | {df2['Total File Size'].sum()} |") + + # Create a pie chart for file counts only + df = pd.DataFrame(list(file_counts.items()), columns=['Key', 'Value']) df = df.sort_values(by='Value', key=lambda col: col.astype(int), ascending=False) sns.set_theme() colors = sns.color_palette('pastel')[0:len(df)] - # Create a pie chart explode = [0.05] * len(df) # this will "explode" each slice from the pie df.set_index('Key')['Value'].plot.pie(autopct='%1.0f%%', colors=colors, explode=explode) @@ -59,9 +69,6 @@ def main(args): plt.title(f'Remaining {args.language} files by Instrumentation') plt.ylabel('') - print(df.to_markdown(index=False)) - print(f"| Total | {df['Value'].sum()} |") - if args.output is not None: plt.savefig(args.output) else: diff --git a/data_filter.py b/data_filter.py index cbf3f6d..20637ba 100644 --- a/data_filter.py +++ b/data_filter.py @@ -1,5 +1,7 @@ from typing import List +from CodeFile import CodeFile + class DataFilter: @@ -33,3 +35,17 @@ def parse_data(self, payload): "files": data_result } return json_result + + def get_file_counts_and_lengths(self, payload): + data_result = [] + tree = payload["tree"] + for i in tree: + if self.matches_meta(i) \ + and self.matches_file_extensions(i["path"]) \ + and self.matches_directory(i["path"]): + data_result.append(CodeFile(path=i["path"], size=i["size"])) + + json_result = { + "files": data_result + } + return json_result diff --git a/latest.png b/latest.png new file mode 100644 index 0000000..208ae21 Binary files /dev/null and b/latest.png differ diff --git a/media/example_output.png b/media/example_output.png index 77ef629..14f6dba 100644 Binary files a/media/example_output.png and b/media/example_output.png differ diff --git a/media/example_output2.png b/media/example_output2.png index 21b201a..e104ad7 100644 Binary files a/media/example_output2.png and b/media/example_output2.png differ diff --git a/media/example_pie_output.png b/media/example_pie_output.png index f7b0ea1..cece08e 100644 Binary files a/media/example_pie_output.png and b/media/example_pie_output.png differ diff --git a/readme.md b/readme.md index 6655708..674b2e0 100644 --- a/readme.md +++ b/readme.md @@ -103,37 +103,30 @@ In the `open-telemetry/opentelemetry-java-instrumentation` repository, analyze t Output: -| Key | Value | -|:------------------|------:| -| spring | 52 | -| jaxrs | 37 | -| servlet | 23 | -| restlet | 22 | -| couchbase | 18 | -| aws-sdk | 17 | -| ratpack | 16 | -| elasticsearch | 15 | -| play | 15 | -| jaxws | 15 | -| vertx | 14 | -| mongo | 10 | -| jdbc | 8 | -| apache-dubbo-2.7 | 7 | -| jaxrs-client | 5 | -| netty | 5 | -| apache-httpclient | 3 | -| opentelemetry-api | 3 | -| grizzly-2.3 | 3 | -| grails-3.0 | 3 | -| undertow-1.4 | 3 | -| kafka | 3 | -| internal | 2 | -| dropwizard | 2 | -| hibernate | 1 | -| rediscala-1.8 | 1 | -| spymemcached-2.12 | 1 | -| twilio-6.6 | 1 | -| Total | 305 | +| Key | File Count | Total File Size | +|:------------------|-----------:|----------------:| +| servlet | 23 | 143182 | +| aws-sdk | 17 | 127827 | +| jdbc | 8 | 90890 | +| elasticsearch | 15 | 90341 | +| jaxrs | 37 | 72352 | +| vertx | 14 | 56845 | +| ratpack | 16 | 51932 | +| mongo | 10 | 51661 | +| play | 15 | 48986 | +| restlet | 22 | 38226 | +| jaxws | 17 | 21595 | +| kafka | 3 | 21505 | +| twilio-6.6 | 1 | 18936 | +| jaxrs-client | 5 | 16067 | +| spymemcached-2.12 | 1 | 15630 | +| undertow-1.4 | 3 | 12754 | +| hibernate | 1 | 12167 | +| dropwizard | 2 | 10789 | +| rediscala-1.8 | 1 | 3898 | +| grails-3.0 | 3 | 3201 | +| internal | 2 | 2603 | +| Total | 216 | 911387 | ![Example](./media/example_pie_output.png) diff --git a/utilities.py b/utilities.py index 3c52e7f..6cdf4e3 100644 --- a/utilities.py +++ b/utilities.py @@ -2,6 +2,8 @@ from collections import defaultdict from typing import List, Dict +from CodeFile import CodeFile + def get_dates_between(start_date_str, end_date, interval): date_format = "%Y-%m-%d" @@ -37,17 +39,20 @@ def count_by_file_extension(files: List[str], languages: List[str]) -> dict: return file_counts -def count_by_language_and_file_extension(files: List[str], languages: List[str]) -> Dict[str, Dict[str, int]]: - counts = defaultdict(int) +def count_by_language_and_file_extension(files: List[CodeFile], languages: List[str]) -> Dict[str, Dict[str, int]]: + file_counts = defaultdict(int) + file_sizes = defaultdict(int) for file in files: - file_parts = file.split('/') + file_parts = file.path.split('/') if len(file_parts) < 3: continue instrumentation = file_parts[1] extension = file_parts[-1].split('.')[-1] if extension in languages: - counts[instrumentation] += 1 - return counts + file_counts[instrumentation] += 1 + file_sizes[instrumentation] += file.size + return file_counts, file_sizes + def convert_to_plot(input_dict: dict, items): result = {}