From 799fb5e9584de13850cb704a04cd3397fde35e1e Mon Sep 17 00:00:00 2001 From: Jay DeLuca Date: Tue, 24 Dec 2024 14:25:54 -0500 Subject: [PATCH] list files in count by script --- .gitignore | 6 +++++- count_by_instrumentation.py | 19 +++++++++++++------ data_filter.py | 1 + main.py | 4 ++++ utilities.py | 19 ++++++++++++++++--- 5 files changed, 39 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 5d289db..48f8b11 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,8 @@ __pycache__/* tests/cache tests/cache/* -.coverage \ No newline at end of file +.coverage + +latest-groovy.png +latest-with-java.png +latest-groovy-detailed.png \ No newline at end of file diff --git a/count_by_instrumentation.py b/count_by_instrumentation.py index 991f5fa..58cb3dc 100644 --- a/count_by_instrumentation.py +++ b/count_by_instrumentation.py @@ -13,8 +13,9 @@ class App: - def __init__(self, languages: List[str], path_prefix: str, keyword: str): - self.client = GithubClient() + def __init__(self, languages: List[str], path_prefix: str, keyword: str, + client: GithubClient = GithubClient()): + self.client = client self.data_filter = DataFilter(languages=languages, path_prefix=path_prefix, keyword=keyword) @@ -35,19 +36,21 @@ def main(args): keyword="test" ) - today = datetime.now().date().strftime("%Y-%m-%dT%H:%M:%SZ") + today = (datetime.now().date() + pd.Timedelta(days=1)).strftime( + "%Y-%m-%dT%H:%M:%SZ") commit = app.get_commit_by_date(date=today, repository=args.repo) repo_files = app.get_repository_by_commit( repository=args.repo, commit=commit ) - file_counts, file_sizes = count_by_language_and_file_extension( + file_info = count_by_language_and_file_extension( files=repo_files["files"], languages=[args.language]) # Print the table showing file counts and sizes - data = [(key, file_counts[key], file_sizes[key]) for key in file_counts.keys()] + data = [(key, file_info.file_counts[key], file_info.file_sizes[key]) for key in + file_info.file_counts.keys()] df2 = pd.DataFrame(data, columns=['Key', 'File Count', 'Total File Size']) df2 = df2.sort_values(by='Total File Size', key=lambda col: col.astype(int), ascending=False) @@ -56,7 +59,7 @@ def main(args): print(f"| Total | {df2['File Count'].sum()} | {df2['Total File Size'].sum()} |") # Create a pie chart for file counts only - df = pd.DataFrame(list(file_counts.items()), columns=['Key', 'Value']) + df = pd.DataFrame(list(file_info.file_counts.items()), columns=['Key', 'Value']) df = df.sort_values(by='Value', key=lambda col: col.astype(int), ascending=False) sns.set_theme() @@ -69,6 +72,10 @@ def main(args): plt.title(f'Remaining {args.language} files by Instrumentation') plt.ylabel('') + print("\n") + for item in file_info.matched_files: + print(item) + if args.output is not None: plt.savefig(args.output) else: diff --git a/data_filter.py b/data_filter.py index 20637ba..77079d4 100644 --- a/data_filter.py +++ b/data_filter.py @@ -42,6 +42,7 @@ def get_file_counts_and_lengths(self, payload): for i in tree: if self.matches_meta(i) \ and self.matches_file_extensions(i["path"]) \ + and "grails" not in i["path"] \ and self.matches_directory(i["path"]): data_result.append(CodeFile(path=i["path"], size=i["size"])) diff --git a/main.py b/main.py index 65204e2..bfeea10 100644 --- a/main.py +++ b/main.py @@ -89,6 +89,10 @@ def main(args): for lang, counts in language_counts.items(): df = pd.DataFrame({'Date': dates, 'Count': counts}) + + # remove any junk data + df = df[df['Count'] != 0] + df['Date'] = pd.to_datetime(df['Date']) sns.lineplot(x='Date', y='Count', label=lang.capitalize(), data=df, marker='o') diff --git a/utilities.py b/utilities.py index c98e37e..ee4f0ce 100644 --- a/utilities.py +++ b/utilities.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datetime import datetime, timedelta from collections import defaultdict from typing import List, Dict @@ -5,6 +6,13 @@ from CodeFile import CodeFile +@dataclass +class FileCountInfo: + file_counts: Dict[str, int] + file_sizes: Dict[str, int] + matched_files: List[str] + + def get_dates_between(start_date_str, end_date, interval): date_format = "%Y-%m-%d" output_format = "%Y-%m-%dT%H:%M:%SZ" @@ -38,12 +46,14 @@ def count_by_file_extension(files: List[str], languages: List[str]) -> dict: for file in files: for ext in languages: extension = f".{ext.lower()}" - if file.endswith(extension): + if file.endswith(extension) and "grails" not in file: file_counts[ext] += 1 return file_counts -def count_by_language_and_file_extension(files: List[CodeFile], languages: List[str]) -> Dict[str, Dict[str, int]]: +def count_by_language_and_file_extension(files: List[CodeFile], + languages: List[str]) -> FileCountInfo: + matched_files = [] file_counts = defaultdict(int) file_sizes = defaultdict(int) for file in files: @@ -55,7 +65,10 @@ def count_by_language_and_file_extension(files: List[CodeFile], languages: List[ if extension in languages: file_counts[instrumentation] += 1 file_sizes[instrumentation] += file.size - return file_counts, file_sizes + matched_files.append(file.path) + + return FileCountInfo(file_counts=file_counts, file_sizes=file_sizes, + matched_files=matched_files) def convert_to_plot(input_dict: dict, items):