From 7a283bd30b8a4a9970f71ddd46195ba1c568bdf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20R=C3=BCdiger=20Steuer?= Date: Thu, 28 Sep 2023 17:16:52 +0200 Subject: [PATCH] plots and tables used for RQ1improved --- varats/varats/plots/feature_blame_plots.py | 278 +++++++------------ varats/varats/tables/feature_blame_tables.py | 96 ++++++- 2 files changed, 179 insertions(+), 195 deletions(-) diff --git a/varats/varats/plots/feature_blame_plots.py b/varats/varats/plots/feature_blame_plots.py index 0baf705d3..0ebd62ce8 100644 --- a/varats/varats/plots/feature_blame_plots.py +++ b/varats/varats/plots/feature_blame_plots.py @@ -5,6 +5,7 @@ import pandas as pd import seaborn as sns +from varats.data.metrics import apply_tukeys_fence from varats.data.reports.feature_blame_report import ( StructuralFeatureBlameReport as SFBR, ) @@ -91,110 +92,74 @@ def get_structural_commit_data_for_case_study(case_study: CaseStudy) -> pd.DataF ######## FEATURES ######### -class FeatureSizeCorrSFBRPlot(Plot, plot_name="feature_size_corr_sfbr_plot"): +class FeatureSFBRPlot(Plot, plot_name="feature_sfbr_plot"): def plot(self, view_mode: bool) -> None: case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] - df = pd.concat( - [ - get_structural_feature_data_for_case_study(case_study) - for case_study in case_studies - ] - ) - - plt = sns.regplot(data=df, x="feature_size", y="num_interacting_commits") - - plt.set(xlabel="Feature Size", ylabel="Number Interacting Commits") - - -class FeatureSizeCorrSFBRPlotGenerator( - PlotGenerator, - generator_name="feature-size-corr-sfbr-plot", - options=[REQUIRE_MULTI_CASE_STUDY], -): - def generate(self) -> tp.List[Plot]: - case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") - return [ - FeatureSizeCorrSFBRPlot( - self.plot_config, case_studies=case_studies, **self.plot_kwargs + fig, naxs = pyplot.subplots(len(case_studies), 3, figsize=(18, 18)) + fig.suptitle("Structural Interactions of Features") + first: bool = True + for axs, case_study in zip(naxs, case_studies): + data = get_structural_feature_data_for_case_study(case_study) + + data = data.sort_values(by=["num_interacting_commits_nd1"]) + index = ["" for _ in range(len(data))] + + stacked_feature_data = pd.DataFrame( + { + "Interacting with ND1": data["num_interacting_commits_nd1"].values, + "Interacting with ND>1": data[ + "num_interacting_commits_nd>1" + ].values, + }, + index=index, ) - ] + stacked_feature_data.plot.bar(stacked=True, width=0.95, ax=axs[0]) + axs[0].set_xlabel("Features" if first else "", size="13") + axs[0].set_ylabel("Num Interacting Commits" if first else "", size="13") + axs[0].set_title(case_study.project_name, size="16") -class FeatureDisSFBRPlot(Plot, plot_name="feature_dis_sfbr_plot"): - def plot(self, view_mode: bool) -> None: - case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] - dfs = [ - get_structural_feature_data_for_case_study(case_study) - for case_study in case_studies - ] - data = pd.concat( - [ - df.assign(project=[case_study.project_name for _ in range(0, len(df))]) - for case_study, df in zip(case_studies, dfs) - ] - ) + data = data.sort_values(by=["def_feature_size"]) - plt = sns.boxplot( - data=data, - x="num_interacting_commits", - y="project", - ) - ticks = 5 - start = min(data["num_interacting_commits"]) - stop = max(data["num_interacting_commits"]) - step = round((stop - start) / ticks) - plt.set_xticks(range(start, stop + step, step), range(start, stop + step, step)) - plt.set( - xlabel="Number Interacting Commits", - ylabel="Project", - title="Structural Interactions of Features", - ) - - -class FeatureDisSFBRPlotGenerator( - PlotGenerator, - generator_name="feature-dis-sfbr-plot", - options=[REQUIRE_MULTI_CASE_STUDY], -): - def generate(self) -> tp.List[Plot]: - case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") - return [ - FeatureDisSFBRPlot( - self.plot_config, case_studies=case_studies, **self.plot_kwargs + stacked_feature_size_data = pd.DataFrame( + { + "Definite Feature Size": data["def_feature_size"].values, + "Potential Feature Size": data["pot_feature_size"].values + - data["def_feature_size"].values, + }, + index=index, ) - ] - - -class FeatureSizeDisSFBRPlot(Plot, plot_name="feature_size_dis_sfbr_plot"): - def plot(self, view_mode: bool) -> None: - case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] - df = pd.concat( - [ - get_structural_feature_data_for_case_study(case_study) - for case_study in case_studies - ] - ) + stacked_feature_size_data.plot.bar(stacked=True, width=0.95, ax=axs[1]) + axs[1].set_xlabel("") + axs[1].set_ylabel("Feature Size" if first else "", size="13") - df = df.sort_values(by=["feature_size"]) + sns.regplot( + data=data, + x="def_feature_size", + y="num_interacting_commits_nd1", + ax=axs[2], + ) + sns.regplot( + data=data, x="pot_feature_size", y="num_interacting_commits", ax=axs[2] + ) - plt = sns.barplot(data=df, x="feature", y="feature_size", color="steelblue") - plt.set(xlabel="Feature", ylabel="Size", title="") + axs[2].set_xlabel("Feature Size" if first else "", size="13") + axs[2].set_ylabel("Num Interacting Commits" if first else "", size="13") - xticklabels = [str(i) for i in range(0, len(df))] - plt.set(xticklabels=xticklabels) + first = False -class FeatureSizeDisSFBRPlotGenerator( +class FeatureSFBRPlotGenerator( PlotGenerator, - generator_name="feature-size-dis-sfbr-plot", + generator_name="feature-sfbr-plot", options=[REQUIRE_MULTI_CASE_STUDY], ): def generate(self) -> tp.List[Plot]: case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") return [ - FeatureSizeDisSFBRPlot( + FeatureSFBRPlot( self.plot_config, case_studies=case_studies, **self.plot_kwargs ) ] @@ -203,84 +168,6 @@ def generate(self) -> tp.List[Plot]: ######## COMMITS ######### -def get_pie_data_for_commit_data(commit_data) -> (tp.List[int], tp.List[int]): - min_num_interacting_features = min(commit_data) - max_num_interacting_features = max(commit_data) - - data = [ - 0 for _ in range(min_num_interacting_features, max_num_interacting_features + 1) - ] - add_s = lambda x: "" if x == 1 else "s" - labels = [ - "Impl. " + str(i) + " feature" + add_s(i) - for i in range(min_num_interacting_features, max_num_interacting_features + 1) - ] - - for num_interacting_features in commit_data: - data[num_interacting_features - min_num_interacting_features] = ( - data[num_interacting_features - min_num_interacting_features] + 1 - ) - - adj_labels, adj_data = ([], []) - for i in range(0, max_num_interacting_features - min_num_interacting_features + 1): - if data[i] == 0: - continue - frac = data[i] / len(commit_data) - if frac < 0.05: - num_interacting_features = i + min_num_interacting_features - adj_labels.append( - "Impl. >=" - + str(num_interacting_features) - + " feature" - + add_s(num_interacting_features) - ) - adj_data.append(np.sum(data[i:])) - break - adj_labels.append(labels[i]) - adj_data.append(data[i]) - - return (adj_data, adj_labels) - - -class CommitSFBRPieChart(Plot, plot_name="commit_sfbr_pie_chart"): - def plot(self, view_mode: bool) -> None: - case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] - - def func(pct): - absolute = int(np.round(pct / 100.0 * len(commit_data))) - return f"{absolute:d}" - - fig, naxs = pyplot.subplots(2, 2, figsize=(15, 15)) - case_study_counter = 0 - for axs in naxs: - for ax in axs: - case_study = case_studies[case_study_counter] - commit_data = get_structural_commit_data_for_case_study(case_study).loc[ - :, "num_interacting_features" - ] - data, labels = get_pie_data_for_commit_data(commit_data) - explode = [0.1] + [0 for _ in range(1, len(data))] - ax.pie( - data, labels=labels, explode=explode, autopct=lambda pct: func(pct) - ) - ax.set_title(case_study.project_name) - case_study_counter += 1 - - -class CommitSFBRPieChartGenerator( - PlotGenerator, - generator_name="commit-sfbr-pie-chart", - options=[REQUIRE_MULTI_CASE_STUDY], -): - def generate(self) -> tp.List[Plot]: - case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") - return [ - CommitSFBRPieChart( - self.plot_config, case_studies=case_studies, **self.plot_kwargs - ) - ] - - class CommitSpecificSFBRPlot(Plot, plot_name="commit_specific_sfbr_plot"): def plot(self, view_mode: bool) -> None: case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] @@ -288,40 +175,65 @@ def plot(self, view_mode: bool) -> None: fig, naxs = pyplot.subplots(2, 2, figsize=(15, 15)) case_study_counter = 0 for axs in naxs: - for nth_ax in axs: + for ax in axs: if case_study_counter == len(case_studies): continue case_study = case_studies[case_study_counter] commit_data = get_structural_commit_data_for_case_study(case_study) - commit_data = commit_data.sort_values(by=["num_interacting_features"])[ - "num_interacting_features" - ] + commit_data = commit_data.sort_values(by=["num_interacting_features"]) + + filter_lrg_commits = apply_tukeys_fence( + commit_data, column="commit_size", k=1.5 + ) + + commit_data = commit_data["num_interacting_features"] interacting_with_nd1 = [ - commit_data[index][0] for index in commit_data.index + commit_data[index][0] if index in filter_lrg_commits.index else 0 + for index in commit_data.index ] - interacting_with_at_leat_nd2 = [ - sum(commit_data[index][1:]) for index in commit_data.index + interacting_with_at_least_nd2 = [ + sum(commit_data[index][1:]) + if index in filter_lrg_commits.index + else 0 + for index in commit_data.index + ] + interacting_with_nd1_lrg_commit = [ + 0 if index in filter_lrg_commits.index else commit_data[index][0] + for index in commit_data.index + ] + interacting_with_at_least_nd2_lrg_commit = [ + 0 + if index in filter_lrg_commits.index + else sum(commit_data[index][1:]) + for index in commit_data.index ] - stacked_commit_data = pd.DataFrame( - { - "Min Nesting Degree 1": interacting_with_nd1, - "Min Nesting Degree >=2": interacting_with_at_leat_nd2, - }, - index=commit_data.index, - ) - - stacked_commit_data.plot.bar(stacked=True, width=0.95, ax=nth_ax) - nth_ax.set_xlabel("Commits") - nth_ax.set_ylabel("Num Interacting Features") + rng = range(len(commit_data)) + ax.bar(rng, interacting_with_nd1) + ax.bar( + rng, + interacting_with_at_least_nd2, + bottom=interacting_with_nd1, + ) + ax.bar(rng, interacting_with_nd1_lrg_commit, alpha=0.65, color="tab:blue") + ax.bar( + rng, + interacting_with_at_least_nd2_lrg_commit, + bottom=interacting_with_nd1_lrg_commit, + alpha=0.65, + color="tab:orange", + ) + ax.set_xlabel("Commits") + ax.set_ylabel("Num Interacting Features") step = round(len(commit_data) / 6) - nth_ax.set_xticks( + ax.set_xticks( ticks=[i * step for i in range(6)], labels=[str(i * step) for i in range(6)], ) - nth_ax.set_title(case_study.project_name) + ax.set_title(case_study.project_name) + ax.legend(["Interacting with ND1", "Interacting with ND>1", "ND1, Large Commit", "ND>1, Large Commit"]) case_study_counter += 1 @@ -822,7 +734,7 @@ def plot(self, view_mode: bool) -> None: else: ax.set_xlabel("") ax.set_ylabel("") - x_rng = range(0, len(author_data), 2) + x_rng = range(1, len(author_data) + 1, 2) ax.set_xticks(ticks=x_rng, labels=[str(i) for i in x_rng]) max_impl_authors = max(author_data["num_implementing_authors"]) y_rng = range(1, max_impl_authors + 1) diff --git a/varats/varats/tables/feature_blame_tables.py b/varats/varats/tables/feature_blame_tables.py index 6d719ee5d..90e1e9f8c 100644 --- a/varats/varats/tables/feature_blame_tables.py +++ b/varats/varats/tables/feature_blame_tables.py @@ -118,7 +118,7 @@ def generate(self) -> tp.List[Table]: ] -class SFBRCommitEvalTable(Table, table_name="sfbr_commit_eval_table"): +class SFBRCommitAvgEvalTable(Table, table_name="sfbr_commit_avg_eval_table"): def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] @@ -185,6 +185,82 @@ def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: commit_average_number_of_features_changed_outliers_filtered_nd1 ) + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + + df = pd.DataFrame( + round_rows(rows, 2), + columns=[ + "Projects", + "Avg Num Ftrs Chngd", + "Only ND1", + "Lrg Cmmts Fltrd", + "Only ND1 + Lrg Cmmts Fltrd", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join([ + case_study.project_name for case_study in case_studies + ]) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of structural CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, + table_format, + wrap_table=wrap_table, + wrap_landscape=True, + **kwargs + ) + + +class SFBRCommitAvgEvalTableGenerator( + TableGenerator, + generator_name="sfbr-commit-avg-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + SFBRCommitAvgEvalTable( + self.table_config, + case_studies=case_studies, + **self.table_kwargs + ) + ] + + +class SFBRCommitFracEvalTable(Table, table_name="sfbr_commit_frac_eval_table"): + + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_commits = [ + get_structural_commit_data_for_case_study(case_study) + for case_study in case_studies + ] + print(projects_data_commits[0]) + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_commits, current_row in zip( + projects_data_commits, + range(0, len(case_studies)), + ): + data_commits_num_interacting_features = data_commits["num_interacting_features"] + # filter large commits + data_commits_num_interacting_features_outliers_filtered = ( + apply_tukeys_fence(data_commits, "commit_size", + 1.5)["num_interacting_features"] + ) + fraction_commits_changing_more_than_one_feature = sum([ sum(data_commits_num_interacting_features[index]) > 1 for index in data_commits_num_interacting_features.index @@ -234,14 +310,10 @@ def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: round_rows(rows, 2), columns=[ "Projects", - "Avg Num Ftrs Chngd", - "Avg Num Ftrs Chngd ND1", - "Avg Num Ftrs Chngd (Lrg Cmmts Fltrd)", - "Avg Num Ftrs Chngd (Lrg Cmmts Fltrd) ND1", - "Frctn Cmmts Chngng >1 Ftr", - "Frctn Cmmts Chngng >1 Ftr ND1", - "Frctn Cmmts Chngng >1 Ftr (Lrg Cmmts Fltrd)", - "Frctn Cmmts Chngng >1 Ftr (Lrg Cmmts Fltrd) ND1", + "Frac Cmmts Interacting with >1 Feature", + "Only ND1", + "Lrg Cmmts Fltrd", + "Only ND1 + Lrg Cmmts Fltrd", ], ) @@ -264,16 +336,16 @@ def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: ) -class SFBRCommitEvalTableGenerator( +class SFBRCommitFracEvalTableGenerator( TableGenerator, - generator_name="sfbr-commit-eval-table", + generator_name="sfbr-commit-frac-eval-table", options=[REQUIRE_MULTI_CASE_STUDY], ): def generate(self) -> tp.List[Table]: case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") return [ - SFBRCommitEvalTable( + SFBRCommitFracEvalTable( self.table_config, case_studies=case_studies, **self.table_kwargs