Skip to content

Commit

Permalink
improve structural feature data generation
Browse files Browse the repository at this point in the history
specify nesting degree of an interaction
  • Loading branch information
Simon Rüdiger Steuer committed Sep 28, 2023
1 parent d85ed12 commit 5382403
Showing 1 changed file with 111 additions and 83 deletions.
194 changes: 111 additions & 83 deletions varats/varats/data/reports/feature_blame_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ class StructuralCommitFeatureInteraction:
occurs in."""

def __init__(
self, num_instructions: int, features: tp.List[str],
commit: CommitRepoPair
self, num_instructions: int, features: tp.List[str], commit: CommitRepoPair
) -> None:
self.__num_instructions = num_instructions
self.__features = features
Expand All @@ -47,9 +46,7 @@ def create_commit_feature_interaction(
(raw_inst_entry["commit-repo-pair"])["commit"],
(raw_inst_entry["commit-repo-pair"])["repository"],
)
return StructuralCommitFeatureInteraction(
num_instructions, features, commit
)
return StructuralCommitFeatureInteraction(num_instructions, features, commit)

@property
def num_instructions(self) -> int:
Expand All @@ -71,9 +68,7 @@ class FeatureBlameReportMetaData(FeatureAnalysisReportMetaData):
pass


class StructuralFeatureBlameReport(
BaseReport, shorthand="SFBR", file_type="yaml"
):
class StructuralFeatureBlameReport(BaseReport, shorthand="SFBR", file_type="yaml"):
"""Data class that gives access to a loaded structural feature blame
report."""

Expand All @@ -87,17 +82,20 @@ def __init__(self, path: Path) -> None:
version_header.raise_if_version_is_less_than(1)

self.__meta_data = (
FeatureBlameReportMetaData.
create_feature_analysis_report_meta_data(next(documents))
FeatureBlameReportMetaData.create_feature_analysis_report_meta_data(
next(documents)
)
)

raw_feature_blame_report = next(documents)

self.__commit_feature_interactions = [
StructuralCommitFeatureInteraction.
create_commit_feature_interaction(cfi)
StructuralCommitFeatureInteraction.create_commit_feature_interaction(
cfi
)
for cfi in raw_feature_blame_report[
"structural-commit-feature-interactions"]
"structural-commit-feature-interactions"
]
]

@property
Expand All @@ -114,34 +112,55 @@ def commit_feature_interactions(
return self.__commit_feature_interactions


def generate_feature_scfi_data(
SFBR: StructuralFeatureBlameReport
) -> pd.DataFrame:
features_cfi_data: tp.Dict[str, tp.Tuple(tp.Set[str], int)] = {}
def generate_feature_scfi_data(SFBR: StructuralFeatureBlameReport) -> pd.DataFrame:
# {ftr:
# [[inter_commits, inter_commits_nd1, inter_commits_nd>1], [def_ftr_size, pot_ftr_size]]}
features_cfi_data: tp.Dict[
str,
tp.List[tp.List[tp.Set[str], tp.Set[str], tp.Set[str]], tp.List[int, int]],
] = {}
for SCFI in SFBR.commit_feature_interactions:
commit_hash = ShortCommitHash(SCFI.commit.commit_hash).hash
nesting_degree: int = len(SCFI.features)
for feature in SCFI.features:
entry = features_cfi_data.get(feature)
if not entry:
features_cfi_data.update({
feature: (set([commit_hash]), SCFI.num_instructions)
})
else:
entry[0].add(commit_hash)
features_cfi_data.update({
feature: (entry[0], entry[1] + SCFI.num_instructions)
})
rows = [[feature_data[0],
len(feature_data[1][0]), feature_data[1][1]]
for feature_data in features_cfi_data.items()]
entry = [[set([]), set([]), set([])], [0, 0]]
entry[0][0].add(commit_hash)
entry[1][1] = entry[1][1] + SCFI.num_instructions
if nesting_degree == 1:
entry[0][1].add(commit_hash)
entry[0][2] = entry[0][2].difference(entry[0][1])
entry[1][0] = entry[1][0] + SCFI.num_instructions
elif entry[0][1].isdisjoint([commit_hash]):
entry[0][2].add(commit_hash)
features_cfi_data.update({feature: entry})
rows = [
[
feature_data[0],
len(feature_data[1][0][0]),
len(feature_data[1][0][1]),
len(feature_data[1][0][2]),
feature_data[1][1][0],
feature_data[1][1][1],
]
for feature_data in features_cfi_data.items()
]
return pd.DataFrame(
rows, columns=["feature", "num_interacting_commits", "feature_size"]
rows,
columns=[
"feature",
"num_interacting_commits",
"num_interacting_commits_nd1",
"num_interacting_commits_nd>1",
"def_feature_size",
"pot_feature_size",
],
)


def generate_feature_author_scfi_data(
SFBR: StructuralFeatureBlameReport, project_gits: tp.Dict[str,
pygit2.Repository]
SFBR: StructuralFeatureBlameReport, project_gits: tp.Dict[str, pygit2.Repository]
) -> pd.DataFrame:
# {feature: (authors, size)}
features_cfi_author_data: tp.Dict[str, tp.Tuple(tp.Set[str], int)] = {}
Expand All @@ -154,17 +173,18 @@ def generate_feature_author_scfi_data(
for feature in SCFI.features:
entry = features_cfi_author_data.get(feature)
if not entry:
features_cfi_author_data.update({
feature: (set([author]), SCFI.num_instructions)
})
features_cfi_author_data.update(
{feature: (set([author]), SCFI.num_instructions)}
)
else:
entry[0].add(author)
features_cfi_author_data.update({
feature: (entry[0], entry[1] + SCFI.num_instructions)
})
rows = [[feature_data[0],
len(feature_data[1][0]), feature_data[1][1]]
for feature_data in features_cfi_author_data.items()]
features_cfi_author_data.update(
{feature: (entry[0], entry[1] + SCFI.num_instructions)}
)
rows = [
[feature_data[0], len(feature_data[1][0]), feature_data[1][1]]
for feature_data in features_cfi_author_data.items()
]
return pd.DataFrame(
rows, columns=["feature", "num_implementing_authors", "feature_size"]
)
Expand Down Expand Up @@ -258,9 +278,7 @@ def commits(self) -> tp.List[CommitRepoPair]:
return self.__commits


class DataflowFeatureBlameReport(
BaseReport, shorthand="DFBR", file_type="yaml"
):
class DataflowFeatureBlameReport(BaseReport, shorthand="DFBR", file_type="yaml"):
"""Data class that gives access to a loaded dataflow feature blame
report."""

Expand All @@ -274,16 +292,18 @@ def __init__(self, path: Path) -> None:
version_header.raise_if_version_is_less_than(1)

self.__meta_data = (
FeatureBlameReportMetaData.
create_feature_analysis_report_meta_data(next(documents))
FeatureBlameReportMetaData.create_feature_analysis_report_meta_data(
next(documents)
)
)

raw_feature_blame_report = next(documents)

self.__commit_feature_interactions = [
DataflowCommitFeatureInteraction.
create_commit_feature_interaction(cfi) for cfi in
raw_feature_blame_report["dataflow-commit-feature-interactions"]
DataflowCommitFeatureInteraction.create_commit_feature_interaction(cfi)
for cfi in raw_feature_blame_report[
"dataflow-commit-feature-interactions"
]
]

@property
Expand All @@ -293,9 +313,7 @@ def meta_data(self) -> FeatureAnalysisReportMetaData:
return self.__meta_data

@property
def commit_feature_interactions(
self
) -> tp.List[DataflowCommitFeatureInteraction]:
def commit_feature_interactions(self) -> tp.List[DataflowCommitFeatureInteraction]:
"""Return all dataflow-based cfis."""
return self.__commit_feature_interactions

Expand All @@ -320,10 +338,10 @@ def get_commits_dataflow_interacting_features(
DFBR: DataflowFeatureBlameReport,
) -> tp.Dict[str, tp.Tuple[tp.Set[str], tp.Set[str], tp.Set[str]]]:
# [hash, ([all_interacting_features], [inside_df], [outside_df])]
dfi_commit: tp.Dict[str, tp.Tuple[tp.Set[str], tp.Set[str],
tp.Set[str]]] = {}
dfi_commit: tp.Dict[str, tp.Tuple[tp.Set[str], tp.Set[str], tp.Set[str]]] = {}
commits_structurally_interacting_features: tp.Dict[
str, tp.Set[str]] = get_commits_structurally_interacting_features(SFBR)
str, tp.Set[str]
] = get_commits_structurally_interacting_features(SFBR)

for DCFI in DFBR.commit_feature_interactions:
feature = DCFI.feature
Expand Down Expand Up @@ -351,11 +369,13 @@ def get_features_dataflow_affecting_commits(
SFBR: StructuralFeatureBlameReport, DFBR: DataflowFeatureBlameReport
) -> tp.Dict[str, tp.Tuple[tp.Set[CommitRepoPair], tp.Set[CommitRepoPair]]]:
# {feature, ([interacting_commits_outside], [interacting_commits_inside])}
dci_feature: tp.Dict[str, tp.Tuple[tp.Set[CommitRepoPair],
tp.Set[CommitRepoPair]]] = {}
dci_feature: tp.Dict[
str, tp.Tuple[tp.Set[CommitRepoPair], tp.Set[CommitRepoPair]]
] = {}

commits_structurally_interacting_with_features: tp.Dict[
str, tp.Set[str]] = get_commits_structurally_interacting_features(SFBR)
str, tp.Set[str]
] = get_commits_structurally_interacting_features(SFBR)

for DCFI in DFBR.commit_feature_interactions:
feature = DCFI.feature
Expand Down Expand Up @@ -386,12 +406,15 @@ def generate_commit_specific_dcfi_data(
# [hash, ([all_interacting_features], [inside_df], [outside_df])]
dfi_commit = get_commits_dataflow_interacting_features(SFBR, DFBR)

rows_commit_dfi = [[
commit_data[0],
len(commit_data[1][0]),
len(commit_data[1][1]),
len(commit_data[1][2]),
] for commit_data in dfi_commit.items()]
rows_commit_dfi = [
[
commit_data[0],
len(commit_data[1][0]),
len(commit_data[1][1]),
len(commit_data[1][2]),
]
for commit_data in dfi_commit.items()
]
counter = 0
for _ in range(0, num_commits - len(dfi_commit)):
rows_commit_dfi.append([f"fake_hash{counter}", 0, 0, 0])
Expand All @@ -413,7 +436,8 @@ def generate_general_commit_dcfi_data(
) -> pd.DataFrame:
row = []
commits_structurally_interacting_features: tp.Dict[
str, tp.Set[str]] = get_commits_structurally_interacting_features(SFBR)
str, tp.Set[str]
] = get_commits_structurally_interacting_features(SFBR)
num_structurally_interacting_commits = len(
commits_structurally_interacting_features.values()
)
Expand All @@ -431,8 +455,8 @@ def generate_general_commit_dcfi_data(
interacting_structurally_and_through_dataflow += 1

row.append(
interacting_structurally_and_through_dataflow /
len(SFBR.commit_feature_interactions)
interacting_structurally_and_through_dataflow
/ len(SFBR.commit_feature_interactions)
)

columns = [
Expand All @@ -450,13 +474,17 @@ def generate_feature_dcfi_data(

feature_scfi_data = generate_feature_scfi_data(SFBR)

rows_feature_dci = [[
feature_data[0],
feature_scfi_data.loc[feature_scfi_data["feature"] == feature_data[0]]
["feature_size"].to_numpy()[0],
len(feature_data[1][0]),
len(feature_data[1][1]),
] for feature_data in dci_feature.items()]
rows_feature_dci = [
[
feature_data[0],
feature_scfi_data.loc[feature_scfi_data["feature"] == feature_data[0]][
"feature_size"
].to_numpy()[0],
len(feature_data[1][0]),
len(feature_data[1][1]),
]
for feature_data in dci_feature.items()
]

columns = [
"feature",
Expand All @@ -473,12 +501,10 @@ def generate_feature_author_dcfi_data(
project_gits: tp.Dict[str, pygit2.Repository],
) -> pd.DataFrame:
dci_feature = get_features_dataflow_affecting_commits(SFBR, DFBR)

# {feature, ([interacting_authors_outside], [interacting_authors_inside])}
rows_feature_author_dci = []

feature_scfi_data = generate_feature_scfi_data(SFBR)

for feature_data in dci_feature.items():
feature = feature_data[0]
interacting_commits_outside = feature_data[1][0]
Expand All @@ -501,14 +527,16 @@ def generate_feature_author_dcfi_data(
continue
interacting_authors_inside.add(author)

rows_feature_author_dci.append([
feature,
feature_scfi_data.loc[feature_scfi_data["feature"] ==
feature_data[0]]["feature_size"].to_numpy()
[0],
len(interacting_authors_outside),
len(interacting_authors_inside),
])
rows_feature_author_dci.append(
[
feature,
feature_scfi_data.loc[feature_scfi_data["feature"] == feature][
"feature_size"
].to_numpy()[0],
len(interacting_authors_outside),
len(interacting_authors_inside),
]
)

columns = [
"feature",
Expand Down

0 comments on commit 5382403

Please sign in to comment.