From 64a94863c9e70ac8c75e443bc15cd7facbf2111d Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 29 Nov 2018 17:51:07 +0100 Subject: [PATCH 01/31] Remove get.commits.raw function from util-data.R The get.commits.raw function was removed. Instead, the function get.commits should be used from now on. Signed-off-by: Jakob Kronawitter --- util-data.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/util-data.R b/util-data.R index 80cd16eb..951f9a81 100644 --- a/util-data.R +++ b/util-data.R @@ -446,16 +446,6 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits) }, - #' Get the complete list of commits. - #' If it does not already exist, call the read method first. - #' - #' Note: This is just a delegate for \code{ProjectData$get.commits()}. - #' - #' @return the list of commits - get.commits.raw = function() { - return(self$get.commits()) - }, - #' Set the commit list of the project to a new one. #' Add PaStA data if configured in the field \code{project.conf}. #' From 894c9a5c181fef14dcb71fa23699bebbcbcd2b4f Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Sun, 2 Dec 2018 17:28:30 +0100 Subject: [PATCH 02/31] Move artifact kind filtering functionality into the get.commits method The artifact kind filtering which filters the commits.list file and only keeps the commits which have the correct artifact.type (configured in the ProjectConf class) has been moved to the get.commits method of the ProjectData class. Previously this functionality was in the get.commits.filtered method. Signed-off-by: Jakob Kronawitter --- util-data.R | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/util-data.R b/util-data.R index 951f9a81..bb3078c3 100644 --- a/util-data.R +++ b/util-data.R @@ -146,10 +146,6 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits.filtered) } - ## only process commits with the artifact listed in the configuration or missing - commit.data = subset(commit.data, artifact.type %in% - c(private$project.conf$get.value("artifact.codeface"), "")) - ## filter out the base artifacts (i.e., Base_Feature, File_Level) if (private$project.conf$get.value("artifact.filter.base")) { commit.data = subset(commit.data, !(artifact %in% BASE.ARTIFACTS)) @@ -404,14 +400,14 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits.filtered.empty) }, - #' Get the list of commits filtered by the artifact kind configured in the field \code{project.conf}. + #' Get the list of commits returned by the get.commits method and apply additional filters on them. #' If configured in \code{project.conf}, get the list of commits without the base artifact. #' In addition, if configured in \code{project.conf}, append the synchronicity data and PaStA data #' to the filtered commit data. #' If the list of filtered commits does not already exist, call the filter method. #' - #' @return the commit list containing only commit data related to the configured artifact and, - #' if configured, without the base artifact + #' @return the commit list returned by get.commits with configured filters applied and optionally added PaSta or + #' synchronicity data get.commits.filtered = function() { logging::loginfo("Getting commit data filtered by artifact.base.") @@ -423,23 +419,28 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits.filtered) }, - #' Get the complete list of commits. + #' Get the complete list of commits filtered by the artifact kind which was configured in the + #' \code{project.conf}. #' If configured in the field \code{project.conf}, append the PaStA data to the commit data #' by calling the setter function. #' If the list of commits does not already exist, call the read method first. #' #' @return the list of commits get.commits = function() { - logging::loginfo("Getting raw commit data.") + logging::loginfo("Getting commit data.") ## if commits are not read already, do this if (is.null(private$commits)) { - commits.read = read.commits( + commit.data = read.commits( self$get.data.path(), private$project.conf$get.value("artifact") ) - self$set.commits(data = commits.read) + ## only process commits with the artifact listed in the configuration or missing + commit.data = subset(commit.data, artifact.type %in% + c(private$project.conf$get.value("artifact.codeface"), "")) + + self$set.commits(data = commit.data) } private$extract.timestamps(source = "commits") From e74e15d64f152fd29eab0a3bd157b9e243891435 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Tue, 4 Dec 2018 12:02:15 +0100 Subject: [PATCH 03/31] Adjust read.commits to return a valid data.frame instead of an empty one In the case of a valid commits.list file with at least one commit line the read.commits function returns a data.frame with 16 columns containing all the commits read from the file. If the commits.list file is empty, however, it previously returned an empty data.frame with no columns. This has now been adjusted to return an empty data.frame with all the columns (16 columns), which should save a lot of additional if-else case distinctions later on because now the shape of the returned data.frame by the read.commits function is always the same. Signed-off-by: Jakob Kronawitter --- util-read.R | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/util-read.R b/util-read.R index f9ba84dc..d326c3f1 100644 --- a/util-read.R +++ b/util-read.R @@ -47,13 +47,6 @@ read.commits = function(data.path, artifact) { commit.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) - ## handle the case that the list of commits is empty - if (inherits(commit.data, "try-error")) { - logging::logwarn("There are no commits available for the current environment.") - logging::logwarn("Datapath: %s", data.path) - return(data.frame()) - } - ## set proper column names based on Codeface extraction: ## ## SELECT c.id, c.authorDate, a.name, a.email1, c.commitDate, @@ -61,12 +54,23 @@ read.commits = function(data.path, artifact) { ## c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, ## cd.file, cd.entityId, cd.entityType, cd.size commit.data.columns = c( - "commit.id", # id - "date", "author.name", "author.email", # author information - "committer.date", "committer.name", "committer.email", # committer information - "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information - "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information + "commit.id", # id + "date", "author.name", "author.email", # author information + "committer.date", "committer.name", "committer.email", # committer information + "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information + "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information ) + + ## handle the case that the list of commits is empty + if (inherits(commit.data, "try-error")) { + logging::logwarn("There are no commits available for the current environment.") + logging::logwarn("Datapath: %s", data.path) + + # return a dataframe with the correct columns but zero rows + commit.data = data.frame(matrix(nrow = 0, ncol = length(commit.data.columns))) + colnames(commit.data) = commit.data.columns + return(commit.data) + } colnames(commit.data) = commit.data.columns ## remove duplicated lines (even if they contain different commit ids but the same commit hash) From 11428d9847fd44f982cd094a3248bd13fb6b7b58 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 6 Dec 2018 18:58:06 +0100 Subject: [PATCH 04/31] Restructure get.commits and get.commits.filtered(.empty) methods This major commit merges the two old methods get.commits.filtered and get.commits.filtered.empty of the ProjectData class into one new method again called get.commits.filtered. Similiarly, the filter.commits.empty and filter.commits methods were merged into one new filter.commits method which now takes filter.untracked.files and artifact.filter.base as paramaters which then control how the filtering is performed. The filter.untracked.files parameter was added to the ProjectConf which now controls - just like the artifact.filter.base parameter - which commits should be filtered out when calling the get.commits.filtered method. If you want to call the get.commits.filtered with other paramaters (not the ones that are configured in the ProjectConf) then one can call the get.commits.filtered.uncached version of this method. As the name implies, this method is not taking advantage of caching and should thus not be used too often. In the course of revamping these methods it only took a minor effort to rename the empty artifact to a more speaking identifier, namely, "untracked files". Thus, this renaming was also performed in this commit. Signed-off-by: Jakob Kronawitter --- showcase.R | 2 - util-conf.R | 6 ++ util-data.R | 199 +++++++++++++------------------------ util-networks-covariates.R | 4 +- util-read.R | 4 + 5 files changed, 80 insertions(+), 135 deletions(-) diff --git a/showcase.R b/showcase.R index 2222913a..fa17fd16 100644 --- a/showcase.R +++ b/showcase.R @@ -85,7 +85,6 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) # x.data$get.synchronicity() # x.data$group.artifacts.by.data.column("commits", "author.name") # x.data$get.commits.filtered() -# x.data$get.commits.filtered.empty() # x.data$get.mails() # x.data$get.authors() # x.data$get.data.path() @@ -126,7 +125,6 @@ y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) # y.data$get.synchronicity() # y.data$group.artifacts.by.data.column("commits", "author.name") # y.data$get.commits.filtered() -# y.data$get.commits.filtered.empty() # y.data$get.mails() # y.data$get.authors() # y.data$get.data.path() diff --git a/util-conf.R b/util-conf.R index 38a7b66b..39dbe445 100644 --- a/util-conf.R +++ b/util-conf.R @@ -338,6 +338,12 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), + filter.untracked.files = list( + default = TRUE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 + ), synchronicity = list( default = FALSE, type = "logical", diff --git a/util-data.R b/util-data.R index bb3078c3..c53a262a 100644 --- a/util-data.R +++ b/util-data.R @@ -40,7 +40,7 @@ BASE.ARTIFACTS = c( ## mapping of data source to artifact column ## (for commits: filter also empty, non-configured, and (potentially) base artifacts) DATASOURCE.TO.ARTIFACT.FUNCTION = list( - "commits" = "get.commits.filtered.empty", + "commits" = "get.commits.filtered", "mails" = "get.mails", "issues" = "get.issues" ) @@ -70,7 +70,6 @@ ProjectData = R6::R6Class("ProjectData", ## commits and commit data commits.filtered = NULL, # data.frame - commits.filtered.empty = NULL, #data.frame commits = NULL, # data.frame synchronicity = NULL, # data.frame pasta = NULL, # data.frame @@ -85,88 +84,31 @@ ProjectData = R6::R6Class("ProjectData", ## * * filtering commits ------------------------------------------- - #' Filter commits with empty artifacts from the already filtered commit list and - #' save the new list to 'commits.filtered.empty'. + #' Filter commits retrieved by the \code{get.commits} method by removing untracked files and removing the base + #' artifact (see parameters). #' - #' @seealso \code{get.commits.filtered} - filter.commits.empty = function() { - - logging::logdebug("filter.commits.empty: starting.") - - ## do not compute anything more than once - if (!is.null(private$commits.filtered.empty)) { - logging::logdebug("filter.commits.empty: finished. (already existing)") - return(private$commits.filtered.empty) - } - - ## get raw commit data - commit.data = self$get.commits.filtered() - - ## break if the list of commits is empty - if (nrow(commit.data) == 0) { - logging::logwarn("There are no commits available for the current environment.") - logging::logwarn("Class: %s", self$get.class.name()) - # logging::logwarn("Configuration: %s", private$project.conf$get.conf.as.string()) - private$commits.filtered.empty = data.frame() - return(private$commits.filtered.empty) - } - - ## only process commits with non-empty artifact - commit.data = subset(commit.data, artifact != "") - - ## store the commit data - private$commits.filtered.empty = commit.data - logging::logdebug("filter.commits.empty: finished.") - }, - - #' Filter the data from the commit list which does not belong to the artifact listed in the field - #' \code{project.conf}. - #' If configured in \code{project.conf}, filter the commits from the commit list that touch the base artifact. - #' Add synchronicity and PaStA data if configured in \code{project.conf}. - #' Finally, save the new list to the field \code{commits.filtered}. - filter.commits = function() { - + #' @param remove.untracked.files configures if untracked files should be kept or removed + #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' + #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied + filter.commits = function(remove.untracked.files, remove.base.artifact) { logging::logdebug("filter.commits: starting.") - ## do not compute anything more than once - if (!is.null(private$commits.filtered)) { - logging::logdebug("filter.commits: finished. (already existing)") - return(private$commits.filtered) - } - - ## get raw commit data + ## get commit data commit.data = self$get.commits() - ## break if the list of commits is empty - if (nrow(commit.data) == 0) { - logging::logwarn("There are no commits available for the current environment.") - logging::logwarn("Class: %s", self$get.class.name()) - # logging::logwarn("Configuration: %s", private$project.conf$get.conf.as.string()) - private$commits.filtered = data.frame() - return(private$commits.filtered) + ## filter out the untracked files + if (remove.untracked.files) { + commit.data = subset(commit.data, artifact != "") } ## filter out the base artifacts (i.e., Base_Feature, File_Level) - if (private$project.conf$get.value("artifact.filter.base")) { + if (remove.base.artifact) { commit.data = subset(commit.data, !(artifact %in% BASE.ARTIFACTS)) } - ## append synchronicity data if wanted - if (private$project.conf$get.value("synchronicity")) { - synchronicity.data = self$get.synchronicity() - commit.data = merge(commit.data, synchronicity.data, - by = "hash", all.x = TRUE, sort = FALSE) - } - - ## add PaStA data if wanted - if (private$project.conf$get.value("pasta")) { - self$get.pasta() - commit.data = private$add.pasta.data(commit.data) - } - - ## store the commit data - private$commits.filtered = commit.data logging::logdebug("filter.commits: finished.") + return(commit.data) }, ## * * PaStA data -------------------------------------------------- @@ -283,7 +225,6 @@ ProjectData = R6::R6Class("ProjectData", #' changed. reset.environment = function() { private$commits.filtered = NULL - private$commits.filtered.empty = NULL private$commits = NULL private$synchronicity = NULL private$mails = NULL @@ -380,50 +321,43 @@ ProjectData = R6::R6Class("ProjectData", ## * * raw data ---------------------------------------------------- - #' Get the list of commits without empty artifacts and filtered by the artifact kind configured - #' in the field \code{project.conf}. - #' If configured in \code{project.conf}, get the list of commits without the base artifact. - #' In addition, if configured in \code{project.conf}, append the synchronicity data and PaStA data - #' to the filtered commit data. - #' If the list of filtered commits does not already exist, call the filter method. + #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the + #' base artifact (if configured in the \code{project.conf}, see parameters \code{filter.untracked.files} and + #' \code{artifact.filter.base}). This method uses caching. #' - #' @return the commit list without empty artifacts and containing only commit data related to the - #' configured artifact and, if configured, without the base artifact - get.commits.filtered.empty = function() { - logging::loginfo("Getting commit data filtered by artifact.base and artifact.empty.") - - ## if commits are not read already, do this - if (is.null(private$commits.filtered.empty)) { - private$filter.commits.empty() - } - - return(private$commits.filtered.empty) - }, - - #' Get the list of commits returned by the get.commits method and apply additional filters on them. - #' If configured in \code{project.conf}, get the list of commits without the base artifact. - #' In addition, if configured in \code{project.conf}, append the synchronicity data and PaStA data - #' to the filtered commit data. - #' If the list of filtered commits does not already exist, call the filter method. + #' @param remove.untracked.files configures if untracked files should be kept or removed + #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' + #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied #' - #' @return the commit list returned by get.commits with configured filters applied and optionally added PaSta or - #' synchronicity data + #' @seealso get.commits.filtered.uncached get.commits.filtered = function() { - logging::loginfo("Getting commit data filtered by artifact.base.") - - ## if commits are not read already, do this if (is.null(private$commits.filtered)) { - private$filter.commits() + private$commits.filtered = private$filter.commits( + private$project.conf$get.value("filter.untracked.files"), + private$project.conf$get.value("artifact.filter.base") + ) } - return(private$commits.filtered) }, - #' Get the complete list of commits filtered by the artifact kind which was configured in the - #' \code{project.conf}. - #' If configured in the field \code{project.conf}, append the PaStA data to the commit data - #' by calling the setter function. - #' If the list of commits does not already exist, call the read method first. + #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the + #' base artifact (see parameters). This method doesn't use caching. If you want to use caching, please use the + #' \code{get.commits.filtered} method instead. + #' + #' @param remove.untracked.files configures if untracked files should be kept or removed + #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' + #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied + #' + #' @seealso get.commits.filtered + get.commits.filtered.uncached = function(remove.untracked.files, remove.base.artifact) { + return (private$filter.commits(remove.untracked.files, remove.base.artifact)) + }, + + #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. + #' If the list of commits is not cached, call the read method first. #' + #' If configured in the field \code{project.conf}, add PaStA and synchronicity data. #' #' @return the list of commits get.commits = function() { @@ -431,16 +365,14 @@ ProjectData = R6::R6Class("ProjectData", ## if commits are not read already, do this if (is.null(private$commits)) { - commit.data = read.commits( - self$get.data.path(), - private$project.conf$get.value("artifact") - ) + commit.data = read.commits(self$get.data.path(), private$project.conf$get.value("artifact")) ## only process commits with the artifact listed in the configuration or missing commit.data = subset(commit.data, artifact.type %in% - c(private$project.conf$get.value("artifact.codeface"), "")) + c(private$project.conf$get.value("artifact.codeface"), "")) - self$set.commits(data = commit.data) + ## saves the commit.data to the commits cache field after PaStA and synchronicity data is added + self$set.commits(commit.data) } private$extract.timestamps(source = "commits") @@ -448,26 +380,32 @@ ProjectData = R6::R6Class("ProjectData", }, #' Set the commit list of the project to a new one. - #' Add PaStA data if configured in the field \code{project.conf}. + #' Add PaStA and sychronicity data if configured in the field \code{project.conf}. #' - #' @param data the new list of commits - set.commits = function(data) { - logging::loginfo("Setting raw commit data.") - if (is.null(data)) { - data = data.frame() - } - ## add PaStA data if wanted - if (private$project.conf$get.value("pasta")) { - logging::loginfo("Adding PaStA data.") - data = private$add.pasta.data(data = data) + #' @param commit.data the new list of commits + set.commits = function(commit.data) { + logging::loginfo("Setting commit data.") + + if (!is.null(commit.data)) { + + ## append synchronicity data if wanted + if (private$project.conf$get.value("synchronicity")) { + synchronicity.data = self$get.synchronicity() + commit.data = merge(commit.data, synchronicity.data, + by = "hash", all.x = TRUE, sort = FALSE) + } + + ## add PaStA data if wanted + if (private$project.conf$get.value("pasta")) { + self$get.pasta() + commit.data = private$add.pasta.data(commit.data) + } } - private$commits = data + private$commits = commit.data - ## remove cached data for filtered commits as these need to be re-computed - ## after changing the data + ## remove cached data for filtered commits as these need to be re-computed after changing the data private$commits.filtered = NULL - private$commits.filtered.empty = NULL }, #' Set the commit list of the project to a new one. @@ -998,7 +936,6 @@ ProjectData = R6::R6Class("ProjectData", ## check given data source data.source = match.arg.or.default(data.source, several.ok = FALSE) - ## TODO use filtered commit data here (and not the filtered.empty version)? → try filtered! data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] ## get the key-value mapping/list for the given parameters diff --git a/util-networks-covariates.R b/util-networks-covariates.R index edd196b1..c13776c3 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -672,7 +672,7 @@ add.vertex.attribute.artifact.change.count = function(list.of.networks, project. nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, aggregation.level, default.value, function(range, range.data, net) { - artifact.to.commit = get.key.to.value.from.df(range.data$get.commits.filtered.empty(), "artifact", "hash") + artifact.to.commit = get.key.to.value.from.df(range.data$get.commits.filtered(), "artifact", "hash") artifact.change.count = lapply(artifact.to.commit, function(x) { length(unique(x[["hash"]])) }) @@ -709,7 +709,7 @@ add.vertex.attribute.artifact.first.occurrence = function(list.of.networks, proj nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, aggregation.level, default.value, function(range, range.data, net) { - artifact.to.dates = get.key.to.value.from.df(range.data$get.commits.filtered.empty(), "artifact", "date") + artifact.to.dates = get.key.to.value.from.df(range.data$get.commits.filtered(), "artifact", "date") artifact.to.first = lapply(artifact.to.dates, function(a) { min(a[["date"]]) }) diff --git a/util-read.R b/util-read.R index d326c3f1..8daaf4a1 100644 --- a/util-read.R +++ b/util-read.R @@ -125,6 +125,10 @@ read.commits = function(data.path, artifact) { commit.data["artifact"] = artifacts.new } + ## Commits to files that are not tracked by Codeface have the empty string in the file column + ## To better indicate this, the column value is changed to 'untracked.file' + commit.data["file"] = ifelse(commit.data[["file"]] == "", "untracked.file", commit.data[["file"]]) + ## convert dates and sort by them commit.data[["date"]] = get.date.from.string(commit.data[["date"]]) commit.data[["committer.date"]] = get.date.from.string(commit.data[["committer.date"]]) From c26e582e4ad6bf1eaeb08202fc3e00394332a013 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 6 Dec 2018 19:12:50 +0100 Subject: [PATCH 05/31] Delete set.commits.raw and read.commits.raw methods. Signed-off-by: Jakob Kronawitter --- tests/test-read.R | 2 +- util-data.R | 9 --------- util-read.R | 12 ------------ 3 files changed, 1 insertion(+), 22 deletions(-) diff --git a/tests/test-read.R b/tests/test-read.R index ee1a8d21..10b54d76 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -88,7 +88,7 @@ test_that("Read the raw commit data with the file artifact.", { proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "file") ## read the actual data - commit.data.read = read.commits.raw(proj.conf$get.value("datapath"), proj.conf$get.value("artifact")) + commit.data.read = read.commits(proj.conf$get.value("datapath"), proj.conf$get.value("artifact")) ## build the expected data.frame commit.data.expected = data.frame(commit.id = sprintf("", c(32716, 32717, 32718, 32719, 32715)), diff --git a/util-data.R b/util-data.R index c53a262a..55161069 100644 --- a/util-data.R +++ b/util-data.R @@ -408,15 +408,6 @@ ProjectData = R6::R6Class("ProjectData", private$commits.filtered = NULL }, - #' Set the commit list of the project to a new one. - #' - #' Note: This is just a delegate for \code{ProjectData$set.commits(data)}. - #' - #' @param data the new list of commits - set.commits.raw = function(data) { - self$set.commits(data) - }, - #' Get the synchronicity data. #' If it does not already exist call the read method. #' diff --git a/util-read.R b/util-read.R index 8daaf4a1..93251e83 100644 --- a/util-read.R +++ b/util-read.R @@ -143,18 +143,6 @@ read.commits = function(data.path, artifact) { return(commit.data) } -#' Read the commits from the 'commits.list' file. -#' -#' @param data.path the path to the commit list -#' @param artifact the artifact whose commits are read -#' -#' Note: This is just a delegate for \code{read.commits(data.path, artifact)}. -#' -#' @return the read commits -read.commits.raw = function(data.path, artifact) { - return(read.commits(data.path = data.path, artifact = artifact)) -} - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Synchronicity data ------------------------------------------------------ From 51617bb807d56437a320a4c7fc245fcb1bc06a54 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 6 Dec 2018 20:09:52 +0100 Subject: [PATCH 06/31] Adjust two testcases to work with the new get.commits method The new get.commits method includes filtering by artifact kind. Two testcases depended on this and thus have now been adjusted accordingly. 10 test cases of the test-split.R are still not working. Signed-off-by: Jakob Kronawitter --- tests/test-data-cut.R | 36 ++++++++++++++++-------------------- tests/test-networks-cut.R | 36 ++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index 36939dad..1daab2bb 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -44,26 +44,22 @@ test_that("Cut commit and mail data to same date range.", { x.data = ProjectData$new(proj.conf) - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32712, 32713, 32713)), - date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:58:59", "2016-07-12 16:00:45", - "2016-07-12 16:00:45")), - author.name = c("Björn", "Björn", "Olaf", "Olaf"), - author.email = c("bjoern@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org"), - committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:58:59", "2016-07-20 10:00:44", - "2016-07-20 10:00:44")), - committer.name = c("Björn", "Björn", "Björn", "Björn"), - committer.email = c("bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org"), - hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", - "5a5ec9675e98187e1e92561e1888aa6f04faa338", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), - changed.files = as.integer(c(1, 1, 1, 1)), - added.lines = as.integer(c(1, 1, 1, 1)), - deleted.lines = as.integer(c(1, 1, 0, 0)), - diff.size = as.integer(c(2, 2, 1, 1)), - file = c("test.c", "test.c", "test.c", "test.c"), - artifact = c("A", "defined(A)", "A", "defined(A)"), - artifact.type = c("Feature", "FeatureExpression", "Feature", "FeatureExpression"), - artifact.diff.size = as.integer(c(1, 1, 1, 1))) + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713)), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45")), + author.name = c("Björn", "Olaf"), + author.email = c("bjoern@example.org", "olaf@example.org"), + committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-20 10:00:44")), + committer.name = c("Björn", "Björn"), + committer.email = c("bjoern@example.org", "bjoern@example.org"), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), + changed.files = as.integer(c(1, 1)), + added.lines = as.integer(c(1, 1)), + deleted.lines = as.integer(c(1, 0)), + diff.size = as.integer(c(2, 1)), + file = c("test.c", "test.c"), + artifact = c("A", "A"), + artifact.type = c("Feature", "Feature"), + artifact.diff.size = as.integer(c(1, 1))) mail.data.expected = data.frame(author.name = c("Thomas"), author.email = c("thomas@example.org"), diff --git a/tests/test-networks-cut.R b/tests/test-networks-cut.R index ac1eab9e..8244dc2f 100644 --- a/tests/test-networks-cut.R +++ b/tests/test-networks-cut.R @@ -44,26 +44,22 @@ test_that("Cut commit and mail data to same date range.", { x.data = ProjectData$new(proj.conf) x = NetworkBuilder$new(x.data, net.conf) - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32712, 32713, 32713)), - date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:58:59", "2016-07-12 16:00:45", - "2016-07-12 16:00:45")), - author.name = c("Björn", "Björn", "Olaf", "Olaf"), - author.email = c("bjoern@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org"), - committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:58:59", "2016-07-20 10:00:44", - "2016-07-20 10:00:44")), - committer.name = c("Björn", "Björn", "Björn", "Björn"), - committer.email = c("bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org"), - hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", - "5a5ec9675e98187e1e92561e1888aa6f04faa338", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), - changed.files = as.integer(c(1, 1, 1, 1)), - added.lines = as.integer(c(1, 1, 1, 1)), - deleted.lines = as.integer(c(1, 1, 0, 0)), - diff.size = as.integer(c(2, 2, 1, 1)), - file = c("test.c", "test.c", "test.c", "test.c"), - artifact = c("A", "defined(A)", "A", "defined(A)"), - artifact.type = c("Feature", "FeatureExpression", "Feature", "FeatureExpression"), - artifact.diff.size = as.integer(c(1, 1, 1, 1))) + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713)), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45")), + author.name = c("Björn", "Olaf"), + author.email = c("bjoern@example.org", "olaf@example.org"), + committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-20 10:00:44")), + committer.name = c("Björn", "Björn"), + committer.email = c("bjoern@example.org", "bjoern@example.org"), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), + changed.files = as.integer(c(1, 1)), + added.lines = as.integer(c(1, 1)), + deleted.lines = as.integer(c(1, 0)), + diff.size = as.integer(c(2, 1)), + file = c("test.c", "test.c"), + artifact = c("A", "A"), + artifact.type = c("Feature", "Feature"), + artifact.diff.size = as.integer(c(1, 1))) mail.data.expected = data.frame(author.name = c("Thomas"), author.email = c("thomas@example.org"), From 67a4fbe4f244b4b6047c2c2be7682d7f9085e9eb Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Fri, 7 Dec 2018 15:27:32 +0100 Subject: [PATCH 07/31] Adapt test cases to new changes and improve empty dataframe creation The test cases were adapted to two of the new changes in the network library. The first one is the fact that the get.commits method now removes either 'Feature' or 'FeatureExpression' commits. The second one was the change that there are no dummy data.frames anymore (with zero columns and rows). Instead there are empty data.frames when there no data exists (with columns but zero rows). One mistake was made during creation of these. The empty data.frames that are created did not contain any data type informtion (all columns defaulted to the 'logical' data.type). If this is not wanted there now exists a new helper method which also takes care of data types. Signed-off-by: Jakob Kronawitter --- tests/test-split.R | 56 +++++++++++++++++++++++----------------------- util-misc.R | 52 ++++++++++++++++++++++++++++++++++++++++++ util-read.R | 6 +++-- 3 files changed, 84 insertions(+), 30 deletions(-) diff --git a/tests/test-split.R b/tests/test-split.R index 7f624c98..23967c2a 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -93,9 +93,9 @@ test_that("Split a data object time-based (split.basis == 'commits').", { ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:4, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data.frame(), - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[5:9, ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:6, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data.frame(), @@ -168,10 +168,10 @@ test_that("Split a data object time-based (split.basis == 'mails').", { ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data.frame(), - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data.frame(), - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data.frame(), - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:4, ] + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), mails = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], @@ -247,9 +247,9 @@ test_that("Split a data object time-based (split.basis == 'issues').", { ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data.frame(), + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data.frame() + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commits[0, ] ), mails = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data.frame(), @@ -496,9 +496,9 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = data$commits[1:4, ], - "2016-07-12 16:05:41-2016-07-12 16:06:32" = data$commits[5:7, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[8:9, ] + "2016-07-12 15:58:59-2016-07-12 16:05:41" = data$commits[1:2, ], + "2016-07-12 16:05:41-2016-07-12 16:06:32" = data$commits[3:4, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[5:6, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:05:41" = data$mails[rownames(data$mails) %in% 16:17, ], @@ -591,8 +591,8 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:6, ], - "2016-07-12 16:06:10-2016-07-12 16:06:33" = data$commits[7:9, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:06:10-2016-07-12 16:06:33" = data$commits[4:6, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], @@ -675,12 +675,12 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data.frame(), - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data.frame(), - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data.frame(), - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data.frame(), - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:4, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data.frame() + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commits[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commits[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commits[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] ), mails = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], @@ -742,7 +742,7 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:4, ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -785,8 +785,8 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data.frame(), - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:4, ] + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] ), mails = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], @@ -866,10 +866,10 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2016-07-12 16:05:47" = data$commits[1:6, ], - "2016-07-12 16:05:47-2016-08-31 18:21:48" = data$commits[7:9, ], - "2016-08-31 18:21:48-2017-02-20 22:25:41" = data.frame(), - "2017-02-20 22:25:41-2017-05-23 12:32:40" = data.frame() + "2013-04-21 23:52:09-2016-07-12 16:05:47" = data$commits[1:3, ], + "2016-07-12 16:05:47-2016-08-31 18:21:48" = data$commits[4:6, ], + "2016-08-31 18:21:48-2017-02-20 22:25:41" = data$commits[0, ], + "2017-02-20 22:25:41-2017-05-23 12:32:40" = data$commits[0, ] ), mails = list( "2013-04-21 23:52:09-2016-07-12 16:05:47" = data$mails[rownames(data$mails) %in% 14:17, ], @@ -967,7 +967,7 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { expected.data = list( commits = list( "2013-04-21 23:52:09-2016-07-27 22:25:25" = data$commits, - "2016-07-27 22:25:25-2017-05-23 12:32:40" = data.frame() + "2016-07-27 22:25:25-2017-05-23 12:32:40" = data$commits[0, ] ), mails = list( "2013-04-21 23:52:09-2016-07-27 22:25:25" = data$mails[rownames(data$mails) %in% 14:17, ], diff --git a/util-misc.R b/util-misc.R index b459ead6..be4c9c6f 100644 --- a/util-misc.R +++ b/util-misc.R @@ -132,6 +132,58 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE } } +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Empty dataframe creation------------------------------------------------- + +#' Create an empty data.frame with the specified columns. Unless all columns should have to default datatype +#' \code{logical}, the second parameter \code{data.types} should specify the datatypes. +#' +#' @param columns a character vector containing all the column names +#' @param data.types a character vector of the same length as \code{columns}, the datatypes can be \code{integer}, +#' \code{numeric},\code{POSIXct},\code{character}, \code{factor} or \code{logical} +#' +#' @return the newly created empty data.frame +get.empty.dataframe = function(columns, data.types = NULL) { + if (!is.null(data.types) && length(data.types) != length(columns)) { + stop("If specified, the length of the two given vectors columns and data.types must be the same.") + } + + data.frame = data.frame(matrix(nrow = 0, ncol = length(columns))) + colnames(data.frame) = columns + + if (length(data.types) > 0) { + + for (i in 1:length(data.types)) { + column = data.frame[[i]] + switch(tolower(data.types[i]), + "posixct" = { + column = as.POSIXct(column) + }, + "integer" = { + column = as.integer(column) + }, + "numeric" = { + column = as.numeric(column) + }, + "logical" = { + column = as.logical(column) + }, + "character" = { + column = as.character(column) + }, + "factor" = { + column = as.factor(column) + }, + { + stop(paste("Unknown datatype specified:", data.types[[i]])) + } + ) + data.frame[[i]] = column + } + } + + return(data.frame) +} ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Stacktrace -------------------------------------------------------------- diff --git a/util-read.R b/util-read.R index 93251e83..9400bf90 100644 --- a/util-read.R +++ b/util-read.R @@ -67,8 +67,10 @@ read.commits = function(data.path, artifact) { logging::logwarn("Datapath: %s", data.path) # return a dataframe with the correct columns but zero rows - commit.data = data.frame(matrix(nrow = 0, ncol = length(commit.data.columns))) - colnames(commit.data) = commit.data.columns + commit.data = get.empty.dataframe(commit.data.columns, data.types = + c("character", "POSIXct", "character", "character", "POSIXct", + "character", "character", "character", "numeric", "numeric", "numeric", + "numeric", "character", "character", "character", "numeric")) return(commit.data) } colnames(commit.data) = commit.data.columns From c60c2f6e44b6f34cccb2714eccc7674158c83dde Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Sat, 8 Dec 2018 21:51:41 +0100 Subject: [PATCH 08/31] Change edge generation behaviour for base and untracked files artifact Previously, when an author network was created and the untracked files artifact and the base artifact were included, edges have been created among the untracked files artifact and among the base artifact. This was now changed so that there are no edges created among untracked files at any time. For the base artifact it can be configured via the new base.artifact.edges parameter in the NetworkConf. Signed-off-by: Jakob Kronawitter --- util-conf.R | 6 ++++++ util-data.R | 2 +- util-networks.R | 32 +++++++++++++++++++++++++++++++- util-read.R | 5 +++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/util-conf.R b/util-conf.R index 39dbe445..ae2382d8 100644 --- a/util-conf.R +++ b/util-conf.R @@ -697,6 +697,12 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), + base.artifact.edges = list( + default = TRUE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 + ), edge.attributes = list( default = c( "date", "artifact.type", # general diff --git a/util-data.R b/util-data.R index 55161069..dfb26359 100644 --- a/util-data.R +++ b/util-data.R @@ -99,7 +99,7 @@ ProjectData = R6::R6Class("ProjectData", ## filter out the untracked files if (remove.untracked.files) { - commit.data = subset(commit.data, artifact != "") + commit.data = subset(commit.data, artifact != "untracked.file") } ## filter out the base artifacts (i.e., Base_Feature, File_Level) diff --git a/util-networks.R b/util-networks.R index 4148d466..68a131bf 100644 --- a/util-networks.R +++ b/util-networks.R @@ -153,9 +153,39 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", return(private$authors.network.cochange) } + list = private$proj.data$group.authors.by.data.column("commits", "artifact") + + # split untracked.files subgroup into multiple subgroups which only contain one author each to prohibit edge + # edge construction between authors of this subgroup + if (!is.null(list[["untracked.files"]])) { + for (i in 1:nrow(list[["untracked.files"]])) { + row = list[["untracked.files"]][i, ] + list[[paste0("untracked.files_", i)]] = row + } + list[["untracked.files"]] = NULL + } + + # split base feature subgroup into multiple subgroups which only contain one author each to prohibit edge + # edge construction between authors of this subgroup + if (!is.null(list[["Base_Feature"]]) && !private$network.conf$get.value("base.artifact.edges")) { + for (i in 1:nrow(list[["Base_Feature"]])) { + row = list[["Base_Feature"]][i, ] + list[[paste0("Base_Feature_", i)]] = row + } + list[["Base_Feature"]] = NULL + } + + if (!is.null(list[["File_Level"]]) && !private$network.conf$get.value("base.artifact.edges")) { + for (i in 1:nrow(list[["File_Level"]])) { + row = list[["File_Level"]][i, ] + list[[paste0("File_Level_", i)]] = row + } + list[["File_Level"]] = NULL + } + ## construct edge list based on artifact2author data author.net.data = construct.edge.list.from.key.value.list( - private$proj.data$group.authors.by.data.column("commits", "artifact"), + list, network.conf = private$network.conf, directed = private$network.conf$get.value("author.directed"), respect.temporal.order = private$network.conf$get.value("author.respect.temporal.order") diff --git a/util-read.R b/util-read.R index 9400bf90..922050e0 100644 --- a/util-read.R +++ b/util-read.R @@ -127,9 +127,10 @@ read.commits = function(data.path, artifact) { commit.data["artifact"] = artifacts.new } - ## Commits to files that are not tracked by Codeface have the empty string in the file column - ## To better indicate this, the column value is changed to 'untracked.file' + ## Commits to files that are not tracked by Codeface have the empty string in the file and artifact column + ## To better indicate this, the 'artifact' and 'file' column value is changed to 'untracked.file' commit.data["file"] = ifelse(commit.data[["file"]] == "", "untracked.file", commit.data[["file"]]) + commit.data["artifact"] = ifelse(commit.data[["artifact"]] == "", "untracked.file", commit.data[["artifact"]]) ## convert dates and sort by them commit.data[["date"]] = get.date.from.string(commit.data[["date"]]) From fada26d8a54b2a1ff78edd49724b53375b5fb3af Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 10 Dec 2018 19:59:20 +0100 Subject: [PATCH 09/31] Adjust copyright headers of modified files Signed-off-by: Jakob Kronawitter --- showcase.R | 1 + tests/test-data-cut.R | 1 + tests/test-networks-cut.R | 1 + tests/test-read.R | 1 + tests/test-split.R | 1 + util-conf.R | 1 + util-data.R | 1 + util-misc.R | 1 + util-networks-covariates.R | 1 + util-networks.R | 1 + util-read.R | 1 + 11 files changed, 11 insertions(+) diff --git a/showcase.R b/showcase.R index fa17fd16..39799923 100644 --- a/showcase.R +++ b/showcase.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index 1daab2bb..e235c616 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -16,6 +16,7 @@ ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-networks-cut.R b/tests/test-networks-cut.R index 8244dc2f..c09e391b 100644 --- a/tests/test-networks-cut.R +++ b/tests/test-networks-cut.R @@ -14,6 +14,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-read.R b/tests/test-read.R index 10b54d76..88a91bc9 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -15,6 +15,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-split.R b/tests/test-split.R index 23967c2a..f11df6f0 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -15,6 +15,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Thomas Bock ## Copyright 2018 by Christian Hechtl +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/util-conf.R b/util-conf.R index ae2382d8..488c54f1 100644 --- a/util-conf.R +++ b/util-conf.R @@ -17,6 +17,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Barbara Eckl +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/util-data.R b/util-data.R index dfb26359..996b855a 100644 --- a/util-data.R +++ b/util-data.R @@ -17,6 +17,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017 by Ferdinand Frank +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/util-misc.R b/util-misc.R index be4c9c6f..f8305062 100644 --- a/util-misc.R +++ b/util-misc.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/util-networks-covariates.R b/util-networks-covariates.R index c13776c3..92839f2e 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -15,6 +15,7 @@ ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock ## Copyright 2018 by Klara Schlüter +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/util-networks.R b/util-networks.R index 68a131bf..f25a6dc8 100644 --- a/util-networks.R +++ b/util-networks.R @@ -16,6 +16,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Barbara Eckl +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/util-read.R b/util-read.R index 922050e0..e454e141 100644 --- a/util-read.R +++ b/util-read.R @@ -16,6 +16,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. From 43f185dbfe698208cfd2e6f1e69c9329b0657415 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 10 Dec 2018 20:18:23 +0100 Subject: [PATCH 10/31] Update changelog Signed-off-by: Jakob Kronawitter --- NEWS.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS.md b/NEWS.md index 87dbfece..06718d77 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,17 @@ ## Unversioned +### Added +- In addition to the ProjectConf parameter `artifact.filter.base`, which configured wether the base artifact should be +included in the `get.commits.filtered` method, there is now the similiar parameter `filter.untracked.files` which does +the same thing for untracked files (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- Edges are not being constructed in the author network between authors that only modify untracked files. For authors +it can be configured if the edges should be created or not using the new NetworkConf parameter `base.artifact.edges` +(c60c2f6e44b6f34cccb2714eccc7674158c83dde) + ### Changed/Improved - Change shape of `Vertices` in the legend of plots to avoid confusion (f4fb4807cfd87d9d552a9ede92ea65ae4a386a04) +- Rename `empty artifact` to `untracked files` ### Fixed - Fix error when resetting an `ProjectData` environment (c64cab84e928a2a4c89a6df12440ba7ca06e6263) From 5ea65b9ac5a22967de87d7fd4ac66b0bc8e07238 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Sat, 15 Dec 2018 20:23:47 +0100 Subject: [PATCH 11/31] Add global constant 'UNTRACKED.FILE' and adjust documentation The global constant 'UNTRACKED.FILE' is added to avoid reusage of the same string 'untracked.file' all the time. In addition minor adjustments are made to the documentation. Signed-off-by: Jakob Kronawitter --- NEWS.md | 16 ++++++++++-- showcase.R | 1 + util-data.R | 25 ++++++++++++------- util-misc.R | 70 +++++++++++++++++++++++++++++------------------------ util-read.R | 30 +++++++++++++---------- 5 files changed, 87 insertions(+), 55 deletions(-) diff --git a/NEWS.md b/NEWS.md index 06718d77..141443e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,16 +3,28 @@ ## Unversioned ### Added -- In addition to the ProjectConf parameter `artifact.filter.base`, which configured wether the base artifact should be +- In addition to the ProjectConf parameter `artifact.filter.base`, which configured whether the base artifact should be included in the `get.commits.filtered` method, there is now the similiar parameter `filter.untracked.files` which does the same thing for untracked files (11428d9847fd44f982cd094a3248bd13fb6b7b58) - Edges are not being constructed in the author network between authors that only modify untracked files. For authors it can be configured if the edges should be created or not using the new NetworkConf parameter `base.artifact.edges` (c60c2f6e44b6f34cccb2714eccc7674158c83dde) +- The public `get.commits.filtered.uncached` method was added which allows for external filtering of the commits by +specifying if untracked files and/or the base artifact should be filtered (this method does not take advantage of +caching, whereas the `get.commits.filtered` method does) (11428d9847fd44f982cd094a3248bd13fb6b7b58) ### Changed/Improved - Change shape of `Vertices` in the legend of plots to avoid confusion (f4fb4807cfd87d9d552a9ede92ea65ae4a386a04) -- Rename `empty artifact` to `untracked files` +- Commits that do not change any artifact are considered to be carried out on a metafile called `untracked.file` +(11428d9847fd44f982cd094a3248bd13fb6b7b58) +- Remove `get.commits.raw`, `set.commits.raw` and `read.commits.raw` functions (64a94863c9e70ac8c75e443bc15cd7facbf2111d, +c26e582e4ad6bf1eaeb08202fc3e00394332a013) +- Removed `get.commits.filtered.empty` and corresponding `filter.commits.empty` method, the functionality has been moved +to the altered `get.commits.filtered` and `filter.commits` method respectively (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- The `filter.commits` method now takes parameters which configure if untracked files and/or the base artifact should be +filtered out (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- Filtering by artifact kind (e.g. filtering out either Feature or FeatureExpression) is now being done in the +`get.commits` method instead of the `get.commits.filtered` method (894c9a5c181fef14dcb71fa23699bebbcbcd2b4f) ### Fixed - Fix error when resetting an `ProjectData` environment (c64cab84e928a2a4c89a6df12440ba7ca06e6263) diff --git a/showcase.R b/showcase.R index 39799923..e773e92a 100644 --- a/showcase.R +++ b/showcase.R @@ -86,6 +86,7 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) # x.data$get.synchronicity() # x.data$group.artifacts.by.data.column("commits", "author.name") # x.data$get.commits.filtered() +# x.data$get.commits.filtered.uncached(remove.untracked.files = TRUE, remove.base.artifact = FALSE) # x.data$get.mails() # x.data$get.authors() # x.data$get.data.path() diff --git a/util-data.R b/util-data.R index 996b855a..d1f8b4c7 100644 --- a/util-data.R +++ b/util-data.R @@ -32,10 +32,14 @@ requireNamespace("parallel") # for parallel computation ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Constants --------------------------------------------------------------- +## untracked file +UNTRACKED.FILE = "" + ## base artifacts BASE.ARTIFACTS = c( "Base_Feature", - "File_Level" + "File_Level", + UNTRACKED.FILE ) ## mapping of data source to artifact column @@ -100,7 +104,7 @@ ProjectData = R6::R6Class("ProjectData", ## filter out the untracked files if (remove.untracked.files) { - commit.data = subset(commit.data, artifact != "untracked.file") + commit.data = subset(commit.data, file != UNTRACKED.FILE) } ## filter out the base artifacts (i.e., Base_Feature, File_Level) @@ -266,6 +270,7 @@ ProjectData = R6::R6Class("ProjectData", #' Set a value of the project configuration and reset the environment set.project.conf.entry = function(entry, value) { private$project.conf$update.value(entry, value) + self$reset.environment() }, #' Update the project configuration based on the given list @@ -324,10 +329,9 @@ ProjectData = R6::R6Class("ProjectData", #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the #' base artifact (if configured in the \code{project.conf}, see parameters \code{filter.untracked.files} and - #' \code{artifact.filter.base}). This method uses caching. + #' \code{artifact.filter.base}). #' - #' @param remove.untracked.files configures if untracked files should be kept or removed - #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' This method caches the filtered commits to the field \code{commits.filtered}. #' #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied #' @@ -343,8 +347,10 @@ ProjectData = R6::R6Class("ProjectData", }, #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the - #' base artifact (see parameters). This method doesn't use caching. If you want to use caching, please use the - #' \code{get.commits.filtered} method instead. + #' base artifact (see parameters). + #' + #' This method doesn't use caching. If you want to use caching, please use the \code{get.commits.filtered} + #' method instead. #' #' @param remove.untracked.files configures if untracked files should be kept or removed #' @param remove.base.artifact configures if the base artifact should be kept or removed @@ -357,7 +363,7 @@ ProjectData = R6::R6Class("ProjectData", }, #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. - #' If the list of commits is not cached, call the read method first. #' + #' If the list of commits is not cached in the field \code{commits}, call the read method first. #' If configured in the field \code{project.conf}, add PaStA and synchronicity data. #' #' @return the list of commits @@ -372,7 +378,8 @@ ProjectData = R6::R6Class("ProjectData", commit.data = subset(commit.data, artifact.type %in% c(private$project.conf$get.value("artifact.codeface"), "")) - ## saves the commit.data to the commits cache field after PaStA and synchronicity data is added + ## Add PaStA and synchronicity data (if configured in the 'project.conf') and save the commit data to + ## the field 'commits' afterwards self$set.commits(commit.data) } private$extract.timestamps(source = "commits") diff --git a/util-misc.R b/util-misc.R index f8305062..58d88b11 100644 --- a/util-misc.R +++ b/util-misc.R @@ -136,51 +136,59 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Empty dataframe creation------------------------------------------------- -#' Create an empty data.frame with the specified columns. Unless all columns should have to default datatype +#' Create an empty data.frame with the specified columns. Unless all columns should have the default datatype #' \code{logical}, the second parameter \code{data.types} should specify the datatypes. #' #' @param columns a character vector containing all the column names #' @param data.types a character vector of the same length as \code{columns}, the datatypes can be \code{integer}, -#' \code{numeric},\code{POSIXct},\code{character}, \code{factor} or \code{logical} +#' \code{numeric}, \code{POSIXct}, \code{character}, \code{factor} or \code{logical} #' #' @return the newly created empty data.frame get.empty.dataframe = function(columns, data.types = NULL) { + + ## if the vector data.types is specified, its length must match the length of the corresponding column names if (!is.null(data.types) && length(data.types) != length(columns)) { stop("If specified, the length of the two given vectors columns and data.types must be the same.") } + ## create the empty data frame (with zero rows), but the given number of columns data.frame = data.frame(matrix(nrow = 0, ncol = length(columns))) colnames(data.frame) = columns - if (length(data.types) > 0) { - - for (i in 1:length(data.types)) { - column = data.frame[[i]] - switch(tolower(data.types[i]), - "posixct" = { - column = as.POSIXct(column) - }, - "integer" = { - column = as.integer(column) - }, - "numeric" = { - column = as.numeric(column) - }, - "logical" = { - column = as.logical(column) - }, - "character" = { - column = as.character(column) - }, - "factor" = { - column = as.factor(column) - }, - { - stop(paste("Unknown datatype specified:", data.types[[i]])) - } - ) - data.frame[[i]] = column - } + ## assign the datatypes to the data frame columns by indivdually swapping the columns with new columns that possess + ## the correct data type + for (i in seq_along(data.types)) { + + ## get the column + column = data.frame[[i]] + + ## replace column with column of correct type + switch(tolower(data.types[i]), + "posixct" = { + column = as.POSIXct(column) + }, + "integer" = { + column = as.integer(column) + }, + "numeric" = { + column = as.numeric(column) + }, + "logical" = { + column = as.logical(column) + }, + "character" = { + column = as.character(column) + }, + "factor" = { + column = as.factor(column) + }, + { + stop(paste("Unknown datatype specified:", data.types[[i]])) + } + ) + + ## set the column back into the dataframe + data.frame[[i]] = column } return(data.frame) diff --git a/util-read.R b/util-read.R index e454e141..2ca71a0e 100644 --- a/util-read.R +++ b/util-read.R @@ -48,18 +48,18 @@ read.commits = function(data.path, artifact) { commit.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) - ## set proper column names based on Codeface extraction: + ## prepare proper column names based on Codeface extraction: ## ## SELECT c.id, c.authorDate, a.name, a.email1, c.commitDate, ## acom.name, acom.email1, c.commitHash, ## c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, ## cd.file, cd.entityId, cd.entityType, cd.size commit.data.columns = c( - "commit.id", # id - "date", "author.name", "author.email", # author information - "committer.date", "committer.name", "committer.email", # committer information - "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information - "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information + "commit.id", # id + "date", "author.name", "author.email", # author information + "committer.date", "committer.name", "committer.email", # committer information + "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information + "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information ) ## handle the case that the list of commits is empty @@ -69,11 +69,15 @@ read.commits = function(data.path, artifact) { # return a dataframe with the correct columns but zero rows commit.data = get.empty.dataframe(commit.data.columns, data.types = - c("character", "POSIXct", "character", "character", "POSIXct", - "character", "character", "character", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "numeric")) + c("character", + "POSIXct", "character", "character", + "POSIXct", "character", "character", + "character", "numeric", "numeric", "numeric", "numeric", + "character", "character", "character", "numeric")) return(commit.data) } + + ## assign prepared column names to the dataframe colnames(commit.data) = commit.data.columns ## remove duplicated lines (even if they contain different commit ids but the same commit hash) @@ -128,10 +132,10 @@ read.commits = function(data.path, artifact) { commit.data["artifact"] = artifacts.new } - ## Commits to files that are not tracked by Codeface have the empty string in the file and artifact column - ## To better indicate this, the 'artifact' and 'file' column value is changed to 'untracked.file' - commit.data["file"] = ifelse(commit.data[["file"]] == "", "untracked.file", commit.data[["file"]]) - commit.data["artifact"] = ifelse(commit.data[["artifact"]] == "", "untracked.file", commit.data[["artifact"]]) + ## Commits to files that are not tracked by Codeface have the empty string in the file and artifact column. + ## To better indicate this, the 'artifact' and 'file' column value is changed to 'untracked.file'. + commit.data["file"] = ifelse(commit.data[["file"]] == "", UNTRACKED.FILE, commit.data[["file"]]) + commit.data["artifact"] = ifelse(commit.data[["artifact"]] == "", UNTRACKED.FILE, commit.data[["artifact"]]) ## convert dates and sort by them commit.data[["date"]] = get.date.from.string(commit.data[["date"]]) From ec8c6dd72746a0506b3e03dccc4fcaf7a03325ea Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Fri, 14 Dec 2018 17:01:24 +0100 Subject: [PATCH 12/31] Update default behavior of 'Conf' objects In recent scenarios and in perspective of up-coming changes, the default behavior of 'Conf' objects upon initialization and update: 1) The default values are *not* automatically checked against the allowed values anymore. This is mainly disabled to avoid confusion of users. The constructor of the class 'Conf' is adapted accordingly. 2) When updating a configuration value, the program execution is now stopped (using 'stop') on failure. Previously, the respective update has been ignored while issuing a warning. This change helps preventing confusion and analysis errors early in an analysis script. Accordingly, the parameter 'stop.on.error' to all update methods is removed. Furthermore, the code is streamlined, such that the super-constructor is called from both subclasses 'NetworkConf' and 'ProjectConf'. Some log statements are added/adjusted, too. Signed-off-by: Claus Hunsen --- util-conf.R | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/util-conf.R b/util-conf.R index 488c54f1..2db96541 100644 --- a/util-conf.R +++ b/util-conf.R @@ -76,7 +76,7 @@ Conf = R6::R6Class("Conf", return(self$get.value(att)) }) names(current.values) = names(private$attributes) - self$update.values(current.values, stop.on.error = TRUE) + self$update.values(current.values) }, #' Check whether the given 'value' is the correct datatype @@ -125,8 +125,8 @@ Conf = R6::R6Class("Conf", #' The constructor, automatically checking the default values. initialize = function() { - ## FIXME do we need this? - private$check.values() + # ## check the default values for validity + # private$check.values() }, ## * * printing ---------------------------------------------------- @@ -166,8 +166,7 @@ Conf = R6::R6Class("Conf", #' #' @param entry the entry name for the value #' @param value the new value - #' @param error call stop() on an error? [default: FALSE] - update.value = function(entry, value, stop.on.error = FALSE) { + update.value = function(entry, value) { ## construct list for updating updating = list(value) names(updating) = entry @@ -179,10 +178,9 @@ Conf = R6::R6Class("Conf", #' 'updated.values' list. #' #' @param updated.values the new values for the attributes to be updated - #' @param error call stop() on an error? [default: FALSE] - update.values = function(updated.values = list(), stop.on.error = FALSE) { + update.values = function(updated.values = list()) { ## determine the function executed on an error - error.function = ifelse(stop.on.error, stop, logging::logwarn) + error.function = stop ## check values to update names.to.update = c() @@ -219,9 +217,7 @@ Conf = R6::R6Class("Conf", } else { message = paste0( - "Updating network-configuration attribute '%s' failed.", - if (!stop.on.error) " The failure is ignored!\n", - # "Current value: %s\n", + "Updating network-configuration attribute '%s' failed.\n", "Allowed values (%s of type '%s'): %s\n", "Given value (of type '%s'): %s" ) @@ -476,6 +472,10 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, #' and \code{featureexpression}) [default: "feature"] initialize = function(data, selection.process, casestudy, artifact = c("feature", "file", "function", "featureexpression")) { + + logging::loginfo("Construct project configuration: starting.") + + ## call super constructor super$initialize() ## verify arguments using match.arg @@ -486,8 +486,6 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, private$casestudy = verify.argument.for.parameter(casestudy, "character", class(self)[1]) private$artifact = verify.argument.for.parameter(artifact, "character", class(self)[1]) - logging::loginfo("Construct project configuration: starting.") - ## convert artifact to tagging tagging = ARTIFACT.TO.TAGGING[[ artifact ]] if (is.null(tagging)) { @@ -762,16 +760,20 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, #' The constructor, automatically checking the default values. initialize = function() { - # private$check.values() + logging::loginfo("Construct network configuration: starting.") + + ## call super constructor + super$initialize() + + logging::loginfo("Construct network configuration: finished.") }, #' Update the attributes of the class with the new values given in the #' 'updated.values' list. #' #' @param updated.values the new values for the attributes to be updated - #' @param error call stop() on an error? [default: FALSE] - update.values = function(updated.values = list(), stop.on.error = FALSE) { - super$update.values(updated.values = updated.values, stop.on.error = stop.on.error) + update.values = function(updated.values = list()) { + super$update.values(updated.values = updated.values) ## 1) "date" and "artifact.type" always as edge attribute name = "edge.attributes" From 0d7c2226da67f3537f3ff9d013607fe19df8a4c0 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sun, 16 Dec 2018 20:16:08 +0100 Subject: [PATCH 13/31] Fix nodes for networks without edges When a network contains no edges but more than one node, all the nodes get combined. To fix this, the respecting data frame, which contains the nodes, has to be transposed. This fixes #150. Reported-by: Jakob Kronawitter Signed-off-by: Thomas Bock --- util-networks.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util-networks.R b/util-networks.R index f25a6dc8..6d38e4f7 100644 --- a/util-networks.R +++ b/util-networks.R @@ -1069,7 +1069,7 @@ construct.network.from.edge.list = function(vertices, edge.list, network.conf, d ## if we have nodes to create, but no edges if (is.null(edge.list) || nrow(edge.list) == 0) { ## create network with only the vertices - net = igraph::graph.empty(n = 0, directed = directed) + igraph::vertices(nodes.processed) + net = igraph::graph.empty(n = 0, directed = directed) + igraph::vertices(t(nodes.processed)) } ## if we have nodes and edges else { From 65804276dd2ada9b2f00b2cab7b6ad0cecbe733e Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Sun, 16 Dec 2018 21:12:15 +0100 Subject: [PATCH 14/31] Improve edge creation concerning untracked files and the base artifact The edge creation process which does not draw any edges among authors of untracked files and - if configured in the 'ProjectConf' - does also not draw any edges among the base artifact authors is being reworked since the old way of achieving this was rather uninituitive and complicated. Signed-off-by: Jakob Kronawitter --- util-data.R | 24 ++++++++++++++++++++++-- util-networks.R | 35 ++++++++++------------------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/util-data.R b/util-data.R index d1f8b4c7..b2fd8a9a 100644 --- a/util-data.R +++ b/util-data.R @@ -38,8 +38,7 @@ UNTRACKED.FILE = "" ## base artifacts BASE.ARTIFACTS = c( "Base_Feature", - "File_Level", - UNTRACKED.FILE + "File_Level" ) ## mapping of data source to artifact column @@ -941,6 +940,27 @@ ProjectData = R6::R6Class("ProjectData", mylist = get.key.to.value.from.df(self[[data.source.func]](), group.column, data.column) return(mylist) + }, + + #' Get the list of authors by only looking at the specified data source. The constant + #' \code{DATASOURCE.TO.ARTIFACT.FUNCTION} describes the mapping between data source and the method which is + #' retrieving the data for each data source. + #' + #' @param data.source the data source which can be either \code{"commits"}, \code{"mails"} or \code{"issues"} + #' + #' @return the list of authors extracted from the specified data source + get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) { + if (is.null(data.source)) { + stop ("Data source can't be null.") + } + + data.source = match.arg(data.source) + data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] + + data = self[[data.source.func]]()[c("author.name", "author.email")] + names(data) = c("name", "email") + + return (data) } ) ) diff --git a/util-networks.R b/util-networks.R index 6d38e4f7..ae63825c 100644 --- a/util-networks.R +++ b/util-networks.R @@ -156,33 +156,13 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", list = private$proj.data$group.authors.by.data.column("commits", "artifact") - # split untracked.files subgroup into multiple subgroups which only contain one author each to prohibit edge - # edge construction between authors of this subgroup - if (!is.null(list[["untracked.files"]])) { - for (i in 1:nrow(list[["untracked.files"]])) { - row = list[["untracked.files"]][i, ] - list[[paste0("untracked.files_", i)]] = row - } - list[["untracked.files"]] = NULL + ## if configured in the network conf, remove base artifacts, so that no edges are created in the next step + if (!private$network.conf$get.value("base.artifact.edges")) { + list = list[!(names(list) %in% BASE.ARTIFACTS)] } - # split base feature subgroup into multiple subgroups which only contain one author each to prohibit edge - # edge construction between authors of this subgroup - if (!is.null(list[["Base_Feature"]]) && !private$network.conf$get.value("base.artifact.edges")) { - for (i in 1:nrow(list[["Base_Feature"]])) { - row = list[["Base_Feature"]][i, ] - list[[paste0("Base_Feature_", i)]] = row - } - list[["Base_Feature"]] = NULL - } - - if (!is.null(list[["File_Level"]]) && !private$network.conf$get.value("base.artifact.edges")) { - for (i in 1:nrow(list[["File_Level"]])) { - row = list[["File_Level"]][i, ] - list[[paste0("File_Level_", i)]] = row - } - list[["File_Level"]] = NULL - } + ## remove untracked files, so that no edges are created in the next step + list = list[names(list) != UNTRACKED.FILE] ## construct edge list based on artifact2author data author.net.data = construct.edge.list.from.key.value.list( @@ -192,6 +172,11 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", respect.temporal.order = private$network.conf$get.value("author.respect.temporal.order") ) + ## Add author vertices back into the graph. Previously the untracked file commiters and - if configured - + ## the base artifact commiters have been removed to avoid edge creation among them. + authors = proj.data$get.authors.by.data.source(data.source = "commits") + author.net.data[["vertices"]] = authors["name"] + ## construct network from obtained data author.net = construct.network.from.edge.list( author.net.data[["vertices"]], From dde0dd7c6b36b49aa2b6c91395be8ea6e0cd7969 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 17 Dec 2018 10:49:19 +0100 Subject: [PATCH 15/31] Leave artifact column empty if artifact == file or artifact == funtion For commits to untracked files the artifact column has previously been the copied file column (for example when looking at the commit data returned by 'get.commits'). However this should only be the case when considering file level analysis (e.g. 'artifact == file' in the 'ProjectConf'). This commit changes this to the correct behaviour. So for 'artifact == function' and 'artifact == feature' the artifact column now only contains the empty string for untracked files. To avoid hardcoding this empty string in every affected place a global constant called 'UNTRACKED.FILE.EMPTY.ARTIFACT' was added. Signed-off-by: Jakob Kronawitter --- util-data.R | 9 +++++++-- util-networks.R | 2 +- util-read.R | 6 +++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/util-data.R b/util-data.R index b2fd8a9a..9ea80eb1 100644 --- a/util-data.R +++ b/util-data.R @@ -35,6 +35,9 @@ requireNamespace("parallel") # for parallel computation ## untracked file UNTRACKED.FILE = "" +# the empty string which resides in the artifact column when artifact == feature or artifact == file +UNTRACKED.FILE.EMPTY.ARTIFACT = "" + ## base artifacts BASE.ARTIFACTS = c( "Base_Feature", @@ -373,9 +376,11 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(private$commits)) { commit.data = read.commits(self$get.data.path(), private$project.conf$get.value("artifact")) - ## only process commits with the artifact listed in the configuration or missing + ## only consider commits that have the artifact type configured in the 'project.conf' or commits to + ## untracked files commit.data = subset(commit.data, artifact.type %in% - c(private$project.conf$get.value("artifact.codeface"), "")) + c(private$project.conf$get.value("artifact.codeface"), + UNTRACKED.FILE.EMPTY.ARTIFACT)) ## Add PaStA and synchronicity data (if configured in the 'project.conf') and save the commit data to ## the field 'commits' afterwards diff --git a/util-networks.R b/util-networks.R index ae63825c..b8e1ec8d 100644 --- a/util-networks.R +++ b/util-networks.R @@ -162,7 +162,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", } ## remove untracked files, so that no edges are created in the next step - list = list[names(list) != UNTRACKED.FILE] + list = list[names(list) != UNTRACKED.FILE.EMPTY.ARTIFACT] ## construct edge list based on artifact2author data author.net.data = construct.edge.list.from.key.value.list( diff --git a/util-read.R b/util-read.R index 2ca71a0e..9c1d6ce9 100644 --- a/util-read.R +++ b/util-read.R @@ -135,7 +135,11 @@ read.commits = function(data.path, artifact) { ## Commits to files that are not tracked by Codeface have the empty string in the file and artifact column. ## To better indicate this, the 'artifact' and 'file' column value is changed to 'untracked.file'. commit.data["file"] = ifelse(commit.data[["file"]] == "", UNTRACKED.FILE, commit.data[["file"]]) - commit.data["artifact"] = ifelse(commit.data[["artifact"]] == "", UNTRACKED.FILE, commit.data[["artifact"]]) + + ## copy the file column if file level analysis is performed + if (artifact == "file") { + commit.data["artifact"] = commit.data[["file"]] + } ## convert dates and sort by them commit.data[["date"]] = get.date.from.string(commit.data[["date"]]) From d11d0fb585397fdb3a2641484248f74752db9331 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 17 Dec 2018 11:03:22 +0100 Subject: [PATCH 16/31] Add 'UNTRACKED.FILE constant' back into the constant 'BASE.ARTIFACTS' In a previous commit the constant 'UNTRACKED.FILE' was removed from the 'BASE.ARTIFACTS' constant due to temporary difficulties with this assignment. This change is now reverted. Signed-off-by: Jakob Kronawitter --- util-data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util-data.R b/util-data.R index 9ea80eb1..539ddcb0 100644 --- a/util-data.R +++ b/util-data.R @@ -41,7 +41,8 @@ UNTRACKED.FILE.EMPTY.ARTIFACT = "" ## base artifacts BASE.ARTIFACTS = c( "Base_Feature", - "File_Level" + "File_Level", + UNTRACKED.FILE ) ## mapping of data source to artifact column From 32a716249bf0b8d1e3f3a97ca629bdb6a8ea1cd8 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 17 Dec 2018 11:07:46 +0100 Subject: [PATCH 17/31] Alter inline comments with wrong information This commit changes an inline comment which was misleadingly talking about committers but actually meant commit authors. Signed-off-by: Jakob Kronawitter --- util-networks.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/util-networks.R b/util-networks.R index b8e1ec8d..b8fd5ed5 100644 --- a/util-networks.R +++ b/util-networks.R @@ -172,8 +172,10 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", respect.temporal.order = private$network.conf$get.value("author.respect.temporal.order") ) - ## Add author vertices back into the graph. Previously the untracked file commiters and - if configured - - ## the base artifact commiters have been removed to avoid edge creation among them. + ## Add author vertices back into the graph. Previously, commit information on untracked files + ## ('UNTRACKED.FILE') and, if configured, the base artifact ('BASE.ARTIFACTS') has been removed and, hence, + ## also corresponding author information. Re-add author vertices back to the network now by accessing the + ## complete author list. authors = proj.data$get.authors.by.data.source(data.source = "commits") author.net.data[["vertices"]] = authors["name"] From 466d8eb8e7f39e43985d825636af85ddfe54b13a Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Tue, 18 Dec 2018 09:41:06 +0100 Subject: [PATCH 18/31] Change names of network and project configuration options This commit renames the following three configuration options: - 'artifact.filter.base' to 'commits.filter.base.artifact', - 'filter.untracked.files' to 'commits.filter.untracked.files' - 'base.artifact.edges' to 'edges.for.base.artifacts'. Also the documentation gets slightly adjusted in one place because the old documentation contained outdated information. Signed-off-by: Jakob Kronawitter --- showcase.R | 2 +- tests/test-networks-artifact.R | 2 +- tests/test-networks-author.R | 16 ++++++++-------- tests/test-networks-bipartite.R | 18 +++++++++--------- tests/test-networks-covariates.R | 2 +- tests/test-networks-multi-relation.R | 6 +++--- tests/test-networks-multi.R | 2 +- tests/test-split.R | 16 ++++++++-------- util-conf.R | 6 +++--- util-data.R | 12 ++++++------ util-networks.R | 4 ++-- 11 files changed, 43 insertions(+), 43 deletions(-) diff --git a/showcase.R b/showcase.R index e773e92a..8936508b 100644 --- a/showcase.R +++ b/showcase.R @@ -61,7 +61,7 @@ ARTIFACT.RELATION = "cochange" # cochange, callgraph, mail, issue ## initialize project configuration proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) -proj.conf$update.value("artifact.filter.base", TRUE) +proj.conf$update.value("commits.filter.base.artifact", TRUE) # proj.conf$print() ## initialize network configuration diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index d2f1fa5a..50678ddc 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -36,7 +36,7 @@ test_that("Network construction of the undirected artifact-cochange network", { ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(artifact.relation = "cochange")) diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 488fe146..31c5b6e5 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -139,7 +139,7 @@ test_that("Amount of authors (author.all.authors, author.only.committers).", { ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() ## update network configuration @@ -198,7 +198,7 @@ test_that("Network construction of the undirected author-cochange network", { ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange")) @@ -243,7 +243,7 @@ test_that("Network construction of the undirected but temorally ordered author-c ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", author.directed = FALSE, author.respect.temporal.order = TRUE)) @@ -285,7 +285,7 @@ test_that("Network construction of the directed author-cochange network", { ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", author.directed = TRUE)) @@ -326,7 +326,7 @@ test_that("Network construction of the directed author-cochange network without ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", author.directed = TRUE, author.respect.temporal.order = FALSE)) @@ -372,7 +372,7 @@ test_that("Network construction of the undirected simplified author-cochange net ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", simplify = TRUE)) @@ -420,7 +420,7 @@ test_that("Network construction of the undirected author-issue network with all ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) proj.conf$update.value("issues.only.comments", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "issue")) @@ -511,7 +511,7 @@ test_that("Network construction of the undirected author-issue network with just ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "issue")) diff --git a/tests/test-networks-bipartite.R b/tests/test-networks-bipartite.R index 27305437..e7c3eae5 100644 --- a/tests/test-networks-bipartite.R +++ b/tests/test-networks-bipartite.R @@ -37,7 +37,7 @@ test_that("Construction of the bipartite network for the feature artifact with a ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) @@ -90,7 +90,7 @@ test_that("Construction of the bipartite network for the file artifact with auth ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "file") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) @@ -143,7 +143,7 @@ test_that("Construction of the bipartite network for the function artifact with ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "function") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) @@ -194,7 +194,7 @@ test_that("Construction of the bipartite network for the featureexpression artif ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "featureexpression") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) @@ -245,7 +245,7 @@ test_that("Construction of the bipartite network for the feature artifact with a ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "issue")) @@ -303,7 +303,7 @@ test_that("Construction of the directed bipartite network for the feature artifa ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange", author.directed = TRUE)) @@ -356,7 +356,7 @@ test_that("Construction of the directed bipartite network for the file artifact ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "file") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange", author.directed = TRUE)) @@ -410,7 +410,7 @@ test_that("Construction of the directed bipartite network for the function artif ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "function") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange", author.directed = TRUE)) @@ -463,7 +463,7 @@ test_that("Construction of the directed bipartite network for the featureexpress ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "featureexpression") - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange", author.directed = TRUE)) diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 0f208019..09197189 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -52,7 +52,7 @@ get.network.covariates.test.networks = function(network.type = c("author", "arti ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) proj.conf$update.value("issues.only.comments", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) diff --git a/tests/test-networks-multi-relation.R b/tests/test-networks-multi-relation.R index 47b3a285..73fd468a 100644 --- a/tests/test-networks-multi-relation.R +++ b/tests/test-networks-multi-relation.R @@ -35,7 +35,7 @@ test_that("Network construction of the undirected author network with relation = ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = c("cochange", "mail"))) @@ -97,7 +97,7 @@ test_that("Construction of the bipartite network for the feature artifact with a ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = c("cochange", "issue"), artifact.relation = c("issue", "mail"))) @@ -192,7 +192,7 @@ test_that("Construction of the multi network for the feature artifact with autho ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = c("cochange", "mail"), artifact.relation = c("cochange", "issue"))) diff --git a/tests/test-networks-multi.R b/tests/test-networks-multi.R index fcdcd9df..8d3db249 100644 --- a/tests/test-networks-multi.R +++ b/tests/test-networks-multi.R @@ -36,7 +36,7 @@ test_that("Construction of the multi network for the feature artifact with autho ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) diff --git a/tests/test-split.R b/tests/test-split.R index f11df6f0..94baa3cd 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -356,7 +356,7 @@ test_that("Split a data object time-based (bins == ... ).", { test_that("Test splitting data by networks", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) @@ -422,7 +422,7 @@ test_that("Test splitting data by networks", { test_that("Test splitting data by ranges", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) @@ -1028,7 +1028,7 @@ test_that("Split a network time-based (time.period = ...).", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) project.data = ProjectData$new(proj.conf) @@ -1084,7 +1084,7 @@ test_that("Split a list of networks time-based.", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) project.data = ProjectData$new(proj.conf) @@ -1131,7 +1131,7 @@ test_that("Split a network time-based (bins = ...).", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) project.data = ProjectData$new(proj.conf) @@ -1193,7 +1193,7 @@ test_that("Test splitting network by ranges", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) project.data = ProjectData$new(proj.conf) @@ -1224,7 +1224,7 @@ test_that("Split a network activity-based (number.edges, number.windows).", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) project.data = ProjectData$new(proj.conf) @@ -1518,7 +1518,7 @@ test_that("Check consistency of data and network time-based splitting.", { ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) + proj.conf$update.value("commits.filter.base.artifact", FALSE) net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) diff --git a/util-conf.R b/util-conf.R index 2db96541..e58fdaf1 100644 --- a/util-conf.R +++ b/util-conf.R @@ -329,13 +329,13 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, ## * * attributes --------------------------------------------------- attributes = list( - artifact.filter.base = list( + commits.filter.base.artifact = list( default = TRUE, type = "logical", allowed = c(TRUE, FALSE), allowed.number = 1 ), - filter.untracked.files = list( + commits.filter.untracked.files = list( default = TRUE, type = "logical", allowed = c(TRUE, FALSE), @@ -696,7 +696,7 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), - base.artifact.edges = list( + edges.for.base.artifacts = list( default = TRUE, type = "logical", allowed = c(TRUE, FALSE), diff --git a/util-data.R b/util-data.R index 539ddcb0..91f3999e 100644 --- a/util-data.R +++ b/util-data.R @@ -45,8 +45,8 @@ BASE.ARTIFACTS = c( UNTRACKED.FILE ) -## mapping of data source to artifact column -## (for commits: filter also empty, non-configured, and (potentially) base artifacts) +## mapping of data source to artifact column (for commits: filter artifacts based on the configuration options +## 'commits.filter.base.artifact' and 'commits.filter.untracked.files' of the corresponding 'ProjectConf' object) DATASOURCE.TO.ARTIFACT.FUNCTION = list( "commits" = "get.commits.filtered", "mails" = "get.mails", @@ -331,8 +331,8 @@ ProjectData = R6::R6Class("ProjectData", ## * * raw data ---------------------------------------------------- #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the - #' base artifact (if configured in the \code{project.conf}, see parameters \code{filter.untracked.files} and - #' \code{artifact.filter.base}). + #' base artifact (if configured in the \code{project.conf}, see parameters \code{commits.filter.untracked.files} + #' and \code{commits.filter.base.artifact}). #' #' This method caches the filtered commits to the field \code{commits.filtered}. #' @@ -342,8 +342,8 @@ ProjectData = R6::R6Class("ProjectData", get.commits.filtered = function() { if (is.null(private$commits.filtered)) { private$commits.filtered = private$filter.commits( - private$project.conf$get.value("filter.untracked.files"), - private$project.conf$get.value("artifact.filter.base") + private$project.conf$get.value("commits.filter.untracked.files"), + private$project.conf$get.value("commits.filter.base.artifact") ) } return(private$commits.filtered) diff --git a/util-networks.R b/util-networks.R index b8fd5ed5..1584aa3e 100644 --- a/util-networks.R +++ b/util-networks.R @@ -157,7 +157,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", list = private$proj.data$group.authors.by.data.column("commits", "artifact") ## if configured in the network conf, remove base artifacts, so that no edges are created in the next step - if (!private$network.conf$get.value("base.artifact.edges")) { + if (!private$network.conf$get.value("edges.for.base.artifacts")) { list = list[!(names(list) %in% BASE.ARTIFACTS)] } @@ -1402,7 +1402,7 @@ get.sample.network = function() { ## project configuration proj.conf = ProjectConf$new(SAMPLE.DATA, "testing", "sample", "feature") - proj.conf$update.values(list(artifact.filter.base = FALSE)) + proj.conf$update.values(list(commits.filter.base.artifact = FALSE)) ## RangeData object range = proj.conf$get.value("ranges")[1] From 7e27a182de282f054f08e3a2fb04d852c2c55102 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Mon, 17 Dec 2018 10:39:06 +0100 Subject: [PATCH 19/31] Further improve construction of edgeless networks When constructing a network in 'construct.network.from.edge.list', several corner cases need to be handled. When there are no edges available, an empty edge list can be created using 'create.empty.edge.list'. This way, reliably, the function 'igraph::graph.data.frame' can be used to construct a network. This further improves the patch 0d7c2226da67f3537f3ff9d013607fe19df8a4c0, which tackles #150. Tests for creating edgeless networks are added to the file 'tests/test-networks.R'. This likely prevents regressions in the future. Additionally, use the function 'create.empty.edge.list' in one further place where possible. Signed-off-by: Claus Hunsen --- tests/test-networks.R | 54 +++++++++++++++++++++++++++++++++++++++++++ util-networks.R | 16 ++++++------- 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/tests/test-networks.R b/tests/test-networks.R index 5a9a8489..56942880 100644 --- a/tests/test-networks.R +++ b/tests/test-networks.R @@ -93,3 +93,57 @@ test_that("Merge networks", { }) + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Construction of edgeless networks --------------------------------------- + +test_that("Construction of edgeless networks", { + + ## create data structures and network configuration as a basis + edge.list = data.frame(from = c("D1", "D2"), to = c("D2", "D1")) + edge.list.as.sequence = as.vector(as.matrix(edge.list)) + vertices = data.frame(name = c("D1", "D2")) + vertices.as.sequence = vertices[["name"]] + directed = FALSE # directedness does not matter for this test, but should be consistent + net.conf = NetworkConf$new() + + ## construct edgeless network + net.edgeless = igraph::graph.empty(n = 0, directed = directed) + + igraph::vertices(vertices.as.sequence) + + igraph::edges(NULL, weight = 1) + + ## + ## normal network + ## + + net.constructed = construct.network.from.edge.list(vertices, edge.list, net.conf) + net.expected = igraph::graph.empty(n = 0, directed = directed) + + igraph::vertices(vertices.as.sequence) + + igraph::edges(edge.list.as.sequence, weight = 1) + + ## check equality + expect_true(igraph::identical_graphs(net.constructed, net.expected), label = "normal network construction") + + ## + ## edgeless network: NULL + ## + + net.constructed = construct.network.from.edge.list(vertices, NULL, net.conf) + expect_true(igraph::identical_graphs(net.constructed, net.edgeless), label = "edgeless network: NULL") + + ## + ## edgeless network: create.empty.edge.list() + ## + + net.constructed = construct.network.from.edge.list(vertices, create.empty.edge.list(), net.conf) + expect_true(igraph::identical_graphs(net.constructed, net.edgeless), label = "edgeless network: create.empty.edge.list()") + + ## + ## edgeless network: empty data.frame + ## + + net.constructed = construct.network.from.edge.list(vertices, data.frame(), net.conf) + expect_true(igraph::identical_graphs(net.constructed, net.edgeless), label = "edgeless network: empty data.frame") + +}) + diff --git a/util-networks.R b/util-networks.R index 1584aa3e..8d0eb5d3 100644 --- a/util-networks.R +++ b/util-networks.R @@ -1053,17 +1053,15 @@ construct.network.from.edge.list = function(vertices, edge.list, network.conf, d return(create.empty.network(directed = directed)) } - ## if we have nodes to create, but no edges + ## if we have nodes to create, but no edges, create an empty edge list if (is.null(edge.list) || nrow(edge.list) == 0) { - ## create network with only the vertices - net = igraph::graph.empty(n = 0, directed = directed) + igraph::vertices(t(nodes.processed)) - } - ## if we have nodes and edges - else { - ## construct network from edge list - net = igraph::graph.data.frame(edge.list, directed = directed, vertices = nodes.processed) + edge.list = create.empty.edge.list() } + ## construct network from edge list + net = igraph::graph.data.frame(edge.list, directed = directed, vertices = nodes.processed) + + ## initialize edge weights net = igraph::set.edge.attribute(net, "weight", value = 1) ## transform multiple edges to edge weights @@ -1100,7 +1098,7 @@ merge.network.data = function(vertex.data, edge.data) { edges = plyr::rbind.fill(edge.data.filtered) ## 3) correct empty results if (is.null(edges)) { - edges = data.frame(from = character(0), to = character(0)) + edges = create.empty.edge.list() } logging::logdebug("merge.network.data: finished.") From dc8873e5e8d68718047e3538bd5c3300577602aa Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Tue, 18 Dec 2018 10:52:43 +0100 Subject: [PATCH 20/31] Update changelog. Signed-off-by: Jakob Kronawitter --- NEWS.md | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 141443e4..2d6e446a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,32 +3,42 @@ ## Unversioned ### Added -- In addition to the ProjectConf parameter `artifact.filter.base`, which configured whether the base artifact should be -included in the `get.commits.filtered` method, there is now the similiar parameter `filter.untracked.files` which does -the same thing for untracked files (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- In addition to the ProjectConf parameter `commits.filter.base.artifact` (previously called `artifact.filter.base`), +which configured whether the base artifact should be included in the `get.commits.filtered` method, there is now a +similiar parameter called `commits.filter.untracked.files` which does the same thing for untracked files +(11428d9847fd44f982cd094a3248bd13fb6b7b58) - Edges are not being constructed in the author network between authors that only modify untracked files. For authors -it can be configured if the edges should be created or not using the new NetworkConf parameter `base.artifact.edges` -(c60c2f6e44b6f34cccb2714eccc7674158c83dde) -- The public `get.commits.filtered.uncached` method was added which allows for external filtering of the commits by +it can be configured if the edges should be created or not using the new NetworkConf parameter +`edges.for.base.artifacts` (c60c2f6e44b6f34cccb2714eccc7674158c83dde) +- The public `get.commits.filtered.uncached` method is added which allows for external filtering of the commits by specifying if untracked files and/or the base artifact should be filtered (this method does not take advantage of caching, whereas the `get.commits.filtered` method does) (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- The helper function `get.empty.dataframe` is introduced which returns empty dataframes (0 rows) with correct columnns +and, if specified, all the correct datatypes. In the future, functions, that return data in dataframes, should always +return dataframes of the same shape (regarding columns and datatypes) - especially when they are empty - because this +makes later case distinctions easier or unncessary (67a4fbe4f244b4b6047c2c2be7682d7f9085e9eb) ### Changed/Improved - Change shape of `Vertices` in the legend of plots to avoid confusion (f4fb4807cfd87d9d552a9ede92ea65ae4a386a04) -- Commits that do not change any artifact are considered to be carried out on a metafile called `untracked.file` +- The ProjectConf's configuration parameter `artifact.filter.base` is renamed to `commits.filter.base.artifact` +(466d8eb8e7f39e43985d825636af85ddfe54b13a) +- Commits that do not change any artifact are considered to be carried out on a metafile called `` (11428d9847fd44f982cd094a3248bd13fb6b7b58) - Remove `get.commits.raw`, `set.commits.raw` and `read.commits.raw` functions (64a94863c9e70ac8c75e443bc15cd7facbf2111d, c26e582e4ad6bf1eaeb08202fc3e00394332a013) -- Removed `get.commits.filtered.empty` and corresponding `filter.commits.empty` method, the functionality has been moved -to the altered `get.commits.filtered` and `filter.commits` method respectively (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- Remove `get.commits.filtered.empty` and corresponding `filter.commits.empty` method, the functionality is moved to the +altered `get.commits.filtered` and `filter.commits` method respectively (11428d9847fd44f982cd094a3248bd13fb6b7b58) - The `filter.commits` method now takes parameters which configure if untracked files and/or the base artifact should be filtered out (11428d9847fd44f982cd094a3248bd13fb6b7b58) - Filtering by artifact kind (e.g. filtering out either Feature or FeatureExpression) is now being done in the `get.commits` method instead of the `get.commits.filtered` method (894c9a5c181fef14dcb71fa23699bebbcbcd2b4f) +- The `NetworkConf` and the `ProjectConf` now print out an error message and stop whenever it is attempted to set +non-existing configuration parameters (ec8c6dd72746a0506b3e03dccc4fcaf7a03325ea) ### Fixed - Fix error when resetting an `ProjectData` environment (c64cab84e928a2a4c89a6df12440ba7ca06e6263) - +- Fix bug which lead to wrong network construction in the case of networks with more than one node but no edges +(#150, 0d7c2226da67f3537f3ff9d013607fe19df8a4c0) ## 3.4 From 137d8337bc35f5a83aa16a48ef8e47fc0d36b36c Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Tue, 18 Dec 2018 18:25:56 +0100 Subject: [PATCH 21/31] Fix setting authors in co-change-based author networks This patch consists of three related fix and adaptations: First, the method 'ProjectData$get.authors.by.data.source' does not correct the column names of the returned data.frame anymore. This establishes compatibility with the method 'ProjectData$get.authors'. Additionally, the returned data.frame only contains unique entries. The documentation is tidied. Second, the method 'NetworkBuilder$get.author.network.cochange' is fixed by adding the missing 'private$' prefix when accessing the project data. Third, the assignment of author vertices is corrected to use only author names with the correct vertex attribute (i.e., "name"). This adapts the code with respect to the first change mentioned above. This change fixes all failing tests in PR #149. Signed-off-by: Claus Hunsen --- util-data.R | 18 +++++++++++------- util-networks.R | 12 +++++++++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/util-data.R b/util-data.R index 91f3999e..a0fdfa95 100644 --- a/util-data.R +++ b/util-data.R @@ -948,13 +948,16 @@ ProjectData = R6::R6Class("ProjectData", return(mylist) }, - #' Get the list of authors by only looking at the specified data source. The constant - #' \code{DATASOURCE.TO.ARTIFACT.FUNCTION} describes the mapping between data source and the method which is - #' retrieving the data for each data source. + #' Get the list of authors by only looking only at the specified data source. #' - #' @param data.source the data source which can be either \code{"commits"}, \code{"mails"} or \code{"issues"} + #' *Note*: The constant \code{DATASOURCE.TO.ARTIFACT.FUNCTION} denotes the mapping between + #' data source and the method which is retrieving the data for each data source. #' - #' @return the list of authors extracted from the specified data source + #' @param data.source the data source which can be either \code{"commits"}, \code{"mails"}, + #' or \code{"issues"} + #' + #' @return a data.frame of unique author names (columns \code{name} and \code{author.email}), + #' extracted from the specified data source get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) { if (is.null(data.source)) { stop ("Data source can't be null.") @@ -962,9 +965,10 @@ ProjectData = R6::R6Class("ProjectData", data.source = match.arg(data.source) data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] - data = self[[data.source.func]]()[c("author.name", "author.email")] - names(data) = c("name", "email") + + ## remove duplicates + data = unique(data) return (data) } diff --git a/util-networks.R b/util-networks.R index 8d0eb5d3..b60365b0 100644 --- a/util-networks.R +++ b/util-networks.R @@ -175,9 +175,15 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## Add author vertices back into the graph. Previously, commit information on untracked files ## ('UNTRACKED.FILE') and, if configured, the base artifact ('BASE.ARTIFACTS') has been removed and, hence, ## also corresponding author information. Re-add author vertices back to the network now by accessing the - ## complete author list. - authors = proj.data$get.authors.by.data.source(data.source = "commits") - author.net.data[["vertices"]] = authors["name"] + ## complete author list: + ## 1) get all authors on commits + authors = private$proj.data$get.authors.by.data.source(data.source = "commits") + ## 2) only select author names + authors = authors["author.name"] + ## 3) rename single column to "name" to correct mapping to vertex attribute "name" + colnames(authors) = "name" + ## 4) set author list as vertices + author.net.data[["vertices"]] = authors ## construct network from obtained data author.net = construct.network.from.edge.list( From e709786fbff2ba080410b12b8724952058d750c0 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Wed, 19 Dec 2018 20:50:16 +0100 Subject: [PATCH 22/31] Update README Signed-off-by: Jakob Kronawitter --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b90a663c..b0b0112f 100644 --- a/README.md +++ b/README.md @@ -482,9 +482,11 @@ There is no way to update the entries, except for the revision-based parameters. **Note**: These parameters can be configured using the method `ProjectConf$update.values()`. -- `artifact.filter.base` - * Remove all artifact information regarding the base artifact - (`"Base_Feature"` or `"File_Level"` for features and functions, respectively, as artifacts) +- `commits.filter.base.artifact` + * Remove all information concerning the base artifact from the commit data. This effect becomes clear when retrieving commits using `get.commits.filtered`, because then the result of which does not contain any commit information about changes to the base artifact. Networks built on top of this `ProjectData` do also not contain any base artifact information anymore. + * [*`TRUE`*, `FALSE`] +- `commits.filter.untracked.files` + * Remove all information concerning untracked files from the commit data. This effect becomes clear when retrieving commits using `get.commits.filtered`, because then the result of which does not contain any commits that solely changed untracked files. Networks built on top of this `ProjectData` do also not contain any information about untracked files. * [*`TRUE`*, `FALSE`] - `issues.only.comments` * Only use comments from the issue data on disk and no further events such as references and label changes @@ -552,6 +554,9 @@ Updates to the parameters can be done by calling `NetworkConf$update.variables(. * **Note**: `"date"` and `"artifact.type"` are always included as this information is needed for several parts of the library, e.g., time-based splitting. * **Note**: For each type of network that can be built, only the applicable part of the given vector of names is respected. * **Note**: For the edge attributes `"pasta"` and `"synchronicity"`, the project configuration's parameters `pasta` and `synchronicity` need to be set to `TRUE`, respectively (see below). +- `edges.for.base.artifacts` + * Controls whether edges should be drawn between authors for being involved in committing to the base artifact + * [*`TRUE`*, `FALSE`] - `simplify` * Perform edge contraction to retrieve a simplified network * [`TRUE`, *`FALSE`*] From a5802b07ee8b87a0bba35eed49919caa2cf4af25 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 20 Dec 2018 14:08:37 +0100 Subject: [PATCH 23/31] Update documentation and showcase.R Signed-off-by: Jakob Kronawitter --- showcase.R | 1 + util-data.R | 43 ++++++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/showcase.R b/showcase.R index 8936508b..455839c8 100644 --- a/showcase.R +++ b/showcase.R @@ -127,6 +127,7 @@ y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) # y.data$get.synchronicity() # y.data$group.artifacts.by.data.column("commits", "author.name") # y.data$get.commits.filtered() +# y.data$get.commits.filtered.uncached(remove.untracked.files = TRUE, remove.base.artifact = FALSE) # y.data$get.mails() # y.data$get.authors() # y.data$get.data.path() diff --git a/util-data.R b/util-data.R index a0fdfa95..6723e8ed 100644 --- a/util-data.R +++ b/util-data.R @@ -35,14 +35,15 @@ requireNamespace("parallel") # for parallel computation ## untracked file UNTRACKED.FILE = "" -# the empty string which resides in the artifact column when artifact == feature or artifact == file +## the empty string which resides in the artifact column when artifact == feature or artifact == function +## in the 'ProjectConf' UNTRACKED.FILE.EMPTY.ARTIFACT = "" -## base artifacts +## base artifacts (which one actually applies, depends on the artifact parameter in the 'ProjectConf') BASE.ARTIFACTS = c( - "Base_Feature", - "File_Level", - UNTRACKED.FILE + "Base_Feature", ## when artifact == feature + "File_Level", ## when artifact == function + UNTRACKED.FILE ## when artifact == file ) ## mapping of data source to artifact column (for commits: filter artifacts based on the configuration options @@ -92,13 +93,13 @@ ProjectData = R6::R6Class("ProjectData", ## * * filtering commits ------------------------------------------- - #' Filter commits retrieved by the \code{get.commits} method by removing untracked files and removing the base - #' artifact (see parameters). + #' Filter commits retrieved by the method \code{get.commits} after potentially removing untracked files and the + #' base artifact (see parameters). #' - #' @param remove.untracked.files configures if untracked files should be kept or removed - #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' @param remove.untracked.files flag whether untracked files are kept or removed + #' @param remove.base.artifact flag whether the base artifact is kept or removed #' - #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied + #' @return the commits retrieved by the method \code{get.commits} after all filters have been applied filter.commits = function(remove.untracked.files, remove.base.artifact) { logging::logdebug("filter.commits: starting.") @@ -330,13 +331,13 @@ ProjectData = R6::R6Class("ProjectData", ## * * raw data ---------------------------------------------------- - #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the + #' Return the commits retrieved by the method \code{get.commits} by removing untracked files and removing the #' base artifact (if configured in the \code{project.conf}, see parameters \code{commits.filter.untracked.files} #' and \code{commits.filter.base.artifact}). #' #' This method caches the filtered commits to the field \code{commits.filtered}. #' - #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied + #' @return the commits retrieved by the method \code{get.commits} after all filters have been applied #' #' @seealso get.commits.filtered.uncached get.commits.filtered = function() { @@ -349,16 +350,16 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits.filtered) }, - #' Return the commits retrieved by the \code{get.commits} method by removing untracked files and removing the + #' Return the commits retrieved by the method \code{get.commits} by removing untracked files and removing the #' base artifact (see parameters). #' - #' This method doesn't use caching. If you want to use caching, please use the \code{get.commits.filtered} - #' method instead. + #' This method does not use caching. If you want to use caching, please use the method + #' \code{get.commits.filtered} instead. #' - #' @param remove.untracked.files configures if untracked files should be kept or removed - #' @param remove.base.artifact configures if the base artifact should be kept or removed + #' @param remove.untracked.files flag whether untracked files are kept or removed + #' @param remove.base.artifact flag whether the base artifact is kept or removed #' - #' @return the commits retrieved by the \code{get.commits} method after all filters have been applied + #' @return the commits retrieved by the method \code{get.commits} after all filters have been applied #' #' @seealso get.commits.filtered get.commits.filtered.uncached = function(remove.untracked.files, remove.base.artifact) { @@ -367,7 +368,7 @@ ProjectData = R6::R6Class("ProjectData", #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. #' If the list of commits is not cached in the field \code{commits}, call the read method first. - #' If configured in the field \code{project.conf}, add PaStA and synchronicity data. + #' If configured in the \code{project.conf}, add PaStA and synchronicity data. #' #' @return the list of commits get.commits = function() { @@ -393,7 +394,7 @@ ProjectData = R6::R6Class("ProjectData", }, #' Set the commit list of the project to a new one. - #' Add PaStA and sychronicity data if configured in the field \code{project.conf}. + #' Add PaStA and sychronicity data if configured in the \code{project.conf}. #' #' @param commit.data the new list of commits set.commits = function(commit.data) { @@ -960,7 +961,7 @@ ProjectData = R6::R6Class("ProjectData", #' extracted from the specified data source get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) { if (is.null(data.source)) { - stop ("Data source can't be null.") + stop ("Data source can not be null.") } data.source = match.arg(data.source) From 67dcf31e3f54a1c2bbacdd0511558fbcd27dca67 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 20 Dec 2018 14:37:11 +0100 Subject: [PATCH 24/31] Rename variable 'list' to 'author.groups' and adjust documentation Signed-off-by: Jakob Kronawitter --- util-networks.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/util-networks.R b/util-networks.R index b60365b0..ecd98aa7 100644 --- a/util-networks.R +++ b/util-networks.R @@ -154,19 +154,22 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", return(private$authors.network.cochange) } - list = private$proj.data$group.authors.by.data.column("commits", "artifact") - - ## if configured in the network conf, remove base artifacts, so that no edges are created in the next step + ## Get a list of all artifacts extracted from the commit data. Each artifact in this group is again a list + ## of all authors that were involved in making changes to this artifact. In the following two steps some of + ## the artifacts are filtered from this list, which removes all information (including author information) + ## about these artifacts. Since we only want to lose the edge information and not the information about + ## authors, they will explicitly be added in a later step. + author.groups = private$proj.data$group.authors.by.data.column("commits", "artifact") + ## 1) if configured in the 'NetworkConf, remove the base artifact if (!private$network.conf$get.value("edges.for.base.artifacts")) { - list = list[!(names(list) %in% BASE.ARTIFACTS)] + author.groups = author.groups[!(names(author.groups) %in% BASE.ARTIFACTS)] } - - ## remove untracked files, so that no edges are created in the next step - list = list[names(list) != UNTRACKED.FILE.EMPTY.ARTIFACT] + ## 2) in any case, remove the untracked files + author.groups = author.groups[names(author.groups) != UNTRACKED.FILE.EMPTY.ARTIFACT] ## construct edge list based on artifact2author data author.net.data = construct.edge.list.from.key.value.list( - list, + author.groups, network.conf = private$network.conf, directed = private$network.conf$get.value("author.directed"), respect.temporal.order = private$network.conf$get.value("author.respect.temporal.order") From 5f0f52936b4433f64fd9b1c9b2571eb26f66395f Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 20 Dec 2018 15:53:38 +0100 Subject: [PATCH 25/31] Add additional utility functions for easier empty dataframe creation The following four functions are introduced to make empty dataframe creation even easier: - create.empty.authors.list - create.empty.commits.list - create.empty.issues.list - create.empty.mails.list As the name implies, each function is creating and returning an dataframe with zero rows but the correct column names and datatypes (like a dataframe of the corresponding type would have if it was filled with authors, commits, issues or mails, respectively). In addition, all the column names and column datatypes are made publicly available by defining them as constants. Signed-off-by: Jakob Kronawitter --- NEWS.md | 8 ++-- util-misc.R | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++-- util-read.R | 57 ++++++--------------------- 3 files changed, 122 insertions(+), 53 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2d6e446a..9895c127 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,10 +13,10 @@ it can be configured if the edges should be created or not using the new Network - The public `get.commits.filtered.uncached` method is added which allows for external filtering of the commits by specifying if untracked files and/or the base artifact should be filtered (this method does not take advantage of caching, whereas the `get.commits.filtered` method does) (11428d9847fd44f982cd094a3248bd13fb6b7b58) -- The helper function `get.empty.dataframe` is introduced which returns empty dataframes (0 rows) with correct columnns -and, if specified, all the correct datatypes. In the future, functions, that return data in dataframes, should always -return dataframes of the same shape (regarding columns and datatypes) - especially when they are empty - because this -makes later case distinctions easier or unncessary (67a4fbe4f244b4b6047c2c2be7682d7f9085e9eb) +- The helper function `create.empty.data.frame` is introduced which returns empty dataframes (0 rows) with correct +columnns and, if specified, all the correct datatypes. In the future, functions, that return data in dataframes, should +always return dataframes of the same shape (regarding columns and datatypes) - especially when they are empty - because +this makes later case distinctions easier or unncessary (67a4fbe4f244b4b6047c2c2be7682d7f9085e9eb) ### Changed/Improved - Change shape of `Vertices` in the legend of plots to avoid confusion (f4fb4807cfd87d9d552a9ede92ea65ae4a386a04) diff --git a/util-misc.R b/util-misc.R index 58d88b11..9e68368e 100644 --- a/util-misc.R +++ b/util-misc.R @@ -136,15 +136,118 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Empty dataframe creation------------------------------------------------- -#' Create an empty data.frame with the specified columns. Unless all columns should have the default datatype +## * Constants ------------------------------------------------------------- + +## column names of a dataframe containing authors (based on the Codeface extraction, see the following SQL statement) +## +## SELECT a.name AS authorName, a.email1, m.creationDate, m.subject, m.threadId +AUTHORS.LIST.COLUMNS = c( + "author.id", "author.name", "author.email" +) + +## declare the datatype for each column in the constant 'AUTHORS.LIST.COLUMNS' +AUTHORS.LIST.DATA.TYPES = c( + "character", "character", "character" +) + +## column names of a dataframe containing commits (based on the Codeface extraction, see the following SQL statement) +## +## SELECT c.id, c.authorDate, a.name, a.email1, +## c.commitDate, acom.name, acom.email1, +## c.commitHash, c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, +## cd.file, cd.entityId, cd.entityType, cd.size +COMMITS.LIST.COLUMNS = c( + "commit.id", # id + "date", "author.name", "author.email", # author information + "committer.date", "committer.name", "committer.email", # committer information + "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information + "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information +) + +## declare the datatype for each column in the constant 'COMMITS.LIST.COLUMNS' +COMMITS.LIST.DATA.TYPES = c( + "character", + "POSIXct", "character", "character", + "POSIXct", "character", "character", + "character", "numeric", "numeric", "numeric", "numeric", + "character", "character", "character", "numeric" +) + +## column names of a dataframe containing issues +ISSUES.LIST.COLUMNS = c( + "issue.id", "issue.state", "creation.date", "closing.date", "is.pull.request", # issue information + "author.name", "author.email", # author information + "date", # the date + "ref.name", "event.name" # the event describing the row's entry +) + +## declare the datatype for each column in the constant 'ISSUES.LIST.COLUMNS' +ISSUES.LIST.DATA.TYPES = c( + "character", "character", "POSIXct", "POSIXct", "logical", + "character", "character", + "POSIXct", + "character", "character" +) + +## column names of a dataframe containing mails (based on the Codeface extraction, see the following SQL statement) +## +## SELECT a.name AS authorName, a.messageId, a.email1, m.creationDate, m.subject, m.threadId +MAILS.LIST.COLUMNS = c( + "author.name", "author.email", # author information + "message.id", "date", "date.offset", "subject", # meta information + "thread" # thread ID +) + +## declare the datatype for each column in the constant 'MAILS.LIST.COLUMNS' +MAILS.LIST.DATA.TYPES = c( + "character", "character", + "character", "POSIXct", "numeric", "character", + "numeric" +) + +## * Functions ------------------------------------------------------------- + +#' Create an empty dataframe which has the same shape as a dataframe containing authors. The dataframe has the column +#' names and column datatypes defined in \code{AUTHORS.LIST.COLUMNS} and \code{AUTHORS.LIST.DATA.TYPEs}, respectively. +#' +#' @return the empty dataframe +create.empty.authors.list = function() { + return (create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing commits. The dataframe has the column +#' names and column datatypes defined in \code{COMMITS.LIST.COLUMNS} and \code{COMMITS.LIST.DATA.TYPEs}, respectively. +#' +#' @return the empty dataframe +create.empty.commits.list = function() { + return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing issues. The dataframe has the column +#' names and column datatypes defined in \code{ISSUES.LIST.COLUMNS} and \code{ISSUES.LIST.DATA.TYPEs}, respectively. +#' +#' @return the empty dataframe +create.empty.issues.list = function() { + return (create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing mails. The dataframe has the column +#' names and column datatypes defined in \code{MAILS.LIST.COLUMNS} and \code{MAILS.LIST.DATA.TYPEs}, respectively. +#' +#' @return the empty dataframe +create.empty.mails.list = function() { + return (create.empty.data.frame(MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe with the specified columns. Unless all columns should have the default datatype #' \code{logical}, the second parameter \code{data.types} should specify the datatypes. #' #' @param columns a character vector containing all the column names #' @param data.types a character vector of the same length as \code{columns}, the datatypes can be \code{integer}, #' \code{numeric}, \code{POSIXct}, \code{character}, \code{factor} or \code{logical} #' -#' @return the newly created empty data.frame -get.empty.dataframe = function(columns, data.types = NULL) { +#' @return the newly created empty dataframe +create.empty.data.frame = function(columns, data.types = NULL) { ## if the vector data.types is specified, its length must match the length of the corresponding column names if (!is.null(data.types) && length(data.types) != length(columns)) { @@ -194,6 +297,7 @@ get.empty.dataframe = function(columns, data.types = NULL) { return(data.frame) } + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Stacktrace -------------------------------------------------------------- diff --git a/util-read.R b/util-read.R index 9c1d6ce9..235ab9f8 100644 --- a/util-read.R +++ b/util-read.R @@ -48,44 +48,24 @@ read.commits = function(data.path, artifact) { commit.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) - ## prepare proper column names based on Codeface extraction: - ## - ## SELECT c.id, c.authorDate, a.name, a.email1, c.commitDate, - ## acom.name, acom.email1, c.commitHash, - ## c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, - ## cd.file, cd.entityId, cd.entityType, cd.size - commit.data.columns = c( - "commit.id", # id - "date", "author.name", "author.email", # author information - "committer.date", "committer.name", "committer.email", # committer information - "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information - "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information - ) - ## handle the case that the list of commits is empty if (inherits(commit.data, "try-error")) { logging::logwarn("There are no commits available for the current environment.") logging::logwarn("Datapath: %s", data.path) # return a dataframe with the correct columns but zero rows - commit.data = get.empty.dataframe(commit.data.columns, data.types = - c("character", - "POSIXct", "character", "character", - "POSIXct", "character", "character", - "character", "numeric", "numeric", "numeric", "numeric", - "character", "character", "character", "numeric")) - return(commit.data) + return(create.empty.commits.list()) } ## assign prepared column names to the dataframe - colnames(commit.data) = commit.data.columns + colnames(commit.data) = COMMITS.LIST.COLUMNS ## remove duplicated lines (even if they contain different commit ids but the same commit hash) commit.data = commit.data[rownames(unique(commit.data[, -1])), ] ## aggregate lines which are identical except for the "artifact.diff.size" column (ignoring the commit id) ## 1) select columns which have to be identical - primary.columns = commit.data.columns[!(commit.data.columns %in% c("commit.id", "artifact.diff.size"))] + primary.columns = COMMITS.LIST.COLUMNS[!(COMMITS.LIST.COLUMNS %in% c("commit.id", "artifact.diff.size"))] ## 2) aggregate "artifact.diff.size" for identical rows of the selected columns commit.data.without.id = aggregate(commit.data["artifact.diff.size"], commit.data[primary.columns], @@ -97,7 +77,7 @@ read.commits = function(data.path, artifact) { ## 4) merge the data again to have both "commit.id" and "artifact.diff.size" in one data.frame again commit.data = merge(commit.data.without.id, commit.data.without.artifact.diff.size) ## 5) reorder the columns of the data.frame as their order might be changed during aggregating and merging - commit.data = commit.data[, commit.data.columns] + commit.data = commit.data[, COMMITS.LIST.COLUMNS] ## rewrite data.frame when we want file-based data ## (we have proximity-based data as foundation) @@ -222,17 +202,11 @@ read.mails = function(data.path) { if (inherits(mail.data, "try-error")) { logging::logwarn("There are no mails available for the current environment.") logging::logwarn("Datapath: %s", data.path) - return(data.frame()) + return(create.empty.mails.list()) } - ## set proper column names based on Codeface extraction: - ## - ## SELECT a.name AS authorName, a.messageId, a.email1, m.creationDate, m.subject, m.threadId - colnames(mail.data) = c( - "author.name", "author.email", # author information - "message.id", "date", "date.offset", "subject", # meta information - "thread" # thread ID - ) + + colnames(mail.data) = MAILS.LIST.COLUMNS ## set pattern for thread ID for better recognition mail.data[["thread"]] = sprintf("", mail.data[["thread"]]) @@ -292,15 +266,11 @@ read.authors = function(data.path) { stop("Stopped due to missing authors.") } - ## set proper column names based on Codeface extraction: - ## - ## SELECT a.name AS authorName, a.email1, m.creationDate, m.subject, m.threadId - authors.df.columns = c("author.id", "author.name", "author.email") ## if there is no third column, we need to add e-mail-address dummy data (NAs) - if (ncol(authors.df) != length(authors.df.columns)) { + if (ncol(authors.df) != length(AUTHORS.LIST.COLUMNS)) { authors.df[3] = NA } - colnames(authors.df) = authors.df.columns + colnames(authors.df) = AUTHORS.LIST.COLUMNS ## store the ID--author mapping logging::logdebug("read.authors: finished.") @@ -397,16 +367,11 @@ read.issues = function(data.path) { if (inherits(issue.data, "try-error")) { logging::logwarn("There are no Github issue data available for the current environment.") logging::logwarn("Datapath: %s", data.path) - return(data.frame()) + return(create.empty.issues.list()) } ## set proper column names - colnames(issue.data) = c( - "issue.id", "issue.state", "creation.date", "closing.date", "is.pull.request", # issue information - "author.name", "author.email", # author information - "date", # the date - "ref.name", "event.name" # the event describing the row's entry - ) + colnames(issue.data) = ISSUES.LIST.COLUMNS ## set pattern for issue ID for better recognition issue.data[["issue.id"]] = sprintf("", issue.data[["issue.id"]]) From 6043e5c59107f851159e726b4ecc9de1a0673e0f Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Thu, 20 Dec 2018 16:21:24 +0100 Subject: [PATCH 26/31] Change null checking behaviour of two methods Change the behaviour of method 'set.commits' and 'get.authors.by.data.source' whenever a 'NULL' is passed as parameter. Signed-off-by: Jakob Kronawitter --- util-data.R | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/util-data.R b/util-data.R index 6723e8ed..14e47597 100644 --- a/util-data.R +++ b/util-data.R @@ -400,20 +400,23 @@ ProjectData = R6::R6Class("ProjectData", set.commits = function(commit.data) { logging::loginfo("Setting commit data.") - if (!is.null(commit.data)) { + # TODO: Also check for correct shape (column names and data types) of the passed data - ## append synchronicity data if wanted - if (private$project.conf$get.value("synchronicity")) { - synchronicity.data = self$get.synchronicity() - commit.data = merge(commit.data, synchronicity.data, - by = "hash", all.x = TRUE, sort = FALSE) - } + if (is.null(commit.data)) { + commit.data = create.empty.commits.list(); + } - ## add PaStA data if wanted - if (private$project.conf$get.value("pasta")) { - self$get.pasta() - commit.data = private$add.pasta.data(commit.data) - } + ## append synchronicity data if wanted + if (private$project.conf$get.value("synchronicity")) { + synchronicity.data = self$get.synchronicity() + commit.data = merge(commit.data, synchronicity.data, + by = "hash", all.x = TRUE, sort = FALSE) + } + + ## add PaStA data if wanted + if (private$project.conf$get.value("pasta")) { + self$get.pasta() + commit.data = private$add.pasta.data(commit.data) } private$commits = commit.data @@ -960,11 +963,10 @@ ProjectData = R6::R6Class("ProjectData", #' @return a data.frame of unique author names (columns \code{name} and \code{author.email}), #' extracted from the specified data source get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) { - if (is.null(data.source)) { - stop ("Data source can not be null.") - } data.source = match.arg(data.source) + + ## retrieve author names from chosen data source data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] data = self[[data.source.func]]()[c("author.name", "author.email")] From 418d1dc4929ad1df251d2aeb833dd45757b04a6f Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 7 Jan 2019 18:09:50 +0100 Subject: [PATCH 27/31] Update README Signed-off-by: Jakob Kronawitter --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0b0112f..be9e7f2f 100644 --- a/README.md +++ b/README.md @@ -555,7 +555,7 @@ Updates to the parameters can be done by calling `NetworkConf$update.variables(. * **Note**: For each type of network that can be built, only the applicable part of the given vector of names is respected. * **Note**: For the edge attributes `"pasta"` and `"synchronicity"`, the project configuration's parameters `pasta` and `synchronicity` need to be set to `TRUE`, respectively (see below). - `edges.for.base.artifacts` - * Controls whether edges should be drawn between authors for being involved in committing to the base artifact + * Controls whether edges should be drawn between authors for being involved in authoring commits to the base artifact. This parameter does not have any effect if the base artifact was filtered beforehand (e.g., when `commits.filter.base.artifact == TRUE`, or, when `commits.filter.untracked.files == TRUE` and `artifact == FILE`; all of these options can be configured in the `ProjectConf`; warning: `commits.filter.base.artifact` and `commits.filter.untracked.files` are `TRUE` by default). * [*`TRUE`*, `FALSE`] - `simplify` * Perform edge contraction to retrieve a simplified network From 523daef8cf4642a2360396b11f0d74bce565b0f0 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 7 Jan 2019 18:10:26 +0100 Subject: [PATCH 28/31] Move empty dataframe creation utility functions into util-read.R Signed-off-by: Jakob Kronawitter --- util-misc.R | 104 +---------------------------------------------- util-networks.R | 2 +- util-read.R | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 104 deletions(-) diff --git a/util-misc.R b/util-misc.R index 9e68368e..c230e080 100644 --- a/util-misc.R +++ b/util-misc.R @@ -133,112 +133,10 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE } } + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Empty dataframe creation------------------------------------------------- -## * Constants ------------------------------------------------------------- - -## column names of a dataframe containing authors (based on the Codeface extraction, see the following SQL statement) -## -## SELECT a.name AS authorName, a.email1, m.creationDate, m.subject, m.threadId -AUTHORS.LIST.COLUMNS = c( - "author.id", "author.name", "author.email" -) - -## declare the datatype for each column in the constant 'AUTHORS.LIST.COLUMNS' -AUTHORS.LIST.DATA.TYPES = c( - "character", "character", "character" -) - -## column names of a dataframe containing commits (based on the Codeface extraction, see the following SQL statement) -## -## SELECT c.id, c.authorDate, a.name, a.email1, -## c.commitDate, acom.name, acom.email1, -## c.commitHash, c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, -## cd.file, cd.entityId, cd.entityType, cd.size -COMMITS.LIST.COLUMNS = c( - "commit.id", # id - "date", "author.name", "author.email", # author information - "committer.date", "committer.name", "committer.email", # committer information - "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information - "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information -) - -## declare the datatype for each column in the constant 'COMMITS.LIST.COLUMNS' -COMMITS.LIST.DATA.TYPES = c( - "character", - "POSIXct", "character", "character", - "POSIXct", "character", "character", - "character", "numeric", "numeric", "numeric", "numeric", - "character", "character", "character", "numeric" -) - -## column names of a dataframe containing issues -ISSUES.LIST.COLUMNS = c( - "issue.id", "issue.state", "creation.date", "closing.date", "is.pull.request", # issue information - "author.name", "author.email", # author information - "date", # the date - "ref.name", "event.name" # the event describing the row's entry -) - -## declare the datatype for each column in the constant 'ISSUES.LIST.COLUMNS' -ISSUES.LIST.DATA.TYPES = c( - "character", "character", "POSIXct", "POSIXct", "logical", - "character", "character", - "POSIXct", - "character", "character" -) - -## column names of a dataframe containing mails (based on the Codeface extraction, see the following SQL statement) -## -## SELECT a.name AS authorName, a.messageId, a.email1, m.creationDate, m.subject, m.threadId -MAILS.LIST.COLUMNS = c( - "author.name", "author.email", # author information - "message.id", "date", "date.offset", "subject", # meta information - "thread" # thread ID -) - -## declare the datatype for each column in the constant 'MAILS.LIST.COLUMNS' -MAILS.LIST.DATA.TYPES = c( - "character", "character", - "character", "POSIXct", "numeric", "character", - "numeric" -) - -## * Functions ------------------------------------------------------------- - -#' Create an empty dataframe which has the same shape as a dataframe containing authors. The dataframe has the column -#' names and column datatypes defined in \code{AUTHORS.LIST.COLUMNS} and \code{AUTHORS.LIST.DATA.TYPEs}, respectively. -#' -#' @return the empty dataframe -create.empty.authors.list = function() { - return (create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) -} - -#' Create an empty dataframe which has the same shape as a dataframe containing commits. The dataframe has the column -#' names and column datatypes defined in \code{COMMITS.LIST.COLUMNS} and \code{COMMITS.LIST.DATA.TYPEs}, respectively. -#' -#' @return the empty dataframe -create.empty.commits.list = function() { - return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) -} - -#' Create an empty dataframe which has the same shape as a dataframe containing issues. The dataframe has the column -#' names and column datatypes defined in \code{ISSUES.LIST.COLUMNS} and \code{ISSUES.LIST.DATA.TYPEs}, respectively. -#' -#' @return the empty dataframe -create.empty.issues.list = function() { - return (create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)) -} - -#' Create an empty dataframe which has the same shape as a dataframe containing mails. The dataframe has the column -#' names and column datatypes defined in \code{MAILS.LIST.COLUMNS} and \code{MAILS.LIST.DATA.TYPEs}, respectively. -#' -#' @return the empty dataframe -create.empty.mails.list = function() { - return (create.empty.data.frame(MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)) -} - #' Create an empty dataframe with the specified columns. Unless all columns should have the default datatype #' \code{logical}, the second parameter \code{data.types} should specify the datatypes. #' diff --git a/util-networks.R b/util-networks.R index ecd98aa7..aefa8e02 100644 --- a/util-networks.R +++ b/util-networks.R @@ -155,7 +155,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", } ## Get a list of all artifacts extracted from the commit data. Each artifact in this group is again a list - ## of all authors that were involved in making changes to this artifact. In the following two steps some of + ## of all authors that were involved in making changes to this artifact. In the following two steps, some of ## the artifacts are filtered from this list, which removes all information (including author information) ## about these artifacts. Since we only want to lose the edge information and not the information about ## authors, they will explicitly be added in a later step. diff --git a/util-read.R b/util-read.R index 235ab9f8..0b0737c7 100644 --- a/util-read.R +++ b/util-read.R @@ -30,6 +30,112 @@ requireNamespace("digest") # for sha1 hashing of IDs requireNamespace("sqldf") # for SQL-selections on data.frames +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Constants --------------------------------------------------------------- + +## column names of a dataframe containing authors (based on the Codeface extraction, see the following SQL statement) +## +## SELECT a.name AS authorName, a.email1, m.creationDate, m.subject, m.threadId +AUTHORS.LIST.COLUMNS = c( + "author.id", "author.name", "author.email" +) + +## declare the datatype for each column in the constant 'AUTHORS.LIST.COLUMNS' +AUTHORS.LIST.DATA.TYPES = c( + "character", "character", "character" +) + +## column names of a dataframe containing commits (based on the Codeface extraction, see the following SQL statement) +## +## SELECT c.id, c.authorDate, a.name, a.email1, +## c.commitDate, acom.name, acom.email1, +## c.commitHash, c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, +## cd.file, cd.entityId, cd.entityType, cd.size +COMMITS.LIST.COLUMNS = c( + "commit.id", # id + "date", "author.name", "author.email", # author information + "committer.date", "committer.name", "committer.email", # committer information + "hash", "changed.files", "added.lines", "deleted.lines", "diff.size", # commit information + "file", "artifact", "artifact.type", "artifact.diff.size" ## commit-dependency information +) + +## declare the datatype for each column in the constant 'COMMITS.LIST.COLUMNS' +COMMITS.LIST.DATA.TYPES = c( + "character", + "POSIXct", "character", "character", + "POSIXct", "character", "character", + "character", "numeric", "numeric", "numeric", "numeric", + "character", "character", "character", "numeric" +) + +## column names of a dataframe containing issues +ISSUES.LIST.COLUMNS = c( + "issue.id", "issue.state", "creation.date", "closing.date", "is.pull.request", # issue information + "author.name", "author.email", # author information + "date", # the date + "ref.name", "event.name" # the event describing the row's entry +) + +## declare the datatype for each column in the constant 'ISSUES.LIST.COLUMNS' +ISSUES.LIST.DATA.TYPES = c( + "character", "character", "POSIXct", "POSIXct", "logical", + "character", "character", + "POSIXct", + "character", "character" +) + +## column names of a dataframe containing mails (based on the Codeface extraction, see the following SQL statement) +## +## SELECT a.name AS authorName, a.messageId, a.email1, m.creationDate, m.subject, m.threadId +MAILS.LIST.COLUMNS = c( + "author.name", "author.email", # author information + "message.id", "date", "date.offset", "subject", # meta information + "thread" # thread ID +) + +## declare the datatype for each column in the constant 'MAILS.LIST.COLUMNS' +MAILS.LIST.DATA.TYPES = c( + "character", "character", + "character", "POSIXct", "numeric", "character", + "numeric" +) + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Empty dataframe creation------------------------------------------------- + +#' Create an empty dataframe which has the same shape as a dataframe containing authors. The dataframe has the column +#' names and column datatypes defined in \code{AUTHORS.LIST.COLUMNS} and \code{AUTHORS.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.authors.list = function() { + return (create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing commits. The dataframe has the column +#' names and column datatypes defined in \code{COMMITS.LIST.COLUMNS} and \code{COMMITS.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.commits.list = function() { + return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing issues. The dataframe has the column +#' names and column datatypes defined in \code{ISSUES.LIST.COLUMNS} and \code{ISSUES.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.issues.list = function() { + return (create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)) +} + +#' Create an empty dataframe which has the same shape as a dataframe containing mails. The dataframe has the column +#' names and column datatypes defined in \code{MAILS.LIST.COLUMNS} and \code{MAILS.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.mails.list = function() { + return (create.empty.data.frame(MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)) +} + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Commit data ------------------------------------------------------------- From f8281c7cf253bc4abf2880caaadeafd65800f4eb Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Wed, 9 Jan 2019 14:25:31 +0100 Subject: [PATCH 29/31] Adjust comments for the column names of commonly used dataframes Previously, next to the declaration of column names for commonly used dataframes (dataframes containing commits, mails, etc.), there was an SQL statement describing how the data was retrieved before it is read by the network library (in the tool 'codeface-extraction'). These SQL statements are now removed and instead, it is referred to the tool 'codeface-extraction' itself on GitHub. Signed-off-by: Jakob Kronawitter --- util-read.R | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/util-read.R b/util-read.R index 0b0737c7..ea8520a9 100644 --- a/util-read.R +++ b/util-read.R @@ -33,9 +33,11 @@ requireNamespace("sqldf") # for SQL-selections on data.frames ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Constants --------------------------------------------------------------- -## column names of a dataframe containing authors (based on the Codeface extraction, see the following SQL statement) -## -## SELECT a.name AS authorName, a.email1, m.creationDate, m.subject, m.threadId +## The following definition of column names for each individual data source corresponds to the individual extraction +## process of the tool 'codeface-extraction' (https://github.com/se-passau/codeface-extraction; use commit 0700f94 or +## compatible later commit). + +## column names of a dataframe containing authors (see file 'authors.list' and function \code{read.authors}) AUTHORS.LIST.COLUMNS = c( "author.id", "author.name", "author.email" ) @@ -45,12 +47,7 @@ AUTHORS.LIST.DATA.TYPES = c( "character", "character", "character" ) -## column names of a dataframe containing commits (based on the Codeface extraction, see the following SQL statement) -## -## SELECT c.id, c.authorDate, a.name, a.email1, -## c.commitDate, acom.name, acom.email1, -## c.commitHash, c.ChangedFiles, c.AddedLines, c.DeletedLines, c.DiffSize, -## cd.file, cd.entityId, cd.entityType, cd.size +## column names of a dataframe containing commits (see file 'commits.list' and function \code{read.commits}) COMMITS.LIST.COLUMNS = c( "commit.id", # id "date", "author.name", "author.email", # author information @@ -68,7 +65,7 @@ COMMITS.LIST.DATA.TYPES = c( "character", "character", "character", "numeric" ) -## column names of a dataframe containing issues +## column names of a dataframe containing issues (see file 'issues.list' and function \code{read.issues}) ISSUES.LIST.COLUMNS = c( "issue.id", "issue.state", "creation.date", "closing.date", "is.pull.request", # issue information "author.name", "author.email", # author information @@ -84,9 +81,7 @@ ISSUES.LIST.DATA.TYPES = c( "character", "character" ) -## column names of a dataframe containing mails (based on the Codeface extraction, see the following SQL statement) -## -## SELECT a.name AS authorName, a.messageId, a.email1, m.creationDate, m.subject, m.threadId +## column names of a dataframe containing mails (see file 'mails.list' and function \code{read.mails}) MAILS.LIST.COLUMNS = c( "author.name", "author.email", # author information "message.id", "date", "date.offset", "subject", # meta information From 01217a8399fb87633cb589753bbcd238cc8b601c Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Wed, 9 Jan 2019 15:38:33 +0100 Subject: [PATCH 30/31] Update changelog Signed-off-by: Jakob Kronawitter --- NEWS.md | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9895c127..afbba39d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,40 +5,54 @@ ### Added - In addition to the ProjectConf parameter `commits.filter.base.artifact` (previously called `artifact.filter.base`), which configured whether the base artifact should be included in the `get.commits.filtered` method, there is now a -similiar parameter called `commits.filter.untracked.files` which does the same thing for untracked files -(11428d9847fd44f982cd094a3248bd13fb6b7b58) -- Edges are not being constructed in the author network between authors that only modify untracked files. For authors -it can be configured if the edges should be created or not using the new NetworkConf parameter -`edges.for.base.artifacts` (c60c2f6e44b6f34cccb2714eccc7674158c83dde) +similiar parameter called `commits.filter.untracked.files` doing the same thing for untracked files +(11428d9847fd44f982cd094a3248bd13fb6b7b58, 466d8eb8e7f39e43985d825636af85ddfe54b13a) - The public `get.commits.filtered.uncached` method is added which allows for external filtering of the commits by specifying if untracked files and/or the base artifact should be filtered (this method does not take advantage of caching, whereas the `get.commits.filtered` method does) (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- Commits that do not change any artifact are considered to be carried out on a metafile called ``. +The constant `UNTRACKED.FILE` was added to the file `util-data.R` and holds the string constant ``. +(11428d9847fd44f982cd094a3248bd13fb6b7b58, 5ea65b9ac5a22967de87d7fd4ac66b0bc8e07238) +- In an author network, edges do not get constructed anymore between authors for solely modifying untracked files. For +authors involved in changing the base artifact, it can be configured whether edges should be created or not using the +new NetworkConf parameter `edges.for.base.artifacts` +(c60c2f6e44b6f34cccb2714eccc7674158c83dde, 466d8eb8e7f39e43985d825636af85ddfe54b13a) +- A new constant named `UNTRACKED.FILE.EMPTY.ARTIFACT` has been introduced in the `util-data.R` which simply holds an +empty string. If used in the intended context, this constant (and thus this empty string) denominates the empty artifact, +which is now called `` (see the constant `UNTRACKED.FILE`). The empty string was chosen, as this is the +way that untracked files were named in the file `commits.list` coming from the tool `codeface-extraction` +(dde0dd7c6b36b49aa2b6c91395be8ea6e0cd7969) - The helper function `create.empty.data.frame` is introduced which returns empty dataframes (0 rows) with correct columnns and, if specified, all the correct datatypes. In the future, functions, that return data in dataframes, should always return dataframes of the same shape (regarding columns and datatypes) - especially when they are empty - because this makes later case distinctions easier or unncessary (67a4fbe4f244b4b6047c2c2be7682d7f9085e9eb) +- For the most common types of dataframes (dataframes of commits, mails, issues and authors) four more utility methods +were added, namely `create.empty.authors.list`, `create.empty.commits.list`, `create.empty.issues.list`, +`create.empty.mails.list` as well as corresponding constants holding columns and associated datatypes for all these +empty dataframes (5f0f52936b4433f64fd9b1c9b2571eb26f66395f, 523daef8cf4642a2360396b11f0d74bce565b0f0) +- Add method `ProjectData$get.authors.by.data.source` to retrieve authors by given data-source name (#149, 65804276dd2ada9b2f00b2cab7b6ad0cecbe733e, 137d8337bc35f5a83aa16a48ef8e47fc0d36b36c) ### Changed/Improved +- Rename `ProjectConf` parameter `artifact.filter.base` to `commits.filter.base.artifact` (PR #149, 466d8eb8e7f39e43985d825636af85ddfe54b13a) - Change shape of `Vertices` in the legend of plots to avoid confusion (f4fb4807cfd87d9d552a9ede92ea65ae4a386a04) -- The ProjectConf's configuration parameter `artifact.filter.base` is renamed to `commits.filter.base.artifact` -(466d8eb8e7f39e43985d825636af85ddfe54b13a) -- Commits that do not change any artifact are considered to be carried out on a metafile called `` -(11428d9847fd44f982cd094a3248bd13fb6b7b58) - Remove `get.commits.raw`, `set.commits.raw` and `read.commits.raw` functions (64a94863c9e70ac8c75e443bc15cd7facbf2111d, c26e582e4ad6bf1eaeb08202fc3e00394332a013) -- Remove `get.commits.filtered.empty` and corresponding `filter.commits.empty` method, the functionality is moved to the -altered `get.commits.filtered` and `filter.commits` method respectively (11428d9847fd44f982cd094a3248bd13fb6b7b58) -- The `filter.commits` method now takes parameters which configure if untracked files and/or the base artifact should be -filtered out (11428d9847fd44f982cd094a3248bd13fb6b7b58) - Filtering by artifact kind (e.g. filtering out either Feature or FeatureExpression) is now being done in the `get.commits` method instead of the `get.commits.filtered` method (894c9a5c181fef14dcb71fa23699bebbcbcd2b4f) -- The `NetworkConf` and the `ProjectConf` now print out an error message and stop whenever it is attempted to set -non-existing configuration parameters (ec8c6dd72746a0506b3e03dccc4fcaf7a03325ea) +- Remove `get.commits.filtered.empty` and corresponding `filter.commits.empty` method, the functionality is now included +into the methods `get.commits.filtered` and `filter.commits` respectively (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- The constant `BASE.ARTIFACTS` in the file `util-data.R` was extended by adding untracked files (i.e. the new metafile +`UNTRACKED.FILE`), which is now considered to be a new base artifact in the case of file level analyses. This implies, +that in case of file level anlyses the base artifact and the untracked files fall together, while in feature and +function level analyzes they are treated differently (d11d0fb585397fdb3a2641484248f74752db9331) +- The `filter.commits` method now takes parameters which configure if untracked files and/or the base artifact should be +filtered out (11428d9847fd44f982cd094a3248bd13fb6b7b58) +- In the class `Conf` (and its sub-classes `NetworkConf` and `ProjectConf`), default parameters are not validated anymore to avoid confusion by logging output (ec8c6dd72746a0506b3e03dccc4fcaf7a03325ea) +- In the class `Conf` (and its sub-classes `NetworkConf` and `ProjectConf`), `stop` is called on errors during parameter updates now (ec8c6dd72746a0506b3e03dccc4fcaf7a03325ea) ### Fixed - Fix error when resetting an `ProjectData` environment (c64cab84e928a2a4c89a6df12440ba7ca06e6263) -- Fix bug which lead to wrong network construction in the case of networks with more than one node but no edges -(#150, 0d7c2226da67f3537f3ff9d013607fe19df8a4c0) +- Fix vertices for networks without edges (#150, PR #149, 0d7c2226da67f3537f3ff9d013607fe19df8a4c0, 7e27a182de282f054f08e3a2fb04d852c2c55102) ## 3.4 From ae58902d0eced539003ed1867a430a9d51f65ba5 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Mon, 14 Jan 2019 23:07:09 +0100 Subject: [PATCH 31/31] Adjust copyright headers Signed-off-by: Jakob Kronawitter --- tests/test-networks-artifact.R | 1 + tests/test-networks-author.R | 1 + tests/test-networks-bipartite.R | 1 + tests/test-networks-covariates.R | 1 + 4 files changed, 4 insertions(+) diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 50678ddc..bdf926f7 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -14,6 +14,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Claus Hunsen ## Copyright 2018 by Barbara Eckl +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 31c5b6e5..fa5ad3fb 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-networks-bipartite.R b/tests/test-networks-bipartite.R index e7c3eae5..1c99f0d7 100644 --- a/tests/test-networks-bipartite.R +++ b/tests/test-networks-bipartite.R @@ -15,6 +15,7 @@ ## Copyright 2017-2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 09197189..b6c504db 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -16,6 +16,7 @@ ## Copyright 2017-2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock ## Copyright 2018 by Klara Schlüter +## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved.