From c136b1f6127d73c25f08ae2f317246747aa9ea2b Mon Sep 17 00:00:00 2001 From: Anselm Fehnker Date: Sun, 20 Oct 2019 17:44:53 +0200 Subject: [PATCH 01/37] Add preliminary methods to calculate EDCPTD centrality Methods that enable to calculate the EDCPTD centrality for a project are added to a provisional file 'util-tensor.R'. This include methods that retrive the needed single-layer developer networks, a method that builds a forth-order tensor using this single-layer networks and e method that calculates the EDCPTD centrality using this tensor. Signed-off-by: Anselm Fehnker --- install.R | 4 +- util-init.R | 1 + util-tensor.R | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 util-tensor.R diff --git a/install.R b/install.R index 2a9d47d6..3b4c4758 100644 --- a/install.R +++ b/install.R @@ -37,7 +37,9 @@ packages = c( "markovchain", "lubridate", "viridis", - "jsonlite" + "jsonlite", + "rTensor", + "Matrix" ) diff --git a/util-init.R b/util-init.R index df6db710..aeb1224b 100644 --- a/util-init.R +++ b/util-init.R @@ -62,3 +62,4 @@ source("util-core-peripheral.R") source("util-networks-metrics.R") source("util-networks-covariates.R") source("util-plot-evaluation.R") +source("util-tensor.R") diff --git a/util-tensor.R b/util-tensor.R new file mode 100644 index 00000000..7edcc1fd --- /dev/null +++ b/util-tensor.R @@ -0,0 +1,187 @@ +# function that manages the process to calculate EDCPTD centrality for a project +EDCPTD.centrality = function(network.builder){ + + # get the single-layer developer networks + networks = get.author.networks(network.builder) + + # get and order all active developers from the single-layer nteworks + active.developers = get.developers.from.networks(networks) + active.developers = active.developers[order(active.developers)] + + # build the tensor representing the single-layer networks + tensor = build.tensor.from.adjacency(networks, active.developers) + + # calculate EDCPTD centrality + edcptd = calculate.EDCPTD.centrality(active.developers, tensor) + + # return a data frame containing the names of the active developers and their EDCPTD score + return(edcptd) +} + + + +# function that gets the single-layer developer networks for the mail, cochange and issue data for the project. +# A list containing all available single-layer networks is returned. +get.author.networks = function(network.builder){ + + networks = list() + + # get the single-layer mail developer network if available + network.builder$update.network.conf(updated.values = list(author.relation = "mail")) + + mail.network = network.builder$get.author.network() + + + # only if the mail data is available for the project, the network is added to the list + if(vcount(mail.network) > 0){ + networks = union(networks, list(mail.network)) + } + + # get the single-layer cochange developer network if available + network.builder$update.network.conf(updated.values = list(author.relation = "cochange")) + + cochange.network = network.builder$get.author.network() + + # only if the cochange data is available for the project, the network is added to the list + if(vcount(cochange.network) > 0){ + networks = union(networks, list(cochange.network)) + } + + # get the single-layer issue developer network if available + network.builder$update.network.conf(updated.values = list(author.relation = "issue")) + + issue.network = network.builder$get.author.network() + + # only if the issue data is available for the project, the network is added to the list + if(vcount(issue.network) > 0){ + networks = union(networks, list(issue.network)) + } + + return(networks) +} + +# Get an ordered vector all developers (i.e. their names) who are in at least one of the networks. If globally == FALSE, +# get a seperate list for each network which contains all the authors in the specific network. +get.developers.from.networks = function(networks, globally = TRUE) { + + # for each network, get a list of authors that are in this network + active.authors.list = lapply(networks, function(network) { + active.authors = V(network)$name + return (active.authors) + }) + + if (globally) { + # flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) + + # remove distracting named list members + names(active.authors) = NULL + + # remove duplicates and order alphabetically ascending + active.authors = active.authors[!duplicated(active.authors)] + active.authors = active.authors[order(active.authors)] + return (active.authors) + } else { + return (active.authors.list) + } +} + +# function that builds a forth-order tensor containing the same information as the single-layer developer networks +build.tensor.from.adjacency = function(networks, active.developers){ + + # calculate dimensions for the forth-order tensor + number.layers = length(networks) + number.nodes = length(active.developers) + + # get the adjacency matrices of the single-layer developer networks + adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, active.developers) + + # create an array with the size of the forth-order tensor that only contains zeros + array <-array(0, dim = c(number.nodes, number.layers, number.nodes, number.layers)) + + # the entries from every adjacency matrix are transfered to the array + for (l in 1:length(adjacency.matrices)) { + + mat = as(adjacency.matrices[[l]], "dgTMatrix") + + for (entry in 1:length(mat@x)) { + array[mat@i[entry]+1, l, mat@j[entry]+1, l] =mat@x[entry] + } + } + + # the array is converted into a tensor + tensor <- rTensor::as.tensor(array) + + return(tensor) + +} + + +# The expanded adjacency is a 3-dimensional matrix where each slice represents +# one time window (i.e. one network in the given list of networks). The rows +# and vertices are the developers who are active in at +# least one network. It is possible to choose whether to consider edge weights. +# +# If a fields value is 0, the two developers were not linked to each other, if +# it is 1 (any positive integer for the weighted version) both developers were +# active and a link between them existed (with the indicated edge weight). +get.expanded.adjacency = function(network, authors, weighted = FALSE){ + + # create the matrix for this time step in the appropriate format, adding developer names + matrix <- sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) + matrix <- as(matrix, "dgTMatrix") + rownames(matrix) <- authors + colnames(matrix) <- authors + + if(weighted && vcount(network)>0){ + # get the weighted adjacency matrix for the current network + A <- get.adjacency(network, attr = "weight") + }else{ + # get the unweighted adjacency matrix for the current network + A <- get.adjacency(network) + } + + # order the adjacency matrix + if(nrow(A)>1){ # for a 1x1 matrix ordering doesn't work + A <- A[order(rownames(A)),order(colnames(A))] + } + + # save the activity data per developer + if(nrow(A)>0){ + matrix[rownames(A), colnames(A)] <- A + } + + if(!weighted){ + matrix[matrix > 0] <- 1 + } + + return(matrix) +} + +# method that calculates the EDCPTD centrality from the tensor +calculate.EDCPTD.centrality = function(authors, tensor){ + + # create data frame for results + data = data.frame( + names = authors + ) + + # decompose tensor + decomposition <-rTensor::cp(tensor, num_components = 1, max_iter = 50, tol = 1e-05) + + + # calculate EDCPTD centrality + + data[["EDCPTD.score"]] = 0 + + for (y in 1:length(decomposition[["U"]][[2]][,1])) { + data[["EDCPTD.score"]] = data[["EDCPTD.score"]] + abs(decomposition[["U"]][[1]][,1]*decomposition[["U"]][[2]][,1][y]) +abs(decomposition[["U"]][[3]][,1]*decomposition[["U"]][[4]][,1][y]) + } + + data[["EDCPTD.score"]] = data[["EDCPTD.score"]]/2 + + return(data) +} + + + From e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb Mon Sep 17 00:00:00 2001 From: fehnkera Date: Sat, 8 Aug 2020 17:38:19 +0200 Subject: [PATCH 02/37] Change process of calculating EDCPTD centrality The process of how EDCPTD centrality is computed is changed. In the first step the author network for each relation is build. In the second step the author networks are converted to a forth-order tensor and in the third step EDCPTD centrality is computed. Changes are based on the reviews from @clhunsen, @bockthom and @ecklbarb. Signed-off-by: Signed-off-by: Anselm Fehnker --- showcase.R | 11 ++ util-tensor.R | 367 +++++++++++++++++++++++++++++--------------------- 2 files changed, 228 insertions(+), 150 deletions(-) diff --git a/showcase.R b/showcase.R index 8a2828a5..01668a7d 100644 --- a/showcase.R +++ b/showcase.R @@ -122,6 +122,17 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) # net = x$get.author.network() # save(net, file = sprintf("busybox_%s.network", x$get.network.conf.variable(var.name = "author.relation"))) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Calculate EDCPTD centrality --------------------------------------------- + +## get author networks for each relation +author.networks = get.author.networks(x, c("cochange", "mail", "issue")) + +## create forth-order tensor +forth.order.tensor = ForthOrderTensor$new(author.networks) + +## calculate EDCPTD scores +edcptd.scores = calculate.EDCPTD.centrality(forth.order.tensor) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Range-level data -------------------------------------------------------- diff --git a/util-tensor.R b/util-tensor.R index 7edcc1fd..d7d0439f 100644 --- a/util-tensor.R +++ b/util-tensor.R @@ -1,187 +1,254 @@ -# function that manages the process to calculate EDCPTD centrality for a project -EDCPTD.centrality = function(network.builder){ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2020 by Anselm Fehnker +## All Rights Reserved. + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## ForthOrderTensor class ------------------------------------------------ + +#' The class \code{ForthOrderTensor} creates an (author x relation x author x relation) +#' tensor from a list of networks. The tensor as well as the dimensions and lists +#' of the authors and relations are stored. +#' +ForthOrderTensor = R6::R6Class("ForthOrderTensor", + + ## * private ---------------------------------------------------------- + + private = list( + ## * * data --------------------------------------------------------- + + dim = NULL, + relations = NULL, + authors = NULL, + tensor = NULL, + + ## * * tensor creation ---------------------------------------------- + + #' Creates a forth-order tensor from a list of networks using their + #' adjacency matrices. + #' + #' @param networks the list of networks + #' + #' @return the created tensor + build.tensor.from.networks = function(networks, weighted = FALSE) { + + ## get adjacency matrices from networks + adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) + + ## create an array with the size of the forth-order tensor that only contains zeros + array <-array(0, dim = private$dim) + + ## transfer entries from adjacency matrices to array + for (l in 1:length(adjacency.matrices)) { + + matrix = as(adjacency.matrices[[l]], "dgTMatrix") + + for (entry in 1:length(matrix@x)) { + array[matrix@i[entry]+1, l, matrix@j[entry]+1, l] = matrix@x[entry] + } + } + + ## convert array to tensor + tensor <- rTensor::as.tensor(array) + + return(tensor) + } + ), + + ## * * public ---------------------------------------------------------- + + public = list( + + #' Constructor of the class. Constructs a new forth-order tensor instance + #' based on the given list of networks. + #' + #' @param networks the given list of networks + #' @param weighted bool if the tensor shall be weighted + initialize = function(networks, weighted = FALSE) { + + private$relations = names(networks) + private$authors = get.author.names.from.networks(networks) + private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) + private$tensor = private$build.tensor.from.networks(networks, weighted) + + }, + + #' Get the list of authors of the tensor. + #' + #' @return the list of authors + get.authors = function() { + return(private$authors) + }, + + #' Get the list of relations of the tensor. + #' + #' @return the list of relations + get.relations = function() { + return(private$relations) + }, + + #' Get the tensor data saved in the object. + #' + #' @return the tensor data + get.tensor = function() { + return(private$tensor) + } + + ) +) + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Get author networks ----------------------------------------------------- + +#' Get a list of author networks for each relation. +#' If a relation is not available for the current project, it is not added to the list. +#' +#' @param network.builder the network builder for the project +#' @param relations the relations of the wanted networks +#' +#' @return the list of networks +get.author.networks = function(network.builder, relations) { + + networks = list() - # get the single-layer developer networks - networks = get.author.networks(network.builder) - - # get and order all active developers from the single-layer nteworks - active.developers = get.developers.from.networks(networks) - active.developers = active.developers[order(active.developers)] - - # build the tensor representing the single-layer networks - tensor = build.tensor.from.adjacency(networks, active.developers) - - # calculate EDCPTD centrality - edcptd = calculate.EDCPTD.centrality(active.developers, tensor) - - # return a data frame containing the names of the active developers and their EDCPTD score - return(edcptd) -} - - - -# function that gets the single-layer developer networks for the mail, cochange and issue data for the project. -# A list containing all available single-layer networks is returned. -get.author.networks = function(network.builder){ - - networks = list() - - # get the single-layer mail developer network if available - network.builder$update.network.conf(updated.values = list(author.relation = "mail")) - - mail.network = network.builder$get.author.network() - - - # only if the mail data is available for the project, the network is added to the list - if(vcount(mail.network) > 0){ - networks = union(networks, list(mail.network)) - } - - # get the single-layer cochange developer network if available - network.builder$update.network.conf(updated.values = list(author.relation = "cochange")) - - cochange.network = network.builder$get.author.network() - - # only if the cochange data is available for the project, the network is added to the list - if(vcount(cochange.network) > 0){ - networks = union(networks, list(cochange.network)) + networks = lapply(relations, function(rel) { + + ## retrieve network for relation + network.builder$update.network.conf(updated.values = list(author.relation = rel)) + retrieved.network = network.builder$get.author.network() + + ## check if network is not empty + if(igraph::vcount(retrieved.network) > 0){ + logging::loginfo("Added %s data to list", rel) + return(retrieved.network) + } else { + logging::logwarn("There is no %s data available for the current project", rel) + return(NA) } + }) - # get the single-layer issue developer network if available - network.builder$update.network.conf(updated.values = list(author.relation = "issue")) - - issue.network = network.builder$get.author.network() + ## add names of the relations + names(networks) = relations - # only if the issue data is available for the project, the network is added to the list - if(vcount(issue.network) > 0){ - networks = union(networks, list(issue.network)) - } + ## removes empty networks + networks = networks[!is.na(networks)] - return(networks) + return(networks) } -# Get an ordered vector all developers (i.e. their names) who are in at least one of the networks. If globally == FALSE, -# get a seperate list for each network which contains all the authors in the specific network. -get.developers.from.networks = function(networks, globally = TRUE) { - - # for each network, get a list of authors that are in this network - active.authors.list = lapply(networks, function(network) { - active.authors = V(network)$name - return (active.authors) - }) - - if (globally) { - # flatten the list of lists to one list of authors - active.authors = unlist(active.authors.list, recursive = FALSE) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Get active authors ----------------------------------------------------- - # remove distracting named list members - names(active.authors) = NULL - - # remove duplicates and order alphabetically ascending - active.authors = active.authors[!duplicated(active.authors)] - active.authors = active.authors[order(active.authors)] - return (active.authors) - } else { - return (active.authors.list) - } -} +#' Get all author names that are active in at least one of the networks. +#' +#' @param networks the list of networks +#' +#' @return the list of author names +get.author.names.from.networks = function(networks) { -# function that builds a forth-order tensor containing the same information as the single-layer developer networks -build.tensor.from.adjacency = function(networks, active.developers){ + ## for each network, get a list of authors that are in this network + active.authors.list = lapply(networks, function(network) { + active.authors = igraph::V(network)$name + return (active.authors) + }) - # calculate dimensions for the forth-order tensor - number.layers = length(networks) - number.nodes = length(active.developers) + ## flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) - # get the adjacency matrices of the single-layer developer networks - adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, active.developers) + ## remove distracting named list members + names(active.authors) = NULL - # create an array with the size of the forth-order tensor that only contains zeros - array <-array(0, dim = c(number.nodes, number.layers, number.nodes, number.layers)) + ## remove duplicates + active.authors = active.authors[!duplicated(active.authors)] - # the entries from every adjacency matrix are transfered to the array - for (l in 1:length(adjacency.matrices)) { + ## order alphabetically ascending + active.authors = active.authors[order(active.authors)] - mat = as(adjacency.matrices[[l]], "dgTMatrix") + return (active.authors) +} - for (entry in 1:length(mat@x)) { - array[mat@i[entry]+1, l, mat@j[entry]+1, l] =mat@x[entry] - } - } +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Get adjacency matrix ---------------------------------------------------- - # the array is converted into a tensor - tensor <- rTensor::as.tensor(array) +#' Get a sparse adjacency matrix for a network. +#' +#' @param network the given network +#' @param authors all authors that are wanted in the adjacency matrix +#' @param weighted bool if the adjacency matrix shall be weighted +#' +#' @return the list of author names +get.expanded.adjacency = function(network, authors, weighted = FALSE) { - return(tensor) + ## create an empty sparse matrix with the right size + matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) + matrix = as(matrix, "dgTMatrix") -} + ## add row and column names + rownames(matrix) = authors + colnames(matrix) = authors + if(igraph::vcount(network) > 0) { -# The expanded adjacency is a 3-dimensional matrix where each slice represents -# one time window (i.e. one network in the given list of networks). The rows -# and vertices are the developers who are active in at -# least one network. It is possible to choose whether to consider edge weights. -# -# If a fields value is 0, the two developers were not linked to each other, if -# it is 1 (any positive integer for the weighted version) both developers were -# active and a link between them existed (with the indicated edge weight). -get.expanded.adjacency = function(network, authors, weighted = FALSE){ - - # create the matrix for this time step in the appropriate format, adding developer names - matrix <- sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) - matrix <- as(matrix, "dgTMatrix") - rownames(matrix) <- authors - colnames(matrix) <- authors - - if(weighted && vcount(network)>0){ - # get the weighted adjacency matrix for the current network - A <- get.adjacency(network, attr = "weight") - }else{ - # get the unweighted adjacency matrix for the current network - A <- get.adjacency(network) + if(weighted) { + ## get the weighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network, attr = "weight") + } else { + ## get the unweighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network) } - # order the adjacency matrix - if(nrow(A)>1){ # for a 1x1 matrix ordering doesn't work - A <- A[order(rownames(A)),order(colnames(A))] + ## order the adjacency matrix + if(nrow(matrix.data)>1) { # for a 1x1 matrix ordering doesn't work + matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] } - # save the activity data per developer - if(nrow(A)>0){ - matrix[rownames(A), colnames(A)] <- A + ## save the activity data per developer + if(nrow(matrix.data)>0) { + matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data } - if(!weighted){ + if(!weighted) { matrix[matrix > 0] <- 1 } - return(matrix) -} - -# method that calculates the EDCPTD centrality from the tensor -calculate.EDCPTD.centrality = function(authors, tensor){ + } - # create data frame for results - data = data.frame( - names = authors - ) + return(matrix) +} - # decompose tensor - decomposition <-rTensor::cp(tensor, num_components = 1, max_iter = 50, tol = 1e-05) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Calculate centrality ---------------------------------------------------- +#' Calculate EDCPTD centrality for a given forth-order tensor. +#' +#' @param forth.order.tensor the given tensor +#' +#' @return data frame with EDCPTD score for every author +calculate.EDCPTD.centrality = function(forth.order.tensor) { - # calculate EDCPTD centrality + ## create data frame for results + results = data.frame(names = forth.order.tensor$get.authors(), EDCPTD.score = 0) - data[["EDCPTD.score"]] = 0 + ## decompose tensor + decomposition <-rTensor::cp(forth.order.tensor$get.tensor(), num_components = 1, max_iter = 50, tol = 1e-05) - for (y in 1:length(decomposition[["U"]][[2]][,1])) { - data[["EDCPTD.score"]] = data[["EDCPTD.score"]] + abs(decomposition[["U"]][[1]][,1]*decomposition[["U"]][[2]][,1][y]) +abs(decomposition[["U"]][[3]][,1]*decomposition[["U"]][[4]][,1][y]) - } + ## calculate EDCPTD centrality + for (y in 1:length(forth.order.tensor$get.relations())) { + results[["EDCPTD.score"]] = (results[["EDCPTD.score"]] + + abs(decomposition[["U"]][[1]][,1] * decomposition[["U"]][[2]][,1][y]) + + abs(decomposition[["U"]][[3]][,1] * decomposition[["U"]][[4]][,1][y]))/2 + } - data[["EDCPTD.score"]] = data[["EDCPTD.score"]]/2 - - return(data) + return(results) } - - - From 7acf76e80ab52e2836fdfb2f89d20576e1c08809 Mon Sep 17 00:00:00 2001 From: fehnkera Date: Sat, 8 Aug 2020 17:39:46 +0200 Subject: [PATCH 03/37] Add rTensor to 'README.md' The use of the library rTensor is added to the 'README.md' file. Signed-off-by: Anselm Fehnker --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a325d7f4..255fe207 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `lubridate`: For convenient date conversion and parsing - `viridis`: For plotting of networks with nice colors - `jsonlite`: For parsing the issue data +- `rTensor`: For calculating EDCPTD centrality ### Submodule From 051a5f0287022f97e2367ed0e9591b9df9dbdb3d Mon Sep 17 00:00:00 2001 From: fehnkera Date: Sat, 22 Aug 2020 17:27:27 +0200 Subject: [PATCH 04/37] Apply Review from @bockthom and @clhunsen Apply the Review from @bockthom and @clhunsen on the previous changes. This includes compliance of coding conventions, update of copyright headers and improvement of documentation. Move the functions for 'get.author.names.from.networks' and 'get.expanded.adjacency' to new file 'util-networks-misc.R'. Also add two functions 'get.author.names.from.data' and 'convert.adjacency.matrix.list.to.array' from the 'dev-network-growth' project to the new file. Signed-off-by: Anselm Fehnker --- README.md | 5 + showcase.R | 9 +- util-data.R | 22 ++- util-init.R | 2 + util-networks-misc.R | 238 +++++++++++++++++++++++++++++ util-networks.R | 3 +- util-tensor.R | 349 ++++++++++++++++++------------------------- 7 files changed, 413 insertions(+), 215 deletions(-) create mode 100644 util-networks-misc.R diff --git a/README.md b/README.md index 255fe207..13d7cf36 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `viridis`: For plotting of networks with nice colors - `jsonlite`: For parsing the issue data - `rTensor`: For calculating EDCPTD centrality +- `Matrix`: For sparse matrix representation of large adjacency matrices ### Submodule @@ -410,6 +411,10 @@ Additionally, for more examples, the file `showcase.R` is worth a look. * Functionality to add vertex attributes to existing networks - `util-networks-metrics.R` * A set of network-metric functions +- `util-networks-misc.R` + * Helper functions for network creation (e.g., create adjacency matrices) +- `util-tensor.R` + * Functionality to build fourth-order tensors - `util-core-peripheral.R` * Author classification (core and peripheral) and related functions - `util-motifs.R` diff --git a/showcase.R b/showcase.R index 01668a7d..38bba0c5 100644 --- a/showcase.R +++ b/showcase.R @@ -18,6 +18,7 @@ ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Klara Schlueter +## Copyright 2020 by Anselm Fehnker ## All Rights Reserved. @@ -126,13 +127,13 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) ## Calculate EDCPTD centrality --------------------------------------------- ## get author networks for each relation -author.networks = get.author.networks(x, c("cochange", "mail", "issue")) +author.networks = get.author.networks.for.multiple.relations(x, c("cochange", "mail", "issue")) -## create forth-order tensor -forth.order.tensor = ForthOrderTensor$new(author.networks) +## create fourth-order tensor +fourth.order.tensor = FourthOrderTensor$new(author.networks) ## calculate EDCPTD scores -edcptd.scores = calculate.EDCPTD.centrality(forth.order.tensor) +edcptd.scores = calculate.EDCPTD.centrality(fourth.order.tensor) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Range-level data -------------------------------------------------------- diff --git a/util-data.R b/util-data.R index e1e424df..8694ab35 100644 --- a/util-data.R +++ b/util-data.R @@ -18,7 +18,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017 by Ferdinand Frank ## Copyright 2018-2019 by Jakob Kronawitter -## Copyright 2019 by Anselm Fehnker +## Copyright 2019-2020 by Anselm Fehnker ## All Rights Reserved. @@ -1313,23 +1313,29 @@ ProjectData = R6::R6Class("ProjectData", return(mylist) }, - #' Get the list of authors by only looking only at the specified data source. + #' Get the list of authors for the specified data sources. #' #' *Note*: The constant \code{DATASOURCE.TO.ARTIFACT.FUNCTION} denotes the mapping between #' data source and the method which is retrieving the data for each data source. #' - #' @param data.source the data source which can be either \code{"commits"}, \code{"mails"}, - #' or \code{"issues"} [default: "commits"] + #' @param data.sources the data sources from which the authors should be retrieved, + #' can be either \code{"commits"}, \code{"mails"}, or \code{"issues"}, + #' or any combination of them [default: c("commits", "mails", "issues")] #' #' @return a data.frame of unique author names (columns \code{name} and \code{author.email}), #' extracted from the specified data source - get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) { + get.authors.by.data.source = function(data.sources = c("commits", "mails", "issues")) { - data.source = match.arg(data.source) + data.sources = match.arg.or.default(data.sources, several.ok = TRUE) ## retrieve author names from chosen data source - data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] - data = self[[data.source.func]]()[c("author.name", "author.email")] + data = lapply(data.sources, function(data.source){ + data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] + data.source.authors = self[[data.source.func]]()[c("author.name", "author.email")] + return (data.source.authors) + }) + + data = plyr::rbind.fill(data) ## remove duplicates data = unique(data) diff --git a/util-init.R b/util-init.R index aeb1224b..30c7cc2e 100644 --- a/util-init.R +++ b/util-init.R @@ -17,6 +17,7 @@ ## Copyright 2017 by Sofie Kemper ## Copyright 2017 by Felix Prasse ## Copyright 2019 by Klara Schlüter +## Copyright 2019-2020 by Anselm Fehnker ## All Rights Reserved. @@ -62,4 +63,5 @@ source("util-core-peripheral.R") source("util-networks-metrics.R") source("util-networks-covariates.R") source("util-plot-evaluation.R") +source("util-networks-misc.R") source("util-tensor.R") diff --git a/util-networks-misc.R b/util-networks-misc.R new file mode 100644 index 00000000..a98e999f --- /dev/null +++ b/util-networks-misc.R @@ -0,0 +1,238 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2016 by Sofie Kemper +## Copyright 2016 by Claus Hunsen +## Copyright 2016-2018 by Thomas Bock +## Copyright 2017 by Angelika Schmid +## Copyright 2019 by Jakob Kronawitter +## Copyright 2019-2020 by Anselm Fehnker +## All Rights Reserved. + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Libraries --------------------------------------------------------------- + +requireNamespace("parallel") # for parallel computation +requireNamespace("igraph") # networks +requireNamespace("Matrix") # for sparse matrices + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Get active authors ----------------------------------------------------- + +#' Get all author names that are active in at least one of the networks. +#' +#' @param networks the list of networks from which the author names are wanted +#' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE] +#' +#' @return the list of author names +get.author.names.from.networks = function(networks, globally = TRUE) { + + ## for each network, get a list of authors that are in this network + active.authors.list = lapply(networks, function(network) { + active.authors = igraph::V(network)$name + return(active.authors) + }) + + if (globally) { + ## flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) + + ## remove distracting named list members + names(active.authors) = NULL + + ## remove duplicates and order alphabetically ascending + active.authors = active.authors[!duplicated(active.authors)] + active.authors = sort(active.authors) + return(active.authors) + } else { + return(active.authors.list) + } +} + +#' Get all author names that are active in at least one of the data sources during the data ranges. +#' +#' @param data.ranges the list of the data ranges +#' @param data.sources the data sources from which the author names should be retrieved, +#' can be either \code{"commits"}, \code{"mails"}, or \code{"issues"}, +#' or any combination of them [default: c("commits", "mails", "issues")] +#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE] +#' +#' @return the list of author names +get.author.names.from.data = function(data.ranges, data.sources = c("commits", "mails", "issues"), globally = TRUE) { + + data.sources = match.arg.or.default(data.sources, several.ok = TRUE) + + ## for each range, get the authors who have been active on at least one data source in this range + active.authors.list = lapply(data.ranges, function(range.data) { + + active.authors = range.data$get.authors.by.data.source(data.sources) + + active.authors.names = active.authors$author.name + + return(active.authors.names) + + }) + + if (globally) { + ## flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) + + ## remove distracting named list members + names(active.authors) = NULL + + ## remove duplicates and order alphabetically ascending + active.authors = active.authors[!duplicated(active.authors)] + active.authors = sort(active.authors) + return(active.authors) + } else { + return(active.authors.list) + } +} + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Adjacency matrices ---------------------------------------------------- + +#' Get a sparse expanded adjacency matrix for network. +#' +#' The adjacency matrix is expanded as it may contain rows and columns for authors which are not part of the network +#' but given in the \code{authors} parameter. However, this also means that authors present in the network +#' but not given in the \code{authors} parameter are not contained in the expanded adjacency matrix. +#' +#' @param network the given network +#' @param authors all authors that are wanted in the adjacency matrix +#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE] +#' +#' @return the sparse adjacency matrix of the network +get.expanded.adjacency = function(network, authors, weighted = FALSE) { + + ## create an empty sparse matrix with the right size + matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) + matrix = as(matrix, "dgTMatrix") + + ## add row and column names + rownames(matrix) = authors + colnames(matrix) = authors + + if (igraph::vcount(network) > 0) { + + if (weighted) { + ## get the weighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network, attr = "weight") + } else { + ## get the unweighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network) + } + + ## order the adjacency matrix + if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work + matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] + } + + ## save the activity data per author + if (nrow(matrix.data) > 0) { + matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data + } + + if (!weighted) { + matrix[matrix > 0] = 1 + } + + } + + return(matrix) +} + +#' Calculates a sparse adjacency matrix for each network in the list. +#' All adjacency matrices are expanded in such a way that the use the same set +#' of authors derived from all networks in the list. +#' +#' @param networks list of networks +#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE] +#' +#' @return the list of adjacency matrices +get.expanded.adjacency.matrices = function(networks, weighted = FALSE){ + + authors = get.author.names.from.networks(networks) + + adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted) + + return(adjacency.matrices) +} + +#' Gets a list of networks, converts them to sparse adjacency matrices, and sums up the adjacency matrices cumulatively. +#' This means that the first entry of the returned list is just the adjacency matrix from the first network, +#' the second entry is the sum of the first and the second entry, and so on. +#' +#' @param networks list of networks +#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE] +#' +#' @return the list of cumulated adjacency matrices +get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) { + ## get expanded adjacency matrices first + matrices = get.expanded.adjacency.matrices(networks, weighted) + + ## pair-wise sum of matrices: m.cumul(n) = m.cumul(m - 1) + m + ## (intermediate results consecutively stored in matrices.cumulated) + matrices.cumulated = list(matrices[[1]]) # first one is complete already + + if (length(matrices) > 1) { + for (m in 2:(length(matrices))){ + + matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]] + rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m - 1]]) + colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m - 1]]) + + if (!weighted) { + + ## search for a non-zero entry and set them to an arbitray number (e.g., 42) + ## to force that all non-zero entries are correctly set to 1 afterwards + not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE) + if (nrow(not.zero.idxs) > 0) { + first.not.zero.idx = not.zero.idxs[1, ] + names(first.not.zero.idx) = c("row", "col") + matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42 + matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i)) + } + } + } + } + + return(matrices.cumulated) +} + +#' Converts a list of adjacency matrices to an array. +#' +#' @param adjacency.list the list of adjacency matrices +#' +#' @return the converted array +convert.adjacency.matrix.list.to.array = function(adjacency.list){ + + ## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result + array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list))) + rownames(array) = rownames(adjacency.list[[1]]) + colnames(array) = colnames(adjacency.list[[1]]) + + ## copy the activity values from the adjacency matrices in the list to the corresponding array slices + for (i in seq_along(adjacency.ist)){ + adjacency = adjacency.list[[i]] + activity.indices = which(adjacency != 0, arr.ind = TRUE) + + for (j in 1:nrow(activity.indices)){ + array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2]), i] = + adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])] + } + } + + return(array) +} diff --git a/util-networks.R b/util-networks.R index 7f76c433..3e0962c3 100644 --- a/util-networks.R +++ b/util-networks.R @@ -17,6 +17,7 @@ ## Copyright 2017-2019 by Thomas Bock ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter +## Copyright 2020 by Anselm Fehnker ## All Rights Reserved. @@ -195,7 +196,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## also corresponding author information. Re-add author vertices back to the network now by accessing the ## complete author list: ## 1) get all authors on commits - authors = private$proj.data$get.authors.by.data.source(data.source = "commits") + authors = private$proj.data$get.authors.by.data.source(data.sources = "commits") ## 2) only select author names authors = authors["author.name"] ## 3) rename single column to "name" to correct mapping to vertex attribute "name" diff --git a/util-tensor.R b/util-tensor.R index d7d0439f..c0d9777f 100644 --- a/util-tensor.R +++ b/util-tensor.R @@ -11,101 +11,122 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2020 by Anselm Fehnker +## Copyright 2019-2020 by Anselm Fehnker ## All Rights Reserved. -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## ForthOrderTensor class ------------------------------------------------ - -#' The class \code{ForthOrderTensor} creates an (author x relation x author x relation) -#' tensor from a list of networks. The tensor as well as the dimensions and lists -#' of the authors and relations are stored. -#' -ForthOrderTensor = R6::R6Class("ForthOrderTensor", - - ## * private ---------------------------------------------------------- - - private = list( - ## * * data --------------------------------------------------------- - - dim = NULL, - relations = NULL, - authors = NULL, - tensor = NULL, - - ## * * tensor creation ---------------------------------------------- - - #' Creates a forth-order tensor from a list of networks using their - #' adjacency matrices. - #' - #' @param networks the list of networks - #' - #' @return the created tensor - build.tensor.from.networks = function(networks, weighted = FALSE) { - - ## get adjacency matrices from networks - adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) - - ## create an array with the size of the forth-order tensor that only contains zeros - array <-array(0, dim = private$dim) - - ## transfer entries from adjacency matrices to array - for (l in 1:length(adjacency.matrices)) { - - matrix = as(adjacency.matrices[[l]], "dgTMatrix") - - for (entry in 1:length(matrix@x)) { - array[matrix@i[entry]+1, l, matrix@j[entry]+1, l] = matrix@x[entry] - } - } - ## convert array to tensor - tensor <- rTensor::as.tensor(array) - - return(tensor) - } - ), - - ## * * public ---------------------------------------------------------- - - public = list( - - #' Constructor of the class. Constructs a new forth-order tensor instance - #' based on the given list of networks. - #' - #' @param networks the given list of networks - #' @param weighted bool if the tensor shall be weighted - initialize = function(networks, weighted = FALSE) { - - private$relations = names(networks) - private$authors = get.author.names.from.networks(networks) - private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) - private$tensor = private$build.tensor.from.networks(networks, weighted) - - }, +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Libraries --------------------------------------------------------------- - #' Get the list of authors of the tensor. - #' - #' @return the list of authors - get.authors = function() { - return(private$authors) - }, +requireNamespace("R6") # for R6 classes +requireNamespace("logging") # for logging +requireNamespace("parallel") # for parallel computation +requireNamespace("igraph") # networks +requireNamespace("rTensor") # tensors - #' Get the list of relations of the tensor. - #' - #' @return the list of relations - get.relations = function() { - return(private$relations) - }, - #' Get the tensor data saved in the object. - #' - #' @return the tensor data - get.tensor = function() { - return(private$tensor) - } +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## FourthOrderTensor class ------------------------------------------------ - ) +#' The class \code{FourthOrderTensor} creates an (author x relation x author x relation) +#' tensor from a list of networks. The tensor as well as the dimensions and lists +#' of the authors and relations are stored. +#' +FourthOrderTensor = R6::R6Class("FourthOrderTensor", + + ## * private ---------------------------------------------------------- + + private = list( + ## * * data ------------------------------------------------------- + + dim = NULL, + relations = NULL, + authors = NULL, + tensor = NULL, + + ## * * tensor creation -------------------------------------------- + + #' Creates a fourth-order tensor from a list of networks using their + #' adjacency matrices. + #' + #' @param networks the list of networks + #' @param weighted decides if the tensor shall be weighted [default: FALSE] + #' + #' @return the created tensor + build.tensor.from.networks = function(networks, weighted = FALSE) { + + ## get adjacency matrices from networks + adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) + + ## create an array with the size of the fourth-order tensor that only contains zeros + array = array(0, dim = private$dim) + + ## transfer entries from adjacency matrices to array + for (l in seq_along(adjacency.matrices)) { + + matrix = as(adjacency.matrices[[l]], "dgTMatrix") + + for (entry in seq_along(matrix@x)) { + ## Transfer the entries from the adjacency matrix to the tensor. + ## Due to the property that the indexes saved in a sparse matrix start with 0, + ## while the indexes of an array start with 1, the indexes need to be shifted. + array[matrix@i[entry] + 1, l, matrix@j[entry] + 1, l] = matrix@x[entry] + } + } + + ## convert array to tensor + tensor = rTensor::as.tensor(array) + + return(tensor) + } + ), + + ## * * public ---------------------------------------------------------- + + public = list( + + #' Constructor of the class. Constructs a new fourth-order tensor instance + #' based on the given list of networks. + #' + #' @param networks the given list of networks + #' @param weighted decides if the tensor shall be weighted [default: FALSE] + initialize = function(networks, weighted = FALSE) { + + private$relations = names(networks) + private$authors = get.author.names.from.networks(networks) + private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) + private$tensor = private$build.tensor.from.networks(networks, weighted) + + }, + + #' Get the dimension of the tensor. + #' + #' @return the dimension + get.dim = function() { + return(private$dim) + }, + + #' Get the list of relations of the tensor. + #' + #' @return the list of relations + get.relations = function() { + return(private$relations) + }, + + #' Get the list of authors of the tensor. + #' + #' @return the list of authors + get.authors = function() { + return(private$authors) + }, + + #' Get the tensor data saved in the object. + #' + #' @return the tensor data + get.tensor = function() { + return(private$tensor) + } + ) ) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -118,137 +139,61 @@ ForthOrderTensor = R6::R6Class("ForthOrderTensor", #' @param relations the relations of the wanted networks #' #' @return the list of networks -get.author.networks = function(network.builder, relations) { +get.author.networks.for.multiple.relations = function(network.builder, relations) { - networks = list() + networks = list() - networks = lapply(relations, function(rel) { + networks = lapply(relations, function(rel) { - ## retrieve network for relation - network.builder$update.network.conf(updated.values = list(author.relation = rel)) - retrieved.network = network.builder$get.author.network() + ## retrieve network for relation + network.builder$update.network.conf(updated.values = list(author.relation = rel)) + retrieved.network = network.builder$get.author.network() - ## check if network is not empty - if(igraph::vcount(retrieved.network) > 0){ - logging::loginfo("Added %s data to list", rel) - return(retrieved.network) - } else { - logging::logwarn("There is no %s data available for the current project", rel) - return(NA) - } - }) + ## check if network is not empty + if (igraph::vcount(retrieved.network) > 0){ + logging::loginfo("Added %s data to list", rel) + return(retrieved.network) + } else { + logging::logwarn("There is no %s data available for the current project", rel) + return(NA) + } + }) - ## add names of the relations - names(networks) = relations + ## add names of the relations + names(networks) = relations - ## removes empty networks - networks = networks[!is.na(networks)] + ## removes empty networks + networks = networks[!is.na(networks)] - return(networks) -} - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Get active authors ----------------------------------------------------- - -#' Get all author names that are active in at least one of the networks. -#' -#' @param networks the list of networks -#' -#' @return the list of author names -get.author.names.from.networks = function(networks) { - - ## for each network, get a list of authors that are in this network - active.authors.list = lapply(networks, function(network) { - active.authors = igraph::V(network)$name - return (active.authors) - }) - - ## flatten the list of lists to one list of authors - active.authors = unlist(active.authors.list, recursive = FALSE) - - ## remove distracting named list members - names(active.authors) = NULL - - ## remove duplicates - active.authors = active.authors[!duplicated(active.authors)] - - ## order alphabetically ascending - active.authors = active.authors[order(active.authors)] - - return (active.authors) -} - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Get adjacency matrix ---------------------------------------------------- - -#' Get a sparse adjacency matrix for a network. -#' -#' @param network the given network -#' @param authors all authors that are wanted in the adjacency matrix -#' @param weighted bool if the adjacency matrix shall be weighted -#' -#' @return the list of author names -get.expanded.adjacency = function(network, authors, weighted = FALSE) { - - ## create an empty sparse matrix with the right size - matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) - matrix = as(matrix, "dgTMatrix") - - ## add row and column names - rownames(matrix) = authors - colnames(matrix) = authors - - if(igraph::vcount(network) > 0) { - - if(weighted) { - ## get the weighted adjacency matrix for the current network - matrix.data = igraph::get.adjacency(network, attr = "weight") - } else { - ## get the unweighted adjacency matrix for the current network - matrix.data = igraph::get.adjacency(network) - } - - ## order the adjacency matrix - if(nrow(matrix.data)>1) { # for a 1x1 matrix ordering doesn't work - matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] - } - - ## save the activity data per developer - if(nrow(matrix.data)>0) { - matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data - } - - if(!weighted) { - matrix[matrix > 0] <- 1 - } - - } - - return(matrix) + return(networks) } ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Calculate centrality ---------------------------------------------------- -#' Calculate EDCPTD centrality for a given forth-order tensor. +#' Calculate EDCPTD centrality for a given fourth-order tensor. +#' EDCPTD centrality is based on the work "Identifying Key Nodes in Multilayer Networks based on Tensor Decomposition" +#' by Dingjie Wang, Haitao Wang, and Xiufen Zou, Chaos 27, 063108 (2017) [1]. +#' [1] https://doi.org/10.1063/1.4985185 #' -#' @param forth.order.tensor the given tensor +#' @param fourth.order.tensor the given tensor #' #' @return data frame with EDCPTD score for every author -calculate.EDCPTD.centrality = function(forth.order.tensor) { +calculate.EDCPTD.centrality = function(fourth.order.tensor) { - ## create data frame for results - results = data.frame(names = forth.order.tensor$get.authors(), EDCPTD.score = 0) + ## create data frame for results + results = data.frame(names = fourth.order.tensor$get.authors(), EDCPTD.score = 0) - ## decompose tensor - decomposition <-rTensor::cp(forth.order.tensor$get.tensor(), num_components = 1, max_iter = 50, tol = 1e-05) + ## decompose tensor. 'num_components = 1' needed for EDCPTD centrality. + ## 'max_iter' and 'tol' chosen from default in documentation. + decomposition = rTensor::cp(fourth.order.tensor$get.tensor(), num_components = 1, max_iter = 25, tol = 1e-05) - ## calculate EDCPTD centrality - for (y in 1:length(forth.order.tensor$get.relations())) { - results[["EDCPTD.score"]] = (results[["EDCPTD.score"]] - + abs(decomposition[["U"]][[1]][,1] * decomposition[["U"]][[2]][,1][y]) - + abs(decomposition[["U"]][[3]][,1] * decomposition[["U"]][[4]][,1][y]))/2 - } + ## calculate EDCPTD centrality + for (y in seq_along(fourth.order.tensor$get.relations())) { + results[["EDCPTD.score"]] = (results[["EDCPTD.score"]] + + abs(decomposition[["U"]][[1]][,1] * decomposition[["U"]][[2]][,1][y]) + + abs(decomposition[["U"]][[3]][,1] * decomposition[["U"]][[4]][,1][y])) / 2 + } - return(results) + return(results) } From 09196498b68ca52ee1e3ebb41d324bc5a1f2a777 Mon Sep 17 00:00:00 2001 From: fehnkera Date: Sun, 18 Oct 2020 14:18:32 +0200 Subject: [PATCH 05/37] Adjust the changelog file 'News.md' The changelog file 'News.md' is adjusted. The section '## Unversioned' is added above the latest release version. The new features added in this Pull-Request are described in the subscetions '### Added' and '### Changed/Improved'. Signed-off-by: Signed-off-by: Anselm Fehnker --- NEWS.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/NEWS.md b/NEWS.md index 67c3c6e1..4478c76d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,16 @@ # coronet – Changelog +## Unversioned + +### Added +- Add a new file `util-tensor.R` containing the class `FourthOrderTensor` to create (author x relation x author x relation) tensors from a list of networks (with each network having a different relation) and its corresponding utility function `get.author.networks.for.multiple.relations` (PR #173, c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) +- Add function `calculate.EDCPTD.centrality` for calculating the EDCPTD centrality for a fourth-order tensor in the above described form (c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) +- Add new file `util-networks-misc.R` which contains miscellaneous functions for processing network data and creating and converting various kinds of adjacency matrices: `get.author.names.from.networks`, `get.author.names.from.data`, `get.expanded.adjacency`, `get.expanded.adjacency.matrices`, `get.expanded.adjacency.matrices.cumulated`, `convert.adjacency.matrix.list.to.array` (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) + +### Changed/Improved +- Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) + + ## 3.6 ### Added From 1ba036758a63767e2fcef525c98f5a4fd6938c39 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sun, 8 Nov 2020 23:29:27 +0100 Subject: [PATCH 06/37] Adjust package installation script to fix travis builds Travis builds currently fail (for a few weeks) due to two different reasons: On the one hand, the log output exceeds the maximum log length of travis, resulting in failing builds. This is prevented by setting the parameters `verbose` and `quiet` to `TRUE`, as then the package compilation log is not printed to the log but the result of package installation is clearly perceptible in the log. On the other hand, travis installs lots of packages which are not necessary as they are just suggestions of other packages but not necessary dependencies. Installing such unnecessary packages my lead to problems. This is circumvented by setting the `dependencies` parameter to `NA` as this results in installing only necessary dependencies and imports, whereas the former `TRUE` value of this parameter also led to installing suggested but unnecessary packages. Those two fixes hopefully makes travis builds successful again. Props to @clhunsen for experimenting with the parameters and proposing the first part of this commit regarding the `verbose` and `quiet` parameters of the `install.packages` function. This commit addresses parts of #161. Signed-off-by: Thomas Bock --- install.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/install.R b/install.R index 3b4c4758..132879d9 100644 --- a/install.R +++ b/install.R @@ -16,6 +16,7 @@ ## Copyright 2015 by Wolfgang Mauerer ## Copyright 2015-2017 by Claus Hunsen ## Copyright 2017 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2019 by Anselm Fehnker ## All Rights Reserved. ## @@ -55,5 +56,7 @@ filter.installed.packages = function(packageList) { p = filter.installed.packages(packages) if (length(p) > 0) { print(sprintf("Installing package '%s'.", p)) - install.packages(p, dependencies = TRUE, verbose = FALSE, quiet = FALSE) + + ## set dependencies to 'NA' to install only necessary dependencies (i.e., "Depends", "Imports", "LinkingTo") + install.packages(p, dependencies = NA, verbose = TRUE, quiet = TRUE) } From 40aa0d80e2a94434a8be75925dbefbde6d3518b2 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 00:09:22 +0100 Subject: [PATCH 07/37] Add R version 4.0 to test suite This is related to issue #161. Signed-off-by: Thomas Bock --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2933e44c..a716c811 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2017-2018,2020 by Claus Hunsen +## Copyright 2020 by Thomas Bock ## All Rights Reserved. # TravisCI container @@ -26,6 +27,7 @@ r: - 3.4 - 3.5 - 3.6 + - 4.0 cache: packages repos: CRAN: https://cloud.r-project.org From 92be262514277acb774ab2885c1c0d1c10f03373 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 01:03:21 +0100 Subject: [PATCH 08/37] Adjust recommended R version in README As R version 3.3.1 is quite outdated, update the corresponding statement in our README and recommend R version 3.6.3. Signed-off-by: Thomas Bock --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 13d7cf36..e8cbcbb1 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ If you wonder: The name `coronet` derives as an acronym from the words "configur - [Integration](#integration) * [Requirements](#requirements) - * [R](#r-331) + * [R](#r) * [packrat (recommended)](#packrat) * [Folder structure of the input data](#folder-structure-of-the-input-data) * [Needed R packages](#needed-r-packages) @@ -53,9 +53,11 @@ If you wonder: The name `coronet` derives as an acronym from the words "configur While using the package, we require the following infrastructure. -#### [`R`](https://www.r-project.org/) `3.3.1` +#### [`R`](https://www.r-project.org/) -Later `R` versions should work (and are tested using our TravisCI script), but, for reliability reasons and `packrat` compatibility, only version `3.3.1` is supported. +Minimum requirement is `R` version `3.3.1`. Hence, later `R` versions also work. + +We currently recommend version `3.6.3` for reliability reasons and `packrat` compatibility, but also later versions (`>=4`) should work (and are tested using our TravisCI script). #### [`packrat`](http://rstudio.github.io/packrat/) (recommended) From df367d29013d511f9be6077bdb67714d94775941 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 00:17:04 +0100 Subject: [PATCH 09/37] Update changelog Signed-off-by: Thomas Bock --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4478c76d..b1bfc74b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ ### Changed/Improved - Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) +- Adjust recommended R version to 3.6.3 in README (92be262514277acb774ab2885c1c0d1c10f03373) +- Add R version 4.0 to test suite and adjust package installation in `install.R` to improve compatibility with Travis CI (40aa0d80e2a94434a8be75925dbefbde6d3518b2, 1ba036758a63767e2fcef525c98f5a4fd6938c39, #161) ## 3.6 From f8c7cd2b176e93f9cf037b7f8e872b8d71a3b95a Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 16 Nov 2020 15:15:35 +0100 Subject: [PATCH 10/37] Transfer repository from se-passau to se-sic The repository of coronet was transfered from the se-passau organization to the se-sic organization. Therefore, links in README.md and CONTRIBUTING.md are also updated. Signed-off-by: Thomas Bock --- CONTRIBUTING.md | 8 ++++---- README.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4823c8ff..44daa85e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing to the network library `coronet` -The following is a set of guidelines for contributing to the network library `coronet`, which is hosted in the [se-passau](https://github.com/se-passau) organization on GitHub. +The following is a set of guidelines for contributing to the network library `coronet`, which is hosted in the [se-sic](https://github.com/se-sic) organization on GitHub. These are mostly guidelines, not rules. Use your best judgment, and feel free to propose changes to this document in a pull request. #### Table Of Contents @@ -39,7 +39,7 @@ Before creating bug reports, please check [this list](#before-submitting-a-bug-r #### Before Submitting A Bug Report * **Check the code.** - You might be able to find the cause of the problem and fix things yourself. Most importantly, check if you can reproduce the problem in the latest version of the library (see [branch `dev`](https://github.com/se-passau/coronet/tree/dev)). + You might be able to find the cause of the problem and fix things yourself. Most importantly, check if you can reproduce the problem in the latest version of the library (see [branch `dev`](https://github.com/se-sic/coronet/tree/dev)). * **Search for previous issues describing the same problem.** If an old issue includes also a fix or a workaround for your problem, you do not need to file a new issue. Although, if the problem still persists after applying potential fixes, please file a new issue including detailed information to reproduce the problem. If there is an old issue that is still open, add a comment to the existing issue instead of opening a new one. * **Run the test suite.** @@ -112,8 +112,8 @@ In our development process, we pursue the following idea: - The current development will be performed on the branch `dev`, i.e., all incoming pull requests are against this branch. The current build status is as follows: -- `master`: [![Build Status](https://travis-ci.com/se-passau/coronet.svg?token=8VFPdy2kjPXtstT72yww&branch=master)](https://travis-ci.com/se-passau/coronet) -- `dev`: [![Build Status](https://travis-ci.com/se-passau/coronet.svg?token=8VFPdy2kjPXtstT72yww&branch=dev)](https://travis-ci.com/se-passau/coronet) +- `master`: [![Build Status](https://travis-ci.com/se-sic/coronet.svg?token=8VFPdy2kjPXtstT72yww&branch=master)](https://travis-ci.com/se-sic/coronet) +- `dev`: [![Build Status](https://travis-ci.com/se-sic/coronet.svg?token=8VFPdy2kjPXtstT72yww&branch=dev)](https://travis-ci.com/se-sic/coronet) ### Pull Requests diff --git a/README.md b/README.md index e8cbcbb1..d09d62fe 100644 --- a/README.md +++ b/README.md @@ -638,4 +638,4 @@ This project is licensed under [GNU General Public License v2.0](LICENSE). ## Work in progress -To see what will be the next things to be implemented, please have a look at the [list of issues](https://github.com/se-passau/coronet/issues). +To see what will be the next things to be implemented, please have a look at the [list of issues](https://github.com/se-sic/coronet/issues). From 133675aaf023ab2c1a9f4ab6fd9c46983e86d7a7 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 20:35:34 +0100 Subject: [PATCH 11/37] Fix computation of overlapping ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When computing overlapping ranges, sometimes we ended up in having one more range in the end than needed. This led to the having ranges covering almost the same time span. Consider the following example: When constructing overlapping ranges of three month ranges which overlap fifty-fifty, we might end up in the following scenario: 2020-06-27 ——— 2020-09-26 — 2020-11-11 (2020-12-25) 2020-08-12 ————————— 2020-11-11 There is one range from 2020-06-27 to 2020-09-26, one range from 2020-08-12 to 2020-11-11, and one range from 2020-09-26 to 2020-12-25. However, when assuming that the last action in the data takes place on 2020-11-11, then we would have two ranges covering the last range: one complete ranges and half a range: 2020-08-12 to 2020-11-11 and 2020-09-26 to 2020-11-11. As can be clearly seen (and also above in the visual example), the range 2020-09-26 to 2020-11-11 is superfluous as it does not even represent a full range and it fully overlapps with the previous range. Hence we don't need this half range and can just end up with the following ranges: 2020-06-27 ——— 2020-09-26 2020-08-12 ————————— 2020-11-11 To achieve this, determine the number of complete ranges by rounding downwards instead of rounding to the nearest integer. However, if the complete time span does not even include one complete range, set the number of complete ranges to 1 because multiplying with 0 further down in the function will end up having no range at all (even not an incomplete one). However, the implementation of this corner case works correctly as there already exist some extra checks for this special case. Signed-off-by: Thomas Bock --- util-misc.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/util-misc.R b/util-misc.R index 26002921..0fc632d7 100644 --- a/util-misc.R +++ b/util-misc.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018-2019 by Jakob Kronawitter ## All Rights Reserved. @@ -568,7 +569,10 @@ construct.overlapping.ranges = function(start, end, time.period, overlap, imperf ## compute negative overlap overlap.negative = time.period - overlap ## compute number of complete bins - bins.number = round(bins.duration / overlap.negative) + bins.number = floor(bins.duration / overlap.negative) + if (bins.number < 1) { + bins.number = 1 + } ## generate a approximate sequence of dates which can be streamlined later seq.start = start.date + overlap From 6f3149e8c1dec9fb1c8465ca2da0e018ae958356 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 21:01:09 +0100 Subject: [PATCH 12/37] Fix off-by-one error in data cutting The function `get.data.cut.to.same.date` cuts the data of multiple data sources to a common date range for all the specified data sources. However, in the previous implementation, there was an off-by-one error, caused by excluding end dates in data splitting: If a range is specified, the end date of the range is exclusive (while the start of the range is inclusive). As the data cutting functionality uses the split functionality, the specified end date was excluded whereas it should be included, as the following example shows: Assume there are two data sources d1 and d2. Let d1 contain the time stamps t1 and t4, and let d2 contain the time stamps t2 and t3, with t1 < t2 < t3 < t4. Then a visualization of this example on a time line could be as follows: d1: | | d2: | | t1 t2 t3 t4 According to the proceedure of data cutting, we look for a common data range. It can be easily seen that the interval [t2,t3] completely covers data from both data sources d1 and d2. Hence, all the data needs to be cut to this interval [t2,t3]. However, what the former implementation did was the following: t2 and t3 are determined as start and end of the range the data has to be cut to. As the cutting functionality uses the splitting functionality, t2 is used as start of the range and t3 as used as the end of the range. In here the error comes in: As the end of ranges is exclusive, t3 is not included in the cut data, resulting in interval [t2, t3). To include t3 into the cut data, we need to use t3+1 as the end of the range to cut, as this leads to [t2,t3] then. In this commit, this off-by-one error at the end of the data cutting range is fixed by adding 1 second to the end. In addition, all the corresponding tests in the test suite (i.e., data cutting and network cutting) are adjusted to incorporate the end of the cutting range. Signed-off-by: Thomas Bock --- tests/test-data-cut.R | 17 +++++++++-------- tests/test-networks-cut.R | 17 +++++++++-------- util-data.R | 3 ++- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index 6d85d18b..3234b786 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -16,6 +16,7 @@ ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. @@ -62,14 +63,14 @@ test_that("Cut commit and mail data to same date range.", { artifact.type = c("Feature", "Feature"), artifact.diff.size = as.integer(c(1, 1))) - mail.data.expected = data.frame(author.name = c("Thomas"), - author.email = c("thomas@example.org"), - message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>"), - date = get.date.from.string("2016-07-12 16:04:40"), - date.offset = as.integer(c(100)), - subject = c("Re: Fw: busybox 2 tab"), - thread = sprintf("", c(9)), - artifact.type = "Mail") + mail.data.expected = data.frame(author.name = c("Thomas", "Olaf"), + author.email = c("thomas@example.org", "olaf@example.org"), + message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), + date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), + date.offset = as.integer(c(100, 200)), + subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), + thread = sprintf("", c(9, 9)), + artifact.type = c("Mail", "Mail")) commit.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.commits() rownames(commit.data) = 1:nrow(commit.data) diff --git a/tests/test-networks-cut.R b/tests/test-networks-cut.R index 12716572..95f7a891 100644 --- a/tests/test-networks-cut.R +++ b/tests/test-networks-cut.R @@ -14,6 +14,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. @@ -62,14 +63,14 @@ test_that("Cut commit and mail data to same date range.", { artifact.type = c("Feature", "Feature"), artifact.diff.size = as.integer(c(1, 1))) - mail.data.expected = data.frame(author.name = c("Thomas"), - author.email = c("thomas@example.org"), - message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>"), - date = get.date.from.string(c("2016-07-12 16:04:40")), - date.offset = as.integer(c(100)), - subject = c("Re: Fw: busybox 2 tab"), - thread = sprintf("", c(9)), - artifact.type = "Mail") + mail.data.expected = data.frame(author.name = c("Thomas", "Olaf"), + author.email = c("thomas@example.org", "olaf@example.org"), + message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), + date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), + date.offset = as.integer(c(100, 200)), + subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), + thread = sprintf("", c(9, 9)), + artifact.type = c("Mail", "Mail")) commit.data = x$get.project.data()$get.commits() rownames(commit.data) = 1:nrow(commit.data) diff --git a/util-data.R b/util-data.R index 8694ab35..6b18f8f9 100644 --- a/util-data.R +++ b/util-data.R @@ -13,6 +13,7 @@ ## ## Copyright 2016-2019 by Claus Hunsen ## Copyright 2017-2019 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse @@ -1146,7 +1147,7 @@ ProjectData = R6::R6Class("ProjectData", ## get the timestamp data as vector timestamps.df = self$get.data.timestamps(data.sources = data.sources , simple = TRUE) - timestamps = c(start = timestamps.df[, "start"], end = timestamps.df[, "end"]) + timestamps = c(start = timestamps.df[, "start"], end = timestamps.df[, "end"] + 1) ## check consistency if (timestamps["start"] > timestamps["end"]) { From 835327e8c7c259163af66194d122e3c573dee069 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 9 Nov 2020 23:33:33 +0100 Subject: [PATCH 13/37] Fix function 'split.networks.time.based' regarding sliding windows When calling the function `split.networks.time.based` (which splits several networks at the same time), the `sliding.window` parameter was useless: This function, in turn, calls `split.network.time.based` for each network, specifying fixed bins and passing the `sliding.window` parameter to it. However, if bins are given to this function, the `sliding.window` parameter is ignored. To fix this, create the overlapping ranges need to get sliding windows and call `split.network.time.based.by.ranges` to get the networks split using sliding windows. With this fix, it is possible to use sliding windows when splitting multiple networks simultaneously. Props to @clhunsen for suggesting to use `construct.overlapping.ranges` and `split.network.time.based.by.ranges` for this fix. Signed-off-by: Thomas Bock Signed-off-by: Claus Hunsen --- util-split.R | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/util-split.R b/util-split.R index 0b2e4b66..bfa269e3 100644 --- a/util-split.R +++ b/util-split.R @@ -17,6 +17,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -629,7 +630,7 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = if (!is.null(number.windows)) { ## reset bins for the later algorithm bins = NULL - ## remove sliding windows + ## ignore sliding windows sliding.window = FALSE } @@ -644,10 +645,20 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = dates = get.date.from.unix.timestamp(dates) ## 2) get bin information - bins.info = split.get.bins.time.based(dates, time.period, number.windows) - bins.date = get.date.from.string(bins.info[["bins"]]) + if (sliding.window) { + ranges = construct.overlapping.ranges(start = min(dates), end = max(dates), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = TRUE) + bins.info = construct.overlapping.ranges(start = min(dates), end = max(dates), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = TRUE) + bins.date = unname(unique(get.date.from.unix.timestamp(unlist(bins.info)))) + } else { + bins.info = split.get.bins.time.based(dates, time.period, number.windows) + bins.date = get.date.from.string(bins.info[["bins"]]) + } } else { - ## remove sliding windows + ## specific bins are given, do not use sliding windows sliding.window = FALSE ## set the bins to use bins.date = bins @@ -655,8 +666,16 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = ## split all networks to the extracted bins networks.split = lapply(networks, function(net) { - split.network.time.based(net, bins = bins.date, sliding.window = sliding.window, - remove.isolates = remove.isolates) + + if (sliding.window) { + nets = split.network.time.based.by.ranges(network = net, ranges = ranges, + remove.isolates = remove.isolates) + attr(nets, "bins") = sort(bins.date) + } else { + nets = split.network.time.based(network = net, bins = bins.date, sliding.window = sliding.window, + remove.isolates = remove.isolates) + } + return(nets) }) ## return the split networks From a3a8e6d5c0f80808515cc694419e7d274fdca400 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 10 Nov 2020 00:30:47 +0100 Subject: [PATCH 14/37] Adjust sliding-window creation in 'split.network.time.based' Create the sliding windows in `split.network.time.based` exactly in the same way as in `split.networks.time.based`. Signed-off-by: Thomas Bock --- util-split.R | 56 +++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/util-split.R b/util-split.R index bfa269e3..72a4711e 100644 --- a/util-split.R +++ b/util-split.R @@ -520,7 +520,7 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU if (!is.null(number.windows)) { ## reset bins for the later algorithm bins = NULL - ## remove sliding windows + ## ignore sliding windows sliding.window = FALSE } @@ -531,56 +531,32 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU bins.vector = bins.info[["vector"]] bins.date = get.date.from.string(bins.info[["bins"]]) bins = head(bins.info[["bins"]], -1) - ## logging - logging::loginfo("Splitting network into time ranges [%s].", - paste(bins.info[["bins"]], collapse = ", ")) } else { - ## remove sliding windows + ## specific bins are given, do not use sliding windows sliding.window = FALSE ## find bins for dates bins.date = get.date.from.string(bins) bins.vector = findInterval(dates, bins.date, all.inside = FALSE) bins = 1:(length(bins.date) - 1) # the last item just closes the last bin - ## logging - logging::loginfo("Splitting network into bins [%s].", paste(bins.date, collapse = ", ")) } - nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) - ## perform additional steps for sliding-window approach if (sliding.window) { - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = mapply( - bins.date[1:(length(bins.date) - 1)], - bins.date[2:length(bins.date)], - FUN = function(d1, d2) d1 + ((d2 - d1) / 2) - ) - bins.date.middle = get.date.from.unix.timestamp(bins.date.middle) - - ## order edges by date - edges.all = igraph::E(network) - edges.dates = igraph::get.edge.attribute(network, "date") - - ## identify edges to cut for sliding-window approach - edges.cut = sapply(edges.dates, function(date) { - date < bins.date.middle[1] || date > bins.date.middle[length(bins.date.middle)] - }) - - ## delete edges from the network and create a new network - network.cut = igraph::delete.edges(network, edges.all[edges.cut]) - - ## split network for sliding windows - nets.sliding = split.network.time.based(network.cut, bins = bins.date.middle, sliding.window = FALSE) - - ## append data to normally-split data - nets = append(nets, nets.sliding) - - ## sort data object properly by bin starts - bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) - nets = nets[ order(bins.ranges.start) ] + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - ## construct proper bin vectors for configuration - bins.date = sort(c(bins.date, bins.date.middle)) + logging::loginfo("Splitting network into time ranges [%s].", + paste(ranges, collapse = ", ")) + nets = split.network.time.based.by.ranges(network, ranges, remove.isolates) + } else { + logging::loginfo("Splitting network into bins [%s].", + paste(bins.date, collapse = ", ")) + nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) } ## set bin attribute From 820f85002a37675fc811718ac048114152d71096 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 10 Nov 2020 00:37:06 +0100 Subject: [PATCH 15/37] Fix wrong logging statements in 'split.network.by.bins' Signed-off-by: Thomas Bock --- util-split.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util-split.R b/util-split.R index 72a4711e..bb7f1ebe 100644 --- a/util-split.R +++ b/util-split.R @@ -842,7 +842,7 @@ split.data.by.bins = function(df, bins) { #' #' @return a list of networks, with the length of 'unique(bins.vector)' split.network.by.bins = function(network, bins, bins.vector, remove.isolates = TRUE) { - logging::logdebug("split.data.time.based: starting.") + logging::logdebug("split.network.by.bins: starting.") ## create a network for each bin of edges nets = parallel::mclapply(bins, function(bin) { logging::logdebug("Splitting network: bin %s", bin) @@ -852,7 +852,7 @@ split.network.by.bins = function(network, bins, bins.vector, remove.isolates = T g = igraph::subgraph.edges(network, edges, delete.vertices = remove.isolates) return(g) }) - logging::logdebug("split.data.time.based: finished.") + logging::logdebug("split.network.by.bins: finished.") return(nets) } From 606963a3f5af653604949b6ced70269a1d977099 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 10 Nov 2020 00:47:29 +0100 Subject: [PATCH 16/37] Add missing default values to function documentations in util-split.R For some of the functions in the split module, some default values of parameters have been missing in the function documentation. This commit fixes that. Signed-off-by: Thomas Bock --- util-split.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/util-split.R b/util-split.R index bb7f1ebe..bb27277d 100644 --- a/util-split.R +++ b/util-split.R @@ -40,7 +40,7 @@ requireNamespace("lubridate") # for date conversion #' #' @param project.data the *Data object from which the data is retrieved #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' parameter is ignored; consequently, 'split.basis' and #' 'sliding.window' do not make sense then either. [default: NULL] @@ -49,7 +49,7 @@ requireNamespace("lubridate") # for date conversion #' consequently, 'split.basis' and 'sliding.window' do not make sense then either. #' [default: NULL] #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' -#' [default: commits] +#' [default: "commits"] #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach #' [default: FALSE] #' @@ -218,7 +218,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = #' #' @param project.data the *Data object from which the data is retrieved #' @param activity.type the type of activity used for splitting, either 'commits', 'mails', or 'issues' -#' [default: commits] +#' [default: "commits"] #' @param activity.amount the amount of activity describing the size of the ranges, a numeric, further #' specified by 'activity.type' [default: 5000] #' @param number.windows the number of consecutive data objects to get from this function @@ -498,9 +498,10 @@ split.data.time.based.by.ranges = function(project.data, ranges) { #' #' @param network the igraph network to split, needs to have an edge attribute named "date" #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' and 'sliding.window' parameters are ignored. +#' [default: NULL] #' @param number.windows the number of consecutive networks to get from this function, implying equally #' time-sized windows for all ranges. If set, the 'time.period' and 'bins' parameters are ignored; #' consequently, 'sliding.window' does not make sense then either. @@ -584,9 +585,10 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU #' #' @param networks the igraph networks to split, needs to have an edge attribute named "date" #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' and 'sliding.window' parameters are ignored. +#' [default: NULL] #' @param number.windows the number of consecutive networks to get for each network, implying equally #' time-sized windows for all ranges. If set, the 'time.period' and 'bins' parameters are ignored; #' consequently, 'sliding.window' does not make sense then either. @@ -601,7 +603,6 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = number.windows = NULL, sliding.window = FALSE, remove.isolates = TRUE) { - ## number of windows given (ignoring time period and bins) if (!is.null(number.windows)) { ## reset bins for the later algorithm @@ -669,7 +670,7 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = #' #' @param network the igraph network to split #' @param number.edges the amount of edges describing the size of the ranges -#' (implying an open number of resulting ranges) +#' (implying an open number of resulting ranges) [default: 5000] #' @param number.windows the number of consecutive networks to get from this function #' (implying an equally distributed amount of edges in each range and #' 'sliding.window = FALSE) [default: NULL] From 916127e9b5a63ee4bf88eb1761542d44ca02fc2d Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 10 Nov 2020 20:55:27 +0100 Subject: [PATCH 17/37] Adjust sliding-window creation in 'split.data.time.based' Create the sliding windows in `split.data.time.based` exactly in the same way as in `split.network.time.based` and `split.networks.time.based`. In addition, also stream line the code in `split.data.time.based` (e.g., don't execute steps that are only needed when no sliding windows are used). Signed-off-by: Thomas Bock --- util-split.R | 125 +++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 65 deletions(-) diff --git a/util-split.R b/util-split.R index bb27277d..409f36fb 100644 --- a/util-split.R +++ b/util-split.R @@ -114,78 +114,73 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = bins.ranges = construct.ranges(bins) names(bins.ranges) = bins.ranges - ## split data - data.split = parallel::mclapply(split.data, function(df.name) { - logging::logdebug("Splitting %s.", df.name) - ## identify bins for data - df = data[[df.name]] - df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) - ## split data according to df.bins - df.split = split(df, df.bins) - ## add proper labels/names - names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) - return(df.split) - }) - - ## re-arrange data to get the proper list of data per range - logging::logdebug("Re-arranging data.") - data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) - names(data.split) = bins.ranges - - ## adapt project configuration - project.data$get.project.conf()$set.revisions(bins, bins.date) - - ## construct RangeData objects - logging::logdebug("Constructing RangeData objects.") - cf.data = parallel::mclapply(bins.ranges, function(range) { - logging::logdebug("Constructing data for range %s.", range) - ## construct object for current range - cf.range.data = RangeData$new(project.data$get.project.conf(), range) - ## get data for current range - df.list = data.split[[range]] - - ## set main data sources: commits, mails, issues - for (data.source in split.data) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](df.list[[data.source]]) - } - ## set additional data sources: authors, pasta, synchronicity - for (data.source in additional.data.sources) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](additional.data[[data.source]]) - } - - return(cf.range.data) - }) - - ## perform additional steps for sliding-window approach - ## (only if there is more than one range until here) - if (sliding.window && length(bins.ranges) <= 1) { + if ((length(bins.ranges) <= 1) && sliding.window) { logging::logwarn("Sliding-window approach does not apply for one range or less.") - } else if (sliding.window) { - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = mapply( - bins.date[1:(length(bins.date) - 1)], - bins.date[2:length(bins.date)], - FUN = function(d1, d2) d1 + ((d2 - d1) / 2) - ) - bins.date.middle = get.date.from.unix.timestamp(bins.date.middle) + sliding.window = FALSE + } - ## split data for sliding windows - cf.data.sliding = split.data.time.based(project.data, bins = bins.date.middle, - split.basis = split.basis, sliding.window = FALSE) + if (!sliding.window) { + + ## split data + data.split = parallel::mclapply(split.data, function(df.name) { + logging::logdebug("Splitting %s.", df.name) + ## identify bins for data + df = data[[df.name]] + df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) + ## split data according to df.bins + df.split = split(df, df.bins) + ## add proper labels/names + names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) + return(df.split) + }) - ## append data to normally-split data - cf.data = append(cf.data, cf.data.sliding) + ## re-arrange data to get the proper list of data per range + logging::logdebug("Re-arranging data.") + data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) + names(data.split) = bins.ranges + + ## adapt project configuration + project.data$get.project.conf()$set.revisions(bins, bins.date) + + ## construct RangeData objects + logging::logdebug("Constructing RangeData objects.") + cf.data = parallel::mclapply(bins.ranges, function(range) { + logging::logdebug("Constructing data for range %s.", range) + ## construct object for current range + cf.range.data = RangeData$new(project.data$get.project.conf(), range) + ## get data for current range + df.list = data.split[[range]] + + ## set main data sources: commits, mails, issues + for (data.source in split.data) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](df.list[[data.source]]) + } + ## set additional data sources: authors, pasta, synchronicity + for (data.source in additional.data.sources) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](additional.data[[data.source]]) + } + + return(cf.range.data) + }) - ## sort data object properly by bin starts - bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) - cf.data = cf.data[ order(bins.ranges.start) ] + } else { + ## perform different steps for sliding-window approach - ## construct proper bin vectors for configuration - bins.date = sort(c(bins.date, bins.date.middle)) + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) bins = get.date.string(bins.date) + logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", + project.data$get.class.name(), ranges) + cf.data = split.data.time.based.by.ranges(project.data, ranges) + ## update project configuration project.data$get.project.conf()$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { From 4f23701f762b0889a6b5823c809bb5897e9cddfd Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 11 Nov 2020 13:28:19 +0100 Subject: [PATCH 18/37] Adjust sliding-window creation in 'split.data.activity.based' Create the sliding windows in `split.data.activity.based` in a similar way as in `split.data.time.based`. In particular, fix some bugs in the sliding-window creation for activity-based data splitting and handle some special corner cases with respect to the last range. Signed-off-by: Thomas Bock --- util-split.R | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/util-split.R b/util-split.R index 409f36fb..b232047e 100644 --- a/util-split.R +++ b/util-split.R @@ -298,12 +298,21 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## offsets used for cropping (half the first/last bin) offset.start = floor(activity.amount / 2) - offset.end = floor((items.unique.count %% activity.amount) / 2) + offset.end = (items.unique.count - offset.start) %% activity.amount ## cut the data appropriately - items.cut = c( - items.unique[1:offset.start], - items.unique[(items.unique.count - offset.end):items.unique.count] - ) + if (offset.end > 0) { + items.cut = c( + items.unique[1:offset.start], + items.unique[(items.unique.count - offset.end + 1):items.unique.count] + ) + } else { + items.cut = items.unique[1:offset.start] + } + + ## determine end bin of last sliding-window range + end.event.id = items.unique[(items.unique.count - offset.end + 1)] + end.event.logical = data[[ activity.type ]][[ id.column[[activity.type]] ]] == end.event.id + end.event.date = data[[ activity.type ]][end.event.logical, ][[ "date" ]] ## store the data again data.to.cut = data[[ activity.type ]][[ id.column[[activity.type]] ]] %in% items.cut @@ -333,6 +342,28 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) + ## if last regular range and last sliding-window range end at the same time + ## and the data of the last regular range is contained in the last sliding-window range, then: + ## remove the last regular range as it is not complete and we don't loose data when removing it + last.regular.range = cf.data[[length(cf.data)]] + last.sliding.range = cf.data[[length(cf.data) - 1]] + get.activity.data = paste0("get.", activity.type) + last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] + last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] + if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] + && all(last.regular.range.ids %in% last.sliding.range.ids) ) { + + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } else if (!(bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)])) { + ## adjust the end date of the last sliding-window range + name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) + names(cf.data)[length(cf.data) - 1] = name.last.sliding.window + bins.date[length(bins.date) - 1] = end.event.date + bins[length(bins) - 1] = get.date.string(end.event.date) + } + ## update project configuration project.data$get.project.conf()$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { From 2129acad52f3a4ccbb3dc46961d83aa0933a7783 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 11 Nov 2020 13:32:41 +0100 Subject: [PATCH 19/37] Adjust sliding-window creation in 'split.network.activity.based' Create the sliding windows in `split.network.activity.based` in a similar way as in `split.data.activity.based`. In particular, fix some bugs in the sliding-window creation for activity-based network splitting and handle some special corner cases with respect to the last range. Signed-off-by: Thomas Bock --- util-split.R | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/util-split.R b/util-split.R index b232047e..8741ee11 100644 --- a/util-split.R +++ b/util-split.R @@ -756,27 +756,39 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## split network by bins networks = split.network.by.bins(network, bins, bins.vector, remove.isolates) + if (number.edges >= edge.count) { + logging::logwarn("Sliding-window approach does not apply: not enough edges (%s) for number of edges %s", + edge.count, number.edges) + sliding.window = FALSE + } + ## perform additional steps for sliding-window approach ## for activity-based sliding-window bins to work, we need to crop edges appropriately and, ## then, compute bins on the cropped networks if (sliding.window) { - ## order edges by date - edges.by.date = igraph::E(network)[ order(df[["date"]]) ] + + ## get edge ids ordered by date + edges.by.date = df[["my.unique.id"]] ## offsets used for cropping (half the first/last bin) offset.start = floor(number.edges / 2) - offset.end = floor((edge.count %% number.edges) / 2) + offset.end = (edge.count - offset.start) %% number.edges ## cut the data appropriately - edges.cut = c( - edges.by.date[1:offset.start], - edges.by.date[(edge.count - offset.end):edge.count] - ) + if (offset.end > 0) { + edges.cut = c( + edges.by.date[1:offset.start], + edges.by.date[(edge.count - offset.end + 1):edge.count] + ) + } else { + edges.cut = edges.by.date[1:offset.start] + } ## delete edges from the network and create a new network - network.cut = igraph::delete.edges(network, edges.cut) + network.cut = igraph::delete.edges(network, igraph::E(network)[edges.cut]) ## split network for sliding windows - networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, sliding.window = FALSE) + networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, + sliding.window = FALSE) ## append data to normally-split data networks = append(networks, networks.sliding) @@ -790,6 +802,21 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## construct proper bin vectors for configuration bins.date = sort(c(bins.date, bins.date.middle)) + + ## if last regular range and last sliding-window range end at the same time, + ## and the latter contains the former's edges, then: + ## remove the last regular range as it is not complete and we don't loose data when removing it + edges.last.regular = igraph::E(networks[[length(networks)]]) + edges.last.sliding = igraph::E(networks[[length(networks) - 1]]) + if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] + && all(edges.last.regular %in% edges.last.sliding) + && table(edges.last.regular$date) %in% table(edges.last.sliding$date) ) { + + networks = networks[-length(networks)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + } ## set bin attribute From 69e871eb50d17b6f20ecf17c1fdb09c9aa13cbca Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 11 Nov 2020 13:41:25 +0100 Subject: [PATCH 20/37] Add tests for sliding windows Add a completely new test module 'test-split-sliding-window.R', which is based on the exisiting test module 'test-split.R'. The new test module contains almost the same test setups as the exisiting test module, but it tests all the splitting functions only using sliding windows. In addition, some new test scenarios are added to test certain corner cases with respect to the last range of sliding-window splitting. In particular, the following functions are tested with sliding windows: - split.data.time.based - split.data.activity.based - split.network.time.based - split.network.activity.based Signed-off-by: Thomas Bock --- tests/test-split-sliding-window.R | 1287 +++++++++++++++++++++++++++++ tests/test-split.R | 1 - 2 files changed, 1287 insertions(+), 1 deletion(-) create mode 100644 tests/test-split-sliding-window.R diff --git a/tests/test-split-sliding-window.R b/tests/test-split-sliding-window.R new file mode 100644 index 00000000..311672a1 --- /dev/null +++ b/tests/test-split-sliding-window.R @@ -0,0 +1,1287 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2017 by Felix Prasse +## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock +## Copyright 2018 by Christian Hechtl +## Copyright 2018 by Jakob Kronawitter +## Copyright 2019 by Anselm Fehnker +## All Rights Reserved. + + +context("Splitting functionality, using sliding windows.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## +## NOTE +## + +## In this test file, we rather test the raw data contents of the data objects +## instead of the networks that can be constructed from these data items! + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split data -------------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.data.time.based(..., split.basis = 'commits'), using sliding windows +## + +test_that("Split a data object time-based (split.basis == 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 min", + split.basis = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:01:59", + "2016-07-12 16:00:29-2016-07-12 16:03:29", + "2016-07-12 16:01:59-2016-07-12 16:04:59", + "2016-07-12 16:03:29-2016-07-12 16:06:29", + "2016-07-12 16:04:59-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commits[2, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commits[3:5, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[rownames(data$mails) %in% c(16, 17), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) == 29, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$pasta, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$pasta, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + + +## +## Tests for split.data.time.based(..., split.basis = 'mails'), using sliding windows +## + +test_that("Split a data object time-based (split.basis == 'mails', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 years", + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2007-10-10 12:38:13", + "2006-04-10 15:38:13-2009-04-10 09:38:13", + "2007-10-10 12:38:13-2010-10-10 06:38:13", + "2009-04-10 09:38:13-2012-04-10 03:38:13", + "2010-10-10 06:38:13-2013-10-10 00:38:13", + "2012-04-10 03:38:13-2015-04-10 21:38:13", + "2013-10-10 00:38:13-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commits[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commits[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commits[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$issues[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$pasta, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$pasta, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$pasta, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + + +## +## Tests for split.data.time.based(..., split.basis = 'issues'), using sliding windows +## + +test_that("Split a data object time-based (split.basis == 'issues', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "2 years", + split.basis = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2015-04-22 11:52:09", + "2014-04-22 05:52:09-2016-04-21 17:52:09", + "2015-04-22 11:52:09-2017-04-21 23:52:09", + "2016-04-21 17:52:09-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commits[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commits + ), + mails = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 14:34, ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:36, ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$pasta, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + +## * * bins ---------------------------------------------------------------- + +## +## Tests for split.data.time.based(..., bins = ...), sliding windows parameter ignored +## + +test_that("Split a data object time-based (bins == ... , sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", + "2017-06-03 03:03:03"), + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-01-01 00:00:00-2016-12-31 23:59:59", + "2016-12-31 23:59:59-2017-06-03 03:03:03" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commits[0, ] + ), + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] + ), + issues = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% 14:34, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity + ), + pasta = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + +## * activity-based -------------------------------------------------------- + +## +## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:00:45-2016-07-12 16:06:20", + "2016-07-12 16:06:10-2016-07-12 16:06:32", + "2016-07-12 16:06:20-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (activity.amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29), ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:20", + "2016-07-12 16:06:20-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "commits", + number.windows = nrow(project.data$get.commits()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "commits", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) + +}) + +test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE), continued.", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + + ## add one commit to the commit data having same date as latest commit + commit.data = project.data$get.commits() + latest.commit = commit.data[nrow(commit.data), ] + latest.commit[1, "commit.id"] = "" + latest.commit[1, "hash"] = "abcdefghijklmnopqrstuvxyz" + commit.data = rbind(commit.data, latest.commit) + project.data$set.commits(commit.data) + + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:00:45-2016-07-12 16:06:20", + "2016-07-12 16:06:10-2016-07-12 16:06:32", + "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:32-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (activity.amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + +}) + + +## +## Tests for split.data.activity.based(..., activity.type = 'mails') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'mails', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2010-07-12 11:05:35", + "2005-02-09 18:49:49-2010-07-12 12:05:34", + "2010-07-12 11:05:35-2010-07-12 12:05:41", + "2010-07-12 12:05:34-2010-07-12 12:05:42", + "2010-07-12 12:05:41-2010-07-12 12:05:44", + "2010-07-12 12:05:42-2010-07-12 12:05:45", + "2010-07-12 12:05:44-2016-07-12 15:58:40", + "2010-07-12 12:05:45-2016-07-12 15:58:50", + "2016-07-12 15:58:40-2016-07-12 16:05:37", + "2016-07-12 15:58:50-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commits[0, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$commits[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commits[0, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$commits[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commits[0, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$commits[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commits[0, ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[rownames(data$mails) %in% 2:4, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[rownames(data$mails) %in% 5:7, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[rownames(data$mails) %in% 8:10, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[rownames(data$mails) %in% c(11:12, 14), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 15:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$issues[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$issues[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$issues[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28), ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 27:28), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$synchronicity, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$synchronicity, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$synchronicity, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$pasta, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$pasta, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$pasta, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$pasta, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$pasta, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$pasta, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$mails) + 10, + activity.type = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails + ), + issues = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "mail", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2010-07-12 12:05:43", + "2010-07-12 12:05:43-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", + number.windows = nrow(project.data$get.mails()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) +}) + + +## +## Tests for split.data.activity.based(..., activity.type = 'issues') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'issues', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 9, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2013-05-25 06:22:23", + "2013-05-06 01:04:34-2016-07-12 15:59:25", + "2013-05-25 06:22:23-2016-07-12 16:03:59", + "2016-07-12 15:59:25-2016-07-27 20:12:08", + "2016-07-12 16:03:59-2016-10-05 15:30:02", + "2016-07-27 20:12:08-2017-05-23 12:31:34", + "2016-10-05 15:30:02-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$commits[1, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$commits[1:2, ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$commits[2:8, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$commits[3:8, ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commits[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + ), + mails = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$mails[rownames(data$mails) %in% 14:15, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + ), + issues = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$issues[rownames(data$issues) %in% c(6:13, 27:28), ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(11:15, 20:22, 27:28), ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(14:17, 20:23, 29),], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 29:30), ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 30:34),], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$synchronicity, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$synchronicity, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$pasta, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$pasta, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$pasta, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$pasta, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$pasta, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$issues) + 10, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits + ), + mails = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues + ), + synchronicity = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2016-07-12 16:02:30", + "2016-07-12 16:02:30-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commits[1:2, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commits[3:8, ] + ), + mails = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28), ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36), ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$pasta, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "issues", + number.windows = nrow(project.data$get.issues()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "issues", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split network ----------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.network.time.based(..., time.period = ...) using sliding windows +## + +test_that("Split a network time-based (time.period = ... , sliding.window = TRUE).", { + + ## time period + time.period = "2 mins" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 15:59:59-2016-07-12 16:01:59" = igraph::subgraph.edges(author.net, c(2)), + "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:01:59-2016-07-12 16:03:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:03:59-2016-07-12 16:05:59" = igraph::subgraph.edges(author.net, c(3,5)), + "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, time.period = "2 mins", sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = TRUE), info = "Illegal split.") + +}) + +## * activity-based ------------------------------------------------------------ + +## +## Tests for split.network.activity.based(...) using sliding windows +## + +test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE).", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network + author.net = net.builder$get.author.network() + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + + ## + ## number.edges (2) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) + ) + results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10, + sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (2)).") + + ## + ## number.windows (1) (i.e., ignoring sliding windows) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.windows = 3, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.windows (1)).") + + ## + ## number.windows (2) (i.e., ignoring sliding windows) + ## + + expect_error( + split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10, + sliding.window = TRUE), + info = "Error expected (number.windows (2))." + ) + +}) + +test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE), continued.", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network and add an additional edge in the end + author.net = net.builder$get.author.network() + author.net = igraph::add_edges(author.net, c("Olaf", "Thomas"), + attr = list(date = get.date.from.string("2020-02-20 20:20:20"))) + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2020-02-20 20:20:20" = igraph::subgraph.edges(author.net, c(6, 8)), + "2016-07-12 16:06:32-2020-02-20 20:20:21" = igraph::subgraph.edges(author.net, c(8, 9)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + +}) diff --git a/tests/test-split.R b/tests/test-split.R index 652cbecb..bf26d184 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -47,7 +47,6 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## TODO ## -## - sliding.window = TRUE ## - net.conf$update.values(list(pasta = TRUE, synchronicity = TRUE)) From f3b0ffd3390634a21393a7ed0e45bc22c9df2f7f Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 11 Nov 2020 13:49:57 +0100 Subject: [PATCH 21/37] Add more sliding-window tests & add library for parameterized tests In this commit, additional tests for sliding-window splitting are added. For example, tests for the function 'split.networks.time.based'. As these tests don't need their expected outcomes to be determined manually, but rely on the outcomes of other, already tested functions, it is not necessary to completely copy existing tests and have them duplicated for different values of `sliding.window`. Hence, this is a good example for having parameterized tests: Parameterized tests should succeed independently of which value certain parameters take. Hence, we can just implement the test once and run it twice, for example, the first run with `sliding.window = TRUE` and the second run with `spliding.window = FALSE`. However, es our standard test library 'testthat' does not provide the possiblity to run parameterized tests, we need to add an additiona library to be able to run parameterized tests: the library 'patrick'. For more details on 'patrick', see: https://github.com/google/patrick Signed-off-by: Thomas Bock --- README.md | 1 + install.R | 1 + tests.R | 3 +- tests/test-networks-equal-constructions.R | 41 ++++++++++++++++------- tests/test-split.R | 24 ++++++++----- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d09d62fe..70e04636 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `logging`: Logging - `sqldf`: For advanced aggregation of `data.frame` objects - `testthat`: For the test suite +- `patrick`: For the test suite - `ggplot2`: For plotting of data - `ggraph`: For plotting of networks (needs `udunits2` system library, e.g., `libudunits2-dev` on Ubuntu!) - `markovchain`: For core/peripheral transition probabilities diff --git a/install.R b/install.R index 132879d9..d796d8d0 100644 --- a/install.R +++ b/install.R @@ -33,6 +33,7 @@ packages = c( "logging", "sqldf", "testthat", + "patrick", "ggplot2", "ggraph", "markovchain", diff --git a/tests.R b/tests.R index 58724171..bf0e60dc 100644 --- a/tests.R +++ b/tests.R @@ -42,8 +42,9 @@ sessionInfo() logging::loginfo("Running test suite.") -## load package 'testthat' +## load packages 'testthat' and 'patrick' requireNamespace("testthat") +requireNamespace("patrick") ## starting tests do.tests = function(dir) { diff --git a/tests/test-networks-equal-constructions.R b/tests/test-networks-equal-constructions.R index fc3e0a09..9f254b9a 100644 --- a/tests/test-networks-equal-constructions.R +++ b/tests/test-networks-equal-constructions.R @@ -13,6 +13,7 @@ ## ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Claus Hunsen +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -86,7 +87,8 @@ compare.edge.and.vertex.lists = function(split.author.networks.one = NULL, split } } -test_that("Compare the bipartite and author network constructed in two ways with author/artifact relation 'cochange'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author/artifact relation 'cochange', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -106,7 +108,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -116,7 +118,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -134,10 +137,13 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) -test_that("Compare the bipartite and author network constructed in two ways with author relation 'mail' and artifact relation - 'cochange'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author relation 'mail' and artifact relation 'cochange', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -158,7 +164,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -168,7 +174,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -187,9 +194,13 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) -test_that("Compare the bipartite and author network constructed in two ways with author and artifact relation 'mail'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author and artifact relation 'mail', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -210,7 +221,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -220,7 +231,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -239,4 +251,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) diff --git a/tests/test-split.R b/tests/test-split.R index bf26d184..b20676b6 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -14,6 +14,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker @@ -1085,7 +1086,7 @@ test_that("Split a network time-based (time.period = ...).", { ## Tests for split.networks.time.based(..., time.period = ...) ## -test_that("Split a list of networks time-based.", { +patrick::with_parameters_test_that("Split a list of networks time-based, ", { ## time period time.period = "2 years" @@ -1110,7 +1111,7 @@ test_that("Split a list of networks time-based.", { net.split = split.networks.time.based( networks = list(net.cochange, net.mail), time.period = time.period, - sliding.window = FALSE + sliding.window = test.sliding.window ) ## check whether the splitting information of the two split networks are identical @@ -1120,10 +1121,13 @@ test_that("Split a list of networks time-based.", { net.split = split.networks.time.based( networks = list(net.mail), time.period = time.period, - sliding.window = FALSE + sliding.window = test.sliding.window ) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) ## * * bins ---------------------------------------------------------------- @@ -1131,7 +1135,7 @@ test_that("Split a list of networks time-based.", { ## Tests for split.network.time.based(..., bins = ...) ## -test_that("Split a network time-based (bins = ...).", { +patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", { ## bins bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", @@ -1159,7 +1163,7 @@ test_that("Split a network time-based (bins = ...).", { "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), "2016-07-12 16:04:59-2016-07-12 17:21:43" = igraph::subgraph.edges(author.net, c(3:8)) ) - results = split.network.time.based(author.net, bins = bins) + results = split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window) ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") @@ -1181,9 +1185,13 @@ test_that("Split a network time-based (bins = ...).", { ## retrieve author network author.net = net.builder$get.author.network() - expect_error(split.network.time.based(author.net, bins = bins), info = "Illegal split.") + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window), + info = "Illegal split.") -}) +}, patrick::cases( + "sliding window (ignored): FALSE" = list(test.sliding.window = FALSE), + "sliding window (ignored): TRUE" = list(test.sliding.window = TRUE) +)) ## * * ranges -------------------------------------------------------------------- From 720cc7ba7bdb635129c7669911aef8e7c6200a6b Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sat, 14 Nov 2020 15:32:31 +0100 Subject: [PATCH 22/37] Fix missing layout when plotting networks For unknown reasons, setting the layout for ggraph, which is used for plotting networks, has changed. To avoid errors due to a missing or wrongly set layout, the corresponding implementation in function 'plot.get.plot.for.network' is adjusted in to ways: 1) When creating a ggraph object, we already need to explicitly specify the layout we want to use. As we rely on igraph and its layouts, we use 'layout = "igraph"' and then we have to specify the concrete layout algorithm from igraph (as 'algorithm' paramater). 2) Opposed to previous layout specification, now a layout name has to be stated instead of naming a layout function. Unfortunately, there is no official list of supported igraph layouts. Hence, we need to rely on the following table (which comes from python and not from R): https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms Notice that the layout algorithm needs to be given as string in form of the 'short name' given in this table. However, if there are multiple short names specified in the table, only one of them might actually work for ggraph. Now as before, the default layout is 'kk' (kamada kawai). If one wants to use another igraph layout, just set the graph attribute 'layout' to the short name of the layout and then this one will be used for creating the ggraph object. This commit fixes issue #186. Signed-off-by: Thomas Bock --- util-plot.R | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/util-plot.R b/util-plot.R index 0c009392..6bf105d9 100644 --- a/util-plot.R +++ b/util-plot.R @@ -46,11 +46,13 @@ PLOT.VERTEX.LABEL.COLOR = "gray60" #' Construct a ggplot2/ggraph plot object for the given network and print it directly. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -68,11 +70,13 @@ plot.network = function(network, labels = TRUE) { #' Construct a ggplot2/ggraph plot object for the given network and print it directly. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -91,11 +95,13 @@ plot.print.network = function(network, labels = TRUE) { #' Construct a ggplot2/ggraph plot object for the given network. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -123,13 +129,14 @@ plot.get.plot.for.network = function(network, labels = TRUE) { ## fix the type attributes (add new ones, also named) network = plot.fix.type.attributes(network) - ## set network layout + ## set igraph network layout if not layout is set yet if (!("layout" %in% igraph::list.graph.attributes(network))) { - network = igraph::set.graph.attribute(network, "layout", igraph::layout.kamada.kawai) + network = igraph::set.graph.attribute(network, "layout", "kk") } + layout.algorithm = igraph::get.graph.attribute(network, "layout") - ## create a ggraph object - p = ggraph::ggraph(network) + ## create a ggraph object using the specified igraph layout + p = ggraph::ggraph(network, layout = "igraph", algorithm = layout.algorithm) ## plot edges if there are any if (igraph::ecount(network) > 0) { From f0b3c10e4825499619bfa912c20afb93e696f1ed Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Fri, 27 Nov 2020 19:31:06 +0100 Subject: [PATCH 23/37] Fix minor issues from review in PR #184 Signed-off-by: Thomas Bock --- CONTRIBUTING.md | 2 +- tests.R | 1 + tests/test-split-sliding-window.R | 8 ++++---- tests/test-split.R | 8 ++++---- util-networks-misc.R | 4 ++-- util-plot.R | 3 ++- util-split.R | 14 +++++++------- 7 files changed, 21 insertions(+), 19 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 44daa85e..7191d35d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -129,7 +129,7 @@ The current build status is as follows: * Code must be reviewed by one other project member and, if needed, be properly adapted/fixed. * We add the `Reviewed-by` tag only for the merge commit. -There will be another checklist for you when you open an actual pull request provided by [the corresponding template](.github/PULL_REQUEST_TEMPLATE/pull-request.md). +There will be another checklist for you when you open an actual pull request provided by [the corresponding template](.github/PULL_REQUEST_TEMPLATE.md). ## Style Conventions diff --git a/tests.R b/tests.R index bf0e60dc..89981076 100644 --- a/tests.R +++ b/tests.R @@ -12,6 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2017, 2019 by Claus Hunsen +## Copyright 2020 by Thomas Bock ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/tests/test-split-sliding-window.R b/tests/test-split-sliding-window.R index 311672a1..ea9d712e 100644 --- a/tests/test-split-sliding-window.R +++ b/tests/test-split-sliding-window.R @@ -55,7 +55,7 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## Tests for split.data.time.based(..., split.basis = 'commits'), using sliding windows ## -test_that("Split a data object time-based (split.basis == 'commits', sliding.window = TRUE).", { +test_that("Split a data object time-based (split.basis = 'commits', sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -142,7 +142,7 @@ test_that("Split a data object time-based (split.basis == 'commits', sliding.win ## Tests for split.data.time.based(..., split.basis = 'mails'), using sliding windows ## -test_that("Split a data object time-based (split.basis == 'mails', sliding.window = TRUE).", { +test_that("Split a data object time-based (split.basis = 'mails', sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -241,7 +241,7 @@ test_that("Split a data object time-based (split.basis == 'mails', sliding.windo ## Tests for split.data.time.based(..., split.basis = 'issues'), using sliding windows ## -test_that("Split a data object time-based (split.basis == 'issues', sliding.window = TRUE).", { +test_that("Split a data object time-based (split.basis = 'issues', sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -323,7 +323,7 @@ test_that("Split a data object time-based (split.basis == 'issues', sliding.wind ## Tests for split.data.time.based(..., bins = ...), sliding windows parameter ignored ## -test_that("Split a data object time-based (bins == ... , sliding.window = TRUE).", { +test_that("Split a data object time-based (bins = ... , sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) diff --git a/tests/test-split.R b/tests/test-split.R index b20676b6..b97926f5 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -62,7 +62,7 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## Tests for split.data.time.based(..., split.basis = 'commits') ## -test_that("Split a data object time-based (split.basis == 'commits').", { +test_that("Split a data object time-based (split.basis = 'commits').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -137,7 +137,7 @@ test_that("Split a data object time-based (split.basis == 'commits').", { ## Tests for split.data.time.based(..., split.basis = 'mails') ## -test_that("Split a data object time-based (split.basis == 'mails').", { +test_that("Split a data object time-based (split.basis = 'mails').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -219,7 +219,7 @@ test_that("Split a data object time-based (split.basis == 'mails').", { ## Tests for split.data.time.based(..., split.basis = 'issues') ## -test_that("Split a data object time-based (split.basis == 'issues').", { +test_that("Split a data object time-based (split.basis = 'issues').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -296,7 +296,7 @@ test_that("Split a data object time-based (split.basis == 'issues').", { ## Tests for split.data.time.based(..., bins = ...) ## -test_that("Split a data object time-based (bins == ... ).", { +test_that("Split a data object time-based (bins = ... ).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) diff --git a/util-networks-misc.R b/util-networks-misc.R index a98e999f..1bd4eeeb 100644 --- a/util-networks-misc.R +++ b/util-networks-misc.R @@ -11,8 +11,8 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2016 by Sofie Kemper -## Copyright 2016 by Claus Hunsen +## Copyright 2016-2017 by Sofie Kemper +## Copyright 2016-2017 by Claus Hunsen ## Copyright 2016-2018 by Thomas Bock ## Copyright 2017 by Angelika Schmid ## Copyright 2019 by Jakob Kronawitter diff --git a/util-plot.R b/util-plot.R index 6bf105d9..e8c25859 100644 --- a/util-plot.R +++ b/util-plot.R @@ -14,6 +14,7 @@ ## Copyright 2017-2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -129,7 +130,7 @@ plot.get.plot.for.network = function(network, labels = TRUE) { ## fix the type attributes (add new ones, also named) network = plot.fix.type.attributes(network) - ## set igraph network layout if not layout is set yet + ## set igraph network layout if no layout is set yet if (!("layout" %in% igraph::list.graph.attributes(network))) { network = igraph::set.graph.attribute(network, "layout", "kk") } diff --git a/util-split.R b/util-split.R index 8741ee11..9e523f21 100644 --- a/util-split.R +++ b/util-split.R @@ -293,7 +293,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", } else if (sliding.window) { ## get the list of unique items that are used for the bin computation and, thus, also the ## cropping of data - items.unique = unique(data[[ activity.type ]][[ id.column[[activity.type]] ]]) + items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) ## offsets used for cropping (half the first/last bin) @@ -311,12 +311,12 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## determine end bin of last sliding-window range end.event.id = items.unique[(items.unique.count - offset.end + 1)] - end.event.logical = data[[ activity.type ]][[ id.column[[activity.type]] ]] == end.event.id - end.event.date = data[[ activity.type ]][end.event.logical, ][[ "date" ]] + end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) + end.event.date = data[[activity.type]][end.event.logical, ][["date"]] ## store the data again - data.to.cut = data[[ activity.type ]][[ id.column[[activity.type]] ]] %in% items.cut - data[[ activity.type ]] = data[[ activity.type ]][ !data.to.cut, ] + data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut + data[[activity.type]] = data[[activity.type]][ !data.to.cut, ] ## clone the project data and update raw data to split it again project.data.clone = project.data$clone() @@ -342,7 +342,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) - ## if last regular range and last sliding-window range end at the same time + ## if the last regular range and the last sliding-window range end at the same time ## and the data of the last regular range is contained in the last sliding-window range, then: ## remove the last regular range as it is not complete and we don't loose data when removing it last.regular.range = cf.data[[length(cf.data)]] @@ -803,7 +803,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## construct proper bin vectors for configuration bins.date = sort(c(bins.date, bins.date.middle)) - ## if last regular range and last sliding-window range end at the same time, + ## if the last regular range and the last sliding-window range end at the same time ## and the latter contains the former's edges, then: ## remove the last regular range as it is not complete and we don't loose data when removing it edges.last.regular = igraph::E(networks[[length(networks)]]) From 1ba55517486104098a03442d867ddb95b7292ead Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sat, 28 Nov 2020 16:07:48 +0100 Subject: [PATCH 24/37] Update changelog file Signed-off-by: Thomas Bock --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index b1bfc74b..c131db68 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,12 +6,19 @@ - Add a new file `util-tensor.R` containing the class `FourthOrderTensor` to create (author x relation x author x relation) tensors from a list of networks (with each network having a different relation) and its corresponding utility function `get.author.networks.for.multiple.relations` (PR #173, c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add function `calculate.EDCPTD.centrality` for calculating the EDCPTD centrality for a fourth-order tensor in the above described form (c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add new file `util-networks-misc.R` which contains miscellaneous functions for processing network data and creating and converting various kinds of adjacency matrices: `get.author.names.from.networks`, `get.author.names.from.data`, `get.expanded.adjacency`, `get.expanded.adjacency.matrices`, `get.expanded.adjacency.matrices.cumulated`, `convert.adjacency.matrix.list.to.array` (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) +- Add tests for sliding-window functionality and make parameterized tests possible (a3ad0a81015c7f23bce958d5c1922e3b82b28bda, 2ed84ac55d434f62341297b1aa9676c12e383491, PR #184) ### Changed/Improved - Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Adjust recommended R version to 3.6.3 in README (92be262514277acb774ab2885c1c0d1c10f03373) - Add R version 4.0 to test suite and adjust package installation in `install.R` to improve compatibility with Travis CI (40aa0d80e2a94434a8be75925dbefbde6d3518b2, 1ba036758a63767e2fcef525c98f5a4fd6938c39, #161) +### Fixed + +- Fix sliding-window creation in various splitting functions (`split.network.time.based`, `split.networks.time.based`, `split.data.time.based`, `split.data.activity.based`, `split.network.activity.based`) and also fix the computation of overlapping ranges in the function `construct.overlapping.ranges` to make sure that the last and the second-last range do not cover the same range) (1abc1b8dbfc65ccad0cbbc8e33b209e39d2f8118, c34c42aef32a30b82adc53384fd6a1b09fc75dee, 097cebcc477b1b65056d512124575f5a78229c3e, 9a1b6516f490b72b821be2d5365d98cac1907b2f, 0fc179e2735bec37d26a68c6c351ab43770007d2, cad28bf221f942eb25e997aaa2de553181956680, PR #184) +- Fix off-by-1 error in the function `get.data.cut.to.same.date` (f0744c0e14543292cccb1aa9a61f822755ee7183) +- Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b) + ## 3.6 From 63382b446c37291a89d93284957f03aaeb341e67 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Mon, 30 Nov 2020 19:24:26 +0100 Subject: [PATCH 25/37] Fix typo in variable name 'adjacency.list' In the function 'convert.adjacency.matrix.list.to.array', there was a typo in a variable name which led to an error when calling the function. Signed-off-by: Thomas Bock --- util-networks-misc.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util-networks-misc.R b/util-networks-misc.R index 1bd4eeeb..a9053220 100644 --- a/util-networks-misc.R +++ b/util-networks-misc.R @@ -14,6 +14,7 @@ ## Copyright 2016-2017 by Sofie Kemper ## Copyright 2016-2017 by Claus Hunsen ## Copyright 2016-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2017 by Angelika Schmid ## Copyright 2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker @@ -224,7 +225,7 @@ convert.adjacency.matrix.list.to.array = function(adjacency.list){ colnames(array) = colnames(adjacency.list[[1]]) ## copy the activity values from the adjacency matrices in the list to the corresponding array slices - for (i in seq_along(adjacency.ist)){ + for (i in seq_along(adjacency.list)){ adjacency = adjacency.list[[i]] activity.indices = which(adjacency != 0, arr.ind = TRUE) From 7602af2cf46f699b2285d53819dec614c71754c6 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 1 Dec 2020 12:37:19 +0100 Subject: [PATCH 26/37] Fix minor issues from second review in PR #184 Add more documentation to activity-based data splitting annd add a `unique` call to make sure that there is only one date even if there could be several identical dates in `end.event.date`. In addition, fix some minor inconsistencies. Signed-off-by: Thomas Bock --- util-split.R | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/util-split.R b/util-split.R index 9e523f21..778d6501 100644 --- a/util-split.R +++ b/util-split.R @@ -302,17 +302,17 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## cut the data appropriately if (offset.end > 0) { items.cut = c( - items.unique[1:offset.start], - items.unique[(items.unique.count - offset.end + 1):items.unique.count] + items.unique[seq_len(offset.start)], + items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] ) } else { - items.cut = items.unique[1:offset.start] + items.cut = items.unique[seq_len(offset.start)] } ## determine end bin of last sliding-window range end.event.id = items.unique[(items.unique.count - offset.end + 1)] end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) - end.event.date = data[[activity.type]][end.event.logical, ][["date"]] + end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) ## store the data again data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut @@ -356,8 +356,17 @@ split.data.activity.based = function(project.data, activity.type = c("commits", cf.data = cf.data[-length(cf.data)] bins.date = bins.date[-length(bins.date)] bins = bins[-length(bins)] - } else if (!(bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)])) { - ## adjust the end date of the last sliding-window range + } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { + ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: + ## The end of the last range usually is one second after the last event (as end dates are exclusive). + ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the + ## next event after that range (as end dates are exclusive) to get a full range as for all the previous + ## ranges which end at the beginning of the next range, which is the date of the first event after the + ## actual range. + + ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one + ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing + ## bins[length(bins) - 3] cannot throw errors in this case. name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) names(cf.data)[length(cf.data) - 1] = name.last.sliding.window bins.date[length(bins.date) - 1] = end.event.date @@ -564,7 +573,7 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU ## find bins for dates bins.date = get.date.from.string(bins) bins.vector = findInterval(dates, bins.date, all.inside = FALSE) - bins = 1:(length(bins.date) - 1) # the last item just closes the last bin + bins = seq_len(length(bins.date) - 1) # the last item just closes the last bin } ## perform additional steps for sliding-window approach @@ -655,7 +664,7 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = bins.info = construct.overlapping.ranges(start = min(dates), end = max(dates), time.period = time.period, overlap = 0.5, raw = TRUE, include.end.date = TRUE) - bins.date = unname(unique(get.date.from.unix.timestamp(unlist(bins.info)))) + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) } else { bins.info = split.get.bins.time.based(dates, time.period, number.windows) bins.date = get.date.from.string(bins.info[["bins"]]) @@ -673,7 +682,7 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = if (sliding.window) { nets = split.network.time.based.by.ranges(network = net, ranges = ranges, remove.isolates = remove.isolates) - attr(nets, "bins") = sort(bins.date) + attr(nets, "bins") = bins.date } else { nets = split.network.time.based(network = net, bins = bins.date, sliding.window = sliding.window, remove.isolates = remove.isolates) @@ -776,11 +785,11 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## cut the data appropriately if (offset.end > 0) { edges.cut = c( - edges.by.date[1:offset.start], - edges.by.date[(edge.count - offset.end + 1):edge.count] + edges.by.date[seq_len(offset.start)], + edges.by.date[seq(from = (edge.count - offset.end + 1), to = edge.count)] ) } else { - edges.cut = edges.by.date[1:offset.start] + edges.cut = edges.by.date[seq_len(offset.start)] } ## delete edges from the network and create a new network @@ -1018,7 +1027,7 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica bins.number.complete = length(ids.unique) %/% activity.amount bins.number.incomplete = length(ids.unique) %% activity.amount bins.activity = c( - if (bins.number.complete != 0) rep(1:bins.number.complete, each = activity.amount), + if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount), rep(bins.number.complete + 1, bins.number.incomplete) ) bins.number = max(bins.activity) @@ -1030,7 +1039,7 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica ) ## get the start (and end) date for all bins - bins.date = parallel::mclapply(1:bins.number, function(bin) { + bins.date = parallel::mclapply(seq_len(bins.number), function(bin) { ## get the ids in the bin ids = bins.mapping[ bins.mapping[["bin"]] == bin, "id"] ## grab dates for the ids From 877931b94f87ca097c2f8f3c55e4b4bcc6087742 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Tue, 1 Dec 2020 12:51:55 +0100 Subject: [PATCH 27/37] Fix setting of the layout when plotting networks This is a follow-up commit for commit d9e604e65cac610f97f9eb724528d3a8355f8af4. In more recent package versions of igraph and ggraph, there is no layout "igraph". Instead, the layout algorithm has to be set directly to the "layout" parameter. This fixes #186. Signed-off-by: Claus Hunsen Signed-off-by: Thomas Bock --- util-plot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util-plot.R b/util-plot.R index e8c25859..fdb60232 100644 --- a/util-plot.R +++ b/util-plot.R @@ -137,7 +137,7 @@ plot.get.plot.for.network = function(network, labels = TRUE) { layout.algorithm = igraph::get.graph.attribute(network, "layout") ## create a ggraph object using the specified igraph layout - p = ggraph::ggraph(network, layout = "igraph", algorithm = layout.algorithm) + p = ggraph::ggraph(network, layout = layout.algorithm) ## plot edges if there are any if (igraph::ecount(network) > 0) { From 4afe8b8c358d56c8f4b2dbefcb17a09af7e979f3 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 1 Dec 2020 12:58:39 +0100 Subject: [PATCH 28/37] Update changelog file again Signed-off-by: Thomas Bock --- NEWS.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index c131db68..5ae28b67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,10 +14,9 @@ - Add R version 4.0 to test suite and adjust package installation in `install.R` to improve compatibility with Travis CI (40aa0d80e2a94434a8be75925dbefbde6d3518b2, 1ba036758a63767e2fcef525c98f5a4fd6938c39, #161) ### Fixed - -- Fix sliding-window creation in various splitting functions (`split.network.time.based`, `split.networks.time.based`, `split.data.time.based`, `split.data.activity.based`, `split.network.activity.based`) and also fix the computation of overlapping ranges in the function `construct.overlapping.ranges` to make sure that the last and the second-last range do not cover the same range) (1abc1b8dbfc65ccad0cbbc8e33b209e39d2f8118, c34c42aef32a30b82adc53384fd6a1b09fc75dee, 097cebcc477b1b65056d512124575f5a78229c3e, 9a1b6516f490b72b821be2d5365d98cac1907b2f, 0fc179e2735bec37d26a68c6c351ab43770007d2, cad28bf221f942eb25e997aaa2de553181956680, PR #184) +- Fix sliding-window creation in various splitting functions (`split.network.time.based`, `split.networks.time.based`, `split.data.time.based`, `split.data.activity.based`, `split.network.activity.based`) and also fix the computation of overlapping ranges in the function `construct.overlapping.ranges` to make sure that the last and the second-last range do not cover the same range) (1abc1b8dbfc65ccad0cbbc8e33b209e39d2f8118, c34c42aef32a30b82adc53384fd6a1b09fc75dee, 097cebcc477b1b65056d512124575f5a78229c3e, 9a1b6516f490b72b821be2d5365d98cac1907b2f, 0fc179e2735bec37d26a68c6c351ab43770007d2, cad28bf221f942eb25e997aaa2de553181956680, 7602af2cf46f699b2285d53819dec614c71754c6, PR #184) - Fix off-by-1 error in the function `get.data.cut.to.same.date` (f0744c0e14543292cccb1aa9a61f822755ee7183) -- Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b) +- Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b, 877931b94f87ca097c2f8f3c55e4b4bcc6087742) ## 3.6 From 712bbafde3fb8f7b7c0fc847cb9c1838eb4cf86e Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Tue, 1 Dec 2020 15:37:31 +0100 Subject: [PATCH 29/37] Change the code for reading PaStA data to match the new file format The PaStA data is now called 'patch-groups'. On top of that the format of the file changed to include all unmatched commits. This leads to a lot of log outputs, which is why we delete the logwarn. Signed-off-by: Christian Hechtl --- .../results/testing/test_pasta/{mbox-result => patch-groups} | 0 util-read.R | 5 ++--- 2 files changed, 2 insertions(+), 3 deletions(-) rename tests/codeface-data/results/testing/test_pasta/{mbox-result => patch-groups} (100%) diff --git a/tests/codeface-data/results/testing/test_pasta/mbox-result b/tests/codeface-data/results/testing/test_pasta/patch-groups similarity index 100% rename from tests/codeface-data/results/testing/test_pasta/mbox-result rename to tests/codeface-data/results/testing/test_pasta/patch-groups diff --git a/util-read.R b/util-read.R index 9d6b3c27..5039544b 100644 --- a/util-read.R +++ b/util-read.R @@ -430,7 +430,7 @@ PASTA.LIST.DATA.TYPES = c( "character", "character", "character" ) -#' Read and parse the PaStA data from the 'mbox-result' file. +#' Read and parse the PaStA data from the 'patch-groups' file. #' The form in the file is : ... => commit.hash commit.hash2 .... #' The parsed form is a data frame with message IDs as keys, commit hashes as values, and a revision set id. #' If the message ID does not get mapped to a commit hash, the value for the commit hash is \code{NA}. @@ -444,7 +444,7 @@ read.pasta = function(data.path) { KEY.SEPERATOR = " " ## get file name of PaStA data - filepath = file.path(data.path, "mbox-result") + filepath = file.path(data.path, "patch-groups") ## read data from disk [can be empty] lines = suppressWarnings(try(readLines(filepath), silent = TRUE)) @@ -463,7 +463,6 @@ read.pasta = function(data.path) { } if (!grepl("<", line)) { - logging::logwarn("Faulty line: %s", line) return(NULL) } From de42eb24be131c261ccad7d807007f27d5559d68 Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Tue, 1 Dec 2020 15:39:25 +0100 Subject: [PATCH 30/37] Fix bug with using 'do.call(c, ...)' Calling 'do.call(c, ...)' leads to an error if there is any variable called 'c' in the R environment. This is why we change it to 'base::c'. Signed-off-by: Christian Hechtl --- util-conf.R | 2 +- util-networks-covariates.R | 2 +- util-split.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/util-conf.R b/util-conf.R index 0aecfc43..c70d202c 100644 --- a/util-conf.R +++ b/util-conf.R @@ -559,7 +559,7 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, ## convert columns accordingly revisions.cols = c(revision = "as.character", date = "get.date.from.string") for (i in 1:ncol(revisions.df)) { - revisions.df[i] = do.call(c, lapply(revisions.df[[i]], revisions.cols[i])) + revisions.df[i] = do.call(base::c, lapply(revisions.df[[i]], revisions.cols[i])) colnames(revisions.df)[i] = names(revisions.cols)[i] } revisions = revisions.df[["revision"]] diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 4c4a945a..d7dacb79 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -456,7 +456,7 @@ add.vertex.attribute.first.activity = function(list.of.networks, project.data, ## list(authorA = list(all.activities = 1), authorB = list(all.activities = 3)) if (combine.activity.types) { data = parallel::mclapply(data, function(item.list) { - min.value = min(do.call(c, item.list), na.rm = TRUE) + min.value = min(do.call(base::c, item.list), na.rm = TRUE) return(list(all.activities = min.value)) }) } diff --git a/util-split.R b/util-split.R index 778d6501..bf441a02 100644 --- a/util-split.R +++ b/util-split.R @@ -1060,7 +1060,7 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica )) }) ## unlist bins - bins.date = do.call(c, bins.date) + bins.date = do.call(base::c, bins.date) ## convert to character strings bins.date.char = get.date.string(bins.date) From 1797e0324c39ad7b88dc22a14391340f4d26aea8 Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Tue, 1 Dec 2020 15:42:52 +0100 Subject: [PATCH 31/37] Fix various bugs regarding the PaStA data First, fix the copy-paste error where the existence of mail data was checked when adding the PaStA data to the mail data. Second, fix the duplication of revision.set.ids that happens when merging the PaStA data to the commit or mail data. Last, throw out PaStA items that contain message ids or commit hashes that do not appear in the mail or commit data. Signed-off-by: Christian Hechtl --- util-data.R | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/util-data.R b/util-data.R index 6b18f8f9..660f6238 100644 --- a/util-data.R +++ b/util-data.R @@ -307,7 +307,7 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("update.pasta.commit.data: starting.") ## return immediately if no commits available - if (!is.null(private$mails)) { + if (!is.null(private$commits)) { ## remove previous PaStA data private$commits["pasta"] = NULL @@ -319,6 +319,11 @@ ProjectData = R6::R6Class("ProjectData", ## sort by date again because 'merge' disturbs the order private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] + + ## remove duplicated revision set ids + private$commits[["revision.set.id"]] = sapply(private$commits[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } logging::logdebug("update.pasta.commit.data: finished.") @@ -342,6 +347,11 @@ ProjectData = R6::R6Class("ProjectData", ## sort by date again because 'merge' disturbs the order private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] + + ## remove duplicated revision set ids + private$mails[["revision.set.id"]] = sapply(private$mails[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } logging::logdebug("update.pasta.mail.data: finished.") @@ -373,6 +383,8 @@ ProjectData = R6::R6Class("ProjectData", private$update.pasta.commit.data() } + logging::logwarn("There might be PaStA data that does not appear in the mail or commit data. + To clean this up you can call the function 'cleanup.pasta.data()'.") logging::logdebug("update.pasta.data: finished.") }, @@ -817,6 +829,31 @@ ProjectData = R6::R6Class("ProjectData", } }, + #' Remove lines in the PaStA data that contain message ids or commit hashes + #' that don't appear in the commit or mail data. + cleanup.pasta.data = function() { + logging::loginfo("Cleaning up PaStA data") + + ## remove message ids that don't appear in the mail data + if (!is.null(private$mails)) { + rev.id.contained = private$pasta[["revision.set.id"]] %in% private$mails[["revision.set.id"]] + private$pasta = private$pasta[rev.id.contained, ] + } + + ## remove commit hashes that don't appear in the commit data + if (!is.null(private$commits)) { + pasta.commit.hashes = unlist(private$pasta[["commit.hash"]]) + commit.hashes.contained = unlist(private$pasta[["commit.hash"]]) %in% private$commits[["hash"]] + commit.hashes.to.eliminate = pasta.commit.hashes[!commit.hashes.contained] + commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] + rows.to.remove = unlist(private$pasta[["commit.hash"]]) %in% commit.hashes.to.eliminate + private$pasta = private$pasta[!rows.to.remove, ] + } + + ## update pasta data again + private$update.pasta.data() + }, + #' Get the mail data. #' If it does not already exist call the read method. #' Call the setter function to set the data and add PaStA From ee6a48c6239bfe31bd40cccc2dd64d24a09150be Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Tue, 1 Dec 2020 14:07:50 +0100 Subject: [PATCH 32/37] Adjust outdated examples in showcase.R As the edge attributes and node attributes `edge.type.char` and `node.type.char` do not exist anymore, remove the outdated '.char' suffixes. Then the corresponding plotting examples should work again. Props to @clhunsen for pointing this out! Signed-off-by: Thomas Bock Signed-off-by: Claus Hunsen --- showcase.R | 9 +++++---- util-plot.R | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/showcase.R b/showcase.R index 38bba0c5..e6a1d00c 100644 --- a/showcase.R +++ b/showcase.R @@ -11,11 +11,12 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2016-2018 by Claus Hunsen +## Copyright 2016-2018, 2020 by Claus Hunsen ## Copyright 2017 by Raphael Nömmer ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Klara Schlueter ## Copyright 2020 by Anselm Fehnker @@ -331,10 +332,10 @@ y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) # panel.border = ggplot2::element_blank(), # legend.position = "none" # ) + -# ggraph::facet_edges( ~ edge.type.char) +# ggraph::facet_edges( ~ edge.type) # # ggraph::facet_edges( ~ weight) -# # ggraph::facet_nodes( ~ vertex.type.char) -# # ggraph::facet_graph(edge.type.char ~ vertex.type.char) +# # ggraph::facet_nodes( ~ vertex.type) +# # ggraph::facet_graph(edge.type ~ vertex.type) # print(p) # ## generate network plot from README file and save it to disk diff --git a/util-plot.R b/util-plot.R index fdb60232..ef978b23 100644 --- a/util-plot.R +++ b/util-plot.R @@ -11,7 +11,7 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2017-2018 by Claus Hunsen +## Copyright 2017-2018, 2020 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock ## Copyright 2020 by Thomas Bock From 9cd35a77d136d40da6ef061411cb84eebb2f56a4 Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Tue, 1 Dec 2020 20:30:33 +0100 Subject: [PATCH 33/37] Add copyright headers Signed-off-by: Christian Hechtl --- util-conf.R | 1 + util-data.R | 1 + util-networks-covariates.R | 1 + util-read.R | 1 + util-split.R | 1 + 5 files changed, 5 insertions(+) diff --git a/util-conf.R b/util-conf.R index c70d202c..a9c61b35 100644 --- a/util-conf.R +++ b/util-conf.R @@ -15,6 +15,7 @@ ## Copyright 2016 by Wolfgang Mauerer ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl +## Copyright 2020 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2019 by Thomas Bock ## Copyright 2018 by Barbara Eckl diff --git a/util-data.R b/util-data.R index 660f6238..c47048b0 100644 --- a/util-data.R +++ b/util-data.R @@ -16,6 +16,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl +## Copyright 2020 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017 by Ferdinand Frank ## Copyright 2018-2019 by Jakob Kronawitter diff --git a/util-networks-covariates.R b/util-networks-covariates.R index d7dacb79..1aa00603 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -16,6 +16,7 @@ ## Copyright 2018-2019 by Thomas Bock ## Copyright 2018-2019 by Klara Schlüter ## Copyright 2018 by Jakob Kronawitter +## Copyright 2020 by Christian Hechtl ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/util-read.R b/util-read.R index 5039544b..674cc90f 100644 --- a/util-read.R +++ b/util-read.R @@ -14,6 +14,7 @@ ## Copyright 2016-2019 by Claus Hunsen ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl +## Copyright 2020 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter diff --git a/util-split.R b/util-split.R index bf441a02..aedc276b 100644 --- a/util-split.R +++ b/util-split.R @@ -15,6 +15,7 @@ ## Copyright 2017 by Sofie Kemper ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl +## Copyright 2020 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2020 by Thomas Bock From 474ed761828c763c5f8772ac2f6f44c10724b51e Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Wed, 2 Dec 2020 13:13:01 +0100 Subject: [PATCH 34/37] Update changelog file Signed-off-by: Christian Hechtl --- NEWS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5ae28b67..6fcbe109 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ - Add function `calculate.EDCPTD.centrality` for calculating the EDCPTD centrality for a fourth-order tensor in the above described form (c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add new file `util-networks-misc.R` which contains miscellaneous functions for processing network data and creating and converting various kinds of adjacency matrices: `get.author.names.from.networks`, `get.author.names.from.data`, `get.expanded.adjacency`, `get.expanded.adjacency.matrices`, `get.expanded.adjacency.matrices.cumulated`, `convert.adjacency.matrix.list.to.array` (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add tests for sliding-window functionality and make parameterized tests possible (a3ad0a81015c7f23bce958d5c1922e3b82b28bda, 2ed84ac55d434f62341297b1aa9676c12e383491, PR #184) +- Add function `cleanup.pasta.data` to remove wrong commit hashes and message ids from the PaStA data (0d53142444ac06b605f3a92096697f3286b36103, PR #189) ### Changed/Improved - Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) @@ -17,6 +18,10 @@ - Fix sliding-window creation in various splitting functions (`split.network.time.based`, `split.networks.time.based`, `split.data.time.based`, `split.data.activity.based`, `split.network.activity.based`) and also fix the computation of overlapping ranges in the function `construct.overlapping.ranges` to make sure that the last and the second-last range do not cover the same range) (1abc1b8dbfc65ccad0cbbc8e33b209e39d2f8118, c34c42aef32a30b82adc53384fd6a1b09fc75dee, 097cebcc477b1b65056d512124575f5a78229c3e, 9a1b6516f490b72b821be2d5365d98cac1907b2f, 0fc179e2735bec37d26a68c6c351ab43770007d2, cad28bf221f942eb25e997aaa2de553181956680, 7602af2cf46f699b2285d53819dec614c71754c6, PR #184) - Fix off-by-1 error in the function `get.data.cut.to.same.date` (f0744c0e14543292cccb1aa9a61f822755ee7183) - Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b, 877931b94f87ca097c2f8f3c55e4b4bcc6087742) +- Fix reading of the PaStA data since the file format has changed (712bbafde3fb8f7b7c0fc847cb9c1838eb4cf86e, PR #189) +- Fix bug that duplicates revision set ids in the mail and commit data when merging the PaStA data and also copy-paste error when merging PaStA data to commit data (0d53142444ac06b605f3a92096697f3286b36103, PR #189) +- Fix bug that results in an error when there is a variable called 'c' in the R environment (de42eb24be131c261ccad7d807007f27d5559d68, PR #189) + ## 3.6 From 82614754fb3d75b0e5856d1ef42ada737859ee37 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 2 Dec 2020 12:49:03 +0100 Subject: [PATCH 35/37] Fix filtering patchstack mails if there are no mails If there are no mails (e.g., after splitting the data to a certain range), the function `filter.patchstack.mails()` prints warnings and sets the mail data to NULL, whereas the mail data should stay as created by `create.empty.mails.list()`. When then accessing certain columns of the mail data, this leads to an error. To prevent this, don't perform the patchstack mail filtering if there are no mails - just return the mail data as they have been before. Signed-off-by: Thomas Bock Signed-off-by: Christian Hechtl --- util-data.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/util-data.R b/util-data.R index c47048b0..0531a204 100644 --- a/util-data.R +++ b/util-data.R @@ -162,6 +162,12 @@ ProjectData = R6::R6Class("ProjectData", filter.patchstack.mails = function() { logging::logdebug("filter.patchstack.mails: starting.") + ## return immediately if no mails are available + if (nrow(private$mails) == 0) { + private$mails.patchstacks = NULL + return(private$mails) + } + ## retrieve mails grouped by thread IDs thread.data = self$group.authors.by.data.column("mails", "thread") From cd40097f4bd403adf279a3b068c25582ce52da58 Mon Sep 17 00:00:00 2001 From: Christian Hechtl Date: Wed, 2 Dec 2020 14:03:00 +0100 Subject: [PATCH 36/37] Update changelog file again Signed-off-by: Christian Hechtl --- NEWS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6fcbe109..705a0fb0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,7 @@ - Add function `calculate.EDCPTD.centrality` for calculating the EDCPTD centrality for a fourth-order tensor in the above described form (c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add new file `util-networks-misc.R` which contains miscellaneous functions for processing network data and creating and converting various kinds of adjacency matrices: `get.author.names.from.networks`, `get.author.names.from.data`, `get.expanded.adjacency`, `get.expanded.adjacency.matrices`, `get.expanded.adjacency.matrices.cumulated`, `convert.adjacency.matrix.list.to.array` (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add tests for sliding-window functionality and make parameterized tests possible (a3ad0a81015c7f23bce958d5c1922e3b82b28bda, 2ed84ac55d434f62341297b1aa9676c12e383491, PR #184) -- Add function `cleanup.pasta.data` to remove wrong commit hashes and message ids from the PaStA data (0d53142444ac06b605f3a92096697f3286b36103, PR #189) +- Add function `cleanup.pasta.data` to remove wrong commit hashes and message ids from the PaStA data (1797e0324c39ad7b88dc22a14391340f4d26aea8, PR #189) ### Changed/Improved - Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) @@ -19,8 +19,9 @@ - Fix off-by-1 error in the function `get.data.cut.to.same.date` (f0744c0e14543292cccb1aa9a61f822755ee7183) - Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b, 877931b94f87ca097c2f8f3c55e4b4bcc6087742) - Fix reading of the PaStA data since the file format has changed (712bbafde3fb8f7b7c0fc847cb9c1838eb4cf86e, PR #189) -- Fix bug that duplicates revision set ids in the mail and commit data when merging the PaStA data and also copy-paste error when merging PaStA data to commit data (0d53142444ac06b605f3a92096697f3286b36103, PR #189) +- Fix bug that duplicates revision set ids in the mail and commit data when merging the PaStA data and also copy-paste error when merging PaStA data to commit data (1797e0324c39ad7b88dc22a14391340f4d26aea8, PR #189) - Fix bug that results in an error when there is a variable called 'c' in the R environment (de42eb24be131c261ccad7d807007f27d5559d68, PR #189) +- Fix bug that when applying `filter.patchstack.mails()` to an environment with no mail data, the mail data gets set to `NULL` (82614754fb3d75b0e5856d1ef42ada737859ee37, PR #189) From af4eaa63df57a04d4ee8f5687bda3a14504c05c7 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 2 Dec 2020 16:36:57 +0100 Subject: [PATCH 37/37] Version v3.7 Signed-off-by: Thomas Bock --- NEWS.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 705a0fb0..eb301587 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # coronet – Changelog -## Unversioned +## 3.7 ### Added - Add a new file `util-tensor.R` containing the class `FourthOrderTensor` to create (author x relation x author x relation) tensors from a list of networks (with each network having a different relation) and its corresponding utility function `get.author.networks.for.multiple.relations` (PR #173, c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) @@ -24,7 +24,6 @@ - Fix bug that when applying `filter.patchstack.mails()` to an environment with no mail data, the mail data gets set to `NULL` (82614754fb3d75b0e5856d1ef42ada737859ee37, PR #189) - ## 3.6 ### Added @@ -33,19 +32,16 @@ - Add a new file `util-plot-evaluation.R` containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) ### Changed/Improved - - Add R version 3.6 to test suite (8b2a52d38475a59c55feb17bb54ed12b9252a937, #161) - Update `.travis.yml` to improve compatibility with Travis CI (41ce589b3b50fd581a10e6af33ac6b1bbea63bb8) ### Fixed - - Ensure sorting of commit-count and LOC-count data.frames to fix tests with R 3.3 (33d63fd50c4b29d45a9ca586c383650f7d29efd5) ## 3.5 ### Announcement - - Rename project to `coronet` (#10, 929f8cec7b52adef1389ce1691b783c235eb815d, ac1ce80b9f5da812f90b5fed63f26dc8c812a4d6) * Be sure to update Git remotes and submodules to the new URL!