diff --git a/tests/test-misc.R b/tests/test-misc.R index 19f58a99..0252c121 100644 --- a/tests/test-misc.R +++ b/tests/test-misc.R @@ -5,7 +5,310 @@ ## (c) Thomas Bock, 2017 ## bockthom@fim.uni-passau.de + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Date handling ----------------------------------------------------------- + +## +## Parse date from a string. +## + +test_that("Parse date from a string.", { + + ## full date: + date.string = "2018-02-22 10:44:27" + date.posixct = as.POSIXct(strptime(date.string, format = "%Y-%m-%d %H:%M:%S")) + ## 1) from string + expect_equal(get.date.from.string(date.string), date.posixct, info = "From string.") + ## 2) from POSIXct + expect_equal(get.date.from.string(date.posixct), date.posixct, info = "From POSIXct.") + + ## partial date: + ## 1) no seconds + date.string = "2018-02-22 10:22" + date.posixct = as.POSIXct(strptime(date.string, format = "%Y-%m-%d %H:%M")) + expect_equal(get.date.from.string(date.string), date.posixct, info = "Partial date (no seconds).") + ## 2) no seconds, no minutes + date.string = "2018-02-22 10" + date.posixct = as.POSIXct(strptime(date.string, format = "%Y-%m-%d %H")) + expect_equal(get.date.from.string(date.string), date.posixct, info = "Partial date (no seconds, no minutes).") + ## 3) no time + date.string = "2018-02-22" + date.posixct = as.POSIXct(strptime(date.string, format = "%Y-%m-%d")) + expect_equal(get.date.from.string(date.string), date.posixct, info = "Partial date (no time).") + +}) + +## +## Parse date from a UNIX timestamp. +## + +test_that("Parse date from a UNIX timestamp.", { + + date.numeric = 1519296267 + date.string = "2018-02-22 10:44:27" + date.posixct = get.date.from.string(date.string) + + expect_equal(get.date.from.unix.timestamp(date.numeric), date.posixct, info = "From string.") + +}) + +## +## Format a POSIXct object. +## + +test_that("Format a POSIXct object.", { + + ## full date: + date.string = "2018-02-22 10:44:27" + date.posixct = get.date.from.string(date.string) + ## 1) from string + expect_identical(get.date.string(date.string), date.string, info = "From string.") + ## 2) from POSIXct + expect_identical(get.date.string(date.posixct), date.string, info = "From POSIXct.") + + ## partial date: + ## 1) no seconds + date.string = "2018-02-22 10:44" + date.string.formatted = "2018-02-22 10:44:00" + date.posixct = get.date.from.string(date.string) + expect_equal(get.date.string(date.posixct), date.string.formatted, info = "Partial date (no seconds).") + ## 2) no seconds, no minutes + date.string = "2018-02-22 10" + date.string.formatted = "2018-02-22 10:00:00" + date.posixct = get.date.from.string(date.string) + expect_equal(get.date.string(date.posixct), date.string.formatted, info = "Partial date (no seconds, no minutes).") + ## 3) no time + date.string = "2018-02-22" + date.string.formatted = "2018-02-22 00:00:00" + date.posixct = get.date.from.string(date.string) + expect_equal(get.date.string(date.posixct), date.string.formatted, info = "Partial date (no time).") + +}) + +## +## Generate a date sequence. +## + +test_that("Generate a date sequence.", { + + ## parameter configuration + time.period = "2 hours" + time.period.duration = lubridate::duration(time.period) + + start.date = "2018-02-22 00:00:00" + start.date.posixct = get.date.from.string(start.date) + + ## short last range: + end.date.short = "2018-02-22 06:05:01" + end.date.short.posixct = get.date.from.string(end.date.short) + ## 1) expected results + expected = get.date.from.string(c("2018-02-22 00:00:00", "2018-02-22 02:00:00", + "2018-02-22 04:00:00", "2018-02-22 06:00:00", + "2018-02-22 06:05:01")) + ## 2) From string. + result = generate.date.sequence(start.date, end.date.short, time.period) + expect_equal(result, expected, info = "Date sequence from strings.") + ## 3) From POSIXct. + result = generate.date.sequence(start.date.posixct, end.date.short.posixct, time.period) + expect_equal(result, expected, info = "Date sequence from dates.") + ## 4) With lubridate::duration. + result = generate.date.sequence(start.date.posixct, end.date.short.posixct, time.period.duration) + expect_equal(result, expected, info = "Date sequence with lubridate::duration") + + ## precise last range: + end.date.precise = "2018-02-22 06:00:00" + end.date.precise.posixct = get.date.from.string(end.date.precise) + ## 1) expected results + expected = get.date.from.string(c("2018-02-22 00:00:00", "2018-02-22 02:00:00", + "2018-02-22 04:00:00", "2018-02-22 06:00:00")) + ## 2) From string. + result = generate.date.sequence(start.date, end.date.precise, time.period) + expect_equal(result, expected, info = "Date sequence from strings.") + ## 3) From POSIXct. + result = generate.date.sequence(start.date.posixct, end.date.precise.posixct, time.period) + expect_equal(result, expected, info = "Date sequence from dates.") + ## 4) With lubridate::duration. + result = generate.date.sequence(start.date.posixct, end.date.precise.posixct, time.period.duration) + expect_equal(result, expected, info = "Date sequence with lubridate::duration") + +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Range construction and handling ----------------------------------------- + +## +## Construct consecutive and overlapping ranges. +## + +test_that("Construct consecutive and overlapping ranges.", { + + start = ("2018-01-01 00:00:00") + start.date = get.date.from.string(start) + end = ("2018-01-01 06:05:00") + end.date = get.date.from.string(end) + end.including = end.date + 1 + + ## standard overlapping ranges: + ## 1) expected results + expected.formatted = c( + "2018-01-01 00:00:00-2018-01-01 02:00:00", + "2018-01-01 01:30:00-2018-01-01 03:30:00", + "2018-01-01 03:00:00-2018-01-01 05:00:00", + "2018-01-01 04:30:00-2018-01-01 06:05:01" + ) + expected.raw = lapply(expected.formatted, get.range.bounds) + names(expected.raw) = expected.formatted + ## 2) formatted + result.formatted = construct.overlapping.ranges(start, end, time.period = "2 hours", overlap = "30 minutes", raw = FALSE) + expect_identical(result.formatted, expected.formatted, info = "Standard overlapping ranges (formatted).") + ## 3) raw + result.raw = construct.overlapping.ranges(start, end, time.period = "2 hours", overlap = "30 minutes", raw = TRUE) + expect_equal(result.raw, expected.raw, info = "Standard overlapping ranges (raw).") + ## TODO use expect_identical here? why failing? + + ## non-overlapping/consecutive ranges: + ## 1) expected results + expected.formatted = c( + "2018-01-01 00:00:00-2018-01-01 02:00:00", + "2018-01-01 02:00:00-2018-01-01 04:00:00", + "2018-01-01 04:00:00-2018-01-01 06:00:00", + "2018-01-01 06:00:00-2018-01-01 06:05:01" + ) + expected.raw = lapply(expected.formatted, get.range.bounds) + names(expected.raw) = expected.formatted + ## 2) formatted + result.formatted = construct.overlapping.ranges(start.date, end.date, time.period = "2 hours", overlap = 0, raw = FALSE) + expect_identical(result.formatted, expected.formatted, info = "Non-overlapping ranges (formatted).") + ## 3) raw + result.raw = construct.overlapping.ranges(start.date, end.date, time.period = "2 hours", overlap = 0, raw = TRUE) + expect_equal(result.raw, expected.raw, info = "Non-overlapping ranges (raw).") + ## TODO use expect_identical here? why failing? + ## 4) matching with consecutive ranges + results.raw = construct.consecutive.ranges(start.date, end.date, time.period = "2 hours", raw = FALSE) + expect_equal(result.raw, expected.raw, info = "Non-overlapping ranges (consecutive).") + + ## illegal overlap + expect_error( + construct.overlapping.ranges(start.date, end.date, time.period = "2 hours", overlap = "1 year", raw = FALSE), + info = "Error expected (illegal overlap)." + ) +}) + +## +## Construct cumulative ranges. +## + +test_that("Construct cumulative ranges.", { + + start = ("2018-01-01 00:00:00") + start.date = get.date.from.string(start) + end = ("2018-01-01 06:05:00") + end.date = get.date.from.string(end) + end.including = end.date + 1 + + ## standard overlapping ranges: + ## 1) expected results + expected.formatted = c( + "2018-01-01 00:00:00-2018-01-01 02:00:00", + "2018-01-01 00:00:00-2018-01-01 04:00:00", + "2018-01-01 00:00:00-2018-01-01 06:00:00", + "2018-01-01 00:00:00-2018-01-01 06:05:01" + ) + expected.raw = lapply(expected.formatted, get.range.bounds) + names(expected.raw) = expected.formatted + ## 2) formatted + result.formatted = construct.cumulative.ranges(start, end, time.period = "2 hours", raw = FALSE) + expect_identical(result.formatted, expected.formatted, info = "Cumulative ranges (formatted).") + ## 3) raw + result.raw = construct.cumulative.ranges(start, end, time.period = "2 hours", raw = TRUE) + expect_equal(result.raw, expected.raw, info = "Cumulative ranges (raw).") + ## TODO use expect_identical here? why failing? +}) + +## +## Aggregate ranges. +## + +test_that("Aggregate ranges.", { + + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete") + + project.start = get.date.from.string("2017-12-01 00:00:00") + start = ("2018-01-01 00:00:00") + start.date = get.date.from.string(start) + end = ("2018-01-01 06:05:00") + end.date = get.date.from.string(end) + end.including = end.date + 1 + project.end = get.date.from.string("2019-04-04 00:00:00") + + ## construct ranges to aggregate + ranges = construct.consecutive.ranges(start.date, end.date, time.period = "2 hours", raw = FALSE) + ranges.raw = construct.consecutive.ranges(start.date, end.date, time.period = "2 hours", raw = FALSE) + + ## get results + results = lapply(aggregation.level, function(level) + aggregate.ranges(ranges, project.start, project.end, level, raw = FALSE) ## FIXME raw = TRUE + ) + names(results) = aggregation.level + + ## expected results + expected = list( + range = c( + "2018-01-01 00:00:00-2018-01-01 02:00:00", + "2018-01-01 02:00:00-2018-01-01 04:00:00", + "2018-01-01 04:00:00-2018-01-01 06:00:00", + "2018-01-01 06:00:00-2018-01-01 06:05:01" + ), + cumulative = c( + "2018-01-01 00:00:00-2018-01-01 02:00:00", + "2018-01-01 00:00:00-2018-01-01 04:00:00", + "2018-01-01 00:00:00-2018-01-01 06:00:00", + "2018-01-01 00:00:00-2018-01-01 06:05:01" + ), + all.ranges = c( + "2018-01-01 00:00:00-2018-01-01 06:05:01", + "2018-01-01 00:00:00-2018-01-01 06:05:01", + "2018-01-01 00:00:00-2018-01-01 06:05:01", + "2018-01-01 00:00:00-2018-01-01 06:05:01" + ), + project.cumulative = c( + "2017-12-01 00:00:00-2018-01-01 02:00:00", + "2017-12-01 00:00:00-2018-01-01 04:00:00", + "2017-12-01 00:00:00-2018-01-01 06:00:00", + "2017-12-01 00:00:00-2018-01-01 06:05:01" + ), + project.all.ranges = c( + "2017-12-01 00:00:00-2018-01-01 06:05:01", + "2017-12-01 00:00:00-2018-01-01 06:05:01", + "2017-12-01 00:00:00-2018-01-01 06:05:01", + "2017-12-01 00:00:00-2018-01-01 06:05:01" + ), + complete = c( + "2017-12-01 00:00:00-2019-04-04 00:00:01", + "2017-12-01 00:00:00-2019-04-04 00:00:01", + "2017-12-01 00:00:00-2019-04-04 00:00:01", + "2017-12-01 00:00:00-2019-04-04 00:00:01" + ) + ) + + lapply(aggregation.level, function(aggregation.level) { + expected.ranges = expected[[aggregation.level]] + results.ranges = results[[aggregation.level]] + + expect_identical(results.ranges, expected.ranges) + }) +}) + +## +## Parse ranges. +## + test_that("Parse range", { + range.input = c("2012-07-10 15:58:00-2012-07-15 16:02:00", "2012-07-10-2012-07-15 16:02:00", "2012-07-10 15:58:00-2012-07-15", @@ -24,3 +327,4 @@ test_that("Parse range", { expect_equal(actual.output, expected.output, "Parsed Range") }) + diff --git a/tests/test-split.R b/tests/test-split.R index 9c613196..ae95112d 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -34,6 +34,13 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## - net.conf$update.values(list(pasta = TRUE, synchronicity = TRUE)) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split data -------------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + ## ## Tests for split.data.time.based(..., split.basis = 'commits') ## @@ -135,10 +142,10 @@ test_that("Split a data object time-based (split.basis == 'mails').", { ## check time ranges expected = c( - "2004-10-09 18:38:13-2007-10-09 18:38:13", - "2007-10-09 18:38:13-2010-10-09 18:38:13", - "2010-10-09 18:38:13-2013-10-09 18:38:13", - "2013-10-09 18:38:13-2016-07-12 16:05:38" + "2004-10-09 18:38:13-2007-10-10 12:38:13", + "2007-10-10 12:38:13-2010-10-10 06:38:13", + "2010-10-10 06:38:13-2013-10-10 00:38:13", + "2013-10-10 00:38:13-2016-07-12 16:05:38" ) result = proj.conf$get.value("ranges") expect_equal(result, expected, info = "Time ranges.") @@ -146,34 +153,34 @@ test_that("Split a data object time-based (split.basis == 'mails').", { ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2007-10-09 18:38:13" = data.frame(), - "2007-10-09 18:38:13-2010-10-09 18:38:13" = data.frame(), - "2010-10-09 18:38:13-2013-10-09 18:38:13" = data.frame(), - "2013-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:4, ] + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data.frame(), + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data.frame(), + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data.frame(), + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:4, ] ), mails = list( - "2004-10-09 18:38:13-2007-10-09 18:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], - "2007-10-09 18:38:13-2010-10-09 18:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2010-10-09 18:38:13-2013-10-09 18:38:13" = data.frame(), - "2013-10-09 18:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data.frame(), + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] ), issues = list( - "2004-10-09 18:38:13-2007-10-09 18:38:13" = data.frame(), - "2007-10-09 18:38:13-2010-10-09 18:38:13" = data.frame(), - "2010-10-09 18:38:13-2013-10-09 18:38:13" = data$issues[rownames(data$issues) %in% 1:6, ], - "2013-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(8:9, 18:21), ] + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data.frame(), + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data.frame(), + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:6, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(8:9, 18:21), ] ), synchronicity = list( - "2004-10-09 18:38:13-2007-10-09 18:38:13" = data$synchronicity, - "2007-10-09 18:38:13-2010-10-09 18:38:13" = data$synchronicity, - "2010-10-09 18:38:13-2013-10-09 18:38:13" = data$synchronicity, - "2013-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity ), pasta = list( - "2004-10-09 18:38:13-2007-10-09 18:38:13" = data$pasta, - "2007-10-09 18:38:13-2010-10-09 18:38:13" = data$pasta, - "2010-10-09 18:38:13-2013-10-09 18:38:13" = data$pasta, - "2013-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta ) ) results.data = list( @@ -215,8 +222,8 @@ test_that("Split a data object time-based (split.basis == 'issues').", { ## check time ranges expected = c( - "2013-04-21 23:52:09-2015-04-21 23:52:09", - "2015-04-21 23:52:09-2017-04-21 23:52:09", + "2013-04-21 23:52:09-2015-04-22 11:52:09", + "2015-04-22 11:52:09-2017-04-21 23:52:09", "2017-04-21 23:52:09-2017-05-23 12:32:40" ) result = proj.conf$get.value("ranges") @@ -225,28 +232,28 @@ test_that("Split a data object time-based (split.basis == 'issues').", { ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2015-04-21 23:52:09" = data.frame(), - "2015-04-21 23:52:09-2017-04-21 23:52:09" = data$commits, + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data.frame(), + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, "2017-04-21 23:52:09-2017-05-23 12:32:40" = data.frame() ), mails = list( - "2013-04-21 23:52:09-2015-04-21 23:52:09" = data.frame(), - "2015-04-21 23:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data.frame(), + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], "2017-04-21 23:52:09-2017-05-23 12:32:40" = data.frame() ), issues = list( - "2013-04-21 23:52:09-2015-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 1:6, ], - "2015-04-21 23:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 7:33, ], + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:6, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 7:33, ], "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 34:36, ] ), synchronicity = list( - "2013-04-21 23:52:09-2015-04-21 23:52:09" = data$synchronicity, - "2015-04-21 23:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity ), pasta = list( - "2013-04-21 23:52:09-2015-04-21 23:52:09" = data$pasta, - "2015-04-21 23:52:09-2017-04-21 23:52:09" = data$pasta, + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta ) ) @@ -261,6 +268,7 @@ test_that("Split a data object time-based (split.basis == 'issues').", { }) +## * * bins ---------------------------------------------------------------- ## ## Tests for split.data.time.based(..., bins = ...) @@ -323,6 +331,118 @@ test_that("Split a data object time-based (bins == ... ).", { }) +## * * ranges -------------------------------------------------------------- + +## +## Test splitting data by network names. +## + +test_that("Test splitting data by networks", { + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("artifact.filter.base", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## construct project data + project.data = ProjectData$new(proj.conf) + + ## split data + mybins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", + "2016-07-12 16:05:00", "2016-10-05 09:00:00")) + input.data = split.data.time.based(project.data, bins = mybins) + input.data.network = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) + + ## split data by networks + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete") + results = lapply(aggregation.level, function(level) + split.data.by.networks(input.data.network, project.data, level) + ) + names(results) = aggregation.level + + ## construct expected ranges + expected.ranges = list( + range = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 16:00:00-2016-07-12 16:05:00", + "2016-07-12 16:05:00-2016-10-05 09:00:00"), + cumulative = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 15:00:00-2016-07-12 16:05:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + all.ranges = c("2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + project.cumulative = c("2004-10-09 18:38:13-2016-07-12 16:00:00", + "2004-10-09 18:38:13-2016-07-12 16:05:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + project.all.ranges = c("2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + complete = c("2004-10-09 18:38:13-2017-05-23 12:32:40", + "2004-10-09 18:38:13-2017-05-23 12:32:40", + "2004-10-09 18:38:13-2017-05-23 12:32:40") + ) + + ## test the ranges + test.each.network = function(aggregation.level) { + result.data = results[[aggregation.level]] + expected.range.names = expected.ranges[[aggregation.level]] + + lapply(seq_along(result.data), function(i) { + result.entry = result.data[[i]] + + expect_true(igraph::identical_graphs(result.entry[["network"]], input.data.network[[i]])) + expect_equal(result.entry[["data"]]$get.range(), expected.range.names[[i]]) + }) + } + lapply(aggregation.level, test.each.network) +}) + +## +## Test splitting data by ranges. +## + +test_that("Test splitting data by ranges", { + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("artifact.filter.base", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## construct project data + project.data = ProjectData$new(proj.conf) + + ## split data + my.bins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", + "2016-07-12 16:05:00", "2016-10-05 09:00:00")) + my.ranges = construct.ranges(my.bins, sliding.window = FALSE) + expected.results = split.data.time.based(project.data, bins = my.bins) + results = split.data.time.based.by.ranges(project.data, my.ranges) + + ## check time ranges + expect_equal(names(results), my.ranges, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = lapply(expected.results, function(cf.data) cf.data$get.commits()), + mails = lapply(expected.results, function(cf.data) cf.data$get.mails()), + issues = lapply(expected.results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(expected.results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(expected.results, function(cf.data) cf.data$get.pasta()) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + +## * activity-based -------------------------------------------------------- ## ## Tests for split.data.activity.based(..., activity.type = 'commits') @@ -874,6 +994,13 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { }) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split network ----------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + ## ## Tests for split.network.time.based(..., time.period = ...) ## @@ -930,6 +1057,44 @@ test_that("Split a network time-based (time.period = ...).", { }) +## +## Tests for split.networks.time.based(..., time.period = ...) +## + +test_that("Split a list of networks time-based.", { + + ## time period + time.period = "2 years" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("artifact.filter.base", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## obtain networks: + ## 1) co-change network + net.builder$update.network.conf(list(author.relation = "cochange")) + net.cochange = net.builder$get.author.network() + ## 2) mail network + net.builder$update.network.conf(list(author.relation = "mail")) + net.mail = net.builder$get.author.network() + + ## split networks + net.split = split.networks.time.based( + networks = list(net.cochange, net.mail), + time.period = time.period, + sliding.window = FALSE + ) + + ## check whether the splitting information of the two split networks are identical + expect_identical(attributes(net.split[[1]]), attributes(net.split[[2]]), info = "Splitting information.") + +}) + +## * * bins ---------------------------------------------------------------- ## ## Tests for split.network.time.based(..., bins = ...) @@ -989,44 +1154,44 @@ test_that("Split a network time-based (bins = ...).", { }) +## * * ranges -------------------------------------------------------------------- ## -## Tests for split.networks.time.based(..., time.period = ...) +## Test splitting network by ranges. ## -test_that("Split a list of networks time-based.", { +test_that("Test splitting network by ranges", { - ## time period - time.period = "2 years" + + ## bins + bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", + "2016-07-12 16:04:59", "2016-07-12 17:21:43") + ranges = construct.ranges(bins, sliding.window = FALSE, raw = TRUE) ## configuration and data objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("artifact.filter.base", FALSE) net.conf = NetworkConf$new() - net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) project.data = ProjectData$new(proj.conf) net.builder = NetworkBuilder$new(project.data, net.conf) - ## obtain networks: - ## 1) co-change network - net.builder$update.network.conf(list(author.relation = "cochange")) - net.cochange = net.builder$get.author.network() - ## 2) mail network - net.builder$update.network.conf(list(author.relation = "mail")) - net.mail = net.builder$get.author.network() - - ## split networks - net.split = split.networks.time.based( - networks = list(net.cochange, net.mail), - time.period = time.period, - sliding.window = FALSE - ) + ## retrieve author network + author.net = net.builder$get.author.network() + expected.results = split.network.time.based(author.net, bins = bins) + results = split.network.time.based.by.ranges(author.net, ranges) - ## check whether the splitting information of the two split networks are identical - expect_identical(attributes(net.split[[1]]), attributes(net.split[[2]]), info = "Splitting information.") + ## check time ranges + expect_equal(names(results), names(ranges), info = "Time ranges.") + ## check data for all ranges + check.identical = mapply(results, expected.results, FUN = function(r, e) { + return(igraph::identical_graphs(r, e)) + }) + expect_true(all(check.identical), info = "Network equality (split by ranges).") }) +## * activity-based ------------------------------------------------------------ ## ## Tests for split.network.activity.based(...) @@ -1119,6 +1284,9 @@ test_that("Split a network activity-based (number.edges, number.windows).", { }) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split raw data (data and networks by bins) ------------------------------ + ## ## Tests for split.data.by.bins and split.network.by.bins ## @@ -1203,6 +1371,9 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ }) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Bin identification ------------------------------------------------------ + ## ## Tests for split.get.bins.time.based and split.get.bins.activity.based ## @@ -1296,6 +1467,9 @@ test_that("Get bins for network and data on low level (split.get.bins.time.based }) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Consistency tests ------------------------------------------------------- + ## ## Tests for consistency of data and network time-based splitting ## @@ -1343,6 +1517,9 @@ test_that("Check consistency of data and network time-based splitting.", { }) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Unification of range names ---------------------------------------------- + ## ## Tests for duplicate range names ## @@ -1462,69 +1639,3 @@ test_that("Check and correct duplicate range names during network activity-based expect_identical(result, expected, info = "Removal of duplicate ranges.") }) - - -## -## Test splitting data by network names. -## -test_that("Test splitting data by networks", { - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("artifact.filter.base", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - - ## construct project data - project.data = ProjectData$new(proj.conf) - - ## split data - mybins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", - "2016-07-12 16:05:00", "2016-10-05 09:00:00")) - input.data = split.data.time.based(project.data, bins = mybins) - input.data.network = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) - - ## split data by networks - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete") - results = lapply(aggregation.level, function(level) - split.data.by.networks(input.data.network, project.data, level) - ) - names(results) = aggregation.level - - ## construct expected ranges - expected.ranges = list( - range = c("2016-07-12 15:00:00-2016-07-12 16:00:00", - "2016-07-12 16:00:00-2016-07-12 16:05:00", - "2016-07-12 16:05:00-2016-10-05 09:00:00"), - cumulative = c("2016-07-12 15:00:00-2016-07-12 16:00:00", - "2016-07-12 15:00:00-2016-07-12 16:05:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00"), - all.ranges = c("2016-07-12 15:00:00-2016-10-05 09:00:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00"), - project.cumulative = c("2004-10-09 18:38:13-2016-07-12 16:00:00", - "2004-10-09 18:38:13-2016-07-12 16:05:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00"), - project.all.ranges = c("2004-10-09 18:38:13-2016-10-05 09:00:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00"), - complete = c("2004-10-09 18:38:13-2017-05-23 12:32:39", - "2004-10-09 18:38:13-2017-05-23 12:32:39", - "2004-10-09 18:38:13-2017-05-23 12:32:39") - ) - - ## test the ranges - test.each.network = function(aggregation.level) { - result.data = results[[aggregation.level]] - expected.range.names = expected.ranges[[aggregation.level]] - - lapply(seq_along(result.data), function(i) { - result.entry = result.data[[i]] - - expect_true(igraph::identical_graphs(result.entry[["network"]], input.data.network[[i]])) - expect_equal(result.entry[["data"]]$get.range(), expected.range.names[[i]]) - }) - } - lapply(aggregation.level, test.each.network) -}) diff --git a/util-conf.R b/util-conf.R index 6fb70ef9..b39fe2b6 100644 --- a/util-conf.R +++ b/util-conf.R @@ -759,32 +759,6 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Helper functions -------------------------------------------------------- -#' Construct the range strings. -#' -#' @param revs the revisions -#' @param sliding.window whether sliding window splitting is enabled or not -#' default: 'FALSE' -#' -#' @return the ranges as strings -construct.ranges = function(revs, sliding.window = FALSE) { - ## setting offset to construct ranges, i.e., - ## combine each $offset revisions - offset = 1 - - ## with sliding window, we combine each second revision - if (sliding.window) - offset = 2 - - ## extract sequences of revisions - seq1 = revs[ 1:(length(revs) - offset) ] - seq2 = revs[ (offset + 1):length(revs) ] - - ## construct ranges - ranges = paste(seq1, seq2, sep = "-") - - return(ranges) -} - #' Constructs a string representing a configuration (i.e., a potentially nested list). #' #' @param conf the configuration list to represent as string diff --git a/util-misc.R b/util-misc.R index a204e930..1e00393b 100644 --- a/util-misc.R +++ b/util-misc.R @@ -117,7 +117,7 @@ get.date.from.string = function(input) { return(result) } -#' Convert unix timestamp to POSIXct +#' Convert UNIX timestamp to POSIXct #' #' @param timestmap The timestamp #' @@ -136,6 +136,12 @@ get.date.string = function(input) { ## re-usable function to parse date strings with lubridate convert.date.to.text = function(date) { + + ## if we do not have a POSIXct object here, do not convert + if (!lubridate::is.POSIXct(date)) { + return(date) + } + text = strftime(date, format = "%Y-%m-%d %H:%M:%S") return(text) } @@ -151,6 +157,411 @@ get.date.string = function(input) { return(result) } +#' Construct a date sequence on the given start time, end time, and time period between the +#' sequentially generated dates. +#' +#' Note: You may want to use the function \code{ProjectData$get.data.timestamps} with this +#' function here. +#' +#' @param start The start time as string or POSIXct object +#' @param end The end time as string or POSIXct object +#' @param by The time period describing the length of time between dates, a character +#' string, e.g., "3 mins" or "15 days" +#' +#' @return the sequential dates as a vector +generate.date.sequence = function(start.date, end.date, by) { + + ## convert dates + start.date = get.date.from.string(start.date) + end.date = get.date.from.string(end.date) + + ## convert time.period to duration + time.period = lubridate::duration(by) + + ## convenience function for next step + get.next.step = function(date) { + return(date + time.period) + } + + ## generate dates before end date: + ## 1) initialize date sequence with first date + dates = c(start.date) + ## 2) current date + current.date = start.date + ## 3) iterate while smaller than end date + while (get.next.step(current.date) < end.date) { + ## get next step + next.step = get.next.step(current.date) + ## add next-step date to sequence + dates = c(dates, next.step) + current.date = next.step + } + ## 4) add end date to sequence + dates = c(dates, end.date) + + return(dates) +} + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Range construction and handling ----------------------------------------- + +#' Construct ranges from the given list/vector of revisions. If \code{raw} is +#' \code{FALSE} (the default), the ranges are constructed in the format "rev[n]-rev[n+1]". +#' Otherwise, pairs of range bounds are returned in list. +#' +#' @param revs the revisions +#' @param sliding.window whether sliding window splitting is enabled or not +#' [default: FALSE] +#' @param raw whether to return pairs of POSIXct objects or strings rather than +#' formatted strings [default: FALSE] +#' +#' @return the constructed ranges, either formatted or raw; the raw ranges are a named list, +#' for which the formatted ranges are the names +construct.ranges = function(revs, sliding.window = FALSE, raw = FALSE) { + ## setting offset to construct ranges, i.e., + ## combine each $offset revisions + offset = 1 + + ## with sliding window, we combine each second revision + if (sliding.window) + offset = 2 + + ## extract sequences of revisions + seq1 = revs[ 1:(length(revs) - offset) ] + seq2 = revs[ (offset + 1):length(revs) ] + + ## construct ranges + ranges = mapply(seq1, seq2, SIMPLIFY = FALSE, FUN = function(start, end) { + start.string = get.date.string(start) + end.string = get.date.string(end) + range = paste(start.string, end.string, sep = "-") + return(range) + }) + ranges = unlist(ranges, use.names = FALSE) + + ## if raw is enabled, we need to compose seq1 and + ## seq2 to appropriate tuples + if (raw) { + ## compose tuples of range start and range end + ranges.raw = mapply(seq1, seq2, FUN = c, SIMPLIFY = FALSE) + ## add formatted ranges as names + names(ranges.raw) = ranges + ## set as return value + ranges = ranges.raw + } + + return(ranges) +} + +#' Construct consecutive ranges based on the given start time, end time, and time period for +#' each range. The ranges do not overlap, i.e., the end of any range is the start of the next one. +#' +#' With this function, it is possible to construct ranges like this: +#' > ++... +#' > ..++. +#' > ....+ +#' +#' Important: As the start of each range is supposed to be inclusive and the end of each range +#' exclusive, 1 second is added to \code{end}. This way, the date \code{end} will be *included* +#' in the last range. +#' +#' Note: You may want to use the function \code{ProjectData$get.data.timestamps} with this +#' function here. +#' +#' @param start The start time as string or POSIXct object +#' @param end The end time as string or POSIXct object; the last time to be *included* in the +#' last range (see above) +#' @param time.period The time period describing the length of the ranges, a character +#' string, e.g., "3 mins" or "15 days" +#' @param raw whether to return pairs of POSIXct objects or strings rather than +#' formatted strings [default: FALSE] +#' +#' @return the constructed ranges, either formatted or raw; the raw ranges are a named list, +#' for which the formatted ranges are the names +construct.consecutive.ranges = function(start, end, time.period, raw = FALSE) { + + ## just construct overlapping ranges without any overlap ;) + ranges = construct.overlapping.ranges(start, end, time.period, overlap = 0, raw) + return(ranges) +} + +#' Construct ranges based on the given start time, end time, time period, and overlap. +#' +#' With this function, it is possible to construct ranges like this: +#' > ++++ +#' > .++++ +#' > ..++++ +#' +#' With \code{overlap} being the half of \code{time.period}, we basically obtain half- +#' overlapping ranges as in the function \code{construct.ranges} when \code{sliding.window} +#' is set to \code{TRUE}. +#' +#' Important: As the start of each range is supposed to be inclusive and the end of each range +#' exclusive, 1 second is added to \code{end}. This way, the date \code{end} will be *included* +#' in the last range. +#' +#' Note: You may want to use the function \code{ProjectData$get.data.timestamps} with this +#' function here. +#' +#' @param start The start time as string or POSIXct object +#' @param end The end time as string or POSIXct object; the last time to be *included* in the +#' last range (see above) +#' @param time.period The time period describing the length of the ranges, a character +#' string, e.g., "3 mins" or "15 days" +#' @param overlap The time period describing the length of the overlap, a character string +#' (e.g., "3 mins" or "15 days") or a numeric indication the percentage of +#' overlap (e.g., 1/4). Should be more than 0 seconds and must not be larger +#' than the given \code{time.period}. +#' @param raw whether to return pairs of POSIXct objects or strings rather than +#' formatted strings [default: FALSE] +#' +#' @return the constructed ranges, either formatted or raw; the raw ranges are a named list, +#' for which the formatted ranges are the names +construct.overlapping.ranges = function(start, end, time.period, overlap, raw = FALSE) { + + ## convert given periods to lubridate stuff: + ## 1) time period + time.period = lubridate::duration(time.period) + ## 2) overlap as character string or percent of time.period + if (is.character(overlap)) { + overlap = lubridate::duration(overlap) + } else { + overlap = time.period * overlap + } + ## 3) the dates for theirselves + start.date = get.date.from.string(start) + end.date = get.date.from.string(end) + 1 ## add 1 for inclusion of end.date + + ## check the breaking case + if (overlap >= time.period) { + logging::logerror("The overlap (%s) is exceeding the given time period (%s).", + overlap, time.period) + stop("Stopping due to illegally specified overlap for overlapping ranges.") + } + + ## compute overall duration + bins.duration = lubridate::as.duration(lubridate::interval(start.date, end.date)) + ## compute negative overlap + overlap.negative = time.period - overlap + ## compute number of complete bins + bins.number = round(bins.duration / overlap.negative) + + ## generate a approximate sequence of dates which can be streamlined later + seq.start = start.date + overlap + seq.end = seq.start + (bins.number) * overlap.negative + ranges.approx = generate.date.sequence(seq.start, seq.end, by = overlap.negative) + + ## handle end date properly + if (end.date > seq.end) { + bins.number = bins.number + 1 + ranges.approx = c(ranges.approx, end.date) + } + + ## construct the raw ranges from the approximate ones + ranges.raw = lapply(seq_len(bins.number), function(bin.index) { + ## combine start and end dates + bin.start = ranges.approx[[bin.index]] - overlap + bin.end = ranges.approx[[bin.index + 1]] + + ## check if we hit the end already + if (bin.end > end.date) { + bin.end = end.date + } + + ## return the tuple of bin start and bin end + return(c(bin.start, bin.end)) + }) + + ## construct actual range strings (without names) + ranges = sapply(ranges.raw, construct.ranges, sliding.window = FALSE, raw = FALSE) + ranges = unname(ranges) + + ## if raw is enabled, we need to attach proper names + if (raw) { + ## add formatted ranges as names + names(ranges.raw) = ranges + ## set as return value + ranges = ranges.raw + } + + return(ranges) +} + +#' Construct cumulative ranges based on the given start time, end time, and time period. +#' Each range starts at \code{start}; the first range lasts exactly \code{time.period}-long, +#' the second two times as long, etc. +#' +#' With this function, it is possible to construct ranges like this: +#' > +... +#' > ++.. +#' > +++. +#' > ++++ +#' +#' Important: As the start of each range is supposed to be inclusive and the end of each range +#' exclusive, 1 second is added to \code{end}. This way, the date \code{end} will be *included* +#' in the last range. +#' +#' Note: You may want to use the function \code{ProjectData$get.data.timestamps} with this +#' function here. +#' +#' @param start The start time as string or POSIXct object +#' @param end The end time as string or POSIXct object; the last time to be *included* in the +#' last range (see above) +#' @param time.period The time period describing the length of the ranges, a character +#' string, e.g., "3 mins" or "15 days" +#' @param raw whether to return pairs of POSIXct objects or strings rather than +#' formatted strings [default: FALSE] +#' +#' @return the constructed ranges, either formatted or raw; the raw ranges are a named list, +#' for which the formatted ranges are the names +construct.cumulative.ranges = function(start, end, time.period, raw = FALSE) { + + ## get the consecutive ranges to alter them afterwards + ranges.consecutive = construct.overlapping.ranges(start, end, time.period, overlap = 0, raw = TRUE) + + ## set the start of each range to global start date + ranges.raw = lapply(ranges.consecutive, function(range.bounds) { + ## start of each range is the global start date + range.bounds[1] = start + return(range.bounds) + }) + + ## construct actual range strings (without names) + ranges = sapply(ranges.raw, construct.ranges, sliding.window = FALSE, raw = FALSE) + ranges = unname(ranges) + + ## if raw is enabled, we need to attach proper names + if (raw) { + ## add formatted ranges as names + names(ranges.raw) = ranges + ## set as return value + ranges = ranges.raw + } + + return(ranges) +} + +#' Aggregate a given list/vector of ranges to specific levels, configurable through the +#' the parameter \code{aggregation.level} (see below for more details). +#' +#' Using different aggregation levels given by the parameter \code{aggregation.level}, +#' it is possible to configure the exact treatment of range bounds and, thus, the +#' re-arrangement of the given list of ranges. The various aggregation levels work +#' as follows: +#' - \code{"range"}: The ranges will be kept exactly as given. +#' - \code{"cumulative"}: The ranges will be re-arranged in a cumulative manner. +#' - \code{"all.ranges"}: The ranges will be re-arranged to exactly to the time range +#' specified by the start of the first range and end of the last +#' range. All ranges will be exactly the same. +#' - \code{"project.cumulative"}: The same re-arrangement as for \code{"cumulative"}, but +#' all ranges will start at \code{project.start} and *not* at the +#' beginning of the first range. +#' - \code{"project.all.ranges"}: The same re-arrangement as for \code{"all.ranges"}, but +#' all ranges will start at \code{project.start} and *not* at +#' the beginning of the first range. All ranges will be exactly the same. +#' - \code{"complete"}: The same re-arrangement as for \code{"all.ranges"}, but all ranges +#' will start at \code{project.start} and end at \code{project.end}. All +#' ranges will be exactly the same. +#' +#' Note: You may want to use the function \code{ProjectData$get.data.timestamps} with this +#' function here, to pass proper values for \code{project.start} and \code{project.end}. +#' +#' Important: As the start of each range is supposed to be inclusive and the end of each range +#' exclusive, 1 second is added to \code{project.end}. All other range bounds are supposed to +#' be correctly constructed upfront, but if \code{project.end} comes from the function +#' \code{ProjectData$get.data.timestamps}, this is not respected directly. This way, the date +#' \code{project.end} will be *included* in the last range for the aggregation level +#' \code{"complete"}. +#' +#' @param ranges the list or vector of ranges to aggregate +#' @param project.start the project start time as string or POSIXct object +#' @param project.end the project end time as string or POSIXct object +#' @param aggregation.level One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See above for more details. +#' @param raw whether to return pairs of POSIXct objects or strings rather than +#' formatted strings [default: FALSE] +#' +#' @return the constructed ranges, either formatted or raw; the raw ranges are a named list, +#' for which the ranges from \code{ranges} are the names +aggregate.ranges = function(ranges, project.start, project.end, + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + raw = FALSE) { + + ## get the chosen aggregation level + aggregation.level = match.arg(aggregation.level) + + ## get the timestamp data from the project data (needed for some aggr. levels) + project.start = get.date.from.string(project.start) + project.end = get.date.from.string(project.end) + 1 ## add 1 for inclusion of project.end + ## with aggregation level "complete" + + ## loop over all ranges and split the data for each range accordingly: + list.of.range.bounds = lapply(ranges, get.range.bounds) + ranges.raw = lapply(ranges, function(range) { + ## 1) get the range bounds to work with + start.end = get.range.bounds(range) + + ## 2) adjust the range bounds for the respective aggregation levels + ## (if nothing else is stated below, the respective range bounds stay unchanged) + switch(aggregation.level, + + range = { + ## use the exact range bounds + }, + cumulative = { + ## the start is always at the first network's start bound + start.end[1] = list.of.range.bounds[[1]][1] + }, + all.ranges = { + ## the start is always at the first network's start bound + start.end[1] =list.of.range.bounds[[1]][1] + ## the end is always at the last network's ending bound + start.end[2] = list.of.range.bounds[[length(ranges)]][2] + }, + project.cumulative = { + ## the start is always at the project data's start + start.end[1] = project.start + }, + project.all.ranges = { + ## the start is always at the project data's start + start.end[1] = project.start + ## the end is always at the last network's ending bound + start.end[2] = list.of.range.bounds[[length(ranges)]][2] + }, + complete = { + ## the start is always at the project data's start + start.end[1] = project.start + ## the start is always at the project data's ending + start.end[2] = project.end + } + ) + + return(start.end) + }) + + ## construct actual range strings (without names) + ranges.new = sapply(ranges.raw, construct.ranges, sliding.window = FALSE, raw = FALSE) + ranges.new = unname(ranges.new) + + ## if raw is enabled, we need to attach proper names + if (raw) { + ## add formatted original(!) ranges as names + if (is.list(ranges)) { + names(ranges.raw) = names(ranges) + } else { + names(ranges.raw) = ranges + } + ## set as return value + ranges.new = ranges.raw + } + + return(ranges.new) +} + #' Calculate the bounds of a range from its name. #' #' @param range The range name diff --git a/util-split.R b/util-split.R index df82734d..b2000345 100644 --- a/util-split.R +++ b/util-split.R @@ -28,8 +28,9 @@ requireNamespace("lubridate") # for date conversion #' @param project.data the *Data object from which the data is retrieved #' @param time.period the time period describing the length of the ranges, a character string, #' e.g., "3 mins" or "15 days" -#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range). -#' If set, the 'time.period' parameter is ignored; consequently, 'split.basis' does not make sense then. +#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an +#' *exclusive* manner). If set, the 'time.period' parameter is ignored; consequently, 'split.basis' does +#' not make sense then either. #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: commits] #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach @@ -360,14 +361,15 @@ split.data.activity.based = function(project.data, activity.type = c("commits", #' #' @param list.of.networks The network list #' @param project.data The entire project data -#' @param aggregation.level Determines the data to use for the attribute calculation. -#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' @param aggregation.level One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See above for more details. #' #' @return A list containing tuples with the keys "network" and "data", where, under "network", are #' the respective networks passed via \code{list.of.networks} and, under "data", are the #' split data instances of type \code{RangeData}. +#' +#' @seealso \code{aggregate.ranges} split.data.by.networks = function(list.of.networks, project.data, aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", @@ -378,66 +380,65 @@ split.data.by.networks = function(list.of.networks, project.data, ## get the timestamp data from the project data (needed for some aggr. levels) project.timestamps = project.data$get.data.timestamps(outermost = TRUE) - ## loop over all ranges and split the data for each range accordingly: + ## get the list of ranges list.of.ranges = names(list.of.networks) - list.of.range.bounds = lapply(list.of.ranges, get.range.bounds) - net.to.range.list = lapply(list.of.ranges, function(range) { - ## 1) get the range bounds to work with - start.end = get.range.bounds(range) - - ## 2) adjust the range bounds for the respective aggregation levels - ## (if nothing else is stated below, the respective range bounds stay unchanged) - switch(aggregation.level, - - range = { - ## use the exact range bounds - }, - cumulative = { - ## the start is always at the first network's start bound - start.end[1] = list.of.range.bounds[[1]][1] - }, - all.ranges = { - ## the start is always at the first network's start bound - start.end[1] =list.of.range.bounds[[1]][1] - ## the end is always at the last network's ending bound - start.end[2] = list.of.range.bounds[[length(list.of.ranges)]][2] - }, - project.cumulative = { - ## the start is always at the project data's start - start.end[1] = project.timestamps[["start"]] - }, - project.all.ranges = { - ## the start is always at the project data's start - start.end[1] = project.timestamps[["start"]] - ## the end is always at the last network's ending bound - start.end[2] = list.of.range.bounds[[length(list.of.ranges)]][2] - }, - complete = { - ## the start is always at the project data's start - start.end[1] = project.timestamps[["start"]] - ## the start is always at the project data's ending - start.end[2] = project.timestamps[["end"]] - } - ) - - ## 3) split the data to the ranges - range.data = split.data.time.based(project.data, bins = start.end, sliding.window = FALSE)[[1]] - - ## 4) construct return value - net.to.range.entry = list( - "network" = list.of.networks[[range]], - "data" = range.data - ) + ## aggregate ranges + ranges.bounds = aggregate.ranges( + list.of.ranges, project.start = project.timestamps[["start"]], project.end = project.timestamps[["end"]], + aggregation.level = aggregation.level, raw = TRUE + ) - return (net.to.range.entry) - }) + ## split the data by the computed (and aggregated) ranges + list.of.data = split.data.time.based.by.ranges(project.data, ranges.bounds) + + ## zip networks and range data + net.to.range.list = mapply( + list.of.networks, list.of.data, SIMPLIFY = FALSE, + FUN = function(net, range.data) { + net.to.range.entry = list( + "network" = net, + "data" = range.data + ) + return(net.to.range.entry) + } + ) ## properly set names for the result list - names(net.to.range.list) = names(list.of.networks) + names(net.to.range.list) = list.of.ranges return(net.to.range.list) } +#' Split the given data to the given ranges and return the resulting list. +#' +#' Note: You may want to use any function \code{construct.*.ranges} to obtain +#' an appropriate sequence of ranges to pass to this function. +#' +#' @param project.data the \code{ProjectData} instance to be split +#' @param ranges the ranges to be used for splitting +#' +#' @return a list of \code{RangeData} instances, each representing one of the +#' given ranges; the ranges are used as names for the list +split.data.time.based.by.ranges = function(project.data, ranges) { + + ## aggregate ranges + ranges.bounds = lapply(ranges, get.range.bounds) + + ## loop over all ranges and split the data accordingly: + data.split = mapply( + ranges, ranges.bounds, SIMPLIFY = FALSE, + FUN = function(range, start.end) { + ## 1) split the data to the current range + range.data = split.data.time.based(project.data, bins = start.end, sliding.window = FALSE)[[1]] + + ## 2) return the data + return (range.data) + } + ) + + return(data.split) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split networks ---------------------------------------------------------- @@ -454,8 +455,8 @@ split.data.by.networks = function(list.of.networks, project.data, #' @param network the igraph network to split, needs to have an edge attribute named "date" #' @param time.period the time period describing the length of the ranges, a character string, #' e.g., "3 mins" or "15 days" -#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range). -#' If set, the 'time.period' parameter is ignored. +#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an +#' *exclusive* manner). If set, the 'time.period' parameter is ignored. #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach #' [default: FALSE] #' @@ -703,6 +704,36 @@ split.network.activity.based = function(network, number.edges = 5000, number.win return(networks) } +#' Split the given network to the given ranges and return the resulting list. +#' +#' Note: You may want to use any function \code{construct.*.ranges} to obtain +#' an appropriate sequence of ranges to pass to this function. +#' +#' @param network the network to be split +#' @param ranges the ranges to be used for splitting +#' +#' @return a list of networks, each representing one of the given ranges; the +#' ranges are used as names for the list +split.network.time.based.by.ranges = function(network, ranges) { + + ## aggregate ranges + ranges.bounds = lapply(ranges, get.range.bounds) + + ## loop over all ranges and split the network accordingly: + nets.split = mapply( + ranges, ranges.bounds, SIMPLIFY = FALSE, + FUN = function(range, start.end) { + ## 1) split the network to the current range + range.net = split.network.time.based(network, bins = start.end, sliding.window = FALSE)[[1]] + + ## 2) return the network + return (range.net) + } + ) + + return(nets.split) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split raw data ---------------------------------------------------------- @@ -796,14 +827,14 @@ split.unify.range.names = function(ranges) { #' item indicates the end of the last bin split.get.bins.time.based = function(dates, time.period) { logging::logdebug("split.get.bins.time.based: starting.") - ## find date bins from given dates - dates.breaks = c( - ## time periods of length 'time.period' - seq.POSIXt(from = min(dates), to = max(dates), by = time.period), - ## add last bin - max(dates) + 1 - ) + + ## generate date bins from given dates + dates.breaks = generate.date.sequence(min(dates), max(dates), time.period) + ## as the last bin bound is exclusive, we need to add a second to it + dates.breaks[length(dates.breaks)] = max(dates) + 1 + ## generate charater strings for bins dates.breaks.chr = get.date.string(head(dates.breaks, -1)) + ## find bins for given dates dates.bins = findInterval(dates, dates.breaks, all.inside = FALSE) dates.bins = factor(dates.bins)