From 10696e4cf4ae92371917ed8ccaec2b0183da145c Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Thu, 19 Sep 2024 18:20:52 +0200 Subject: [PATCH 1/4] Ensure correct data type of NA timestamps Up until now, `get.data.cut.to.same.date(data.sources = c("issues", "mails", "commits"))` failed if some of the first data source was empty, but not if the second one was empty. The reason was that `NA` values introduced by empty data sources at the beginning of the data frame turned the data frame into a data frame of numeric objects instead of POSIXct objects. If there were already POSIXct objects in the data frame, this did not happen. To prevent the timestamps to be interpreted as numeric values, make sure that the `NA` values are always POSIXct objects. This fixes #269. Signed-off-by: Thomas Bock --- util-data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util-data.R b/util-data.R index 90c01ca4..6edc899b 100644 --- a/util-data.R +++ b/util-data.R @@ -797,8 +797,8 @@ ProjectData = R6::R6Class("ProjectData", } ## NAs otherwise else { - source.date.min = NA - source.date.max = NA + source.date.min = as.POSIXct(NA) + source.date.max = as.POSIXct(NA) } ## remove old line if existing From 646c01a42ad8decfbc9040030e790e51cb65cffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Wed, 16 Oct 2024 16:33:33 +0200 Subject: [PATCH 2/4] Test 'get.data.cut.to.same.date' in presence of an empty data source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This test fails without the previous fix by Thomas Bock but does not fail when the fix is in place. This works towards fixing #269. Signed-off-by: Maximilian Löffler --- tests/test-data-cut.R | 61 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index d1f3ef2a..26b1bf08 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -18,6 +18,7 @@ ## Copyright 2018 by Thomas Bock ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. @@ -82,3 +83,63 @@ test_that("Cut commit and mail data to same date range.", { expect_identical(mail.data, mail.data.expected, info = "Cut mail data.") }) + +test_that("Cut data to same date range with one empty data source.", { + + ## configurations + + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + + ## in order to properly test whether the data types of timestamps originating from empty data are correct, + ## ensure that the first provided data source contains empty data. This is important as R usually uses the + ## first entry of a data frame to determine the data type of it. However, the data type of the project timestamps + ## should be properly set regardless of the empty data. + proj.conf$update.value("issues.locked", TRUE) + data.sources = c("issues", "mails", "commits") + + ## construct objects + + x.data = ProjectData$new(proj.conf) + x.data$set.issues(NULL) + + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713)), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45")), + author.name = c("Björn", "Olaf"), + author.email = c("bjoern@example.org", "olaf@example.org"), + committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-20 10:00:44")), + committer.name = c("Björn", "Björn"), + committer.email = c("bjoern@example.org", "bjoern@example.org"), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), + changed.files = as.integer(c(1, 1)), + added.lines = as.integer(c(1, 1)), + deleted.lines = as.integer(c(1, 0)), + diff.size = as.integer(c(2, 1)), + file = c("test.c", "test.c"), + artifact = c("A", "A"), + artifact.type = c("Feature", "Feature"), + artifact.diff.size = as.integer(c(1, 1))) + + mail.data.expected = data.frame(author.name = c("Thomas", "Olaf"), + author.email = c("thomas@example.org", "olaf@example.org"), + message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), + date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), + date.offset = as.integer(c(100, 200)), + subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), + thread = sprintf("", c("13#9", "13#9")), + artifact.type = c("Mail", "Mail")) + + issue.data.expected = create.empty.issues.list() + + commit.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.commits.unfiltered() + rownames(commit.data) = 1:nrow(commit.data) + + mail.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.mails() + rownames(mail.data) = 1:nrow(mail.data) + + issue.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.issues() + + expect_identical(commit.data, commit.data.expected, info = "Cut Raw commit data.") + expect_identical(mail.data, mail.data.expected, info = "Cut mail data.") + expect_identical(issue.data, issue.data.expected, info = "Cut issue data (empty).") + +}) From 7c525b9c4f18dd1eb0e961d4fb47c34adffcb804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Wed, 16 Oct 2024 16:52:08 +0200 Subject: [PATCH 3/4] Define 'PATCHSTACK.MAIL.DECAY.THRESHOLD' as Duration object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- util-data.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/util-data.R b/util-data.R index 6edc899b..f137a928 100644 --- a/util-data.R +++ b/util-data.R @@ -25,7 +25,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann -## Copyright 2022-2023 by Maximilian Löffler +## Copyright 2022-2024 by Maximilian Löffler ## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -113,7 +113,7 @@ DATASOURCE.TO.ARTIFACT.COLUMN = list( ## the maximum time difference between subsequent mails of a patchstack -PATCHSTACK.MAIL.DECAY.THRESHOLD = "30 seconds" +PATCHSTACK.MAIL.DECAY.THRESHOLD = lubridate::as.duration("30 seconds") ## configuration parameters that do not reset the environment when changed CONF.PARAMETERS.NO.RESET.ENVIRONMENT = c("commit.messages", @@ -283,8 +283,7 @@ ProjectData = R6::R6Class("ProjectData", ## of 'PATCHSTACK.MAIL.DECAY.THRESHOLD' while (i < nrow(thread) && running) { if (thread[1, "author.name"] == thread[i + 1, "author.name"] && - thread[i + 1, "date"] - thread[i, "date"] <= - lubridate::as.duration(PATCHSTACK.MAIL.DECAY.THRESHOLD)) { + thread[i + 1, "date"] - thread[i, "date"] <= PATCHSTACK.MAIL.DECAY.THRESHOLD) { i = i + 1 } else { running = FALSE From 1d1fe7fc0d12c1a9a7c1c1d843a4cdb7e28d96a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Fri, 18 Oct 2024 19:14:03 +0200 Subject: [PATCH 4/4] Update 'NEWS.md' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 3756263b..d4e457bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -27,6 +27,7 @@ ### Fixed - Fix the creation of edgelists for issue-based artifact-networks by correctly iterating over the issue data (PR #264, 321d85043112971c04998249c14a0677a32c9004) +- Fix a bug in `extract.timestamps` that occurs when the first `data.source` contains empty data and that leads to a return value of type numeric which should be POSIXct (PR #270, 10696e4cf4ae92371917ed8ccaec2b0183da145c, 646c01a42ad8decfbc9040030e790e51cb65cffd) ## 4.4