From be4cd48c07b7c0ce41d85763cda8ce41754f3d97 Mon Sep 17 00:00:00 2001
From: Johannes Zauner <112665672+JZauner@users.noreply.github.com>
Date: Wed, 9 Oct 2024 13:51:24 +0200
Subject: [PATCH 1/2] Increment version number to 0.4.2

---
 DESCRIPTION   |  3 ++-
 NEWS.md       |  2 ++
 R/import_LL.R | 18 ++++++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 37dbd85..e589d28 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: LightLogR
 Title: Process Data from Wearable Light Loggers and Optical Radiation Dosimeters
-Version: 0.4.1
+Version: 0.4.2
 Authors@R: c(
     person("Johannes", "Zauner", 
       email = "johannes.zauner@tum.de", role = c("aut", "cre"),
@@ -32,6 +32,7 @@ Imports:
     ggsci,
     ggtext,
     hms,
+    janitor,
     lubridate,
     magrittr,
     pkgload,
diff --git a/NEWS.md b/NEWS.md
index 76f5f95..79399b6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+# LightLogR 0.4.2
+
 # LightLogR 0.4.1
 
 * added support for OcuWEAR devices
diff --git a/R/import_LL.R b/R/import_LL.R
index 790bf1b..dd9c18b 100644
--- a/R/import_LL.R
+++ b/R/import_LL.R
@@ -44,6 +44,7 @@
 #'   column. If the column is not present it will add this column and fill it
 #'   with the filename of the importfile (see param `auto.id`).
 #' * `print_n` can be used if you want to see more rows from the observation intervals
+#' * `remove_duplicates` can be used if identical observations are present within or across multiple files. The default is `FALSE`. The function keeps only unique observations (=rows) if set to' TRUE'. This is a convenience implementation of [dplyr::distinct()].
 #'
 #' @param ... Parameters that get handed down to the specific import functions
 #' @param device From what device do you want to import? For a few devices,
@@ -282,6 +283,7 @@ imports <- function(device,
       locale = readr::default_locale(),
       silent = FALSE,
       print_n = 10,
+      remove_duplicates = FALSE,
       ... =
     ),
     #function expression
@@ -353,6 +355,22 @@ imports <- function(device,
         )
       }
       
+      #if there are duplicate rows, remove them and print an info message
+      duplicates <- suppressMessages(janitor::get_dupes(data, -file.name) %>% nrow())
+      orig_rows <- data %>% nrow()
+      
+      if(duplicates > 0 & remove_duplicates) {
+        data <- data %>% dplyr::distinct(dplyr::pick(-file.name),.keep_all = TRUE)
+        cat(paste0(format(orig_rows - nrow(data), big.mark = "'"), " duplicate rows were removed during import.\n"))
+      }
+      
+      #if there are untreated duplicate rows, give a warning
+      if(duplicates > 0 & !remove_duplicates) {
+        messages <- paste0(format(duplicates, big.mark = "'"), " rows in your dataset(s) are identical to at least one other row. This causes problems during analysis. Please set `remove_duplicates = TRUE` during import. \nIf you still want to import the data as is and it failed with an error, try setting `auto.plot = FALSE`. You may want to do this to find out which entries are duplicates. Use `{replace_with_data_object} %>% janitor::get_dupes(-file.name) on your imported dataset.\n")
+        cat(messages)
+        warning(messages)
+      }
+      
       #if dst_adjustment is TRUE, adjust the datetime column
       if(dst_adjustment) {
         data <- data %>% dst_change_handler(filename.colname = file.name)

From efaa09832b4a0556bfb85dcfef37af3e397bb6de Mon Sep 17 00:00:00 2001
From: Johannes Zauner <112665672+JZauner@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:00:51 +0200
Subject: [PATCH 2/2] import update to deal with identical observations

---
 NEWS.md       | 2 ++
 R/aaa.r       | 2 +-
 R/import_LL.R | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 79399b6..5fe2546 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # LightLogR 0.4.2
 
+* `import` functions will now give a warning message about identical observations in the provided data files, stop the import process and return a tibble with the duplicate rows. Through the `remove_duplicates` parameter, the user can decide to automatically remove these duplicates during import. **Note: identical observations refers to identical rows when disregarding the filename.**
+
 # LightLogR 0.4.1
 
 * added support for OcuWEAR devices
diff --git a/R/aaa.r b/R/aaa.r
index 31a27f1..73dc65d 100644
--- a/R/aaa.r
+++ b/R/aaa.r
@@ -1,4 +1,4 @@
-Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <-  Date.data <- print_n <-  NULL
+Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <-  Date.data <- print_n <- remove_duplicates <-  NULL
 
 empty_function <- function() {
   rsconnect::accountInfo()
diff --git a/R/import_LL.R b/R/import_LL.R
index dd9c18b..a911d93 100644
--- a/R/import_LL.R
+++ b/R/import_LL.R
@@ -366,9 +366,9 @@ imports <- function(device,
       
       #if there are untreated duplicate rows, give a warning
       if(duplicates > 0 & !remove_duplicates) {
-        messages <- paste0(format(duplicates, big.mark = "'"), " rows in your dataset(s) are identical to at least one other row. This causes problems during analysis. Please set `remove_duplicates = TRUE` during import. \nIf you still want to import the data as is and it failed with an error, try setting `auto.plot = FALSE`. You may want to do this to find out which entries are duplicates. Use `{replace_with_data_object} %>% janitor::get_dupes(-file.name) on your imported dataset.\n")
-        cat(messages)
+        messages <- paste0(format(duplicates, big.mark = "'"), " rows in your dataset(s) are identical to at least one other row. This causes problems during analysis. Please set `remove_duplicates = TRUE` during import. Import will be stopped now and a dataframe with the duplicate rows returned \nIf you want to find out which entries are duplicates. Use `{replace_with_data_object} %>% janitor::get_dupes(-file.name) on your imported dataset.\n")
         warning(messages)
+        return(janitor::get_dupes(data, -file.name))
       }
       
       #if dst_adjustment is TRUE, adjust the datetime column