Skip to content

Commit

Permalink
Merge pull request #38 from tscnlab/dev
Browse files Browse the repository at this point in the history
remove duplicates
  • Loading branch information
JZauner authored Oct 9, 2024
2 parents 15db999 + efaa098 commit 73b3621
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 2 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: LightLogR
Title: Process Data from Wearable Light Loggers and Optical Radiation Dosimeters
Version: 0.4.1
Version: 0.4.2
Authors@R: c(
person("Johannes", "Zauner",
email = "[email protected]", role = c("aut", "cre"),
Expand Down Expand Up @@ -32,6 +32,7 @@ Imports:
ggsci,
ggtext,
hms,
janitor,
lubridate,
magrittr,
pkgload,
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# LightLogR 0.4.2

* `import` functions will now give a warning message about identical observations in the provided data files, stop the import process and return a tibble with the duplicate rows. Through the `remove_duplicates` parameter, the user can decide to automatically remove these duplicates during import. **Note: identical observations refers to identical rows when disregarding the filename.**

# LightLogR 0.4.1

* added support for OcuWEAR devices
Expand Down
2 changes: 1 addition & 1 deletion R/aaa.r
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <- Date.data <- print_n <- NULL
Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <- Date.data <- print_n <- remove_duplicates <- NULL

empty_function <- function() {
rsconnect::accountInfo()
Expand Down
18 changes: 18 additions & 0 deletions R/import_LL.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#' column. If the column is not present it will add this column and fill it
#' with the filename of the importfile (see param `auto.id`).
#' * `print_n` can be used if you want to see more rows from the observation intervals
#' * `remove_duplicates` can be used if identical observations are present within or across multiple files. The default is `FALSE`. The function keeps only unique observations (=rows) if set to' TRUE'. This is a convenience implementation of [dplyr::distinct()].
#'
#' @param ... Parameters that get handed down to the specific import functions
#' @param device From what device do you want to import? For a few devices,
Expand Down Expand Up @@ -282,6 +283,7 @@ imports <- function(device,
locale = readr::default_locale(),
silent = FALSE,
print_n = 10,
remove_duplicates = FALSE,
... =
),
#function expression
Expand Down Expand Up @@ -353,6 +355,22 @@ imports <- function(device,
)
}

#if there are duplicate rows, remove them and print an info message
duplicates <- suppressMessages(janitor::get_dupes(data, -file.name) %>% nrow())
orig_rows <- data %>% nrow()

if(duplicates > 0 & remove_duplicates) {
data <- data %>% dplyr::distinct(dplyr::pick(-file.name),.keep_all = TRUE)
cat(paste0(format(orig_rows - nrow(data), big.mark = "'"), " duplicate rows were removed during import.\n"))
}

#if there are untreated duplicate rows, give a warning
if(duplicates > 0 & !remove_duplicates) {
messages <- paste0(format(duplicates, big.mark = "'"), " rows in your dataset(s) are identical to at least one other row. This causes problems during analysis. Please set `remove_duplicates = TRUE` during import. Import will be stopped now and a dataframe with the duplicate rows returned \nIf you want to find out which entries are duplicates. Use `{replace_with_data_object} %>% janitor::get_dupes(-file.name) on your imported dataset.\n")
warning(messages)
return(janitor::get_dupes(data, -file.name))
}

#if dst_adjustment is TRUE, adjust the datetime column
if(dst_adjustment) {
data <- data %>% dst_change_handler(filename.colname = file.name)
Expand Down

0 comments on commit 73b3621

Please sign in to comment.