Merge branch 'dev_general' into main

tscnlab · Jun 18, 2024 · c8282c4 · c8282c4
2 parents 8e559c9 + b695dc5
commit c8282c4
Show file tree

Hide file tree

Showing 22 changed files with 524 additions and 10 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,12 @@
 
 * `bright_dark_period()` now maintains the date when looping the data.
 
+* Added articles on `Import & Cleaning`, `Metrics`, and `Visualizations` to the website.
+
+* Added the option for more print rows of observation intervals during `import`.
+
+* Added the option to set a length for the dataset starting from the end in `filter_Datetime()` and family.
+
 # LightLogR 0.3.5
 
 * Added the function `aggregate_Date()` to aggregate long datasets to one day per group.

diff --git a/R/aaa.r b/R/aaa.r
@@ -1,4 +1,4 @@
-Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <-  Date.data <- NULL
+Time <- mEDI <- Time.data <- Datetime <- timestamp <- tz <- Day.data <- `DATE/TIME` <- n <- Datetime.rounded <- id <- sleep.colname.string <- file.name <- Interval <- original.datapoints.fleeting <- MEDI <- State.Brown <- Reference <- Reference.check <- Id <- Start.date.shift <- data <- Shift <- `MELANOPIC EDI` <- State <- group <- End <- Start <- Quant.x <- Quant.y <- is.implicit <- group.indices <- Id2 <- gap.id <- start <- end <- path <- auto.id <- n_max <- manual.id <- silent <- Light <- Day <- N <- is_missing <- Hour <- .change <- dst_start <- .dst <- .dst2 <- dst_adjustment <- auto.plot <- group.1 <- group.2 <- group.indices2 <- cluster_start <- cluster_end <- row_idx <- is_cluster <- cluster_idx <- is_pulse <- pulse_idx <- light <- time <- level <- duration <- mean_duration <- onset <- midpoint <- offset <- mean_onset <- mean_midpoint <- mean_offset <-  Date.data <- print_n <-  NULL
 
 empty_function <- function() {
   rsconnect::accountInfo()

diff --git a/R/filter_Datetime.R b/R/filter_Datetime.R
@@ -37,6 +37,10 @@
 #'   is FALSE). This is useful, e.g., when the first observation in the dataset
 #'   is slightly after midnight. If TRUE, it will count the length from midnight
 #'   on to avoid empty days in plotting with [gg_day()].
+#' @param length_from_start A `logical` indicating whether the `length` argument
+#'   should be applied to the start (default, TRUE) or the end of the data
+#'   (FALSE). Only relevant if neither the `start` nor the `end` arguments are
+#'   provided.
 #' @param only_Id An expression of `ids` where the filtering should be applied
 #'   to. If `NULL` (the default), the filtering will be applied to all `ids`.
 #'   Based on the this expression, the dataset will be split in two and only
@@ -93,6 +97,7 @@ filter_Datetime <- function(dataset,
                             start = NULL, 
                             end = NULL, 
                             length = NULL,
+                            length_from_start = TRUE,
                             full.day = FALSE,
                             tz = NULL,
                             only_Id = NULL,
@@ -156,12 +161,22 @@ filter_Datetime <- function(dataset,
     start <- dataset[[Datetime.colname.defused]] %>% min()
   }
 
-  #calculate end time if length is given
+
+
+  #calculate end time if length is given & length_from_start is TRUE
   if(is.null(end) & !is.null(length)) {
+  if(length_from_start) {
     end <- switch (full.day %>% as.character(),
       "TRUE" = lubridate::as_date(start, tz = tz),
       "FALSE" = lubridate::as_datetime(start, tz = tz)
     )  + length
+  } else {
+    end <- dataset[[Datetime.colname.defused]] %>% max()
+    start <- switch (full.day %>% as.character(),
+                     "TRUE" = lubridate::as_date(end, tz = tz),
+                     "FALSE" = lubridate::as_datetime(end, tz = tz)
+    ) - length
+  }
   }
 
   #calculate end time if NULL
@@ -233,7 +248,10 @@ filter_Date <- function(...,
 #'   to be quoted with [quote()] or [rlang::expr()].
 #' @param filter_function The function to be used for filtering, either
 #'   `filter_Datetime` (the default) or `filter_Date`
-#' @param ... Additional arguments passed to the filter function
+#' @param ... Additional arguments passed to the filter function. If the
+#'   `length` argument is provided here instead of the `argument`, it has to be
+#'   written as a string, e.g., `length = "1 day"`, instead of `length =
+#'   lubridate::days(1)`.
 #'
 #' @return A dataframe with the filtered data
 #' @export
@@ -245,7 +263,7 @@ filter_Date <- function(...,
 #'  #compare the unfiltered dataset
 #'  sample.data.environment %>% gg_overview(Id.colname = Id)
 #'  #compare the unfiltered dataset
-#'  sample.data.environment %>% 
+#'  sample.data.environment %>%
 #'  filter_Datetime_multiple(arguments = arguments, filter_Date) %>%
 #'  gg_overview(Id.colname = Id)
 filter_Datetime_multiple <- function(dataset, 

diff --git a/R/import_LL.R b/R/import_LL.R
@@ -43,6 +43,7 @@
 #'   If the `Id` column is already part of the `dataset` it will just use
 #'   this column. If the column is not present it will add this column and fill
 #'   it with the filename of the importfile (see param `auto.id`).
+#'   `print_n` can be used if you want to see more rows from the observation intervals
 #'
 #' @param ... Parameters that get handed down to the specific import functions
 #' @param device From what device do you want to import? For a few devices,
@@ -153,6 +154,7 @@ imports <- function(device,
       auto.plot = TRUE,
       locale = readr::default_locale(),
       silent = FALSE,
+      print_n = 10,
       ... =
     ),
     #function expression
@@ -237,7 +239,8 @@ imports <- function(device,
           Id.colname = Id, #the id column name
           dst_adjustment = dst_adjustment, #whether there is a dst adjustment
           filename = filename, #what the filename(s) is/are
-          na.count = na.count #how many NA values were dropped
+          na.count = na.count, #how many NA values were dropped
+          print_n = print_n #how many rows to print for observation intervals
           )
 
       #if autoplot is TRUE, make a plot

diff --git a/R/import_helper.r b/R/import_helper.r
@@ -6,7 +6,8 @@ import.info <- function(data,
                         dst_adjustment,
                         dst_info = TRUE,
                         filename,
-                        na.count) {
+                        na.count,
+                        print_n = 10) {
   #give info about the file
   min.time <- min(data$Datetime)
   max.time <- max(data$Datetime)
@@ -72,7 +73,8 @@ import.info <- function(data,
     "Timespan: " , diff(c(min.time, max.time)) %>% format(digits = 2), "\n\n",
     "Observation intervals: \n",
     sep = "")
-  utils::capture.output(interval.time)[c(-1,-2,-4)] %>% cat(sep = "\n")
+  utils::capture.output(interval.time %>% print(n=print_n))[c(-1,-2,-4)] %>% 
+    cat(sep = "\n")
 }
 
 #This internal helper function looks for the starting row of an import file based on a vector of column names in order.

diff --git a/inst/.DS_Store b/inst/.DS_Store
diff --git a/inst/extdata/.DS_Store b/inst/extdata/.DS_Store
diff --git a/man/.DS_Store b/man/.DS_Store
diff --git a/man/figures/.DS_Store b/man/figures/.DS_Store
diff --git a/man/filter_Datetime.Rd b/man/filter_Datetime.Rd
diff --git a/man/filter_Datetime_multiple.Rd b/man/filter_Datetime_multiple.Rd
diff --git a/man/import_Dataset.Rd b/man/import_Dataset.Rd
diff --git a/vignettes/.DS_Store b/vignettes/.DS_Store
diff --git a/vignettes/articles/.DS_Store b/vignettes/articles/.DS_Store
diff --git a/vignettes/articles/Day.Rmd b/vignettes/articles/Day.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "What's in a Day?"
+title: "The whole game"
 ---
 
 ```{r, include = FALSE}

diff --git a/vignettes/articles/Import.Rmd b/vignettes/articles/Import.Rmd
@@ -0,0 +1,142 @@
+---
+title: "Import & cleaning"
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+This article focuses on the import from multiple files and participants, as well as the cleaning of the data. We need these packages
+
+```{r setup, message = FALSE}
+library(LightLogR)
+library(tidyverse)
+library(gghighlight)
+```
+
+# Importing Data
+
+The first step in every analysis is data import. We will work with data collected as part the Master Thesis *Insights into real-world human light exposure: relating self-report with eye-level light logging* by Carolina Guidolin (2023). The data is stored in 17 text files in the *data/* folder. You can access the data yourself through the [LightLogR GitHub repository](https://github.com/tscnlab/LightLogR/tree/main/vignettes/articles/data).
+
+```{r, files}
+path <- "data"
+files <- list.files(path, full.names = TRUE)
+#show how many files are listes
+length(files)
+```
+
+Next we require a time zone of data collection. If uncertain which time zones are valid, use the `OlsonNames()` function. Our data was collected in the "Europe/Berlin" time zone.
+
+```{r, tz}
+#first six time zones from OlsonNames()
+head(OlsonNames())
+
+#our time zone
+tz <- "Europe/Berlin"
+```
+
+Lastly, the participant Ids are stored in the file names. We will extract them and store them in a column called `Id`. The following code defines the pattern as a *regular expression*, which will extract the first three digits from the file name.
+
+```{r, Id pattern}
+pattern <- "^(\\d{3})"
+```
+
+Now we can import the data. Data were collected with the ActLumus device by Condor Instruments. The right way to specify this is through the `import` function.
+
+```{r, import}
+data <- import$ActLumus(files, tz = tz, auto.id = pattern, print_n=33)
+```
+
+# Data cleaning #1
+
+Before we can dive into the analysis part, we need to make sure we have a clean dataset. The import summary shows us two problems with the data:
+
+-   two files have data that crosses daylight saving time (DST) changes. Because the ActLumus device does not adjust for DST, we need to correct for this.
+-   Multiple Ids have single datapoints at the beginning of the dataset with gaps before actual data collection starts. These are test measurements to check equipment, but must be removed from the dataset.
+
+Let us first deal with the DST change. LightLogR has an in-built function to correct for this during import. We thus will re-import the data, but make the import silent as to not clutter the output.
+
+```{r, dst change}
+data <- 
+  import$ActLumus(files, tz = tz, auto.id = pattern, dst_adjustment = TRUE, 
+                  auto.plot = FALSE, silent = TRUE)
+```
+
+The second problem requires the filtering of certain Ids. The `filter_Datetime_multiple()` function is ideal for this. We can provide a length (1 week), starting from the end of data collection and backwards. The variable `arguments provide variable arguments to the filter function, they have to be provided in list form and expressions have to be quoted through`quote()`. Fixed arguments, like the length and`length_from_start\` are provided as named arguments and only have to be specified once, as they are the same for all Ids.
+
+```{r, start shift}
+data <- 
+  data %>% 
+  filter_Datetime_multiple(
+    arguments = list(
+      list(only_Id = quote(Id == 216)),
+      list(only_Id = quote(Id == 219)),
+      list(only_Id = quote(Id == 214)),
+      list(only_Id = quote(Id == 206))
+    ), length = "1 week", length_from_start = FALSE)
+
+```
+
+Let's have a look at the data again with the `gg_overview()` function.
+
+```{r, overview}
+data %>% gg_overview()
+```
+
+Looks much better now. Also, because there is no longer a hint about gaps in the lower right corner, we can be sure that all gaps have been removed. The function gap_finder() shows us, however, that there are still irregularities in the data and the function count_difftime() reveals where they are.
+
+```{r, irregularities}
+data %>% gap_finder()
+data %>% count_difftime() %>% print(n=22)
+```
+
+This means we have to look at and take care of the irregularities for the Ids 215, 218, and 221.
+
+# Data cleaning #2
+
+Let us first visualize where the irregularities are. We can use `gg_days()` for that.
+
+```{r}
+#create two columns to show the irregularities and gaps for relevant ids
+difftimes <- 
+  data %>% 
+  filter(Id %in% c(215, 218, 221)) %>%
+  mutate(difftime = difftime(lead(Datetime), Datetime, units = "secs"),
+                  end = Datetime + seconds(difftime))
+
+#visualize where those points are
+difftimes %>% 
+  gg_days(geom = "point", 
+          x.axis.breaks = ~Datetime_breaks(.x, by = "2 days" )
+  ) +
+  geom_rect(data = difftimes %>% filter(difftime !=10),
+            aes(xmin = Datetime, xmax = end, ymin = -Inf, ymax = Inf),
+            fill = "red", col = "red", linewidth = 0.2, alpha = 0.2) +
+    gghighlight(difftime != 10 | lag(difftime !=10))
+
+```
+
+All irregular data appear at the very beginning of the data collection. As we are interestet in one whole week of data, we can similarly apply a one week filter on these Ids and see if that removed the irregular data points.
+
+```{r}
+data <- 
+  data %>% 
+  filter_Datetime_multiple(
+    arguments = list(
+      list(only_Id = quote(Id == 215)),
+      list(only_Id = quote(Id == 218)),
+      list(only_Id = quote(Id == 221))
+    ), length = "1 week", length_from_start = FALSE)
+
+data %>% gap_finder()
+data %>% count_difftime() %>% print(n=17)
+```
+
+The data is now clean and we can proceed with the analysis. This dataset will be needed in other articles, so we will save it as an RDS file.
+
+```{r}
+# saveRDS(data, "cleaned_data/ll_data.rds")
+```