repo skeleton

Closes #2
nlsy-links · Jun 16, 2017 · 68950db · 68950db
1 parent 026e222
commit 68950db
Show file tree

Hide file tree

Showing 25 changed files with 10,224 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,28 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+#So HTML, MD, & CSS files aren't considered as code when determining language of repo.
+# https://github.com/github/linguist#using-gitattributes
+*.html linguist-documentation
+*.md linguist-documentation
+*.css linguist-documentation
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,41 @@
+# ---- R and RStudio files -----------------------------------------------
+.Rhistory
+.Rapp.history
+.RData
+.Rproj.user
+.Rproj.user/
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+
+# ---- Protected Information -----------------------------------------------
+# see cubuspl42's answer in http://stackoverflow.com/questions/2415873/exceptions-in-gitignore
+/unshared-material/*
+/data-unshared/*
+
+# Dataset Exceptions
+!utility/package-dependency-list.csv
+
+# Keep the README files in the unshared directories.  They provide documentation, and also force the directories to be created on other machines.
+# Depending on how you do it, you might need to add w/ the command line (eg, git add -f ./data-unshared/raw/README.md)
+!README.md
+!data-unshared/contents.md
+!data-unshared/raw/README.md
+!data-unshared/derived/README.md
+
+
+# ---- Windows -----------------------------------------------
+# Cached small image files
+Thumbs.db
+
+# Folder config file
+Desktop.ini
+
+# ---- Visual Studio -----------------------------------------------
 ## Ignore Visual Studio temporary files, build results, and
 ## files generated by popular Visual Studio add-ons.
 ##

diff --git a/analysis/README.md b/analysis/README.md
@@ -0,0 +1,4 @@
+`./analysis/` Directory
+=========
+
+Files in this directory statistically analyze the data.  If multiple lines of analysis are needed, they should be contained in separate subfolders.
diff --git a/analysis/report-1/report-1.R b/analysis/report-1/report-1.R
@@ -0,0 +1,193 @@
+rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. This is not called by knitr, because it's above the first chunk.
+
+# ---- load-sources ------------------------------------------------------------
+#Load any source files that contain/define functions, but that don't load any other types of variables
+#   into memory.  Avoid side effects and don't pollute the global environment.
+# source("./SomethingSomething.R")
+
+# ---- load-packages -----------------------------------------------------------
+library(magrittr) #Pipes
+library(ggplot2) #For graphing
+requireNamespace("dplyr")
+# requireNamespace("tidyr") #For converting wide to long
+# requireNamespace("RColorBrewer")
+# requireNamespace("scales") #For formating values in graphs
+# requireNamespace("mgcv) #For the Generalized Additive Model that smooths the longitudinal graphs.
+# requireNamespace("TabularManifest") # devtools::install_github("Melinae/TabularManifest")
+
+# ---- declare-globals ---------------------------------------------------------
+options(show.signif.stars=F) #Turn off the annotations on p-values
+
+path_input <- "./data-public/derived/motor-trend-car-test.rds"
+
+# The two graphing functions are copied from https://github.com/Melinae/TabularManifest.
+histogram_discrete <- function(
+  d_observed,
+  variable_name,
+  levels_to_exclude   = character(0),
+  main_title          = variable_name,
+  x_title             = NULL,
+  y_title             = "Number of Included Records",
+  text_size_percentage= 6,
+  bin_width           = 1L,
+  font_base_size      = 12
+) {
+
+  # Ungroup, in case it comes in grouped.
+  d_observed <- d_observed %>%
+    dplyr::ungroup()
+
+  if( !base::is.factor(d_observed[[variable_name]]) )
+    d_observed[[variable_name]] <- base::factor(d_observed[[variable_name]])
+
+  d_observed$iv <- base::ordered(d_observed[[variable_name]], levels=rev(levels(d_observed[[variable_name]])))
+
+  d_count <- dplyr::count_(d_observed, vars ="iv" )
+  # if( base::length(levels_to_exclude)>0 ) { }
+  d_count <- d_count[!(d_count$iv %in% levels_to_exclude), ]
+
+  d_summary <- d_count %>%
+    dplyr::rename_(
+      "count"    =  "n"
+    ) %>%
+    dplyr::mutate(
+      proportion = count / sum(count)
+    )
+  d_summary$percentage <- base::paste0(base::round(d_summary$proportion*100), "%")
+
+  y_title <- base::paste0(y_title, " (n=", scales::comma(base::sum(d_summary$count)), ")")
+
+  g <- ggplot(d_summary, aes_string(x="iv", y="count", fill="iv", label="percentage")) +
+    geom_bar(stat="identity") +
+    geom_text(stat="identity", size=text_size_percentage, hjust=.8) +
+    scale_y_continuous(labels=scales::comma_format()) +
+    labs(title=main_title, x=x_title, y=y_title) +
+    coord_flip()
+
+  theme  <- theme_light(base_size=font_base_size) +
+    theme(legend.position       =  "none") +
+    theme(panel.grid.major.y    =  element_blank()) +
+    theme(panel.grid.minor.y    =  element_blank()) +
+    theme(axis.text.y           =  element_text(size=font_base_size + 2L)) +
+    theme(axis.text.x           =  element_text(colour="gray40")) +
+    theme(axis.title.x          =  element_text(colour="gray40")) +
+    theme(panel.border          =  element_rect(colour="gray80")) +
+    theme(axis.ticks            =  element_blank())
+
+  return( g + theme )
+}
+histogram_continuous <- function(
+  d_observed,
+  variable_name,
+  bin_width               = NULL,
+  main_title              = base::gsub("_", " ", variable_name, perl=TRUE),
+  x_title                 = paste0(variable_name, "\n(each bin is ", scales::comma(bin_width), " units wide)"),
+  y_title                 = "Frequency",
+  rounded_digits          = 0L,
+  font_base_size          = 12
+) {
+
+  if( !inherits(d_observed, "data.frame") )
+    stop("`d_observed` should inherit from the data.frame class.")
+
+  d_observed <- d_observed[!base::is.na(d_observed[[variable_name]]), ]
+
+  ds_mid_points <- base::data.frame(label=c("italic(X)[50]", "bar(italic(X))"), stringsAsFactors=FALSE)
+  ds_mid_points$value <- c(stats::median(d_observed[[variable_name]]), base::mean(d_observed[[variable_name]]))
+  ds_mid_points$value_rounded <- base::round(ds_mid_points$value, rounded_digits)
+
+  if( ds_mid_points$value[1] < ds_mid_points$value[2] ) {
+    h_just <- c(1, 0)
+  } else {
+    h_just <- c(0, 1)
+  }
+
+  g <- ggplot2::ggplot(d_observed, ggplot2::aes_string(x=variable_name))
+  g <- g + ggplot2::geom_histogram(binwidth=bin_width, position=ggplot2::position_identity(), fill="gray70", color="gray90", alpha=.7)
+  g <- g + ggplot2::geom_vline(xintercept=ds_mid_points$value, color="gray30")
+  g <- g + ggplot2::geom_text(data=ds_mid_points, ggplot2::aes_string(x="value", y=0, label="value_rounded"), color="tomato", hjust=h_just, vjust=.5)
+  g <- g + ggplot2::scale_x_continuous(labels=scales::comma_format())
+  g <- g + ggplot2::scale_y_continuous(labels=scales::comma_format())
+  g <- g + ggplot2::labs(title=main_title, x=x_title, y=y_title)
+
+  g <- g + ggplot2::theme_light(base_size = font_base_size) +
+    ggplot2::theme(axis.ticks             = ggplot2::element_blank())
+
+  ds_mid_points$top <- stats::quantile(ggplot2::ggplot_build(g)$layout$panel_ranges[[1]]$y.range, .8)
+  g <- g + ggplot2::geom_text(data=ds_mid_points, ggplot2::aes_string(x="value", y="top", label="label"), color="tomato", hjust=h_just, parse=TRUE)
+  return( g )
+}
+
+# ---- load-data ---------------------------------------------------------------
+ds <- readr::read_rds(path_input) # 'ds' stands for 'datasets'
+
+# ---- tweak-data --------------------------------------------------------------
+# ds <- ds %>%
+#   tidyr::drop_na(infant_weight_for_gestational_age_category) %>%
+#   dplyr::mutate(
+#     clinic = droplevels(clinic)
+#   ) %>%
+#   dplyr::filter(
+#     !ds$premature_infant
+#   ) %>%
+#   dplyr::select(
+#     -premature_infant
+#   )
+
+# ---- marginals ---------------------------------------------------------------
+# Inspect continuous variables
+histogram_continuous(d_observed=ds, variable_name="quarter_mile_in_seconds", bin_width=.5, rounded_digits=1)
+histogram_continuous(d_observed=ds, variable_name="displacement_inches_cubed", bin_width=50, rounded_digits=1)
+
+# Inspect discrete/categorical variables
+histogram_discrete(d_observed=ds, variable_name="carburetor_count_f")
+histogram_discrete(d_observed=ds, variable_name="forward_gear_count_f")
+
+# This helps start the code for graphing each variable.
+#   - Make sure you change it to `histogram_continuous()` for the appropriate variables.
+#   - Make sure the graph doesn't reveal PHI.
+#   - Don't graph the IDs (or other uinque values) of large datasets.  The graph will be worth and could take a long time on large datasets.
+# for(column in colnames(ds)) {
+#   cat('TabularManifest::histogram_discrete(ds, variable_name="', column,'")\n', sep="")
+# }
+
+# ---- scatterplots ------------------------------------------------------------
+g1 <- ggplot(ds, aes(x=gross_horsepower, y=quarter_mile_in_seconds, color=forward_gear_count_f)) +
+  geom_smooth(method="loess", span=2) +
+  geom_point(shape=1) +
+  theme_light() +
+  theme(axis.ticks = element_blank())
+g1
+
+g1 %+% aes(color=cylinder_count)
+g1 %+% aes(color=factor(cylinder_count))
+
+# ---- models ------------------------------------------------------------------
+cat("============= Simple model that's just an intercept. =============")
+m0 <- lm(quarter_mile_in_seconds ~ 1, data=ds)
+summary(m0)
+
+cat("============= Model includes one predictor. =============")
+m1 <- lm(quarter_mile_in_seconds ~ 1 + miles_per_gallon, data=ds)
+summary(m1)
+
+cat("The one predictor is significantly tighter.")
+anova(m0, m1)
+
+cat("============= Model includes two predictors. =============")
+m2 <- lm(quarter_mile_in_seconds ~ 1 + miles_per_gallon + forward_gear_count_f, data=ds)
+summary(m2)
+
+cat("The two predictor is significantly tighter.")
+anova(m1, m2)
+
+# ---- model-results-table  -----------------------------------------------
+
+summary(m2)$coef %>%
+  knitr::kable(
+    digits      = 2,
+    format      = "markdown"
+  )
+
+# Uncomment the next line for a dynamic, JavaScript [DataTables](https://datatables.net/) table.
+# DT::datatable(round(summary(m2)$coef, digits = 2), options = list(pageLength = 2))
diff --git a/analysis/report-1/report-1.Rmd b/analysis/report-1/report-1.Rmd
@@ -0,0 +1,123 @@
+---
+title: Skeleton Report 1
+date: "Date: `r Sys.Date()`"
+output:
+  html_document:
+    keep_md: yes
+    toc: 4
+    toc_float: true
+    number_sections: true
+---
+
+This report covers the analyses used in the ZZZ project (Marcus Mark, PI).
+
+<!--  Set the working directory to the repository's base directory; this assumes the report is nested inside of two directories.-->
+```{r, echo=F, message=F} 
+# cat("Working directory: ", getwd())
+library(knitr)
+opts_knit$set(root.dir='../../')  #Don't combine this call with any other chunk -especially one that uses file paths.
+```
+
+<!-- Set the report-wide options, and point to the external code file. -->
+```{r set-options, echo=F}
+# cat("Working directory: ", getwd())
+report_render_start_time <- Sys.time()
+opts_chunk$set(
+  results      = 'show', 
+  comment      = NA, 
+  tidy         = FALSE,
+  # dpi        = 400,
+  # out.width  = "650px", #This affects only the markdown, not the underlying png file.  The height will be scaled appropriately.
+  fig.width    = 4, 
+  fig.height   = 4, 
+  fig.path     = 'figure-png/'
+)
+
+echo_chunks    <- FALSE #Toggle for debugging.
+message_chunks <- FALSE #Toggle for debugging.
+options(width=100) #So the output is 25% wider than the default.
+read_chunk("./analysis/report-1/report-1.R") #This allows knitr to call chunks tagged in the underlying *.R file.
+```
+
+<!-- Load 'sourced' R files.  Suppress the output when loading sources. --> 
+```{r load-sources, echo=echo_chunks, message=message_chunks}
+```
+
+<!-- Load packages, or at least verify they're available on the local machine.  Suppress the output when loading packages. --> 
+```{r load-packages, echo=echo_chunks, message=message_chunks}
+```
+
+<!-- Load any global functions and variables declared in the R file.  Suppress the output. --> 
+```{r declare-globals, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+<!-- Declare any global functions specific to a Rmd output.  Suppress the output. --> 
+```{r rmd-specific, echo=echo_chunks, message=message_chunks}
+# Put presentation-specific code in here.  It doesn't call a chunk in the codebehind file.
+#   It should be rare (and used cautiously), but sometimes it makes sense to include code in Rmd 
+#   that doesn't live in the codebehind R file.
+```
+
+<!-- Load the datasets.   -->
+```{r load-data, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+<!-- Tweak the datasets.   -->
+```{r tweak-data, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+# Summary {.tabset .tabset-fade .tabset-pills}
+
+## Notes 
+1. The current report covers `r nrow(ds)` cars, with `r dplyr::n_distinct(ds$carburetor_count_f)` unique values for `carburetor_count`.
+1. The Seattle track's phluguerstometer was producing flaky negative values; it's measurements have been dropped.
+
+## Unanswered Questions
+1. What does `VS` stand for?  How was it measured?
+1. Where the cars at the Philly track measured with the same phluguerstometer and the Cleveland track?
+
+## Answered Questions
+1. The Seattle track's phluguerstometer was producing flaky negative values; it's measurements have been dropped.
+
+# Graphs
+
+## Marginals
+```{r marginals, echo=echo_chunks, message=message_chunks}
+```
+
+## Scatterplots
+```{r scatterplots, echo=echo_chunks, message=message_chunks, fig.width=7}
+```
+
+# Models
+## Model Exploration
+```{r models, echo=echo_chunks, message=message_chunks}
+```
+
+## Final Model
+```{r model-results-table, echo=echo_chunks, message=message_chunks, warning=TRUE}
+```
+
+In the model that includes two predictors, the slope coefficent of `Miles per gallon` is `r summary(m2)$coefficients[2,1]`.
+
+
+# Session Information
+For the sake of documentation and reproducibility, the current report was rendered in the following environment.  Click the line below to expand.
+
+<details>
+  <summary>Environment <span class="glyphicon glyphicon-plus-sign"></span></summary>
+```{r session-info, echo=FALSE}
+if( requireNamespace("devtools", quietly = TRUE) ) {
+  devtools::session_info()
+} else {
+  sessionInfo()
+} 
+```
+</details>
+
+```{r session-duration, echo=FALSE}
+report_render_duration_in_seconds <- round(as.numeric(difftime(Sys.time(), report_render_start_time, units="secs")))
+```
+
+Report rendered by `r Sys.info()["user"]` at `r strftime(Sys.time(), "%Y-%m-%d, %H:%M %z")` in `r report_render_duration_in_seconds` seconds.
+