From 3ec046aab1f31bc0c9316d69d5a71098f5d40f40 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sun, 14 Oct 2018 12:36:00 -0500 Subject: [PATCH] splitting tagset utility closes #60 --- utility/migrate-variablest-new-tagset.R | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 utility/migrate-variablest-new-tagset.R diff --git a/utility/migrate-variablest-new-tagset.R b/utility/migrate-variablest-new-tagset.R new file mode 100644 index 0000000..4fbedee --- /dev/null +++ b/utility/migrate-variablest-new-tagset.R @@ -0,0 +1,97 @@ +# knitr::stitch_rmd(script="./manipulation/te-ellis.R", output="./stitched-output/manipulation/te-ellis.md") # dir.create("./stitched-output/manipulation/", recursive=T) +# For a brief description of this file see the presentation at +# - slides: https://rawgit.com/wibeasley/RAnalysisSkeleton/master/documentation/time-and-effort-synthesis.html#/ +# - code: https://github.com/wibeasley/RAnalysisSkeleton/blob/master/documentation/time-and-effort-synthesis.Rpres +rm(list=ls(all=TRUE)) #Clear the variables from previous runs. + +# ---- load-sources ------------------------------------------------------------ +# Call `base::source()` on any repo file that defines functions needed below. Ideally, no real operations are performed. + +# ---- load-packages ----------------------------------------------------------- +# Attach these package(s) so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path +library(magrittr , quietly=TRUE) +library(DBI , quietly=TRUE) + +# Verify these packages are available on the machine, but their functions need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path +requireNamespace("readr" ) +requireNamespace("tidyr" ) +requireNamespace("dplyr" ) # Avoid attaching dplyr, b/c its function names conflict with a lot of packages (esp base, stats, and plyr). +requireNamespace("testit" ) # For asserting conditions meet expected patterns/conditions. +requireNamespace("checkmate" ) # For asserting conditions meet expected patterns/conditions. # remotes::install_github("mllg/checkmate") +requireNamespace("RSQLite" ) # Lightweight database for non-PHI data. +# requireNamespace("RODBC" ) # For communicating with SQL Server over a locally-configured DSN. Uncomment if you use 'upload-to-db' chunk. +requireNamespace("OuhscMunge" ) # remotes::install_github(repo="OuhscBbmc/OuhscMunge") + +# ---- declare-globals --------------------------------------------------------- +# Constant values that won't change. +directory_in <- "data-public/metadata/tagsets-79" +path_in_tagset_new <- file.path(directory_in, "Gen2FatherFromGen1Death.NLSY79" ) +path_in_tagset_old <- file.path(directory_in, "Gen2FatherFromGen1.NLSY79" ) +path_out_tagset_old <- path_in_tagset_old # In this case, overwrite the tagset. Make sure things are committed to Git first + +variables_to_duplicate <- c("R0000100") # For Nlsy79 Gen1 +# variables_to_duplicate <- c("C0000100", "C0000200") # For Nlsy79 Gen2 +# variables_to_duplicate <- c("R0000100") # For Nlsy97 + + +col_types <- readr::cols_only( # readr::spec_csv(path_in_oklahoma) + `id` = readr::col_character() +) + +# ---- load-data --------------------------------------------------------------- +# Read the CSVs +ds_new <- readr::read_csv(path_in_tagset_new , col_types=col_types, col_names="id") +ds_old <- readr::read_csv(path_in_tagset_old , col_types=col_types, col_names="id") + +rm(path_in_tagset_new, path_in_tagset_old) + +ds_new +ds_old + +# ---- tweak-data -------------------------------------------------------------- +# OuhscMunge::column_rename_headstart(ds) #Spit out columns to help write call ato `dplyr::rename()`. +ds_to_duplicate <- + tibble::tibble( + id = variables_to_duplicate + ) + +row_count_new_start <- nrow(ds_new) +row_count_old_start <- nrow(ds_old) +row_count_duplicate <- nrow(ds_to_duplicate) + +# ---- anti-join ---------------------------------------------------------- +ds_old <- + ds_old %>% + dplyr::anti_join(ds_new, by="id") %>% + dplyr::union_all( + ds_to_duplicate + ) %>% + dplyr::arrange(id) + +# ---- verify-values ----------------------------------------------------------- +# Sniff out problems +# OuhscMunge::verify_value_headstart(ds_old) +checkmate::assert_character(ds_new$id , any.missing=F , pattern="^[A-Z]\\d{7}$" , unique=T) +checkmate::assert_character(ds_old$id , any.missing=F , pattern="^[A-Z]\\d{7}$" , unique=T) + +row_count_old_end <- nrow(ds_old) +row_count_old_expected <- row_count_old_start - row_count_new_start + row_count_duplicate + +testit::assert(row_count_old_end == row_count_old_expected) + +# ---- specify-columns-to-upload ----------------------------------------------- +# dput(colnames(ds)) # Print colnames for line below. +columns_to_write <- c( + "id" +) +ds_slim <- + ds_old %>% + dplyr::select_(.dots=columns_to_write) %>% + # dplyr::slice(1:100) %>% + dplyr::mutate_if(is.logical, as.integer) # Some databases & drivers need 0/1 instead of FALSE/TRUE. +ds_slim + +rm(columns_to_write) + +# ---- save-to-disk ------------------------------------------------------------ +readr::write_csv(ds_slim, path_out_tagset_old, col_names=F)