This report was automatically generated with the R package knitr (version 1.20).
# knitr::stitch_rmd(script="./dal/import-79-metadata.R", output="./stitched-output/dal/import-metadata.md") # dir.create(output="./stitched-output/dal/", recursive=T)
rm(list=ls(all=TRUE)) #Clear the variables from previous runs.
# Call `base::source()` on any repo file that defines functions needed below. Ideally, no real operations are performed.
base::source("utility/connectivity.R")
# Attach these package(s) so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr , quietly=TRUE)
# Verify these packages are available on the machine, but their functions need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
requireNamespace("readr" )
requireNamespace("tidyr" )
requireNamespace("tibble" )
requireNamespace("purrr" )
requireNamespace("dplyr" ) #Avoid attaching dplyr, b/c its function names conflict with a lot of packages (esp base, stats, and plyr).
requireNamespace("testit" ) #For asserting conditions meet expected patterns.
requireNamespace("odbc" ) #For communicating with SQL Server over a locally-configured DSN. Uncomment if you use 'upload-to-db' chunk.
# Constant values that won't change.
config <- config::get()
directory_in <- "data-public/metadata/tables-97"
study <- "97"
shallow_only <- F # If TRUE, update only the metadata tables that won't delete any other database tables.
col_types_minimal <- readr::cols_only(
ID = readr::col_integer(),
Label = readr::col_character(),
Active = readr::col_logical(),
Notes = readr::col_character()
)
# The order of this list matters.
# - Tables are WRITTEN from top to bottom.
# - Tables are DELETED from bottom to top.
lst_col_types <- list(
ArchiveDescription = readr::cols_only(
AlgorithmVersion = readr::col_integer(),
Description = readr::col_character(),
Date = readr::col_date()
),
item = readr::cols_only(
ID = readr::col_integer(),
Label = readr::col_character(),
MinValue = readr::col_integer(),
MinNonnegative = readr::col_integer(),
MaxValue = readr::col_integer(),
Active = readr::col_logical(),
Notes = readr::col_character()
),
LUExtractSource = col_types_minimal,
LUMarkerEvidence = col_types_minimal,
LUGender = col_types_minimal,
LUMarkerType = readr::cols_only(
ID = readr::col_integer(),
Label = readr::col_character(),
Explicit = readr::col_integer(),
Active = readr::col_logical(),
Notes = readr::col_character()
),
LUMultipleBirth = col_types_minimal,
LURaceCohort = col_types_minimal,
LURoster = col_types_minimal,
LUTristate = col_types_minimal,
LUYesNo = col_types_minimal,
MzManual = readr::cols_only(
ID = readr::col_integer(),
SubjectTag_S1 = readr::col_integer(),
SubjectTag_S2 = readr::col_integer(),
MultipleBirthIfSameSex = readr::col_integer(),
IsMz = readr::col_integer(),
Undecided = readr::col_integer(),
Related = readr::col_integer(),
Notes = readr::col_character()
),
RosterAssignment = readr::cols_only(
ID = readr::col_integer(),
ResponseLower = readr::col_integer(),
ResponseUpper = readr::col_integer(),
Freq = readr::col_integer(),
Resolved = readr::col_integer(),
R = readr::col_double(),
RBoundLower = readr::col_double(),
RBoundUpper = readr::col_double(),
SameGeneration = readr::col_double(),
ShareBiodad = readr::col_integer(),
ShareBiomom = readr::col_integer(),
ShareBiograndparent = readr::col_integer(),
Inconsistent = readr::col_integer(),
Notes = readr::col_character(),
ResponseLowerLabel = readr::col_character(),
ResponseUpperLabel = readr::col_character()
),
variable = readr::cols_only(
# ID = readr::col_integer(),
VariableCode = readr::col_character(),
Item = readr::col_integer(),
ExtractSource = readr::col_integer(),
SurveyYear = readr::col_integer(),
LoopIndex1 = readr::col_integer(),
LoopIndex2 = readr::col_integer(),
Translate = readr::col_integer(),
Active = readr::col_integer(),
Notes = readr::col_character(),
QuestionName = readr::col_character(),
VariableTitle = readr::col_character()
)
)
col_types_mapping <- readr::cols_only(
table_name = readr::col_character(),
schema_name = readr::col_character(),
enum_name = readr::col_character(),
# enum_file = readr::col_character(),
c_sharp_type = readr::col_character(),
convert_to_enum = readr::col_logical(),
shallow = readr::col_logical()
)
ds_file <- lst_col_types %>%
tibble::enframe(value = "col_types") %>%
dplyr::mutate(
path = file.path(directory_in, paste0(name, ".csv")),
exists = purrr::map_lgl(path, file.exists)
# col_types = purrr::map(name, function(x) lst_col_types[[x]]),
) %>%
dplyr::select(name, path, dplyr::everything())
ds_file
## # A tibble: 14 x 4
## name path col_types exists
## <chr> <chr> <list> <lgl>
## 1 ArchiveDescription data-public/metadata/tables-97/A~ <S3: col_s~ TRUE
## 2 item data-public/metadata/tables-97/i~ <S3: col_s~ TRUE
## 3 LUExtractSource data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 4 LUMarkerEvidence data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 5 LUGender data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 6 LUMarkerType data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 7 LUMultipleBirth data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 8 LURaceCohort data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 9 LURoster data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 10 LUTristate data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 11 LUYesNo data-public/metadata/tables-97/L~ <S3: col_s~ TRUE
## 12 MzManual data-public/metadata/tables-97/M~ <S3: col_s~ TRUE
## 13 RosterAssignment data-public/metadata/tables-97/R~ <S3: col_s~ TRUE
## 14 variable data-public/metadata/tables-97/v~ <S3: col_s~ TRUE
start_time <- Sys.time()
ds_mapping <- readr::read_csv(file.path(directory_in, "_mapping.csv"), col_types=col_types_mapping)
testit::assert("All metadata files must exist.", all(ds_file$exists))
ds_entries <- ds_file %>%
# dplyr::slice(14) %>%
dplyr::select(name, path, col_types) %>%
dplyr::mutate(
entries = purrr::pmap(list(file=.$path, col_types=.$col_types), readr::read_csv, comment = "#")
)
ds_entries
## # A tibble: 14 x 4
## name path col_types entries
## <chr> <chr> <list> <list>
## 1 ArchiveDescription data-public/metadata/table~ <S3: col_s~ <tibble [8 ~
## 2 item data-public/metadata/table~ <S3: col_s~ <tibble [26~
## 3 LUExtractSource data-public/metadata/table~ <S3: col_s~ <tibble [6 ~
## 4 LUMarkerEvidence data-public/metadata/table~ <S3: col_s~ <tibble [8 ~
## 5 LUGender data-public/metadata/table~ <S3: col_s~ <tibble [3 ~
## 6 LUMarkerType data-public/metadata/table~ <S3: col_s~ <tibble [28~
## 7 LUMultipleBirth data-public/metadata/table~ <S3: col_s~ <tibble [5 ~
## 8 LURaceCohort data-public/metadata/table~ <S3: col_s~ <tibble [4 ~
## 9 LURoster data-public/metadata/table~ <S3: col_s~ <tibble [92~
## 10 LUTristate data-public/metadata/table~ <S3: col_s~ <tibble [3 ~
## 11 LUYesNo data-public/metadata/table~ <S3: col_s~ <tibble [6 ~
## 12 MzManual data-public/metadata/table~ <S3: col_s~ <tibble [90~
## 13 RosterAssignment data-public/metadata/table~ <S3: col_s~ <tibble [31~
## 14 variable data-public/metadata/table~ <S3: col_s~ <tibble [55~
# d <- readr::read_csv("data-public/metadata/tables/variable_97.csv", col_types=lst_col_types$variable_97, comment = "#")
# readr::problems(d)
# ds_entries$entries[15]
ds_table <- database_inventory(study)
ds_table
## # A tibble: 32 x 6
## schema_name table_name row_count column_count space_total_kb
## * <chr> <chr> <int> <int> <int>
## 1 Archive tblArchiveDescription 8 3 72
## 2 Archive tblRelatedValuesArch~ 27709 23 2392
## 3 dbo sysdiagrams 0 5 0
## 4 Enum tblLUExtractSource 6 4 72
## 5 Enum tblLUGender 3 4 72
## 6 Enum tblLUMarkerEvidence 8 4 72
## 7 Enum tblLUMarkerType 28 5 72
## 8 Enum tblLUMultipleBirth 5 4 72
## 9 Enum tblLURaceCohort 4 4 72
## 10 Enum tblLURoster 92 4 72
## # ... with 22 more rows, and 1 more variable: space_used_kb <int>
rm(directory_in) # rm(col_types_tulsa)
# OuhscMunge::column_rename_headstart(ds_county) #Spit out columns to help write call to `dplyr::rename()`.
if( shallow_only ) {
ds_mapping <- ds_mapping %>%
dplyr::filter(.data$shallow)
}
ds_mapping
## # A tibble: 14 x 6
## table_name schema_name enum_name c_sharp_type convert_to_enum shallow
## <chr> <chr> <chr> <chr> <lgl> <lgl>
## 1 ArchiveDesc~ Archive NA_chara~ NA_character FALSE TRUE
## 2 item Metadata Item short TRUE FALSE
## 3 LUExtractSo~ Enum ExtractS~ byte TRUE FALSE
## 4 LUGender Enum Gender byte TRUE FALSE
## 5 LUMarkerEvi~ Enum MarkerEv~ byte TRUE FALSE
## 6 LUMarkerType Enum MarkerTy~ byte TRUE FALSE
## 7 LUMultipleB~ Enum Multiple~ byte TRUE FALSE
## 8 LURaceCohort Enum RaceCoho~ byte TRUE FALSE
## 9 LURoster Enum RosterGe~ short TRUE FALSE
## 10 LUTristate Enum Tristate byte TRUE FALSE
## 11 LUYesNo Enum YesNo short TRUE FALSE
## 12 MzManual Metadata NA_chara~ NA_character FALSE TRUE
## 13 RosterAssig~ Metadata NA_chara~ NA_character FALSE FALSE
## 14 variable Metadata NA_chara~ NA_character FALSE FALSE
ds_file <- ds_file %>%
dplyr::inner_join(ds_mapping, by=c("name"="table_name")) %>%
dplyr::mutate(
table_name = paste0("tbl", name),
sql_delete = glue::glue("DELETE FROM {schema_name}.{table_name};")
) %>%
dplyr::left_join(
ds_entries %>%
dplyr::select(name, entries)
, by="name"
)
rm(ds_entries)
ds_file$entries %>%
purrr::walk(print)
## # A tibble: 8 x 3
## AlgorithmVersion Description Date
## <int> <chr> <date>
## 1 1 naive roster 2018-01-17
## 2 2 account for twins 2018-01-18
## 3 3 same sib full twins are R=.5 by default, an~ 2018-02-14
## 4 7 allows nulls for RFull 2018-06-19
## 5 8 recover different-sex full sibs 2018-06-19
## 6 9 recover same-sex full sibs 2018-06-19
## 7 10 allow nonsibs to still be r>0 2018-06-19
## 8 11 refresh 2018-06-20
## # A tibble: 26 x 7
## ID Label MinValue MinNonnegative MaxValue Active Notes
## <int> <chr> <int> <int> <int> <lgl> <chr>
## 1 1 subject_id 1 1 9022 TRUE <NA>
## 2 2 extended_famil~ 1 1 9022 TRUE <NA>
## 3 3 hh_internal_id 1 1 5 TRUE <NA>
## 4 10 gender 1 1 2 TRUE <NA>
## 5 11 DateOfBirthMon~ 1 1 12 TRUE <NA>
## 6 12 DateOfBirthYear 1980 1980 1984 TRUE <NA>
## 7 13 cross_sectiona~ 0 0 1 TRUE <NA>
## 8 14 race_cohort 1 1 4 TRUE race-eth~
## 9 20 InterviewDateD~ -7 1 31 TRUE <NA>
## 10 21 InterviewDateM~ -7 1 12 TRUE <NA>
## # ... with 16 more rows
## # A tibble: 6 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 1 97-demographics TRUE <NA>
## 2 2 97-roster TRUE <NA>
## 3 3 97-survey-time TRUE <NA>
## 4 4 97-links-explicit TRUE <NA>
## 5 5 97-links-implicit TRUE <NA>
## 6 6 97-twins TRUE <NA>
## # A tibble: 8 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 0 Irrelevant TRUE <NA>
## 2 1 StronglySupports TRUE <NA>
## 3 2 Supports TRUE <NA>
## 4 3 Consistent TRUE <NA>
## 5 4 Ambiguous TRUE <NA>
## 6 5 Missing TRUE <NA>
## 7 6 Unlikely TRUE <NA>
## 8 7 Disconfirms TRUE <NA>
## # A tibble: 3 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 1 Male TRUE <NA>
## 2 2 Female TRUE <NA>
## 3 255 InvalidSkipGen2 TRUE <NA>
## # A tibble: 28 x 5
## ID Label Explicit Active Notes
## <int> <chr> <int> <lgl> <chr>
## 1 1 Roster 1 TRUE <NA>
## 2 2 ShareBiomom 1 TRUE <NA>
## 3 3 ShareBiodad 1 TRUE <NA>
## 4 5 DobSeparation 0 FALSE <NA>
## 5 6 GenderAgreement 0 FALSE <NA>
## 6 10 FatherAsthma 0 FALSE <NA>
## 7 11 BabyDaddyAsthma 0 FALSE <NA>
## 8 12 BabyDaddyLeftHHDate 0 FALSE <NA>
## 9 13 BabyDaddyDeathDate 0 FALSE <NA>
## 10 14 BabyDaddyAlive 0 FALSE <NA>
## # ... with 18 more rows
## # A tibble: 5 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 0 No TRUE <NA>
## 2 2 Twin TRUE <NA>
## 3 3 Trip TRUE <NA>
## 4 4 TwinOrTrip FALSE Currently Then Gen1 algorithm doesn't distingui~
## 5 255 DoNotKnow TRUE <NA>
## # A tibble: 4 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 1 Black TRUE <NA>
## 2 2 Hispanic TRUE <NA>
## 3 3 Mixed TRUE <NA>
## 4 4 Nbnh TRUE <NA>
## # A tibble: 92 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 -4 valid_skip TRUE <NA>
## 2 -2 do_not_know TRUE <NA>
## 3 -1 refusal TRUE <NA>
## 4 0 self TRUE <NA>
## 5 1 wife TRUE <NA>
## 6 2 husband TRUE <NA>
## 7 3 mother TRUE <NA>
## 8 4 father TRUE <NA>
## 9 7 mother_step TRUE <NA>
## 10 8 father_step TRUE <NA>
## # ... with 82 more rows
## # A tibble: 3 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 0 No TRUE <NA>
## 2 1 Yes TRUE <NA>
## 3 255 DoNotKnow TRUE <NA>
## # A tibble: 6 x 4
## ID Label Active Notes
## <int> <chr> <lgl> <chr>
## 1 -6 ValidSkipOrNoInterviewOrNotInSurvey TRUE <NA>
## 2 -3 InvalidSkip TRUE <NA>
## 3 -2 DoNotKnow TRUE <NA>
## 4 -1 Refusal TRUE <NA>
## 5 0 No TRUE <NA>
## 6 1 Yes TRUE <NA>
## # A tibble: 90 x 8
## ID SubjectTag_S1 SubjectTag_S2 MultipleBirthIfSameS~ IsMz Undecided
## <int> <int> <int> <int> <int> <int>
## 1 1 66 67 0 0 0
## 2 2 75 76 2 1 0
## 3 3 116 117 2 1 0
## 4 4 222 223 2 0 0
## 5 5 343 344 3 255 1
## 6 6 343 345 3 255 1
## 7 7 344 345 3 255 1
## 8 8 351 352 2 1 0
## 9 9 447 448 2 1 0
## 10 10 588 589 2 0 0
## # ... with 80 more rows, and 2 more variables: Related <int>, Notes <chr>
## # A tibble: 31 x 16
## ID ResponseLower ResponseUpper Freq Resolved R RBoundLower
## <int> <int> <int> <int> <int> <dbl> <dbl>
## 1 1 -2 -1 2 0 NA 0
## 2 2 -1 -1 2 0 NA 0
## 3 3 13 13 1034 0 0.5 0.5
## 4 4 13 14 2034 1 0.5 0.5
## 5 5 14 14 1154 0 0.5 0.5
## 6 6 15 15 48 1 0.25 0.25
## 7 7 15 18 132 1 0.25 0.25
## 8 8 16 19 2 1 0.25 0.25
## 9 9 18 18 62 1 0.25 0.25
## 10 10 19 19 8 1 0.25 0.25
## # ... with 21 more rows, and 9 more variables: RBoundUpper <dbl>,
## # SameGeneration <dbl>, ShareBiodad <int>, ShareBiomom <int>,
## # ShareBiograndparent <int>, Inconsistent <int>, Notes <chr>,
## # ResponseLowerLabel <chr>, ResponseUpperLabel <chr>
## # A tibble: 551 x 11
## VariableCode Item ExtractSource SurveyYear LoopIndex1 LoopIndex2
## <chr> <int> <int> <int> <int> <int>
## 1 R0000100 1 1 1997 1 1
## 2 R1193000 2 1 1997 1 1
## 3 R0533400 3 1 1997 1 1
## 4 R0536300 10 1 1997 1 1
## 5 R0536401 11 1 1997 1 1
## 6 R0536402 12 1 1997 1 1
## 7 R1235800 13 1 1997 1 1
## 8 R1482600 14 1 1997 1 1
## 9 R1097800 101 2 1997 1 1
## 10 R1097900 101 2 1997 2 1
## # ... with 541 more rows, and 5 more variables: Translate <int>,
## # Active <int>, Notes <chr>, QuestionName <chr>, VariableTitle <chr>
# ds_file %>%
# dplyr::group_by(name) %>%
# dplyr::mutate(
# a = purrr::map_int(entries, ~max(nchar(.), na.rm=T))
# ) %>%
# dplyr::ungroup() %>%
# dplyr::pull(a)
# ds_file %>%
# dplyr::select(name, entries) %>%
# tibble::deframe() %>%
# purrr::map(~max(nchar(.), na.rm=T))
# lst_ds %>%
# purrr::map(nrow)
# lst_ds %>%
# purrr::map(readr::spec)
ds_file$table_name
## [1] "tblArchiveDescription" "tblitem"
## [3] "tblLUExtractSource" "tblLUMarkerEvidence"
## [5] "tblLUGender" "tblLUMarkerType"
## [7] "tblLUMultipleBirth" "tblLURaceCohort"
## [9] "tblLURoster" "tblLUTristate"
## [11] "tblLUYesNo" "tblMzManual"
## [13] "tblRosterAssignment" "tblvariable"
ds_file
## # A tibble: 14 x 12
## name path col_types exists schema_name enum_name c_sharp_type
## <chr> <chr> <list> <lgl> <chr> <chr> <chr>
## 1 Archiv~ data-publi~ <S3: col~ TRUE Archive NA_chara~ NA_character
## 2 item data-publi~ <S3: col~ TRUE Metadata Item short
## 3 LUExtr~ data-publi~ <S3: col~ TRUE Enum ExtractS~ byte
## 4 LUMark~ data-publi~ <S3: col~ TRUE Enum MarkerEv~ byte
## 5 LUGend~ data-publi~ <S3: col~ TRUE Enum Gender byte
## 6 LUMark~ data-publi~ <S3: col~ TRUE Enum MarkerTy~ byte
## 7 LUMult~ data-publi~ <S3: col~ TRUE Enum Multiple~ byte
## 8 LURace~ data-publi~ <S3: col~ TRUE Enum RaceCoho~ byte
## 9 LURost~ data-publi~ <S3: col~ TRUE Enum RosterGe~ short
## 10 LUTris~ data-publi~ <S3: col~ TRUE Enum Tristate byte
## 11 LUYesNo data-publi~ <S3: col~ TRUE Enum YesNo short
## 12 MzManu~ data-publi~ <S3: col~ TRUE Metadata NA_chara~ NA_character
## 13 Roster~ data-publi~ <S3: col~ TRUE Metadata NA_chara~ NA_character
## 14 variab~ data-publi~ <S3: col~ TRUE Metadata NA_chara~ NA_character
## # ... with 5 more variables: convert_to_enum <lgl>, shallow <lgl>,
## # table_name <chr>, sql_delete <chr>, entries <list>
create_enum_body <- function( d ) {
tab_spaces <- " "
labels <- dplyr::if_else( d$Active , d$Label, paste("//", d$Label))
comments <- dplyr::if_else(is.na(d$Notes ), "" , paste("//", d$Notes))
paste0(sprintf("%s%-60s = %5s, %s\n", tab_spaces, labels, d$ID, comments), collapse="")
}
# ds_file %>%
# dplyr::filter(name=="LURelationshipPath") %>%
# dplyr::pull(entries)
ds_enum <- ds_file %>%
dplyr::filter(convert_to_enum) %>%
dplyr::select(enum_name, entries, c_sharp_type) %>%
dplyr::mutate(
enum_header = paste0("\npublic enum ", .$enum_name, " {\n"),
enum_body = purrr::map_chr(.$entries, create_enum_body),
enum_footer = "}\n",
enum_cs = paste0(enum_header, enum_body, enum_footer)
) %>%
dplyr::select(-enum_header, -enum_body, -enum_footer)
ds_enum %>%
dplyr::pull(enum_cs) %>%
cat()
##
## public enum Item {
## subject_id = 1,
## extended_family_id = 2,
## hh_internal_id = 3,
## gender = 10,
## DateOfBirthMonth = 11,
## DateOfBirthYear = 12,
## cross_sectional_cohort = 13,
## race_cohort = 14, // race-ethnicity
## InterviewDateDay = 20,
## InterviewDateMonth = 21,
## InterviewDateYear = 22,
## AgeAtInterviewDateMonths = 23,
## AgeAtInterviewDateYears = 24,
## roster_crosswalk = 101,
## hh_member_id = 102,
## hh_informant = 103,
## // roster_relationship_2_dim = 104, // 16 x 16 square
## roster_relationship_1_dim = 105, // 1 x 16 vector
## hh_unique_id = 106, // HHI2: People living in the Household - sorted, UID; HH member's unique ID
## pair_multiple_birth = 121,
## pair_twins_mz = 122,
## pair_sister_same_bioparent = 123,
## pair_brother_same_bioparent = 124,
## // InterviewDateDayParent_NOTUSED = 1020,
## // InterviewDateMonthParent_NOTUSED = 1021,
## // InterviewDateYearParent_NOTUSED = 1022,
## }
##
## public enum ExtractSource {
## 97-demographics = 1,
## 97-roster = 2,
## 97-survey-time = 3,
## 97-links-explicit = 4,
## 97-links-implicit = 5,
## 97-twins = 6,
## }
##
## public enum MarkerEvidence {
## Irrelevant = 0,
## StronglySupports = 1,
## Supports = 2,
## Consistent = 3,
## Ambiguous = 4,
## Missing = 5,
## Unlikely = 6,
## Disconfirms = 7,
## }
##
## public enum Gender {
## Male = 1,
## Female = 2,
## InvalidSkipGen2 = 255,
## }
##
## public enum MarkerType {
## Roster = 1,
## ShareBiomom = 2,
## ShareBiodad = 3,
## // DobSeparation = 5,
## // GenderAgreement = 6,
## // FatherAsthma = 10,
## // BabyDaddyAsthma = 11,
## // BabyDaddyLeftHHDate = 12,
## // BabyDaddyDeathDate = 13,
## // BabyDaddyAlive = 14,
## // BabyDaddyInHH = 15,
## // BabyDaddyDistanceFromHH = 16,
## // Gen2CFatherAlive = 17,
## // Gen2CFatherInHH = 18,
## // Gen2CFatherDistanceFromHH = 19,
## // Gen1BiodadInHH = 30,
## // Gen1BiodadDeathAge = 31,
## // Gen1BiodadBirthYear = 32,
## // Gen1BiodadInHH1979 = 33,
## // Gen1BiodadBirthCountry = 34,
## // Gen1BiodadBirthState = 35,
## // Gen1BiomomInHH = 40,
## // Gen1BiomomDeathAge = 41,
## // Gen1BiomomBirthYear = 42,
## // Gen1BiomomInHH1979 = 43,
## // Gen1BiomomBirthCountry = 44,
## // Gen1BiomomBirthState = 45,
## // Gen1AlwaysLivedWithBothBioparents = 50,
## }
##
## public enum MultipleBirth {
## No = 0,
## Twin = 2,
## Trip = 3,
## // TwinOrTrip = 4, // Currently Then Gen1 algorithm doesn't distinguish.
## DoNotKnow = 255,
## }
##
## public enum RaceCohort {
## Black = 1,
## Hispanic = 2,
## Mixed = 3,
## Nbnh = 4,
## }
##
## public enum RosterGen1 {
## valid_skip = -4,
## do_not_know = -2,
## refusal = -1,
## self = 0,
## wife = 1,
## husband = 2,
## mother = 3,
## father = 4,
## mother_step = 7,
## father_step = 8,
## mother_adoptive = 5,
## father_adoptive = 6,
## mother_foster = 9,
## father_foster = 10,
## mother_in_law = 11,
## father_in_law = 12,
## sister_full = 13,
## brother_full = 14,
## sister_half_same_mother = 15,
## sister_half_same_father = 16,
## sister_half_unsure = 17,
## brother_half_same_mother = 18,
## brother_half_same_father = 19,
## brother_half_unsure = 20,
## sister_step = 21,
## brother_step = 22,
## sister_adoptive = 23,
## brother_adoptive = 24,
## sister_foster = 25,
## brother_foster = 26,
## brother_in_law = 27,
## sister_in_law = 28,
## grandmother_maternal = 29,
## grandmother_paternal = 30,
## grandmother_social = 31,
## grandmother_unsure = 32,
## grandfather_maternal = 33,
## grandfather_paternal = 34,
## grandfather_social = 35,
## grandfather_unsure = 36,
## great_grandmother = 37,
## great_grandfather = 38,
## great_grandmother_social = 39,
## great_grandmother_unsure = 40,
## great_grandfather_maternal = 41,
## great_grandfather_paternal = 42,
## great_grandfather_social = 43,
## great_grandfather_unsure = 44,
## great_great_grandmother = 45,
## great_great_grandfather = 46,
## granddaughter = 47,
## grandson = 48,
## daughter_bio = 49,
## son_bio = 50,
## daughter_step = 51,
## son_step = 52,
## daughter_adoptive = 53,
## son_adoptive = 54,
## daughter_foster = 55,
## son_foster = 56,
## daughter_of_partner = 57,
## son_of_partner = 58,
## daughter_in_law = 59,
## son_in_law = 60,
## grandmother_in_law = 61,
## grandfather_in_law = 62,
## aunt_in_law = 63,
## uncle_in_law = 64,
## cousin_in_law = 65,
## great_grandmother_in_law = 66,
## great_grandfather_in_law = 67,
## roommate = 68,
## partner = 69,
## aunt_unsure = 70,
## great_aunt = 71,
## uncle_unsure = 72,
## great_uncle = 73,
## niece_unsure = 74,
## niece_step = 75,
## niece_foster = 76,
## niece_adoptive = 77,
## nephew_unsure = 78,
## nephew_step = 79,
## nephew_foster = 80,
## nephew_adoptive = 81,
## cousin_female_unsure = 82,
## cousin_male_unsure = 83,
## relative_other = 84,
## nonrelative_other = 85,
## great_grandson = 86,
## great_granddaughter = 87,
## relationship_missing = 99,
## }
##
## public enum Tristate {
## No = 0,
## Yes = 1,
## DoNotKnow = 255,
## }
##
## public enum YesNo {
## ValidSkipOrNoInterviewOrNotInSurvey = -6,
## InvalidSkip = -3,
## DoNotKnow = -2,
## Refusal = -1,
## No = 0,
## Yes = 1,
## }
# Sniff out problems
if( !shallow_only ) {
d_extract_source <- ds_file %>%
dplyr::filter(name=="LUExtractSource") %>%
dplyr::pull(entries) %>%
purrr::flatten_df()
d_item <- ds_file %>%
dplyr::filter(name=="item") %>%
dplyr::pull(entries) %>%
purrr::flatten_df()
checkmate::assert_integer( d_item$ID , lower=1, upper=2^15 , any.missing=F, unique=T)
checkmate::assert_character(d_item$Label , pattern="^\\w+" , any.missing=F, unique=T)
d_variable <- ds_file %>%
dplyr::filter(name=="variable") %>%
dplyr::pull(entries) %>%
purrr::flatten_df() %>%
dplyr::mutate(
item_found = (ExtractSource %in% d_extract_source$ID),
extract_found = (Item %in% d_item$ID),
unique_index = paste(Item, SurveyYear, LoopIndex1, LoopIndex2)
) %>%
dplyr::group_by(unique_index) %>%
dplyr::mutate(
unique_index_violation = (1L < n())
) %>%
dplyr::ungroup()
pattern_unique_index <- "^\\d{1,5} \\d{4} \\d{1,2} \\d{1,2}$"
checkmate::assert_character(d_variable$VariableCode , pattern="^[A-Z]\\d{7}$" , any.missing=F, unique=T)
checkmate::assert_integer( d_variable$Item , lower=0 , any.missing=F)
checkmate::assert_logical( d_variable$item_found , any.missing=F)
testit::assert("All items referenced from the variables should be in the item table.", all(d_variable$item_found))
testit::assert("All extract sources referenced from the variables should be in the item table.", all(d_variable$extract_found))
checkmate::assert_character(d_variable$unique_index , pattern=pattern_unique_index , any.missing=F, unique=T)
rm(d_item, d_variable)
}
# lst_ds %>%
# purrr::map(function(x)paste(names(x)))
ds_table_process <- ds_table %>%
dplyr::filter(schema_name == "Process") %>%
dplyr::mutate(
# sql_truncate = glue::glue("TRUNCATE TABLE {schema_name}.{table_name};")
sql_truncate = glue::glue("DELETE FROM {schema_name}.{table_name};")
)
# Open channel
channel <- open_dsn_channel_odbc(study)
DBI::dbGetInfo(channel)
## $dbname
## [1] "NlsyLinks97"
##
## $dbms.name
## [1] "Microsoft SQL Server"
##
## $db.version
## [1] "13.00.4206"
##
## $username
## [1] "dbo"
##
## $host
## [1] ""
##
## $port
## [1] ""
##
## $sourcename
## [1] "local-nlsy-links-97"
##
## $servername
## [1] "GIMBLE\\EXPRESS_2016"
##
## $drivername
## [1] "msodbcsql17.dll"
##
## $odbc.version
## [1] "03.80.0000"
##
## $driver.version
## [1] "17.01.0000"
##
## $odbcdriver.version
## [1] "03.80"
##
## $supports.transactions
## [1] TRUE
##
## attr(,"class")
## [1] "Microsoft SQL Server" "driver_info" "list"
channel_rodbc <- open_dsn_channel_rodbc(study)
RODBC::odbcGetInfo(channel_rodbc)
## DBMS_Name DBMS_Ver Driver_ODBC_Ver
## "Microsoft SQL Server" "13.00.4206" "03.80"
## Data_Source_Name Driver_Name Driver_Ver
## "local-nlsy-links-97" "msodbcsql17.dll" "17.01.0000"
## ODBC_Ver Server_Name
## "03.80.0000" "GIMBLE\\EXPRESS_2016"
if( !shallow_only ){
# Clear process tables
delete_results_process <- ds_table_process$sql_truncate %>%
purrr::set_names(ds_table_process$table_name) %>%
rev() %>%
purrr::map(DBI::dbGetQuery, conn=channel)
delete_results_process
}
## $tblSurveyTime
## data frame with 0 columns and 0 rows
##
## $tblSubjectDetails
## data frame with 0 columns and 0 rows
##
## $tblSubject
## data frame with 0 columns and 0 rows
##
## $tblRoster
## data frame with 0 columns and 0 rows
##
## $tblResponse
## data frame with 0 columns and 0 rows
##
## $tblRelatedValues
## data frame with 0 columns and 0 rows
##
## $tblRelatedStructure
## data frame with 0 columns and 0 rows
##
## $tblParentsOfGen1Current
## data frame with 0 columns and 0 rows
##
## $tblOutcome
## data frame with 0 columns and 0 rows
##
## $tblMarker
## data frame with 0 columns and 0 rows
# Delete metadata tables
# delete_result <- RODBC::sqlQuery(channel, "DELETE FROM [NlsLinks].[Metadata].[tblVariable]", errors=FALSE)
delete_results_metadata <- ds_file$sql_delete %>%
purrr::set_names(ds_file$table_name) %>%
rev() %>%
purrr::map(DBI::dbGetQuery, conn=channel)
# DBI::dbGetQuery(conn=channel, ds_file$sql_delete[15])
delete_results_metadata
## $tblvariable
## data frame with 0 columns and 0 rows
##
## $tblRosterAssignment
## data frame with 0 columns and 0 rows
##
## $tblMzManual
## data frame with 0 columns and 0 rows
##
## $tblLUYesNo
## data frame with 0 columns and 0 rows
##
## $tblLUTristate
## data frame with 0 columns and 0 rows
##
## $tblLURoster
## data frame with 0 columns and 0 rows
##
## $tblLURaceCohort
## data frame with 0 columns and 0 rows
##
## $tblLUMultipleBirth
## data frame with 0 columns and 0 rows
##
## $tblLUMarkerType
## data frame with 0 columns and 0 rows
##
## $tblLUGender
## data frame with 0 columns and 0 rows
##
## $tblLUMarkerEvidence
## data frame with 0 columns and 0 rows
##
## $tblLUExtractSource
## data frame with 0 columns and 0 rows
##
## $tblitem
## data frame with 0 columns and 0 rows
##
## $tblArchiveDescription
## data frame with 0 columns and 0 rows
# d <- ds_file %>%
# dplyr::select(table_name, entries) %>%
# dplyr::filter(table_name=="Enum.tblLURosterGen1") %>%
# tibble::deframe() %>%
# .[[1]]
# d2 <- d[, 1:16]
# RODBC::sqlSave(channel, dat=d, tablename="Enum.tblLURosterGen1", safer=TRUE, rownames=FALSE, append=TRUE)
# ds_file <- ds_file %>%
# dplyr::slice(1)
# Upload metadata tables
# i <- 2L
# OuhscMunge::upload_sqls_odbc(
# d = ds_file$entries[[i]] %>%
# dplyr::mutate_if(is.logical, as.character),
# schema_name = ds_file$schema_name[[i]],
# table_name = ds_file$table_name[[i]],
# dsn_name = dsn_name(study),
# clear_table = F,
# create_table = FALSE,
# convert_logical_to_integer = F
# )
purrr::pmap_int(
list(
ds_file$entries,
ds_file$table_name,
ds_file$schema_name
# seq_len(nrow(ds_file))
),
function( d, table_name, schema_name ) {
message("Writing to table ", table_name)
# OuhscMunge::upload_sqls_odbc(
# d = d,
# schema_name = schema_name,
# table_name = table_name,
# dsn_name = dsn_name(study),
# clear_table = TRUE,
# create_table = FALSE,
# convert_logical_to_integer = TRUE
# )
# browser()
# DBI::dbWriteTable(
# conn = channel,
# name = table_name,
# schema = schema_name,
# value = d,
#
# append = F
# )
RODBC::sqlSave(
channel = channel_rodbc,
dat = d,
# tablename = table_name,
tablename = paste0(schema_name, ".", table_name),
safer = TRUE, # Don't keep the existing table.
rownames = FALSE,
append = TRUE
)
}
) #%>%
## Writing to table tblArchiveDescription
## Writing to table tblitem
## Writing to table tblLUExtractSource
## Writing to table tblLUMarkerEvidence
## Writing to table tblLUGender
## Writing to table tblLUMarkerType
## Writing to table tblLUMultipleBirth
## Writing to table tblLURaceCohort
## Writing to table tblLURoster
## Writing to table tblLUTristate
## Writing to table tblLUYesNo
## Writing to table tblMzManual
## Writing to table tblRosterAssignment
## Writing to table tblvariable
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# purrr::set_names(ds_file$table_name)
# a <- ds_file$entries[[13]]
# table(a$ID)
# odbc::dbWriteTable(
# conn = channel,
# name = DBI::SQL("Metadata.tblRosterAssignment"),
# # name = "tblvariable_97",
# # schema = "Metadata",
# value = ds_file$entries[[13]],
# append = T
# )
# RODBC::sqlSave(
# channel = channel_rodbc,
# dat = ds_file$entries[[13]][1:1, ],
# # tablename = table_name,
# tablename = "Metadata.tblRosterAssignment",
# safer = TRUE, # Don't keep the existing table.
# rownames = FALSE,
# append = TRUE
# )
# for( i in seq_len(nrow(ds_file)) ) {
# message(glue::glue("Uploading from `{ basename(ds_file$path)[i]}` to `{ds_file$table_name[i]}`."))
#
# d <- ds_file$entries[[i]]
# print(d)
#
# # RODBC::sqlQuery(channel, ds_extract$sql_truncate[i], errors=FALSE)
#
# # d_peek <- RODBC::sqlQuery(channel, ds_extract$sql_select[i], errors=FALSE)
# #
# # missing_in_extract <- setdiff(colnames(d_peek), colnames(d))
# # missing_in_database <- setdiff(colnames(d), colnames(d_peek))
# #
# # d_column <- tibble::tibble(
# # db = colnames(d),
# # extract = colnames(d_peek)
# # ) %>%
# # dplyr::filter(db != extract)
# #
# # RODBC::sqlSave(
# # channel = channel,
# # dat = d,
# # tablename = ds_extract$table_name[i],
# # safer = TRUE, # Don't keep the existing table.
# # rownames = FALSE,
# # append = TRUE
# # ) %>%
# # print()
#
# OuhscMunge::upload_sqls_rodbc(
# d = d,
# table_name = ds_file$table_name[i] ,
# dsn_name = "local-nlsy-links",
# clear_table = T,
# create_table = F
# )
#
#
# message(glue::glue("{format(object.size(d), units='MB')}"))
# }
# Close channels
DBI::dbDisconnect(channel); rm(channel)
RODBC::odbcClose(channel_rodbc); rm(channel_rodbc)
duration_in_seconds <- round(as.numeric(difftime(Sys.time(), start_time, units="secs")))
cat("`import-97-metadata.R` file completed by `", Sys.info()["user"], "` at ", strftime(Sys.time(), "%Y-%m-%d, %H:%M %z"), " in ", duration_in_seconds, " seconds.", sep="")
## `import-97-metadata.R` file completed by `Will` at 2018-06-27, 10:59 -0500 in 13 seconds.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.5.0 Patched (2018-05-14 r74725)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows >= 8 x64 (build 9200)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_2.2.1 DBI_1.0.0 bindrcpp_0.2.2 magrittr_1.5
## [5] knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.4 purrr_0.2.5 colorspace_1.3-2
## [4] testthat_2.0.0 htmltools_0.3.6 viridisLite_0.3.0
## [7] yaml_2.1.19 chron_2.3-52 utf8_1.1.4
## [10] blob_1.1.1 rlang_0.2.1 pillar_1.2.3
## [13] glue_1.2.0 withr_2.1.2 bit64_0.9-7
## [16] gsubfn_0.7 bindr_0.1.1 plyr_1.8.4
## [19] stringr_1.3.1 munsell_0.5.0 gtable_0.2.0
## [22] rvest_0.3.2 devtools_1.13.5 kableExtra_0.9.0
## [25] memoise_1.1.0 evaluate_0.10.1 labeling_0.3
## [28] OuhscMunge_0.1.9.9008 markdown_0.8 highr_0.7
## [31] proto_1.0.0 Rcpp_0.12.17 readr_1.2.0
## [34] scales_0.5.0 backports_1.1.2 checkmate_1.8.6
## [37] config_0.3 bit_1.1-14 testit_0.8
## [40] hms_0.4.2.9000 digest_0.6.15 stringi_1.2.3
## [43] dplyr_0.7.5 rprojroot_1.3-2 grid_3.5.0
## [46] cli_1.0.0 odbc_1.1.6 tools_3.5.0
## [49] sqldf_0.4-11 lazyeval_0.2.1 tibble_1.4.2
## [52] RSQLite_2.1.1 crayon_1.3.4 tidyr_0.8.1
## [55] pkgconfig_2.0.1 RODBC_1.3-15 xml2_1.2.0
## [58] assertthat_0.2.0 rmarkdown_1.10 httr_1.3.1
## [61] rstudioapi_0.7 R6_2.2.2 compiler_3.5.0
Sys.time()
## [1] "2018-06-27 10:59:20 CDT"