From 9d4ed54cbc01ffb9fdbbf5f1e0b01dba5d9616cb Mon Sep 17 00:00:00 2001 From: Lars Vilhuber Date: Fri, 13 Sep 2024 08:12:51 -0400 Subject: [PATCH] Adding an Rscript to create and verify checksums --- checksums.R | 94 +++++++++++++++++++++++++++++++++++ data/metadata/consistency.csv | 16 +++--- 2 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 checksums.R diff --git a/checksums.R b/checksums.R new file mode 100644 index 0000000..c424c88 --- /dev/null +++ b/checksums.R @@ -0,0 +1,94 @@ +# From Stata + +# if $redownload == 1 { +# copy "https://datahub.io/core/country-codes/r/country-codes.csv" "data/raw/country-codes.csv", replace +# // create checksum of file +# // Aug 2023 version: 2295658388 +# global countrycksum 2295658388 +# checksum "data/raw/country-codes.csv", save +# assert $countrycksum == r(checksum) +# // This will fail if the files are not identical +# // Provide a verbose message if we get past this point +# disp in green "Country codes file downloaded successfully" +# } + +library(dplyr) +library(openssl) +library(tools) + +# Function to calculate SHA256 hash of a file +calculate_sha256 <- function(filepath) { + tryCatch({ + hash <- sha256(file(filepath)) + # Convert the raw vector to a single hexadecimal string + as.character(hash, sep = "") + }, error = function(e) { + NA_character_ + }) +} + +# Function to verify checksum +verify_checksum <- function(filepath, expected_hash) { + calculated_hash <- calculate_sha256(filepath) + if (is.na(calculated_hash)) { + return(FALSE) + } + return(calculated_hash == expected_hash) +} + +# Flags + +reprocess <- TRUE +generate <- TRUE + +# filepaths - generalized - this could be in an externally sourced file treated as confidential + +data.path <- "data/" +data.path.external <- file.path(data.path,"external") +data.path.registry <- file.path(data.path,"registry") +data.path.metadata <- file.path(data.path,"metadata") + + +if ( generate ) { + metadata <- read.csv(file.path(data.path.metadata,"consistency.csv")) + # Add SHA256 hash column + metadata <- metadata %>% + rowwise() %>% + mutate(sha256sum = calculate_sha256(file.path(get(path), filename)), + asofdate=date()) %>% + select(filename,path,sha256sum,asofdate) + # Write the updated metadata back to a CSV file + write.csv(metadata, file.path(data.path.metadata,"consistency.csv"), + row.names = FALSE) +} + +# check the checksums before proceeding + +# Verify all checksums in the metadata +metadata <- read.csv(file.path(data.path.metadata,"consistency.csv")) +metadata <- metadata %>% + rowwise() %>% + mutate(checksum_verified = + verify_checksum(file.path(get(paste0("data.path.",path)), filename), sha256sum) + ) + +# Display results +message("Verification results for files") +print(metadata %>% select(filename, sha256sum, checksum_verified)) + +# Count of verified and failed checksums +summary <- metadata %>% + ungroup() %>% + summarise( + total_files = n(), + verified = sum(checksum_verified), + failed = sum(!checksum_verified) + ) +print(summary) + +if ( reprocess ) { + # Do stuff here +} + + + diff --git a/data/metadata/consistency.csv b/data/metadata/consistency.csv index 1e59e37..0a2a8d2 100644 --- a/data/metadata/consistency.csv +++ b/data/metadata/consistency.csv @@ -1,8 +1,8 @@ -filename,path,checksum,date -anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv,external,,2024-09-11 -China2021Data.csv,external,,2024-09-11 -LucidPlotText.csv,external,,2024-09-11 -LucidWideFile.csv,external,,2024-09-11 -Pre2020Surveys_Pooled.csv,external,,2024-09-11 -trials.Rds,registry,,2024-09-11 -consistency.csv,metadata,,2024-09-11 +"filename","path","sha256sum","asofdate" +"anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv","external","75d050817b922b02b444b58c279f5d770f01376321d894c80b76364063248b63","Fri Sep 13 07:53:19 2024" +"China2021Data.csv","external","2ce75c757229ddb631cadccc9c4063537de313d3a754fed13f93054656261bba","Fri Sep 13 07:53:19 2024" +"LucidPlotText.csv","external","992daee8392ebeee510290a7b5895d53028c358b892a9d7c10a6391b5d979194","Fri Sep 13 07:53:19 2024" +"LucidWideFile.csv","external","767eb7b05fc1910603780680476cf32fee845728b4c7346e9f0ada6ed0c84f04","Fri Sep 13 07:53:19 2024" +"Pre2020Surveys_Pooled.csv","external","17b6cc33b8bdc3938aab5b59de0dc4fa2b724d1ad997060385466d0899dae82e","Fri Sep 13 07:53:19 2024" +"trials.Rds","registry","07e0a48dc2bb5896fd89540915f431eb6e1b36b374c848c1edfbabee8d5e6ad7","Fri Sep 13 07:53:19 2024" +"consistency.csv","metadata","2732f4a0fa6a580fc601266ed34e76d6b276a256a09e95232022b7b53f4a9025","Fri Sep 13 07:53:19 2024"