Skip to content

Commit

Permalink
Adding an Rscript to create and verify checksums
Browse files Browse the repository at this point in the history
  • Loading branch information
larsvilhuber committed Sep 13, 2024
1 parent d5cb6d9 commit 9d4ed54
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 8 deletions.
94 changes: 94 additions & 0 deletions checksums.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# From Stata

# if $redownload == 1 {
# copy "https://datahub.io/core/country-codes/r/country-codes.csv" "data/raw/country-codes.csv", replace
# // create checksum of file
# // Aug 2023 version: 2295658388
# global countrycksum 2295658388
# checksum "data/raw/country-codes.csv", save
# assert $countrycksum == r(checksum)
# // This will fail if the files are not identical
# // Provide a verbose message if we get past this point
# disp in green "Country codes file downloaded successfully"
# }

library(dplyr)
library(openssl)
library(tools)

# Function to calculate SHA256 hash of a file
calculate_sha256 <- function(filepath) {
tryCatch({
hash <- sha256(file(filepath))
# Convert the raw vector to a single hexadecimal string
as.character(hash, sep = "")
}, error = function(e) {
NA_character_
})
}

# Function to verify checksum
verify_checksum <- function(filepath, expected_hash) {
calculated_hash <- calculate_sha256(filepath)
if (is.na(calculated_hash)) {
return(FALSE)
}
return(calculated_hash == expected_hash)
}

# Flags

reprocess <- TRUE
generate <- TRUE

# filepaths - generalized - this could be in an externally sourced file treated as confidential

data.path <- "data/"
data.path.external <- file.path(data.path,"external")
data.path.registry <- file.path(data.path,"registry")
data.path.metadata <- file.path(data.path,"metadata")


if ( generate ) {
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
# Add SHA256 hash column
metadata <- metadata %>%
rowwise() %>%
mutate(sha256sum = calculate_sha256(file.path(get(path), filename)),
asofdate=date()) %>%
select(filename,path,sha256sum,asofdate)
# Write the updated metadata back to a CSV file
write.csv(metadata, file.path(data.path.metadata,"consistency.csv"),
row.names = FALSE)
}

# check the checksums before proceeding

# Verify all checksums in the metadata
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
metadata <- metadata %>%
rowwise() %>%
mutate(checksum_verified =
verify_checksum(file.path(get(paste0("data.path.",path)), filename), sha256sum)
)

# Display results
message("Verification results for files")
print(metadata %>% select(filename, sha256sum, checksum_verified))

# Count of verified and failed checksums
summary <- metadata %>%
ungroup() %>%
summarise(
total_files = n(),
verified = sum(checksum_verified),
failed = sum(!checksum_verified)
)
print(summary)

if ( reprocess ) {
# Do stuff here
}



16 changes: 8 additions & 8 deletions data/metadata/consistency.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
filename,path,checksum,date
anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv,external,,2024-09-11
China2021Data.csv,external,,2024-09-11
LucidPlotText.csv,external,,2024-09-11
LucidWideFile.csv,external,,2024-09-11
Pre2020Surveys_Pooled.csv,external,,2024-09-11
trials.Rds,registry,,2024-09-11
consistency.csv,metadata,,2024-09-11
"filename","path","sha256sum","asofdate"
"anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv","external","75d050817b922b02b444b58c279f5d770f01376321d894c80b76364063248b63","Fri Sep 13 07:53:19 2024"
"China2021Data.csv","external","2ce75c757229ddb631cadccc9c4063537de313d3a754fed13f93054656261bba","Fri Sep 13 07:53:19 2024"
"LucidPlotText.csv","external","992daee8392ebeee510290a7b5895d53028c358b892a9d7c10a6391b5d979194","Fri Sep 13 07:53:19 2024"
"LucidWideFile.csv","external","767eb7b05fc1910603780680476cf32fee845728b4c7346e9f0ada6ed0c84f04","Fri Sep 13 07:53:19 2024"
"Pre2020Surveys_Pooled.csv","external","17b6cc33b8bdc3938aab5b59de0dc4fa2b724d1ad997060385466d0899dae82e","Fri Sep 13 07:53:19 2024"
"trials.Rds","registry","07e0a48dc2bb5896fd89540915f431eb6e1b36b374c848c1edfbabee8d5e6ad7","Fri Sep 13 07:53:19 2024"
"consistency.csv","metadata","2732f4a0fa6a580fc601266ed34e76d6b276a256a09e95232022b7b53f4a9025","Fri Sep 13 07:53:19 2024"

0 comments on commit 9d4ed54

Please sign in to comment.