-
Notifications
You must be signed in to change notification settings - Fork 0
/
checksums.R
94 lines (76 loc) · 2.55 KB
/
checksums.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# From Stata
# if $redownload == 1 {
# copy "https://datahub.io/core/country-codes/r/country-codes.csv" "data/raw/country-codes.csv", replace
# // create checksum of file
# // Aug 2023 version: 2295658388
# global countrycksum 2295658388
# checksum "data/raw/country-codes.csv", save
# assert $countrycksum == r(checksum)
# // This will fail if the files are not identical
# // Provide a verbose message if we get past this point
# disp in green "Country codes file downloaded successfully"
# }
library(dplyr)
library(openssl)
library(tools)
# Function to calculate SHA256 hash of a file
calculate_sha256 <- function(filepath) {
tryCatch({
hash <- sha256(file(filepath))
# Convert the raw vector to a single hexadecimal string
as.character(hash, sep = "")
}, error = function(e) {
NA_character_
})
}
# Function to verify checksum
verify_checksum <- function(filepath, expected_hash) {
calculated_hash <- calculate_sha256(filepath)
if (is.na(calculated_hash)) {
return(FALSE)
}
return(calculated_hash == expected_hash)
}
# Flags
reprocess <- TRUE
generate <- TRUE
# filepaths - generalized - this could be in an externally sourced file treated as confidential
data.path <- "data/"
data.path.external <- file.path(data.path,"external")
data.path.registry <- file.path(data.path,"registry")
data.path.metadata <- file.path(data.path,"metadata")
if ( generate ) {
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
# Add SHA256 hash column
metadata <- metadata %>%
rowwise() %>%
mutate(sha256sum = calculate_sha256(file.path(get(path), filename)),
asofdate=date()) %>%
select(filename,path,sha256sum,asofdate)
# Write the updated metadata back to a CSV file
write.csv(metadata, file.path(data.path.metadata,"consistency.csv"),
row.names = FALSE)
}
# check the checksums before proceeding
# Verify all checksums in the metadata
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
metadata <- metadata %>%
rowwise() %>%
mutate(checksum_verified =
verify_checksum(file.path(get(paste0("data.path.",path)), filename), sha256sum)
)
# Display results
message("Verification results for files")
print(metadata %>% select(filename, sha256sum, checksum_verified))
# Count of verified and failed checksums
summary <- metadata %>%
ungroup() %>%
summarise(
total_files = n(),
verified = sum(checksum_verified),
failed = sum(!checksum_verified)
)
print(summary)
if ( reprocess ) {
# Do stuff here
}