Rebuilding Cached Random CDISC Data
rebuild_cached_data.Rmd
Getting Started
The following script is used to create, compare and save cached data
to the data/
directory.
Starting in R 3.6.0
the default kind of under-the-hood
random-number generator was changed. Now, in order to get the results
from set.seed()
to match, you have to first call the
function RNGkind(sample.kind = "Rounding")
.
It throws the expected warning:
Warning: non-uniform 'Rounding' sampler used
Code Maintenance
Currently, when a random.cdisc.data
data-generating
function is created or modified, then the below code chunk must be run
to build the new/updated cached dataset and add it to the
data/
directory. If a dataset that is a dependency for
another dataset has been updated then the dependent dataset will also be
updated. To manually specify which datasets should be updated, edit the
data_to_update
vector below, entering the desired dataset
names.
Update Cached Data
Note: Prior to running the following code chunk,
please ensure that you have reinstalled the
random.cdisc.data
package after completing all dataset
modifications.
# Helper functions
#
flatten_list_of_deps <- function(updated_data, data_deps) {
# Get higher deps fnc
get_higher_deps <- function(cur_dep, data_deps) {
sapply(seq_along(data_deps), function(x) {
if (any(cur_dep %in% data_deps[[x]])) {
names(data_deps)[x]
}
})
}
# Get lower deps fnc
get_lower_deps <- function(cur_dep, data_deps) {
data_deps[sapply(cur_dep, function(x) which(x == names(data_deps)))]
}
# Sort data_deps
sort_data_deps <- function(upd_data, data_deps) {
iup <- upd_data
for (ud in upd_data) {
up <- unlist(get_lower_deps(ud, data_deps))
if (any(up %in% upd_data)) {
iup <- unique(unlist(c(up[up %in% upd_data], iup)))
}
}
iup
}
# Firstly, lets sort by dependencies the initial updated data
fin_up <- sort_data_deps(updated_data, data_deps)
# Extracting higher dependencies for each value
cnt <- 1
while (cnt <= length(fin_up)) {
cur_deps <- unlist(
get_higher_deps(fin_up[cnt], data_deps)
)
if (!is.null(cur_deps)) {
cur_deps <- sort_data_deps(cur_deps, data_deps)
fin_up <- unique(c(fin_up[seq_len(cnt)], cur_deps, fin_up[-seq_len(cnt)]))
}
cnt <- cnt + 1
}
fin_up
}
library(random.cdisc.data)
library(diffdf)
library(dplyr)
# Call function to match random number generation from previous R versions
RNGkind(sample.kind = "Rounding")
# Datasets must be listed after all of their dependencies
# e.g. adsl is a dependency for all other datasets so it is listed first.
pkg_dir <- dirname(getwd())
# Listing source files and extraction of datasets' names
src_files <- list.files(paste0(pkg_dir, "/R"))
data_nms <- src_files[grepl("^ra*", src_files)] %>%
stringr::str_remove(pattern = "^r") %>%
stringr::str_remove(pattern = ".R$") %>%
sort()
# Exception handling
data_nms <- data_nms[data_nms != "adsaftte"] # Unbuilt for now
# Construction of dependency tree based on formals
data_deps <- sapply(
data_nms,
function(x) {
dat_args <- names(formals(paste0("r", x)))
dat_args[dat_args %in% data_nms]
}
)
git_call <- "git diff origin/main --name-only"
updated_files <- tryCatch(
system(git_call, intern = TRUE),
error = function(e) e
)
status_uf <- attr(updated_files, "status")
if (is(updated_files, "error") || (!is.null(status_uf) && status_uf == 1)) {
message("Found following error in git call: ", git_call)
message(e)
message(
"The calculation continues as default by recreating all datasets ",
"and updating the cached data if any change is found."
)
updated_data <- data_nms
} else {
updated_data <- updated_files[grepl("^R\\/", updated_files)] %>%
stringr::str_remove("^R\\/") %>%
stringr::str_remove(pattern = "^r") %>%
stringr::str_remove(pattern = ".R$")
}
if (length(updated_data) != 0) {
stopifnot(all(updated_data %in% names(data_deps)))
data_to_update <- flatten_list_of_deps(updated_data, data_deps)
default_args <- list(seed = 1, na_vars = list(), who_coding = TRUE, percent = 80, number = 2)
# Generate and save updated cached datasets
for (dat in data_to_update) {
# Match arguments with defaults
dat_args <- default_args[names(default_args) %in% names(formals(paste0("r", dat)))]
# Get the data deps cache that is already there (if adsl returns list())
dat_deps <- lapply(data_deps[[dat]], function(x) get(paste0("c", x)))
# Main call to creation function
cdataset <- do.call(paste0("r", dat), c(dat_args, dat_deps))
# Preview differences
cat("\nSaving cached data for dataset", paste0("*", dat, "*"), "with the following changes found (diffdf):\n")
diff_test <- diffdf(get(paste0("c", dat)), cdataset)
print(diff_test)
# Check if there is any actual change to the data
if (length(diff_test) > 0) { # If no difference -> list()
# Save new cached dataset
assign(paste0("c", dat), cdataset)
fl_save <- paste0(dirname(getwd()), "/data/c", dat, ".RData")
attr(cdataset, "creation date") <- lubridate::date() # This should NOT be updated if no changes in diffdf
save(list = paste0("c", dat), file = fl_save, compress = "xz")
cat("Cached dataset updated for", paste0("*", dat, "*"), "in", paste0("data/", basename(fl_save), "."), "\n")
} else {
message("No update detected on the final data. No cached data was updated for *", dat, "*.")
}
}
} else {
message("No source files changed: no cached datasets currently require updates.")
}