Skip to contents

When creating apps which do not use DDL, once the datasets are created there is often some pre-processing required before initializing the teal app. Similarly, in the case of delayed data additional code instructions to pre-process data can be added to DDL objects which will be run after the data is loaded, which may happen after the launching of the shiny app or when the pull() method is called.

  • mutate_dataset: Individual datasets can be processed using the mutate_dataset function. For reproducibility to be maintained with mutate_dataset, all pre-processing code should modify one dataset at a time.
library(teal.data)
library(magrittr)

adsl_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADSL"))
adsl <- cdisc_dataset_connector(
  dataname = "ADSL",
  pull_callable = adsl_cf,
  keys = get_cdisc_keys("ADSL")
) %>%
  mutate_dataset("ADSL$SEX <- as.factor(ADSL$SEX)")


adae_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADAE"))
adae <- cdisc_dataset_connector(
  dataname = "ADAE",
  pull_callable = adae_cf,
  keys = get_cdisc_keys("ADAE")
) %>%
  mutate_dataset("ADAE$X <- rep(ADSL$SEX[1])", vars = list(ADSL = adsl))

adsl$pull() %>%
  get_raw_data() %>%
  head(n = 2)
##   STUDYID               USUBJID SUBJID SITEID AGE  AGEU SEX
## 1 AB12345  AB12345-CHN-3-id-128 id-128  CHN-3  32 YEARS   M
## 2 AB12345 AB12345-CHN-15-id-262 id-262 CHN-15  35 YEARS   M
##                        RACE                 ETHNIC COUNTRY DTHFL         INVID
## 1                     ASIAN     HISPANIC OR LATINO     CHN     Y  INV ID CHN-3
## 2 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO     CHN     N INV ID CHN-15
##           INVNAM            ARM ARMCD         ACTARM ACTARMCD         TRT01P
## 1  Dr. CHN-3 Doe      A: Drug X ARM A      A: Drug X    ARM A      A: Drug X
## 2 Dr. CHN-15 Doe C: Combination ARM C C: Combination    ARM C C: Combination
##           TRT01A     TRT02P         TRT02A REGION1 STRATA1 STRATA2    BMRKR1
## 1      A: Drug X B: Placebo      A: Drug X    Asia       C      S2 14.424934
## 2 C: Combination B: Placebo C: Combination    Asia       C      S1  4.055463
##   BMRKR2 ITTFL SAFFL BMEASIFL BEP01FL AEWITHFL     RANDDT             TRTSDTM
## 1 MEDIUM     Y     Y        Y       Y        N 2019-02-22 2019-02-24 11:09:25
## 2    LOW     Y     Y        N       N        Y 2019-02-26 2019-02-26 09:05:10
##               TRTEDTM           TRT01SDTM           TRT01EDTM
## 1 2022-02-12 04:28:08 2019-02-24 11:09:25 2021-02-11 22:28:08
## 2 2022-02-26 03:05:10 2019-02-26 09:05:10 2021-02-25 21:05:10
##             TRT02SDTM           TRT02EDTM            AP01SDTM
## 1 2021-02-11 22:28:08 2022-02-12 04:28:08 2019-02-24 11:09:25
## 2 2021-02-25 21:05:10 2022-02-26 03:05:10 2019-02-26 09:05:10
##              AP01EDTM            AP02SDTM            AP02EDTM       EOSSTT
## 1 2021-02-11 22:28:08 2021-02-11 22:28:08 2022-02-12 04:28:08 DISCONTINUED
## 2 2021-02-25 21:05:10 2021-02-25 21:05:10 2022-02-26 03:05:10    COMPLETED
##         EOTSTT      EOSDT EOSDY DCSREAS      DTHDT       DTHCAUS        DTHCAT
## 1 DISCONTINUED 2022-02-12  1084   DEATH 2022-03-06 ADVERSE EVENT ADVERSE EVENT
## 2    COMPLETED 2022-02-26  1096    <NA>       <NA>          <NA>          <NA>
##   LDDTHELD LDDTHGR1   LSTALVDT DTHADY ADTHAUT
## 1       22     <=30 2022-03-06   1105     Yes
## 2       NA     <NA> 2022-03-17     NA    <NA>
adae$pull() %>%
  get_raw_data() %>%
  head(n = 2)
##   STUDYID              USUBJID SUBJID SITEID AGE  AGEU SEX  RACE
## 1 AB12345 AB12345-BRA-1-id-134 id-134  BRA-1  47 YEARS   M WHITE
## 2 AB12345 AB12345-BRA-1-id-134 id-134  BRA-1  47 YEARS   M WHITE
##                   ETHNIC COUNTRY DTHFL        INVID        INVNAM       ARM
## 1 NOT HISPANIC OR LATINO     BRA     Y INV ID BRA-1 Dr. BRA-1 Doe A: Drug X
## 2 NOT HISPANIC OR LATINO     BRA     Y INV ID BRA-1 Dr. BRA-1 Doe A: Drug X
##   ARMCD    ACTARM ACTARMCD    TRT01P    TRT01A     TRT02P    TRT02A
## 1 ARM A A: Drug X    ARM A A: Drug X A: Drug X B: Placebo A: Drug X
## 2 ARM A A: Drug X    ARM A A: Drug X A: Drug X B: Placebo A: Drug X
##         REGION1 STRATA1 STRATA2   BMRKR1 BMRKR2 ITTFL SAFFL BMEASIFL BEP01FL
## 1 South America       B      S2 6.462991    LOW     Y     Y        Y       N
## 2 South America       B      S2 6.462991    LOW     Y     Y        Y       N
##   AEWITHFL     RANDDT             TRTSDTM             TRTEDTM
## 1        N 2020-11-03 2020-11-04 04:08:58 2022-02-20 03:33:55
## 2        N 2020-11-03 2020-11-04 04:08:58 2022-02-20 03:33:55
##             TRT01SDTM           TRT01EDTM           TRT02SDTM
## 1 2020-11-04 04:08:58 2021-02-19 21:33:55 2021-02-19 21:33:55
## 2 2020-11-04 04:08:58 2021-02-19 21:33:55 2021-02-19 21:33:55
##             TRT02EDTM            AP01SDTM            AP01EDTM
## 1 2022-02-20 03:33:55 2020-11-04 04:08:58 2021-02-19 21:33:55
## 2 2022-02-20 03:33:55 2020-11-04 04:08:58 2021-02-19 21:33:55
##              AP02SDTM            AP02EDTM       EOSSTT       EOTSTT      EOSDT
## 1 2021-02-19 21:33:55 2022-02-20 03:33:55 DISCONTINUED DISCONTINUED 2022-02-20
## 2 2021-02-19 21:33:55 2022-02-20 03:33:55 DISCONTINUED DISCONTINUED 2022-02-20
##   EOSDY DCSREAS      DTHDT       DTHCAUS        DTHCAT LDDTHELD LDDTHGR1
## 1   473   DEATH 2022-03-16 ADVERSE EVENT ADVERSE EVENT       24     <=30
## 2   473   DEATH 2022-03-16 ADVERSE EVENT ADVERSE EVENT       24     <=30
##     LSTALVDT DTHADY ADTHAUT ASEQ AESEQ        AETERM         AELLT
## 1 2022-03-16    496     Yes    1     1 trm B.2.1.2.1 llt B.2.1.2.1
## 2 2022-03-16    496     Yes    2     2 trm D.1.1.4.2 llt D.1.1.4.2
##         AEDECOD       AEHLT     AEHLGT AEBODSYS AESOC    AESEV AESER
## 1 dcd B.2.1.2.1 hlt B.2.1.2 hlgt B.2.1   cl B.2  cl B MODERATE     N
## 2 dcd D.1.1.4.2 hlt D.1.1.4 hlgt D.1.1   cl D.1  cl D MODERATE     N
##              AEACN AEREL                AEOUT AESDTH AESCONG AESDISAB AESHOSP
## 1 DOSE NOT CHANGED     N RECOVERING/RESOLVING      N       N        Y       N
## 2 DOSE NOT CHANGED     N RECOVERING/RESOLVING      N       N        Y       N
##   AESLIFE AESMIE TRTEMFL AECONTRT              ASTDTM              AENDTM ASTDY
## 1       N      N       Y        Y 2021-04-15 04:08:58 2021-10-04 04:08:58   162
## 2       N      N       Y        N 2021-05-19 04:08:58 2021-10-31 04:08:58   196
##   AENDY            LDOSEDTM    LDRELTM AETOXGR SMQ01NAM SMQ02NAM SMQ01SC
## 1   334 2020-11-07 09:05:04 228663.896       3     <NA>     <NA>    <NA>
## 2   361 2021-05-17 07:21:09   2687.814       3     <NA>     <NA>    <NA>
##   SMQ02SC CQ01NAM ANL01FL           AERELNST          AEACNOTH X
## 1    <NA>    <NA>       Y               NONE PROCEDURE/SURGERY M
## 2    <NA>    <NA>       Y CONCURRENT ILLNESS        MEDICATION M
  • mutate_data: Collections of datasets should only be processed using the mutate_data function:
cdisc_data(adsl, adae, check = TRUE) %>%
  mutate_data("ADAE$x <- ADSL$SUBJID[1]")

The code is processed in the order the datasets are pulled so if there are dependencies between datasets it matters the order in which pre-processing code is added to the CDISCTealData object just as order matters when the arguments are inputted to the cdisc_data function to create the CDISCTealData object.

Finally, the code argument directly in teal_data and cdisc_data call does not need to be used for DDL because data loaded with DDL are reproducible by design. Because of this, it is recommended to set argument check = TRUE inside cdisc_data function when creating apps with DDL.

Processing dependencies

It may be required to generate a delayed data object that is dependent on some other delayed object or some constant value.

For this, when creating your delayed data object it’s possible to supply the additional variables that are to be accessed during the data loading (pull) using additional arguments through ...:

get_code(adsl)
## [1] "ADSL <- teal.data::example_cdisc_data(dataname = \"ADSL\")\nADSL$SEX <- as.factor(ADSL$SEX)"
pull_fun_adae <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADAE"))
adae <- dataset_connector(
  dataname = "ADAE",
  pull_callable = pull_fun_adae,
  keys = get_cdisc_keys("ADAE")
)

get_code(adae)
## [1] "ADAE <- teal.data::example_cdisc_data(dataname = \"ADAE\")"

It’s also possible to supply these additional variables after creating your object using the mutate_dataset function.

last_run <- Sys.Date() # constant value stored as a variable in the current session

adsl_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADSL"))
adsl <- cdisc_dataset_connector(
  dataname = "ADSL",
  pull_callable = adsl_cf,
  keys = get_cdisc_keys("ADSL")
) %>%
  mutate_dataset("ADSL$last_run <- last_run", vars = list(last_run = last_run))

cat(get_code(adsl))
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## last_run <- structure(19583, class = "Date")
## ADSL$last_run <- last_run
# compared to evaluating the variable at the time of loading
adsl_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADSL"))
adsl <- cdisc_dataset_connector(
  dataname = "ADSL",
  pull_callable = adsl_cf,
  keys = get_cdisc_keys("ADSL")
) %>%
  mutate_dataset("last_run <- Sys.Date()\nADSL$last_run <- last_run")

adsl %>%
  get_code() %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## last_run <- Sys.Date()
## ADSL$last_run <- last_run

This is also required when creating the object depends on another delayed data object:

adsl <- teal.data::example_cdisc_data("ADSL")
adae_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADAE"))
adae <- cdisc_dataset_connector(
  dataname = "ADAE",
  pull_callable = adae_cf,
  keys = get_cdisc_keys("ADAE")
) %>%
  mutate_dataset("ADAE$n <- nrow(ADSL)")

cat(get_code(adae)) # the code returned by `adae` is not sufficient to reproduce `adae`
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
adsl_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADSL"))
adsl <- cdisc_dataset_connector(
  dataname = "ADSL",
  pull_callable = adsl_cf,
  keys = get_cdisc_keys("ADSL")
)
adae_cf <- callable_function(teal.data::example_cdisc_data) %>%
  set_args(list(dataname = "ADAE"))
adae <- cdisc_dataset_connector(
  dataname = "ADAE",
  pull_callable = adae_cf,
  keys = get_cdisc_keys("ADAE")
) %>%
  mutate_dataset("ADAE$n <- nrow(ADSL)", vars = list(ADSL = adsl))

cat(get_code(adae)) # this code can be run independently
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE$n <- nrow(ADSL)

Related to this idea, it is possible to provide the code on a Data level. However, this will always return all the code used to generate all the datasets in the object:

adsl_adae <- cdisc_data(
  adsl,
  adae
) %>% mutate_data("ADAE$avg_age <- mean(ADAE$AGE)")

# the output for all 3 are the same
adsl_adae %>%
  get_code() %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
## ADAE$avg_age <- mean(ADAE$AGE)
adsl_adae %>%
  get_code(dataname = "ADAE") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
## ADAE$avg_age <- mean(ADAE$AGE)
adsl_adae %>%
  get_code(dataname = "ADSL") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
## ADAE$avg_age <- mean(ADAE$AGE)

The better approach would be to supply the code on a Dataset level. This ensures that the code accessed on a dataset level only contains the snippets that pertains to itself:

adsl_adae <- cdisc_data(
  adsl,
  adae %>% mutate_dataset("ADAE$avg_age <- mean(ADAE$AGE)")
)

adsl_adae %>%
  get_code() %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
## ADAE$avg_age <- mean(ADAE$AGE)
adsl_adae %>%
  get_code("ADAE") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")
## ADAE <- teal.data::example_cdisc_data(dataname = "ADAE")
## ADAE$n <- nrow(ADSL)
## ADAE$avg_age <- mean(ADAE$AGE)
adsl_adae %>%
  get_code("ADSL") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data(dataname = "ADSL")

Related to this idea, the delayed data object needs to be supplied with the code needed to reproduce the data. This can be provided at the Dataset level or the Data level.

Below is a comparison of these two approaches:

adsl <- teal.data::example_cdisc_data("ADSL")
cdisc_dataset("ADSL", adsl) %>% get_code() # no reproducible code
## [1] ""
# provide the code to reproduce the data:
cdisc_dataset("ADSL", adsl,
  code = "ADSL <- teal.data::example_cdisc_data(\"ADSL\")"
) %>%
  get_code()
## [1] "ADSL <- teal.data::example_cdisc_data(\"ADSL\")"
# it's possible to supply the code at the `Data` level:
adae <- teal.data::example_cdisc_data("ADAE")
adsl_adae <- cdisc_data(
  cdisc_dataset("ADSL", adsl),
  cdisc_dataset("ADAE", adae),
  code = "ADSL <- teal.data::example_cdisc_data(\"ADSL\")\nADAE <- teal.data::example_cdisc_data(\"ADAE\")"
)

adsl_adae %>%
  get_code() %>%
  cat()
## ADSL <- teal.data::example_cdisc_data("ADSL")
## ADAE <- teal.data::example_cdisc_data("ADAE")
# but it's not possible then to access the code at a `Dataset` level:
adsl_adae %>%
  get_code("ADSL") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data("ADSL")
## ADAE <- teal.data::example_cdisc_data("ADAE")
# this can be avoided by storing the code like so:
adsl_adae <- cdisc_data(
  cdisc_dataset("ADSL", adsl, code = "ADSL <- teal.data::example_cdisc_data(\"ADSL\")"),
  cdisc_dataset("ADAE", adae, code = "ADAE <- teal.data::example_cdisc_data(\"ADAE\")")
)

adsl_adae %>%
  get_code("ADSL") %>%
  cat()
## ADSL <- teal.data::example_cdisc_data("ADSL")