head(simdat)
22 hdPS with a time-to-event outcome
22.1 Step 0: Analytic data
To demonstrate the use of the hdPS analysis with a time-to-event outcome, we will use a simulated dataset. The example is to explore the relationship between arthritis (binary exposure) and CVD (time-to-event outcome).
The simulated dataset contains information on 3,000 individuals with the following variables:
22.2 Step 1: Proxy sources
22.2.1 Data with investigator-specified covariates
Let us check the summary statistics of the investigator-specified covariates, stratified by the exposure variable (arthritis).
# Table 1
<- CreateTableOne(vars = c("age", "sex", "comorbidity"),
tab1 strata = "arthritis",
data = simdat,
test = FALSE)
print(tab1, showAllLevels = TRUE, noSpaces = TRUE, quote = FALSE, smd = TRUE)
#> Stratified by arthritis
#> level No Yes SMD
#> n 2143 857
#> age (mean (SD)) 48.88 (9.53) 53.65 (10.03) 0.487
#> sex (%) Female 1189 (55.5) 592 (69.1) 0.283
#> Male 954 (44.5) 265 (30.9)
#> comorbidity (%) No 1629 (76.0) 596 (69.5) 0.146
#> Yes 514 (24.0) 261 (30.5)
# Bivariate table
round(prop.table(table(arthritis = simdat$arthritis, CVD = simdat$cvd),
margin = 1)*100, 2)
#> CVD
#> arthritis 0 1
#> No 70.70 29.30
#> Yes 43.52 56.48
22.2.2 Proxy data
In this example, we will use four data dimensions:
- 3-digit diagnostic codes from hospital database (diag)
- 3-digit procedure codes from hospital database (proc)
- 3-digit icd codes from physician claim database (msp)
- DINPIN from drug dispensation database (din)
table(dat.proxy$dim)
#>
#> diag din msp proc
#> 20000 269 44179 321
<- dat.proxy[order(dat.proxy$studyid),]
dat.proxy 5001:5010,] dat.proxy[
22.3 Step 2: Empirical covariates
The same as before, top 200 covariates with highest prevalence are chosen.
library(autoCovariateSelection)
<- simdat$studyid
id
<- get_candidate_covariates(df = dat.proxy, domainVarname = "dim",
step1 eventCodeVarname = "code",
patientIdVarname = "studyid",
patientIdVector = id,
n = 200,
min_num_patients = 20)
<- step1$covars_data
out1 head(out1)
22.4 Step 3: Recurrence
In this step, we generate a maximum of 3 binary recurrence covariates for each of the candidate proxy/code. We observed 401 recurrence covariates in this analysis.
22.4.1 Assessing recurrence of codes
all.equal(id, step1$patientIds)
#> [1] TRUE
<- get_recurrence_covariates(df = out1,
step2 eventCodeVarname = "code",
patientIdVarname = "studyid",
patientIdVector = id)
<- step2$recurrence_data
out2 dim(out2)
#> [1] 3000 402
22.4.2 Recurrence covariates
<- names(out2)[-1]
vars.empirical head(vars.empirical)
#> [1] "rec_diag_C44_once" "rec_diag_C81_once" "rec_diag_C83_once"
#> [4] "rec_diag_E08_once" "rec_diag_E09_once" "rec_diag_E10_once"
22.4.3 Merging all recurrence covariates with the analytic dataset
<- merge(simdat, out2, by = "studyid", all.x = T)
hdps.data dim(hdps.data)
#> [1] 3000 408
22.5 Step 4: Prioritize
The Bross formula requires the exposure, outcome, and proxy covariates to be binary. With the time-to-event outcome, we will use Cox-PH with LASSO regularization to prioritize the empirical covariates. The hyperparameter (\(\lambda\)) will be selected using 5-fold cross-validation.
22.5.1 Hyperparameter tuning
# Formula with only empirical covariates
<- as.formula(paste("Surv(follow_up, cvd) ~ ",
formula.out paste(vars.empirical, collapse = " + ")))
# Model matrix for fitting Cox with LASSO regularization
<- model.matrix(formula.out, data = hdps.data)[,-1]
X <- as.matrix(data.frame(time = hdps.data$follow_up, status = hdps.data$cvd))
Y
# Detect the number of cores
<- parallel::detectCores()
n_cores
# Create a cluster of cores
<- makeCluster(n_cores - 1)
cl
# Register the cluster for parallel processing
registerDoParallel(cl)
# Hyperparameter tuning with 5-fold cross-validation
set.seed(123)
<- cv.glmnet(x = X, y = Y, nfolds = 5, parallel = T, alpha = 1,
fit.lasso family = "cox")
stopCluster(cl)
plot(fit.lasso)
## Best lambda
$lambda.min
fit.lasso#> [1] 0.03093478
22.5.2 Variable ranking based on Cox-LASSO
<- coef(fit.lasso, s = fit.lasso$lambda.min)
empvars.lasso <- data.frame(as.matrix(empvars.lasso))
empvars.lasso <- data.frame(vars = rownames(empvars.lasso),
empvars.lasso coef = empvars.lasso)
colnames(empvars.lasso) <- c("vars", "coef")
rownames(empvars.lasso) <- NULL
# Number of non-zero coefficients
table(empvars.lasso$coef != 0)
#>
#> FALSE
#> 401
Since proxies were random and unrelated to the simulated data, LASSO produced all zero coefficients. Let choose an arbitrary value as to demonstrate the process of variable selection.
<- coef(fit.lasso, s = exp(-6))
empvars.lasso <- data.frame(as.matrix(empvars.lasso))
empvars.lasso <- data.frame(vars = rownames(empvars.lasso),
empvars.lasso coef = empvars.lasso)
colnames(empvars.lasso) <- c("vars", "coef")
rownames(empvars.lasso) <- NULL
head(empvars.lasso)
# Number of non-zero coefficients
table(empvars.lasso$coef != 0)
#>
#> FALSE TRUE
#> 71 330
22.5.3 Rank empirical covariates
Now we will rank the empirical covariates based on absolute value of log hazard ratio.
$coef.abs <- abs(empvars.lasso$coef)
empvars.lasso<- empvars.lasso[order(empvars.lasso$coef.abs, decreasing = T),]
empvars.lasso head(empvars.lasso)
22.6 Step 5: Covariates
We used all investigator-specified covariates and the top 200 empirical covariates for the PS model. Again, this is a simplistic scenario where we only consider the main effects of the covariates.
# Investigator-specified covariates
<- c("age", "sex", "comorbidity")
investigator.vars
# Top 200 empirical covariates section based on Cox-LASSO
<- empvars.lasso$vars[1:200]
empirical.vars.lasso
# Investigator-specified and empirical covariates
<- c(investigator.vars, empirical.vars.lasso)
vars.hsps head(vars.hsps)
#> [1] "age" "sex" "comorbidity"
#> [4] "rec_diag_V03_once" "rec_diag_S24_once" "rec_diag_W61_once"
22.7 Step 6: Propensity score
22.7.1 Create propensity score formula
<- as.formula(paste0("I(arthritis == 'Yes') ~ ",
ps.formula paste(vars.hsps, collapse = "+")))
22.7.2 Fit PS model
require(WeightIt)
<- weightit(ps.formula,
W.out data = hdps.data,
estimand = "ATE",
method = "ps",
stabilize = T)
22.7.3 Obtain PS
$ps <- W.out$ps hdps.data
22.7.4 Obtain weights
$w <- W.out$weights hdps.data
22.7.5 Assessing balance
22.8 Step 7: Association
22.8.1 Obtain HR
library(survival)
library(Publish)
<- coxph(Surv(follow_up, cvd) ~ arthritis,
fit.hdps weights = w,
data = hdps.data)
publish(fit.hdps, pvalue.method = "robust", confint.method = "robust",
print = F)$regressionTable[1:2,]
22.8.2 Obtain HR with survey
package
library(survey)
# Create a design
<- svydesign(id = ~1, weights = ~w, data = hdps.data)
svy.design
# Model
<- svycoxph(Surv(follow_up, cvd) ~ arthritis,
fit.hdps1 design = svy.design)
publish(fit.hdps1, print = F)$regressionTable[1:2,]
#> Independent Sampling design (with replacement)
#> svydesign(id = ~1, weights = ~w, data = hdps.data)