Mediation Example

# Load required packages
knitr::opts_chunk$set(echo = TRUE)
require(survey)
require(DiagrammeR)
require(DiagrammeRsvg)
require(rsvg)
library(magrittr)
library(svglite)
library(png)
require(Publish)

We want to decompose of the “total effect” of a given exposure OA (\(A\)) on the outcome CVD (\(Y\)) into

a natural direct effect (NDE; \(A \rightarrow Y\)) and
a natural indirect effect (NIE) through a mediator pain medication (\(M\)) through (\(A \rightarrow M \rightarrow Y\)).

Step 0: Build data first

load("Data/mediation/cchs123pain.RData")
source("Data/mediation/medFunc.R")
ls()
#> [1] "analytic.cc"        "analytic.miss"      "doEffectDecomp"    
#> [4] "doEffectDecomp.int"

varlist <- c("age", "sex", "income", "race", "bmi", "edu", "phyact", "smoke", "fruit", "diab")
analytic.miss$mediator <- ifelse(analytic.miss$painmed == "Yes", 1, 0)
analytic.miss$exposure <- ifelse(analytic.miss$OA == "OA", 1, 0)
analytic.miss$outcome <- ifelse(analytic.miss$CVD == "event", 1, 0)

Pre-run step 3 model

We will utilize this fit in step 3

# A = actual exposure (without any change)
analytic.miss$exposureTemp <- analytic.miss$exposure

# Design
w.design0 <- svydesign(id=~1, weights=~weight, data=analytic.miss)
w.design <- subset(w.design0, miss == 0)

# Replace exposure with exposureTemp. This will be necessary in step 3
fit.m <- svyglm(mediator ~ exposureTemp + 
                 age + sex + income + race + bmi + edu + phyact + smoke + fruit + diab,
                design = w.design, family = binomial("logit"))
#> Warning in eval(family$initialize): non-integer #successes in a binomial glm!
publish(fit.m)
#>      Variable             Units OddsRatio       CI.95     p-value 
#>  exposureTemp                        2.43 [2.06;2.86]     < 1e-04 
#>           age       20-29 years       Ref                         
#>                     30-39 years      1.00 [0.88;1.13]   0.9442989 
#>                     40-49 years      0.93 [0.82;1.06]   0.2651302 
#>                     50-59 years      0.66 [0.58;0.76]     < 1e-04 
#>                     60-64 years      0.61 [0.51;0.72]     < 1e-04 
#>               65 years and over      0.61 [0.52;0.71]     < 1e-04 
#>           sex            Female       Ref                         
#>                            Male      0.50 [0.46;0.55]     < 1e-04 
#>        income   $29,999 or less       Ref                         
#>                 $30,000-$49,999      1.20 [1.06;1.35]   0.0043533 
#>                 $50,000-$79,999      1.21 [1.08;1.37]   0.0014914 
#>                 $80,000 or more      1.28 [1.14;1.45]     < 1e-04 
#>          race         Non-white       Ref                         
#>                           White      1.81 [1.62;2.02]     < 1e-04 
#>           bmi       Underweight       Ref                         
#>                  healthy weight      1.09 [0.82;1.44]   0.5631582 
#>                      Overweight      1.33 [1.01;1.77]   0.0449616 
#>           edu          < 2ndary       Ref                         
#>                       2nd grad.      1.13 [0.98;1.30]   0.1014986 
#>                 Other 2nd grad.      1.30 [1.08;1.55]   0.0050596 
#>                  Post-2nd grad.      1.25 [1.10;1.42]   0.0008252 
#>        phyact            Active       Ref                         
#>                        Inactive      1.12 [1.02;1.23]   0.0184447 
#>                        Moderate      1.12 [1.01;1.25]   0.0364592 
#>         smoke      Never smoker       Ref                         
#>                  Current smoker      1.29 [1.16;1.44]     < 1e-04 
#>                   Former smoker      1.28 [1.17;1.40]     < 1e-04 
#>         fruit 0-3 daily serving       Ref                         
#>               4-6 daily serving      0.92 [0.83;1.02]   0.0976967 
#>                6+ daily serving      0.80 [0.71;0.90]   0.0001979 
#>          diab                No       Ref                         
#>                             Yes      1.23 [0.99;1.52]   0.0626501

Step 1 and 2: Replicate data with different exposures

We manipulate and duplicate data here

dim(analytic.miss)
#> [1] 397173     28
dim(analytic.cc)
#> [1] 28734    23
nrow(analytic.miss) - nrow(analytic.cc)
#> [1] 368439

# Create counterfactual data. This will be necessary in step 3
d1 <- d2 <- analytic.miss

# Exposed = Exposed
d1$exposure.counterfactual <- d1$exposure

# Exposed = Not exposed
d2$exposure.counterfactual <- !d2$exposure 

# duplicated data (double the amount)
newd <- rbind(d1, d2)
newd <- newd[order(newd$ID), ]
dim(newd)
#> [1] 794346     29

Step 3: Compute weights for the mediation

Weight is computed by

\(W^{M|C} = \frac{P(M|A^*, C)}{P(M|A, C)}\)

in all data newd (fact d1 + alternative fact d2).

\(P(M|A, C)\) is computed from a logistic regression of \(M\) on \(A\) + \(C\).
- \(logit [P(M=1 | C = c]) = \beta_0 + \beta_1 a + \beta_3 c\)
\(P(M|A^{*}, C)\) is computed from a logistic regression of \(M\) on \(A^*\) + \(C\).
- \(logit [P(M=1 | C = c]) = \beta_0 + \beta'_1 a^* + \beta'_3 c\)

# First, use original exposure (all A + all A):
# A = actual exposure (without any change)
# A = exposure
newd$exposureTemp <- newd$exposure

# Probability of M given A + C
w <- predict(fit.m, newdata=newd, type='response') 
direct <- ifelse(newd$mediator, w, 1-w)

# Second, use counterfactual exposures (all A + all !A):
# A* = Opposite (counterfactual) values of the exposure
# A* = exposure.counterfactual
newd$exposureTemp <- newd$exposure.counterfactual

# Probability of M given A* + C
w <- predict(fit.m, newdata=newd, type='response') 
indirect <- ifelse(newd$mediator, w, 1-w)

# Mediator weights
newd$W.mediator <- indirect/direct
summary(newd$W.mediator)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>     0.4     1.0     1.0     1.0     1.2     2.2  670758
hist(newd$W.mediator)

Incorporating the survey weights:

Note: scaling can often be helpful if there exists extreme weights.

# scale survey weights
#newd$S.w <- with(newd,(weight)/mean(weight))
newd$S.w <- with(newd,weight)
newd$S.w[is.na(newd$S.w)]
#> numeric(0)
summary(newd$S.w)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>    0.39   21.76   42.21   66.70   81.07 2384.98

# Multiply mediator weights with scaled survey weights
newd$SM.w <- with(newd,(W.mediator * S.w))

summary(newd$SM.w)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>     0.4    24.7    46.4    73.9    88.8  3531.6  670758
table(newd$miss[is.na(newd$SM.w)])
#> 
#>      1 
#> 670758
newd$SM.w[is.na(newd$SM.w)] <- 0
summary(newd$SM.w)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>     0.0     0.0     0.0    11.5     0.0  3531.6

hist(newd$SM.w, main = "", xlab = "Combined weights", 
     ylab = "Frequency", freq = TRUE)

Here all missing weights are associated with incomplete cases (miss==1)! Hence, doesn’t matter if they are missing or other value (0) in them.

Step 4: Weighted outcome Model

Outcome model is

\(logit [P(Y_{a,M(a^*)}=1 | C = c)] = \theta_0 + \theta_1 a + \theta_2 a^* + \theta_3 c\)

after weighting (combination of mediator weight + sampling weight).

# Outcome analysis
w.design0 <- svydesign(id=~1, weights=~SM.w, data=newd)
w.design <- subset(w.design0, miss == 0)

# Fit Y on (A + A* + C)
fit <- svyglm(outcome ~ exposure + exposure.counterfactual + 
                age + sex + income + race + bmi + edu + phyact + smoke + fruit + diab, 
             design = w.design, family = binomial("logit"))
#> Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Point estimates

Following are the conditional ORs:

\(OR_{TE}(C=c) = \exp(\theta_1 + \theta_2)\)
\(OR_{NDE}(A=1,M=0,C=c) = \exp(\theta_1)\)
\(OR_{NIE}(A^{*}=1,M=0,C=c) = \exp(\theta_2)\)

# total effect of A-> Y + A -> M -> Y
TE <- exp(sum(coef(fit)[c('exposure', 'exposure.counterfactual')])) 
TE 
#> [1] 1.544694

# direct effect of A-> Y (not through M)
DE <- exp(unname(coef(fit)['exposure']))
DE 
#> [1] 1.488554

# indirect effect of A-> Y (A -> M -> Y)
IE <- exp(unname(coef(fit)[c('exposure.counterfactual')])) 
IE 
#> [1] 1.037714

# Product of ORs; same as TE 
DE * IE 
#> [1] 1.544694

# Proportion mediated
PM <- log(IE) / log(TE)
PM 
#> [1] 0.08513902

Obtaining results fast

User-written function doEffectDecomp() (specific to OA-CVD problem):

doEffectDecomp(analytic.miss, ind = NULL, varlist = varlist)
#>         TE         DE         IE         PM 
#> 1.54469393 1.48855395 1.03771444 0.08513902
# function is provided in the appendix

Confidence intervals

Standard errors and confidence intervals are determined by bootstrap methods.

require(boot)
# I ran the computation on a 24 core computer,
# hence set ncpus = 5 (keep some free). 
# If you have more / less cores, adjust accordingly. 
# Try parallel package to find how many cores you have.
# library(parallel)
# detectCores()
# doEffectDecomp is a user-written function
# See appendix for the function
set.seed(504)
bootresBin <- boot(data=analytic.miss, statistic=doEffectDecomp, 
                R = 5, parallel = "multicore", ncpus=5,
                varlist = varlist)

R = 5 is not reliable for bootstrap. In real applications, try 250 at least.

bootci1b <- boot.ci(bootresBin,type = "perc",index=1)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci2b <- boot.ci(bootresBin,type = "perc",index=2)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci3b <- boot.ci(bootresBin,type = "perc",index=3)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci4b <- boot.ci(bootresBin,type = "perc",index=4)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints

# Number of bootstraps
bootresBin$R
#> [1] 5

# Total Effect
c(bootresBin$t0[1], bootci1b$percent[4:5])
#>       TE                   
#> 1.544694 1.293208 1.894417

# Direct Effect
c(bootresBin$t0[2], bootci2b$percent[4:5])
#>       DE                   
#> 1.488554 1.303554 1.876916

# Indirect Effect
c(bootresBin$t0[3], bootci3b$percent[4:5])
#>        IE                     
#> 1.0377144 0.9738072 1.0093246

# Proportion Mediated
c(bootresBin$t0[4], bootci4b$percent[4:5])
#>          PM                         
#>  0.08513902 -0.08360848  0.01655013

The proportion mediated through pain medication was about 8.51% on the log odds ratio scale.

Visualization for main effects

require(plotrix)
TEc <- c(bootresBin$t0[1], bootci1b$percent[4:5])
DEc <- c(bootresBin$t0[2], bootci2b$percent[4:5])
IEc <- c(bootresBin$t0[3], bootci3b$percent[4:5])
mat <- rbind(TEc,DEc,IEc)
colnames(mat) <- c("Point", "2.5%", "97.5%")
mat
#>        Point      2.5%    97.5%
#> TEc 1.544694 1.2932078 1.894417
#> DEc 1.488554 1.3035540 1.876916
#> IEc 1.037714 0.9738072 1.009325

plotCI(1:3, mat[,1], ui=mat[,3], li=mat[,2], xlab = "Estimates", ylab = "", xaxt="n")
axis(1, at=1:3,labels=c("TE","NDE","NIE"))
abline(h=1, lty = 2)

Non-linearity

Consider

non-linear relationships (polynomials) and interactions between exposure, demographic/baseline covariates and mediators,
Is misclassification of the mediators possible?

Here we are again using a user-written function doEffectDecomp.int() (including interaction phyact*diab in the mediation model as well as the outcome model):

# doEffectDecomp.int is a user-written function
# See appendix for the function
doEffectDecomp.int(analytic.miss, ind = NULL, varlist = varlist)
#>         TE         DE         IE         PM 
#> 1.54472021 1.48868864 1.03763821 0.08496674
# try bootstrap on it?

Visualization for main + interactions

set.seed(504)
bootresInt <- boot(data=analytic.miss, statistic=doEffectDecomp.int,
                R = 5, parallel = "multicore", ncpus=5,
                varlist = varlist)

R = 5 is not reliable for bootstrap. In real applications, try 250 at least.

bootci1i <- boot.ci(bootresInt,type = "perc",index=1)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci2i <- boot.ci(bootresInt,type = "perc",index=2)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci3i <- boot.ci(bootresInt,type = "perc",index=3)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints
bootci4i <- boot.ci(bootresInt,type = "perc",index=4)
#> Warning in norm.inter(t, alpha): extreme order statistics used as endpoints

bootresInt$R
#> [1] 5
# from saved boostrap results: bootresInt 
# (similar as before)
TEc <- c(bootresInt$t0[1], bootci1i$percent[4:5])
DEc <- c(bootresInt$t0[2], bootci2i$percent[4:5])
IEc <- c(bootresInt$t0[3], bootci3i$percent[4:5])
mat<- rbind(TEc,DEc,IEc)
colnames(mat) <- c("Point", "2.5%", "97.5%")
mat
#>        Point      2.5%    97.5%
#> TEc 1.544720 1.2931358 1.893575
#> DEc 1.488689 1.3040953 1.876521
#> IEc 1.037638 0.9742042 1.009088
plotCI(1:3, mat[,1], ui=mat[,3], li=mat[,2],
       xlab = "Estimates", ylab = "", xaxt="n")
axis(1, at=1:3,labels=c("TE","NDE","NIE"))
abline(h=1, lty = 2)

Appendix: OA-CVD Functions for bootstrap

These functions are written basically for performing bootstrap for the OA-CVD analysis. However, changing the covariates names/model-specifications should not be too hard, once you understand the basic steps.

# without interactions (binary mediator)
doEffectDecomp
#> function (dat, ind = NULL, varlist) 
#> {
#>     if (is.null(ind)) 
#>         ind <- 1:nrow(dat)
#>     d <- dat[ind, ]
#>     d$mediator <- ifelse(as.character(d$painmed) == "Yes", 1, 
#>         0)
#>     d$exposure <- ifelse(as.character(d$OA) == "OA", 1, 0)
#>     d$outcome <- ifelse(as.character(d$CVD) == "event", 1, 0)
#>     d$exposureTemp <- d$exposure
#>     w.design0 <- svydesign(id = ~1, weights = ~weight, data = d)
#>     w.design <- subset(w.design0, miss == 0)
#>     fit.m <- svyglm(as.formula(paste0(paste0("mediator ~ exposureTemp  + "), 
#>         paste0(varlist, collapse = "+"))), design = w.design, 
#>         family = quasibinomial("logit"))
#>     d1 <- d2 <- d
#>     d1$exposure.counterfactual <- d1$exposure
#>     d2$exposure.counterfactual <- !d2$exposure
#>     newd <- rbind(d1, d2)
#>     newd <- newd[order(newd$ID), ]
#>     newd$exposureTemp <- newd$exposure
#>     w <- predict(fit.m, newdata = newd, type = "response")
#>     direct <- ifelse(newd$mediator, w, 1 - w)
#>     newd$exposureTemp <- newd$exposure.counterfactual
#>     w <- predict(fit.m, newdata = newd, type = "response")
#>     indirect <- ifelse(newd$mediator, w, 1 - w)
#>     newd$W.mediator <- indirect/direct
#>     summary(newd$W.mediator)
#>     newd$S.w <- with(newd, weight)
#>     summary(newd$S.w)
#>     newd$SM.w <- with(newd, (W.mediator * S.w))
#>     newd$SM.w[is.na(newd$SM.w)] <- 0
#>     summary(newd$SM.w)
#>     w.design0 <- svydesign(id = ~1, weights = ~SM.w, data = newd)
#>     w.design <- subset(w.design0, miss == 0)
#>     fit <- svyglm(as.formula(paste0(paste0("outcome ~ exposure + exposure.counterfactual +"), 
#>         paste0(varlist, collapse = "+"))), design = w.design, 
#>         family = quasibinomial("logit"))
#>     TE <- exp(sum(coef(fit)[c("exposure", "exposure.counterfactual")]))
#>     DE <- exp(unname(coef(fit)["exposure"]))
#>     IE <- exp(unname(coef(fit)[c("exposure.counterfactual")]))
#>     PM <- log(IE)/log(TE)
#>     return(c(TE = TE, DE = DE, IE = IE, PM = PM))
#> }
#> <bytecode: 0x000001999861a6a8>

# with interactions (binary mediator)
doEffectDecomp.int
#> function (dat, ind = NULL, varlist) 
#> {
#>     if (is.null(ind)) 
#>         ind <- 1:nrow(dat)
#>     d <- dat[ind, ]
#>     d$exposureTemp <- d$exposure
#>     w.design0 <- svydesign(id = ~1, weights = ~weight, data = d)
#>     w.design <- subset(w.design0, miss == 0)
#>     fit.m <- svyglm(as.formula(paste0(paste0("mediator ~ exposureTemp + phyact*diab +"), 
#>         paste0(varlist, collapse = "+"))), design = w.design, 
#>         family = quasibinomial("logit"))
#>     d1 <- d2 <- d
#>     d1$exposure.counterfactual <- d1$exposure
#>     d2$exposure.counterfactual <- !d2$exposure
#>     newd <- rbind(d1, d2)
#>     newd <- newd[order(newd$ID), ]
#>     newd$exposureTemp <- newd$exposure
#>     w <- predict(fit.m, newdata = newd, type = "response")
#>     direct <- ifelse(newd$mediator, w, 1 - w)
#>     newd$exposureTemp <- newd$exposure.counterfactual
#>     w <- predict(fit.m, newdata = newd, type = "response")
#>     indirect <- ifelse(newd$mediator, w, 1 - w)
#>     newd$W.mediator <- indirect/direct
#>     summary(newd$W.mediator)
#>     newd$S.w <- with(newd, weight)
#>     summary(newd$S.w)
#>     newd$SM.w <- with(newd, (W.mediator * S.w))
#>     newd$SM.w[is.na(newd$SM.w)] <- 0
#>     summary(newd$SM.w)
#>     w.design0 <- svydesign(id = ~1, weights = ~SM.w, data = newd)
#>     w.design <- subset(w.design0, miss == 0)
#>     fit <- svyglm(as.formula(paste0(paste0("outcome ~ exposure + exposure.counterfactual +"), 
#>         paste0(varlist, collapse = "+"))), design = w.design, 
#>         family = quasibinomial("logit"))
#>     TE <- exp(sum(coef(fit)[c("exposure", "exposure.counterfactual")]))
#>     DE <- exp(unname(coef(fit)["exposure"]))
#>     IE <- exp(unname(coef(fit)[c("exposure.counterfactual")]))
#>     PM <- log(IE)/log(TE)
#>     return(c(TE = TE, DE = DE, IE = IE, PM = PM))
#> }
#> <bytecode: 0x00000199d22d4188>

Video content (optional)

Tip

For those who prefer a video walkthrough, feel free to watch the video below, which offers a description of an earlier version of the above content.