library(nhanesA)
library(SASxport)
library(plyr)
# Demographic data
demo <- nhanes('DEMO_H') # Both males and females: 0 - 150 YEARS
demo1 <- demo[c("SEQN", # Respondent sequence number
"RIDAGEYR", # Age in years at screening
"RIAGENDR", # gender
"DMDEDUC2", # Education level - Adults 20+
"RIDRETH3", # Race/Hispanic origin w/ NH Asian
"RIDEXPRG", # Pregnancy status at exam
"WTINT2YR", # Full sample 2 year weights
"SDMVPSU", # Masked variance pseudo-PSU
"SDMVSTRA")] # Masked variance pseudo-stratum
demo_vars <- names(demo1)
demo2 <- nhanesTranslate('DEMO_H', demo_vars, data = demo1)
# BMI
bmx <- nhanes('BMX_H')
bmx1 <- bmx[c("SEQN", # Respondent sequence number
"BMXBMI")] # Body Mass Index (kg/m**2): 2 YEARS - 150 YEARS
bmx_vars <- names(bmx1)
bmx2 <- nhanesTranslate('BMX_H', bmx_vars, data = bmx1)
# Smoking
smq <- nhanes('SMQ_H')
smq1 <- smq[c("SEQN", # Respondent sequence number
"SMQ020", # Smoked at least 100 cigarettes in life
"SMQ040")] # Do you now smoke cigarettes?: 18 YEARS - 150 YEARS
smq_vars <- names(smq1)
smq2 <- nhanesTranslate('SMQ_H', smq_vars, data = smq1)
# Physical activity
paq <- nhanes('PAQ_H')
paq1 <- paq[c("SEQN", # Respondent sequence number
"PAQ605")] # Vigorous work activity
paq_vars <- names(paq1)
paq2 <- nhanesTranslate('PAQ_H', paq_vars, data = paq1)
# Sleep
slq <- nhanes('SLQ_H')
slq1 <- slq[c("SEQN", # Respondent sequence number
"SLD010H")] # Hours of sleep
slq_vars <- names(slq1)
slq2 <- nhanesTranslate('SLQ_H', slq_vars, data = slq1)
# High blood pressure
bpq <- nhanes('BPQ_H')
bpq1 <- bpq[c("SEQN", # Respondent sequence number
"BPQ020")] # Ever told you had high blood pressure
bpq_vars <- names(bpq1)
bpq2 <- nhanesTranslate('BPQ_H', bpq_vars, data = bpq1)
# General health condition
huq <- nhanes('HUQ_H')
huq1 <- huq[c("SEQN", # Respondent sequence number
"HUQ010")] # General health condition
huq_vars <- names(huq1)
huq2 <- nhanesTranslate('HUQ_H', huq_vars, data = huq1)
# Combined data
dat.full <- join_all(list(demo2, bmx2, smq2, paq2, slq2, bpq2, huq2), by = "SEQN",
type='full')
dim(dat.full) # N = 10,175
Exercise (L)
You can download all of the related files in a zip file machinelearningEx.zip from Github folder, or just by clicking this link directly.
- Navigate to the GitHub folder (above link) where the ZIP file is located.
- Click on the file name (above zip file) to open its preview window.
- Click on the Download button to download the file. If you can’t see the Download button, click on “Download Raw File” link that should appear on the page.
Problem Statement
We will revisit the article by Flegal et al. (2016). We will use the same dataset as in the previous lab exercise on survey data analysis, with some additional predictors in predicting obesity.
Our primary aim is to predict grade 3 obesity with the following predictors:
- Age: Age in years at screening
- Gender
- Race: Race/ethnicity
- Education: Education level
- Smoking: Smoking status
- Physical activity: Level of vigorous work activity
- Sleep: Hours of sleep
- High blood pressure: Ever doctor told a high blood pressure
- General health: General health condition
Question 1: Creating data
1(a) Downloading the datasets
You can see how datasets are downloaded and merged:
1(b) Recoding
Let us recode the outcome and predictors to make them suitable for analysis:
# Survey design
dat.full$survey.weight <- dat.full$WTINT2YR
dat.full$psu <- dat.full$SDMVPSU
dat.full$strata <- dat.full$SDMVSTRA
# Class 3 obesity - BMI >= 40 kg/m^2
summary(dat.full$BMXBMI)
dat.full$obesity <- ifelse(dat.full$BMXBMI >= 40, 1, 0)
table(dat.full$obesity, useNA = "always")
# Age
dat.full$age.cat <- cut(dat.full$RIDAGEYR, c(20, 40, 60, Inf), right = FALSE)
table(dat.full$age.cat, useNA = "always")
# Gender
dat.full$gender <- dat.full$RIAGENDR
table(dat.full$gender, useNA = "always")
# Race/Hispanic origin group
dat.full$race <- dat.full$RIDRETH3
table(dat.full$age.cat, dat.full$race, useNA = "always")
dat.full$race <- car::recode(dat.full$race,
" 'Non-Hispanic White'='White';
'Non-Hispanic Black' = 'Black';
c('Mexican American','Other Hispanic')='Hispanic';
c('Non-Hispanic Asian',
'Other Race - Including Multi-Rac')= 'Other';
else=NA",
levels = c("White", "Black", "Hispanic", "Other"))
table(dat.full$race, useNA = "always")
# Education
dat.full$education <- dat.full$DMDEDUC2
dat.full$education <- car::recode(dat.full$education,
" c('Some college or AA degree',
'College graduate or above') = '>High school';
'High school graduate/GED or equi' = 'High school';
c('Less than 9th grade',
'9-11th grade (Includes 12th grad') =
'<High school';
else = NA",
levels = c("<High school", "High school",
">High school"))
table(dat.full$education, useNA = "always")
# Smoking status
dat.full$smoking <- dat.full$SMQ020
table(dat.full$smoking, useNA = "always")
dat.full$smoking <- car::recode(dat.full$smoking, " 'Yes'='Current smoker';
'No'='Never smoker'; else=NA ",
levels = c("Never smoker", "Former smoker",
"Current smoker"))
dat.full$smoking[dat.full$SMQ040 == "Not at all"] <- "Former smoker"
table(dat.full$smoking, useNA = "always")
# Physical activity
dat.full$physical.activity <- dat.full$PAQ605
table(dat.full$physical.activity, useNA = "always")
dat.full$physical.activity <- car::recode(dat.full$physical.activity,
" 'Yes'='Yes'; 'No'='No'; else=NA ",
levels = c("No", "Yes"))
table(dat.full$physical.activity, useNA = "always")
# Sleep
dat.full$sleep <- dat.full$SLD010H
dat.full$sleep <- car::recode(dat.full$sleep, " 1:6 = 'Less than 7'; 7:9 = '7-9';
10:24 = 'More than 9'; else=NA ",
levels = c("Less than 7", "7-9", "More than 9"))
table(dat.full$sleep, useNA = "always")
# High blood pressure
dat.full$high.blood.pressure <- dat.full$BPQ020
table(dat.full$high.blood.pressure, useNA = "always")
dat.full$high.blood.pressure <- car::recode(dat.full$high.blood.pressure,
" 'Yes'='Yes'; 'No'='No'; else=NA ",
levels = c("No", "Yes"))
table(dat.full$high.blood.pressure, useNA = "always")
# General health condition
dat.full$general.health <- dat.full$HUQ010
table(dat.full$general.health, useNA = "always")
dat.full$general.health <- car::recode(dat.full$general.health,
"c('Excellent,', 'Very good,')=
'Very good or Excellent';
'Good,'='Good';
c('Fair, or', 'Poor?') ='Poor or Fair';
else=NA ",
levels = c("Poor or Fair", "Good",
"Very good or Excellent"))
table(dat.full$general.health, useNA = "always")
1(c) Keep relevant variables
Let’s keep only the relevant variables for this exercise:
# Keep relevant variables
vars <- c(
# Unique identifier
"SEQN",
# Survey features
"survey.weight", "psu", "strata",
# Eligibility
"RIDAGEYR", "BMXBMI", "RIDEXPRG",
# Outcome
"obesity",
# Predictors
"age.cat", "gender", "race", "education", "smoking", "physical.activity",
"sleep", "high.blood.pressure", "general.health")
dat.full2 <- dat.full[,vars]
1(d) Weight normalization
Large weights can cause issues when evaluating model performance. Let’s normalize the survey weights to address this problem:
1(e) Analytic dataset
The authors restricted their study to - adults aged 20 years and more, - non-missing body mass index, and - non-pregnant
# Aged 20 years or more
dat.analytic <- subset(dat.full2, RIDAGEYR>=20) # N = 5,769
# Non-missing outcome
dat.analytic <- subset(dat.analytic, !is.na(BMXBMI)) # N = 5,520
# Non-pregnant
table(dat.analytic$RIDEXPRG)
dat.analytic <- subset(dat.analytic, is.na(RIDEXPRG) | RIDEXPRG !=
"Yes, positive lab pregnancy test") # N = 5,455
nrow(dat.analytic)
# Drop irrelevant variables
dat.analytic$RIDAGEYR <- dat.analytic$BMXBMI <- dat.analytic$RIDEXPRG <- NULL
1(f) Complete case data
Below is the code for creating the complete case dataset (no missing for the outcome or predictors):
1(g) Save daatsets
Question 2: Importing data and creating Table 1
2(a) Importing dataset
Let’s load the dataset:
Here,
- dat.full: the full dataset with all variables
- dat.full2: the full dataset with only relevant variables for this exercise
- dat.analytic: the analytic dataset with only adults aged 20 years and more, non-missing BMI, and non-pregnant
- dat.complete: the complete case dataset without missing values in the outcome and predictors
2(b) Creating Table 1
Let’s create Table 1 for the complete case dataset with unweighted frequencies:
library(tableone)
predictors <- c("age.cat", "gender", "race", "education", "smoking",
"physical.activity", "sleep", "high.blood.pressure",
"general.health")
tab1 <- CreateTableOne(vars = predictors, strata = "obesity",
data = dat.complete, test = F, addOverall = T)
print(tab1, format="f") # Showing only frequencies
#> Stratified by obesity
#> Overall 0 1
#> n 5433 5023 410
#> age.cat
#> [20,40) 1806 1659 147
#> [40,60) 1892 1728 164
#> [60,Inf) 1735 1636 99
#> gender = Female 2807 2531 276
#> race
#> White 2333 2157 176
#> Black 1109 976 133
#> Hispanic 1208 1125 83
#> Other 783 765 18
#> education
#> <High school 1171 1092 79
#> High school 1216 1116 100
#> >High school 3046 2815 231
#> smoking
#> Never smoker 3054 2828 226
#> Former smoker 1261 1159 102
#> Current smoker 1118 1036 82
#> physical.activity = Yes 984 917 67
#> sleep
#> 7-9 3174 2971 203
#> Less than 7 2084 1896 188
#> More than 9 175 156 19
#> high.blood.pressure = Yes 2037 1810 227
#> general.health
#> Poor or Fair 1260 1084 176
#> Good 2065 1911 154
#> Very good or Excellent 2108 2028 80
Question 3: Prediction using split sample approach
In this exercise, we will use the split-sample approach to predict obesity. We will create our training and test data using a 60-40 split for the training and test data. We will use the following two methods to predict obesity:
- Design-adjusted logistic with all survey features (psu, strata, and survey weights)
- LASSO with survey weights
3(a) Split the data into training and test
Let us create our training and test data using the split-sample approach:
set.seed(900)
dat.complete$datasplit <- rbinom(nrow(dat.complete), size = 1, prob = 0.6)
table(dat.complete$datasplit)
#>
#> 0 1
#> 2130 3303
# Training data
dat.train <- dat.complete[dat.complete$datasplit == 1,]
dim(dat.train)
#> [1] 3303 16
# Test data
dat.test <- dat.complete[dat.complete$datasplit == 0,]
dim(dat.test)
#> [1] 2130 16
3(b) Prediction with design-adjusted logistic
We will use the design-adjusted logistic regression to predict obesity with the following predictors:
- age.cat, gender, race, high.blood.pressure, general.health
Instructions:
- 1: Create the survey design on the full data and subset the design for those individuals in the training data.
- 2: Use the training data design created in step 1 to fit the model
- 3: Use the test data to predict the probability of obesity.
- 4: Calculate AUC on the test data.
- 5: Calculate calibration slope with 95% confidence interval on the test data.
Hints:
-
WeightedAUC
andWeightedROC
are helpful functions in calculating AUC. - The
Logit
function from theDescTools
package is helpful in calculating the logit of predicted probabilities for calculating calibration slope. - Use the normalized weight variable to calculate the AUC and calibration slope.
Simpler Model
library(survey)
library(DescTools)
library(WeightedROC)
library(Publish)
library(boot)
library(scoring)
# Design
dat.full2$miss <- 1
dat.full2$miss[dat.full2$SEQN %in% dat.train$SEQN] <- 0
svy.design0 <- svydesign(strata = ~strata, id = ~psu, weights = ~survey.weight,
data = dat.full2, nest = TRUE)
svy.design <- subset(svy.design0, miss == 0)
# Formula
predictors <- c("age.cat", "gender", "race", "education",
"high.blood.pressure", "general.health")
Formula <- formula(paste("obesity ~ ", paste(predictors, collapse=" + ")))
# Model
fit.glm <- svyglm(Formula, design = svy.design, family = binomial)
publish(fit.glm)
#> Variable Units OddsRatio CI.95 p-value
#> age.cat [20,40) Ref
#> [40,60) 0.63 [0.42;0.95] 0.091799
#> [60,Inf) 0.31 [0.17;0.57] 0.018946
#> gender Male Ref
#> Female 1.90 [1.32;2.73] 0.026347
#> race White Ref
#> Black 1.56 [1.12;2.16] 0.056561
#> Hispanic 0.91 [0.53;1.57] 0.758727
#> Other 0.37 [0.18;0.75] 0.050646
#> education <High school Ref
#> High school 1.34 [0.87;2.07] 0.252853
#> >High school 1.97 [1.13;3.46] 0.076212
#> high.blood.pressure No Ref
#> Yes 2.20 [1.61;3.02] 0.007835
#> general.health Poor or Fair Ref
#> Good 0.52 [0.36;0.74] 0.022344
#> Very good or Excellent 0.22 [0.14;0.36] 0.003885
# Prediction on the test set
dat.test$pred.glm <- predict(fit.glm, newdata = dat.test, type = "response")
summary(dat.test$pred.glm)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.003889 0.027737 0.052118 0.073342 0.094836 0.413143
# AUC on the test set with sampling weights
# unfortunately strata and cluster omitted
auc.glm <- WeightedAUC(WeightedROC(dat.test$pred.glm, dat.test$obesity,
weight = dat.test$wgt))
auc.glm
#> [1] 0.6903502
# Function to calculate AUC for bootstrap samples
# unfortunately strata and cluster omitted
calc_auc <- function(data, indices) {
d <- data[indices, ]
roc_obj <- WeightedROC(d$pred.glm, d$obesity, weight = d$wgt)
return(WeightedAUC(roc_obj))
}
# Perform bootstrapping
set.seed(123)
boot_obj <- boot(data = dat.test, statistic = calc_auc, R = 150)
# Get 95% confidence intervals
ci_auc <- boot.ci(boot_obj, type = "perc")
ci_auc
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 150 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = boot_obj, type = "perc")
#>
#> Intervals :
#> Level Percentile
#> 95% ( 0.6357, 0.7582 )
#> Calculations and Intervals on Original Scale
#> Some percentile intervals may be unstable
# Weighted calibration slope
# unfortunately strata and cluster omitted
dat.test$pred.glm.logit <- DescTools::Logit(dat.test$pred.glm)
slope.glm <- glm(obesity ~ pred.glm.logit, data = dat.test, family = quasibinomial,
weights = wgt)
publish(slope.glm)
#> Variable Units Coefficient CI.95 p-value
#> pred.glm.logit 0.79 [0.61;0.98] < 1e-04
# Calculate the weighted Brier score
brier_score <- mean(brierscore(dat.test$obesity ~ dat.test$pred.glm,
data = dat.test,
wt = dat.test$wgt))
brier_score
#> [1] 0.06878706
3(c) Prediction with design-adjusted logistic with added covariates [90% grade]
Add the following variables in the existing model, and assess the model performance in terms of the same 3 performance measures:
- education, smoking, physical.activity, sleep
Model with more variables
# Formula
# predictors2 <- ...
# Formula2 <- ...
# Model
# fit.glm2 <- ...
# Prediction on the test set
# AUC on the test set with sampling weights
# unfortunately strata and cluster omitted
# auc.glm2 <-...
# Function to calculate AUC for bootstrap samples
# unfortunately strata and cluster omitted
# Get 95% confidence intervals from bootstrapping
# ci_auc2 <- ...
# Weighted calibration slope
# unfortunately strata and cluster omitted
# slope.glm2 ...
# Calculate the weighted Brier score
# brier_score2 <- ...
3(d) Interpretation [10%]
Interpret the AUC, calibration slope and Brier Score based on the following criteria, and suggest which model you would choose:
AUC | Interpretation |
---|---|
0.50 | No better than a random chance |
0.51-0.70 | Poor discrimination ability |
0.71-0.80 | Acceptable discrimination ability |
0.81-0.90 | Excellent discrimination ability |
0.90-1.00 | Outstanding discrimination ability |
Calibration slope | Interpretation |
---|---|
1 and 95% CI includes 1 | Well-calibration |
Significantly less than 1 | Overfitting |
Significantly greater than 1 | Underfitting |
Brier Score | Interpretation |
---|---|
0 | Perfect prediction |
0.01-0.1 | Very good model performance |
0.11-0.2 | Good model performance |
0.21-0.3 | Fair model performance |
0.31-0.5 | Poor model performance |
> 0.5 | Very poor model performance (no skill) |
- AUC: Higher is better (closer to 1).
- Calibration Slope: Closer to 1 is better.
- Brier Score: Lower is better (closer to 0).
Response:
3(e) Prediction with LASSO [Optional]
Now we will use the LASSO method to predict obesity. We will incorporate sampling weights in the model to account for survey data (no psu or strata). Note that we are not interested in the statistical significance of the beta coefficients. Hence, not utilizing psu and strata should not be an issue in this prediction problem.
Instructions:
- 1: Use the training data with normalized weight to fit the model.
- 2: Find the optimum lambda using 5-fold cross-validation. Consider the lambda value that gives the minimum prediction error.
- 3: Predict the probability of obesity on the test set
- 3: Calculate AUC on the test data.
- 4: Calculate calibration slope with 95% confidence interval on the test data.
Model
library(glmnet)
library(DescTools)
library(WeightedROC)
# Training data - X: predictor, y: outcome
X.train <- model.matrix(Formula2, dat.train)[,-1]
y.train <- as.matrix(dat.train$obesity)
# Test data - X: predictor, y: outcome
X.test <- model.matrix(Formula2, dat.test)[,-1]
y.test <- as.matrix(dat.test$obesity)
# Find the best lambda using 5-fold CV
fit.cv.lasso <- cv.glmnet(x = X.train, y = y.train, nfolds = 5, alpha = 1,
family = "binomial", weights = dat.train$wgt)
# Prediction on the test set
dat.test$pred.lasso <- predict(fit.cv.lasso, newx = X.test, type = "response",
s = fit.cv.lasso$lambda.min)
# AUC on the test set with sampling weights
auc.lasso <- WeightedAUC(WeightedROC(dat.test$pred.lasso, dat.test$obesity,
weight = dat.test$wgt))
auc.lasso
# Weighted calibration slope
dat.test$pred.lasso.logit <- DescTools::Logit(dat.test$pred.lasso)
slope.lasso <- glm(obesity ~ pred.lasso.logit, data = dat.test,
family = quasibinomial, weights = wgt)
publish(slope.lasso)
Interpretation [optional]
Interpret the AUC and calibration slope.
Question 4: Prediction using croos-validation approach [optional]
Use LASSO with 5-fold cross-validation to predict obesity with the same set of predictors (from larger model) used in Question 2. Report the average AUC and average calibration slope with 95% confidence interval over 5 folds.
library(glmnet)
library(DescTools)
library(WeightedROC)
k <- 5
set.seed(604)
nfolds <- sample(1:k, size = nrow(dat.complete), replace = T)
table(nfolds)
auc.lasso <- cal.slope.lasso <- cal.slope.se.lasso <- NULL
for (fold in 1:k) {
# Training data
dat.train <- dat.complete[nfolds != fold, ]
X.train <- model.matrix(Formula2, dat.train)[,-1]
y.train <- as.matrix(dat.train$obesity)
# Test data
dat.test <- dat.complete[nfolds == fold, ]
X.test <- model.matrix(Formula2, dat.test)[,-1]
y.test <- as.matrix(dat.test$obesity)
# Find the optimum lambda using 5-fold CV
fit.cv.lasso <- cv.glmnet(x = X.train, y = y.train, nfolds = 5, alpha = 1,
family = "binomial", weights = dat.train$wgt)
# Prediction on the test set
dat.test$pred.lasso <- predict(fit.cv.lasso, newx = X.test, type = "response",
s = fit.cv.lasso$lambda.min)
# AUC on the test set with sampling weights
auc.lasso[fold] <- WeightedAUC(WeightedROC(dat.test$pred.lasso,dat.test$obesity,
weight = dat.test$wgt))
# Weighted calibration slope
dat.test$pred.lasso.logit <- DescTools::Logit(dat.test$pred.lasso)
mod.cal <- glm(obesity ~ pred.lasso.logit, data = dat.test, family = binomial,
weights = wgt)
cal.slope.lasso[fold] <- summary(mod.cal)$coef[2,1]
cal.slope.se.lasso[fold] <- summary(mod.cal)$coef[2,2]
}
# Average AUC
mean(auc.lasso)
# Average calibration slope
mean(cal.slope.lasso)
# 95% CI for calibration slope
cbind(mean(cal.slope.lasso) - 1.96 * mean(cal.slope.se.lasso),
mean(cal.slope.lasso) + 1.96 * mean(cal.slope.se.lasso))
Knit your file
Please knit your file once you finished and submit the knitted PDF or doc file. Please also fill-up the following table:
Group name: Put the group name here
Student initial | % contribution |
---|---|
Put Student 1 initial here | Put % contribution for Student 1 here |
Put Student 2 initial here | Put % contribution for Student 2 here |
Put Student 3 initial here | Put % contribution for Student 3 here |