Chapter 3 Data spliting
3.1 Read previously saved data
<- readRDS(file = "data/rhcAnalytic.RDS") ObsData
# Using a seed to randomize in a reproducible way
set.seed(123)
require(caret)
<-createDataPartition(y = ObsData$Length.of.Stay,
splitp = 0.7, list = FALSE)
str(split)
## int [1:4017, 1] 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "Resample1"
dim(split)
## [1] 4017 1
dim(ObsData)*.7 # approximate train data
## [1] 4014.5 36.4
dim(ObsData)*(1-.7) # approximate train data
## [1] 1720.5 15.6
3.2 Split the data
# create train data
<-ObsData[split,]
train.datadim(train.data)
## [1] 4017 52
# create test data
<-ObsData[-split,]
test.datadim(test.data)
## [1] 1718 52
3.3 Train the model
<- readRDS(file = "data/form1.RDS")
out.formula1 out.formula1
## Length.of.Stay ~ Disease.category + Cancer + Cardiovascular +
## Congestive.HF + Dementia + Psychiatric + Pulmonary + Renal +
## Hepatic + GI.Bleed + Tumor + Immunosupperssion + Transfer.hx +
## MI + age + sex + edu + DASIndex + APACHE.score + Glasgow.Coma.Score +
## blood.pressure + WBC + Heart.rate + Respiratory.rate + Temperature +
## PaO2vs.FIO2 + Albumin + Hematocrit + Bilirubin + Creatinine +
## Sodium + Potassium + PaCo2 + PH + Weight + DNR.status + Medical.insurance +
## Respiratory.Diag + Cardiovascular.Diag + Neurological.Diag +
## Gastrointestinal.Diag + Renal.Diag + Metabolic.Diag + Hematologic.Diag +
## Sepsis.Diag + Trauma.Diag + Orthopedic.Diag + race + income +
## RHC.use
<-lm(out.formula1, data = train.data)
fit.train1# summary(fit.train1)
3.3.1 Function that gives performance measures
<- function(new.data,
perform model.formula=NULL,
model.fit,y.name = "Y",
digits=3){
# data dimension
<- dim(model.matrix(model.fit))[2]
p # predicted value
<- predict(model.fit, new.data)
pred.y # sample size
<- length(pred.y)
n # outcome
<- as.numeric(new.data[,y.name])
new.data.y # R2
<- caret:::R2(pred.y, new.data.y)
R2 # adj R2 using alternate formula
<- n-p
df.residual <- 1-(1-R2)*((n-1)/df.residual)
adjR2 # RMSE
<- caret:::RMSE(pred.y, new.data.y)
RMSE # combine all of the results
<- round(cbind(n,p,R2,adjR2,RMSE),digits)
res # returning object
return(res)
}
3.4 Extract performance measures
perform(new.data=train.data,
y.name = "Length.of.Stay",
model.fit=fit.train1)
## n p R2 adjR2 RMSE
## [1,] 4017 64 0.081 0.067 24.647
perform(new.data=test.data,
y.name = "Length.of.Stay",
model.fit=fit.train1)
## n p R2 adjR2 RMSE
## [1,] 1718 64 0.056 0.02 25.488
perform(new.data=ObsData,
y.name = "Length.of.Stay",
model.fit=fit.train1)
## n p R2 adjR2 RMSE
## [1,] 5735 64 0.073 0.063 24.902
In this chapter, we will describe the ideas of internal validation.