Data spliting

Important

This tutorial is very similar to one of the previous tutorials, but uses a different data (we used RHC data here). We are revisiting concepts related to prediction before introducing ideas related to machine learning.

Load dataset

ObsData <- readRDS(file = "Data/machinelearning/rhcAnalytic.RDS")
head(ObsData)

See (KDnuggets 2023; Kuhn 2023)

# Using a seed to randomize in a reproducible way 
set.seed(123)
require(caret)
split<-createDataPartition(y = ObsData$Length.of.Stay, 
                           p = 0.7, list = FALSE)
str(split)
#>  int [1:4017, 1] 1 2 3 4 5 6 7 8 9 10 ...
#>  - attr(*, "dimnames")=List of 2
#>   ..$ : NULL
#>   ..$ : chr "Resample1"
dim(split)
#> [1] 4017    1
dim(ObsData)*.7 # approximate train data
#> [1] 4014.5   36.4
dim(ObsData)*(1-.7) # approximate train data
#> [1] 1720.5   15.6

Split the data

# create train data
train.data<-ObsData[split,]
dim(train.data)
#> [1] 4017   52
# create test data
test.data<-ObsData[-split,]
dim(test.data)
#> [1] 1718   52

Train the model

out.formula1 <- readRDS(file = "Data/machinelearning/form1.RDS")
out.formula1
#> Length.of.Stay ~ Disease.category + Cancer + Cardiovascular + 
#>     Congestive.HF + Dementia + Psychiatric + Pulmonary + Renal + 
#>     Hepatic + GI.Bleed + Tumor + Immunosupperssion + Transfer.hx + 
#>     MI + age + sex + edu + DASIndex + APACHE.score + Glasgow.Coma.Score + 
#>     blood.pressure + WBC + Heart.rate + Respiratory.rate + Temperature + 
#>     PaO2vs.FIO2 + Albumin + Hematocrit + Bilirubin + Creatinine + 
#>     Sodium + Potassium + PaCo2 + PH + Weight + DNR.status + Medical.insurance + 
#>     Respiratory.Diag + Cardiovascular.Diag + Neurological.Diag + 
#>     Gastrointestinal.Diag + Renal.Diag + Metabolic.Diag + Hematologic.Diag + 
#>     Sepsis.Diag + Trauma.Diag + Orthopedic.Diag + race + income + 
#>     RHC.use
fit.train1<-lm(out.formula1, data = train.data)
# summary(fit.train1)

Function that gives performance measures

perform <- function(new.data,
                    model.fit,model.formula=NULL, 
                    y.name = "Y",
                    digits=3){
  # data dimension
  p <- dim(model.matrix(model.fit))[2]
  # predicted value
  pred.y <- predict(model.fit, new.data)
  # sample size
  n <- length(pred.y)
  # outcome
  new.data.y <- as.numeric(new.data[,y.name])
  # R2
  R2 <- caret:::R2(pred.y, new.data.y)
  # adj R2 using alternate formula
  df.residual <- n-p
  adjR2 <- 1-(1-R2)*((n-1)/df.residual)
  # RMSE
  RMSE <-  caret:::RMSE(pred.y, new.data.y)
  # combine all of the results
  res <- round(cbind(n,p,R2,adjR2,RMSE),digits)
  # returning object
  return(res)
}

Extract performance measures

perform(new.data=train.data,
        y.name = "Length.of.Stay",
        model.fit=fit.train1)
#>         n  p    R2 adjR2   RMSE
#> [1,] 4017 64 0.081 0.067 24.647
perform(new.data=test.data,
        y.name = "Length.of.Stay",
        model.fit=fit.train1)
#>         n  p    R2 adjR2   RMSE
#> [1,] 1718 64 0.056  0.02 25.488
perform(new.data=ObsData,
        y.name = "Length.of.Stay",
        model.fit=fit.train1)
#>         n  p    R2 adjR2   RMSE
#> [1,] 5735 64 0.073 0.063 24.902

Video content (optional)

Tip

For those who prefer a video walkthrough, feel free to watch the video below, which offers a description of an earlier version of the above content.

References

KDnuggets. 2023. “Dataset Splitting Best Practices in Python.” https://www.kdnuggets.com/2020/05/dataset-splitting-best-practices-python.html.

Kuhn, Max. 2023. “Data Splitting.” https://topepo.github.io/caret/data-splitting.html.