
Adam J Sullivan
Assistant Professor of Biostatistics
Brown University
caret()
packagecaret
i short for Classification And REgression Traininglibrary(tidyverse)
library(dslabs)
data("mnist_27")
train
functontrain
function lets us train different algorithms using similar syntax. library(caret)
train_glm <- train(y ~ ., method = "glm", data = mnist_27$train)
train_knn <- train(y ~ ., method = "knn", data = mnist_27$train)
## Generalized Linear Model
##
## 800 samples
## 2 predictor
## 2 classes: '2', '7'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 800, 800, 800, 800, 800, 800, ...
## Resampling results:
##
## Accuracy Kappa
## 0.795 0.588
## k-Nearest Neighbors
##
## 800 samples
## 2 predictor
## 2 classes: '2', '7'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 800, 800, 800, 800, 800, 800, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.797 0.592
## 7 0.812 0.623
## 9 0.818 0.634
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
caret
train()
function?predict.glm
and predict.knn
. predict.train
.y_hat_glm <- predict(train_glm, mnist_27$test, type = "raw")
y_hat_knn <- predict(train_knn, mnist_27$test, type = "raw")
confusionMatrix(y_hat_glm, mnist_27$test$y)$overall[["Accuracy"]]
confusionMatrix(y_hat_knn, mnist_27$test$y)$overall[["Accuracy"]]
## [1] 0.75
## [1] 0.84
train
automatically uses cross validation to decide among a few default values. getModelInfo("knn")
modelLookup("knn")
train_knn <- train(y ~ ., method = "knn", data = mnist_27$train)
kNN
method, the default is to try \(k=5,7,9\). tuneGrid
parameter. modelLookup
output. k
, so we use this:
data.frame(k = seq(3, 251, 2))
.set.seed(2008)
train_knn <- train(y ~ ., method = "knn",
data = mnist_27$train,
tuneGrid = data.frame(k = seq(2, 251, 2)))
ggplot(train_knn, highlight = TRUE)
train_knn$bestTune
## k
## 17 34
train_knn$finalModel
## 34-nearest neighbor model
## Training set outcome distribution:
##
## 2 7
## 379 421
predict
will use this best performing model. confusionMatrix(predict(train_knn, mnist_27$test, type = "raw"),
mnist_27$test$y)$overall["Accuracy"]
## Accuracy
## 0.85
trainControl
function.control <- trainControl(method = "cv", number = 10, p = .9)
train_knn_cv <- train(y ~ ., method = "knn",
data = mnist_27$train,
tuneGrid = data.frame(k = seq(9, 71, 2)),
trControl = control)
ggplot(train_knn_cv, highlight = TRUE)
train_knn$results %>%
ggplot(aes(x = k, y = Accuracy)) +
geom_line() +
geom_point() +
geom_errorbar(aes(x = k,
ymin = Accuracy - AccuracySD,
ymax = Accuracy + AccuracySD))
gam
package. modelLookup("gamLoess")
## model parameter label forReg forClass probModel
## 1 gamLoess span Span TRUE TRUE TRUE
## 2 gamLoess degree Degree TRUE TRUE TRUE
grid <- expand.grid(span = seq(0.15, 0.65, len = 10), degree = 1)
grid
## span degree
## 1 0.150 1
## 2 0.206 1
## 3 0.261 1
## 4 0.317 1
## 5 0.372 1
## 6 0.428 1
## 7 0.483 1
## 8 0.539 1
## 9 0.594 1
## 10 0.650 1
train_loess <- train(y ~ .,
method = "gamLoess",
tuneGrid=grid,
data = mnist_27$train)
ggplot(train_loess, highlight = TRUE)
confusionMatrix(data = predict(train_loess, mnist_27$test),
reference = mnist_27$test$y)$overall["Accuracy"]
## Accuracy
## 0.845