Classification, Performance Estimation and Regularization


Recorded Stream



Tutorials by Sofia Nomikou

In this lab we are going to explore Classification techiques and apply them on biomedical data. We will study Logistic Regression and K-Nearest Neighbors. Let’s begin!


Classification


For our lab today we are going to use the Wisconsin Breast Cancer data set.

library(mlbench)
data(BreastCancer)
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
BreastCancer$Cl.thickness <- as.numeric(as.character(BreastCancer$Cl.thickness))
BreastCancer$Cell.size <- as.numeric(as.character(BreastCancer$Cell.size))
BreastCancer$Cell.shape <- as.numeric(as.character(BreastCancer$Cell.shape))
BreastCancer$Marg.adhesion <- as.numeric(as.character(BreastCancer$Marg.adhesion))
BreastCancer$Epith.c.size <- as.numeric(as.character(BreastCancer$Epith.c.size))
BreastCancer$Bare.nuclei <- as.numeric(as.character(BreastCancer$Bare.nuclei))
BreastCancer$Bl.cromatin <- as.numeric(as.character(BreastCancer$Bl.cromatin))
BreastCancer$Normal.nucleoli <- as.numeric(as.character(BreastCancer$Normal.nucleoli))
BreastCancer$Mitoses <- as.numeric(as.character(BreastCancer$Mitoses))

str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : num  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : num  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : num  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : num  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : num  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : num  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: num  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Separate our data to train and test set - 80% train, 20% test
index <- sample(1:nrow(BreastCancer),round(0.80*nrow(BreastCancer)))
train <- BreastCancer[index,]
test <- BreastCancer[-index,]

train <- train[complete.cases(train),]
test <- test[complete.cases(test),]

Logistic Regression


# Fit logistic regression on the train data
glm.fit <- glm(Class ~ Cell.size + Cell.shape + Bare.nuclei + Cl.thickness + Marg.adhesion + Epith.c.size + Bl.cromatin + Normal.nucleoli + Mitoses , data = train, family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Class ~ Cell.size + Cell.shape + Bare.nuclei + 
##     Cl.thickness + Marg.adhesion + Epith.c.size + Bl.cromatin + 
##     Normal.nucleoli + Mitoses, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3923  -0.0991  -0.0545   0.0201   2.6784  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -10.79400    1.46509  -7.367 1.74e-13 ***
## Cell.size         0.07782    0.23365   0.333 0.739102    
## Cell.shape        0.32807    0.25987   1.262 0.206787    
## Bare.nuclei       0.34216    0.10211   3.351 0.000805 ***
## Cl.thickness      0.55983    0.16233   3.449 0.000563 ***
## Marg.adhesion     0.32212    0.13044   2.469 0.013531 *  
## Epith.c.size      0.21798    0.17319   1.259 0.208164    
## Bl.cromatin       0.52036    0.20599   2.526 0.011533 *  
## Normal.nucleoli   0.10232    0.11971   0.855 0.392683    
## Mitoses           0.55189    0.37324   1.479 0.139236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 713.141  on 544  degrees of freedom
## Residual deviance:  77.386  on 535  degrees of freedom
## AIC: 97.386
## 
## Number of Fisher Scoring iterations: 8
# family = binomial to tell R to perform logistic regression and not some other type of 
# generalized model

# Variable importance
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
varImp(glm.fit)
##                   Overall
## Cell.size       0.3330425
## Cell.shape      1.2624504
## Bare.nuclei     3.3510511
## Cl.thickness    3.4486444
## Marg.adhesion   2.4694797
## Epith.c.size    1.2586292
## Bl.cromatin     2.5261158
## Normal.nucleoli 0.8547621
## Mitoses         1.4786408
# How well does the model perform on the test data?
# Find probabilities for each data point in the test set
probabilities <- predict(glm.fit, newdata = test,type = "response")

# Convert to predictions
contrasts(test$Class)
##           malignant
## benign            0
## malignant         1
predictions <- rep("benign", nrow(test))
predictions[probabilities >0.5] = "malignant"

# Create confusion matrix
confusion_matrix <- table(predictions, test$Class)
confusion_matrix
##            
## predictions benign malignant
##   benign        94         4
##   malignant      2        38
#AUC
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# Compute AUC for predicting Class with the model
prob <- predict(glm.fit, newdata=test, type="response")
pred <- prediction(prob, test$Class)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)

auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9957837

K-Nearest Neighbors


library(class)
# Predict the labels of the test set simultaneously with the training of the model
prediction_knn <- knn(train[,2:10], test[,2:10], train$Class, k=4)
# Confusion matrix
table(prediction_knn, test$Class)
##               
## prediction_knn benign malignant
##      benign        95         3
##      malignant      1        39

Performance Estimation


Here we study performance estimation through sampling methods.

# In case you do not have some of these packages installed, you should run the "install.packages()"command before the library() command the first time you want to run the code. 
  library(RCurl)
## Loading required package: bitops
  library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## complete(): tidyr, RCurl
## filter():   dplyr, stats
## lag():      dplyr, stats
## lift():     purrr, caret
  #install.packages("rafalib")
  library(rafalib)
# Download data
  URL_text_1 <- "https://raw.githubusercontent.com/kannan-kasthuri/kannan-kasthuri.github.io"
  URL_text_2 <- "/master/Datasets/HANES/NYC_HANES_DIAB.csv"
  URL <- paste(URL_text_1,URL_text_2, sep="")
  HANES <- read.csv(text=getURL(URL))
  HANES$GENDER <- factor(HANES$GENDER, labels=c("M","F"))
  HANES$AGEGROUP <- factor(HANES$AGEGROUP, labels=c("20-39","40-59","60+"))
  HANES$HSQ_1 <- factor(HANES$HSQ_1, labels=c("Excellent","Very Good","Good", "Fair", "Poor"))
  HANES$DX_DBTS <- factor(HANES$DX_DBTS, labels=c("DIAB","DIAB NO_DX","NO DIAB"))
  HANES <- na.omit(HANES)
  str(HANES)
## 'data.frame':    1112 obs. of  23 variables:
##  $ KEY              : Factor w/ 1527 levels "133370A","133370B",..: 28 43 44 53 55 70 84 90 100 107 ...
##  $ GENDER           : Factor w/ 2 levels "M","F": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SPAGE            : int  29 28 27 24 30 26 31 32 34 32 ...
##  $ AGEGROUP         : Factor w/ 3 levels "20-39","40-59",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ HSQ_1            : Factor w/ 5 levels "Excellent","Very Good",..: 2 2 2 1 1 3 1 2 1 3 ...
##  $ UCREATININE      : int  105 53 314 105 163 150 46 36 177 156 ...
##  $ UALBUMIN         : num  0.707 1 8 4 3 2 2 0.707 4 3 ...
##  $ UACR             : num  0.00673 2 3 4 2 ...
##  $ MERCURYU         : num  0.37 0.106 0.487 2.205 0.979 ...
##  $ DX_DBTS          : Factor w/ 3 levels "DIAB","DIAB NO_DX",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1C              : num  5 5.2 4.8 5.1 4.3 5.2 4.8 5.2 4.8 5.2 ...
##  $ CADMIUM          : num  0.2412 0.1732 0.0644 0.0929 0.1202 ...
##  $ LEAD             : num  1.454 1.019 0.863 1.243 0.612 ...
##  $ MERCURYTOTALBLOOD: num  2.34 2.57 1.32 14.66 2.13 ...
##  $ HDL              : int  42 51 42 61 52 50 57 56 42 44 ...
##  $ CHOLESTEROLTOTAL : int  184 157 145 206 120 155 156 235 156 120 ...
##  $ GLUCOSESI        : num  4.61 4.77 5.16 5 5.11 ...
##  $ CREATININESI     : num  74.3 73 80 84.9 66 ...
##  $ CREATININE       : num  0.84 0.83 0.91 0.96 0.75 0.99 0.9 0.84 0.93 1.09 ...
##  $ TRIGLYCERIDE     : int  156 43 108 65 51 29 31 220 82 35 ...
##  $ GLUCOSE          : int  83 86 93 90 92 85 72 87 96 92 ...
##  $ COTININE         : num  31.5918 0.0635 0.035 0.0514 0.035 ...
##  $ LDLESTIMATE      : int  111 97 81 132 58 99 93 135 98 69 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:415] 2 15 16 24 26 28 33 34 35 39 ...
##   .. ..- attr(*, "names")= chr [1:415] "2" "15" "16" "24" ...

Sampling Methods


The validation set approach

set.seed(1) # for reproducibility
# Choose the training set as half of the data
train=sample(1112,556)

# Train linear regression only on the train set using the "subset" option
lm.fit = lm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES,subset=train)

# Predict the MSE on the test set
# you need to run the following command before you run the following one for the first time 
attach(HANES) 

mean((CHOLESTEROLTOTAL-predict(lm.fit,HANES))[-train]^2)
## [1] 295.9421
# What it we choose a different train set?
set.seed(2)
train=sample(1112,556)
lm.fit=lm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES,subset=train)
#attach(HANES)
mean((CHOLESTEROLTOTAL-predict(lm.fit,HANES))[-train]^2)
## [1] 278.2441

It can be observed that with this method, the MSE on the test set is very variable and depends directly on the initial partition of the data in half.


Leave-One-Out Cross-Validation (LOOCV)


library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
#Fit a linear regression model using glm function this time
glm.fit=glm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES)
# Notice that there is no need to use a train and a test set this time. Why?

# Perform LOOCV 
cv.err=cv.glm(HANES,glm.fit) #might take some time
cv.err$delta # This parameter includes the MSE in this case for LOOCV
## [1] 290.2582 290.2577

The first number is the MSE for the LOOCV and the second is the adjusted value.


K-fold Cross-Validation


# 10-fold CV
set.seed(17)
cv.error.10=rep(0,10) # create an empty vector to store the error values
for (i in 1:10){ # Try different order polynomials
  glm.fit=glm(CHOLESTEROLTOTAL~ poly(LDLESTIMATE,i) ,data=HANES)
  cv.error.10[i]=cv.glm(HANES,glm.fit,K=10)$delta[1] #extract the MSE error
}
#cv.error.10

plot(seq(1,10,1),cv.error.10, xlab = "Order of polynomial",ylab = "MSE")


Regularization


We will now study regularization methods - Ridge regression and Lasso

library(glmnet) # Library for regularization
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-13
# Make sure you have removed missing values from the data
# We have already done this at the beginning

# Select train and test sets
# Separate our data to train and test set - 90% train, 10% test
index <- sample(1:nrow(HANES),round(0.90*nrow(HANES)))
train <- HANES[index,]
test <- HANES[-index,]

Ridge Regression


# Special syntax - matrix x and vector y
train$KEY <- NULL

x=model.matrix(CHOLESTEROLTOTAL~.,train)[,-1] # includes the predictors
y=train$CHOLESTEROLTOTAL # includes the variable we wish to predict

# Set lambda grid to explore all the models
grid=10^seq(10,-2,length=100)
ridge.mod<- glmnet(x, y, alpha=0, lambda = grid) 
# alpha = 0 -> use L2 penalty -> regularization
# family = "binomial" 

dim(coef(ridge.mod))
## [1]  27 100
# Plot coefficients
plot(ridge.mod, xvar="lambda",main='Ridge')

set.seed(1)
# Perform cross validation to find the best value of lambda
cv.ridge <- cv.glmnet(x, y, alpha=0,nfolds=10)
plot(cv.ridge)

bestlam=cv.ridge$lambda.min
bestlam
## [1] 3.710213
# Predict MSE on the test set usind the best lambda
test$KEY <- NULL
x.test <- model.matrix(CHOLESTEROLTOTAL~.,test)[,-1] # includes the predictors
y.test <- test$CHOLESTEROLTOTAL

ridge.pred=predict(ridge.mod,s=bestlam,newx=x.test)
mean((ridge.pred-y.test)^2)
## [1] 12.33512
# What if we used not to optimal lambda?
ridge.pred=predict(ridge.mod,s=15,newx=x.test)
mean((ridge.pred-y.test)^2)
## [1] 111.0305

Lasso


lasso.mod<- glmnet(x, y, alpha=1, lambda = grid) 
# Plot coefficients
plot(lasso.mod, xvar="lambda",main='Lasso')

set.seed(1)
# Perform cross validation to find the best value of lambda
cv.lasso <- cv.glmnet(x, y, alpha=1,nfolds=10)
plot(cv.lasso)

bestlam=cv.lasso$lambda.min
bestlam
## [1] 0.6189008
# Predict MSE on the test set usind the best lambda
lasso.pred=predict(lasso.mod,s=bestlam,newx=x.test)
mean((lasso.pred-y.test)^2)
## [1] 1.156176
# What are the coefficients of the best model?
lasso.coef=predict(lasso.mod,type="coefficients",s=bestlam)[1:20,] 
lasso.coef
##       (Intercept)           GENDERF             SPAGE     AGEGROUP40-59 
##         6.5668852         0.0000000         0.0000000         0.0000000 
##       AGEGROUP60+    HSQ_1Very Good         HSQ_1Good         HSQ_1Fair 
##         0.0000000         0.0000000         0.0000000         0.0000000 
##         HSQ_1Poor       UCREATININE          UALBUMIN              UACR 
##         0.0000000         0.0000000         0.0000000         0.0000000 
##          MERCURYU DX_DBTSDIAB NO_DX    DX_DBTSNO DIAB               A1C 
##         0.0000000         0.0000000         0.0000000         0.0000000 
##           CADMIUM              LEAD MERCURYTOTALBLOOD               HDL 
##         0.0000000         0.0000000         0.0000000         0.9413246
lasso.coef[lasso.coef!=0]
## (Intercept)         HDL 
##   6.5668852   0.9413246

In this particular example, if you calculate the mean square error using simple linear regression you will see that the regularization methods did not really improve it. This is because of the particular data we used in this lab, since the number of samples (1001 in train set) is more than enough to accurately predict the coefficients (21 variables) without overfitting the data. In more “real life” applications, regularization methods can really reduce overfitting and improve your method’s performance.