In this lab we are going to explore Classification techiques and apply them on biomedical data. We will study Logistic Regression and K-Nearest Neighbors. Let’s begin!
For our lab today we are going to use the Wisconsin Breast Cancer data set.
library(mlbench)
data(BreastCancer)
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
BreastCancer$Cl.thickness <- as.numeric(as.character(BreastCancer$Cl.thickness))
BreastCancer$Cell.size <- as.numeric(as.character(BreastCancer$Cell.size))
BreastCancer$Cell.shape <- as.numeric(as.character(BreastCancer$Cell.shape))
BreastCancer$Marg.adhesion <- as.numeric(as.character(BreastCancer$Marg.adhesion))
BreastCancer$Epith.c.size <- as.numeric(as.character(BreastCancer$Epith.c.size))
BreastCancer$Bare.nuclei <- as.numeric(as.character(BreastCancer$Bare.nuclei))
BreastCancer$Bl.cromatin <- as.numeric(as.character(BreastCancer$Bl.cromatin))
BreastCancer$Normal.nucleoli <- as.numeric(as.character(BreastCancer$Normal.nucleoli))
BreastCancer$Mitoses <- as.numeric(as.character(BreastCancer$Mitoses))
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : num 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : num 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : num 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : num 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: num 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Separate our data to train and test set - 80% train, 20% test
index <- sample(1:nrow(BreastCancer),round(0.80*nrow(BreastCancer)))
train <- BreastCancer[index,]
test <- BreastCancer[-index,]
train <- train[complete.cases(train),]
test <- test[complete.cases(test),]
# Fit logistic regression on the train data
glm.fit <- glm(Class ~ Cell.size + Cell.shape + Bare.nuclei + Cl.thickness + Marg.adhesion + Epith.c.size + Bl.cromatin + Normal.nucleoli + Mitoses , data = train, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Class ~ Cell.size + Cell.shape + Bare.nuclei +
## Cl.thickness + Marg.adhesion + Epith.c.size + Bl.cromatin +
## Normal.nucleoli + Mitoses, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3923 -0.0991 -0.0545 0.0201 2.6784
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.79400 1.46509 -7.367 1.74e-13 ***
## Cell.size 0.07782 0.23365 0.333 0.739102
## Cell.shape 0.32807 0.25987 1.262 0.206787
## Bare.nuclei 0.34216 0.10211 3.351 0.000805 ***
## Cl.thickness 0.55983 0.16233 3.449 0.000563 ***
## Marg.adhesion 0.32212 0.13044 2.469 0.013531 *
## Epith.c.size 0.21798 0.17319 1.259 0.208164
## Bl.cromatin 0.52036 0.20599 2.526 0.011533 *
## Normal.nucleoli 0.10232 0.11971 0.855 0.392683
## Mitoses 0.55189 0.37324 1.479 0.139236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 713.141 on 544 degrees of freedom
## Residual deviance: 77.386 on 535 degrees of freedom
## AIC: 97.386
##
## Number of Fisher Scoring iterations: 8
# family = binomial to tell R to perform logistic regression and not some other type of
# generalized model
# Variable importance
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
varImp(glm.fit)
## Overall
## Cell.size 0.3330425
## Cell.shape 1.2624504
## Bare.nuclei 3.3510511
## Cl.thickness 3.4486444
## Marg.adhesion 2.4694797
## Epith.c.size 1.2586292
## Bl.cromatin 2.5261158
## Normal.nucleoli 0.8547621
## Mitoses 1.4786408
# How well does the model perform on the test data?
# Find probabilities for each data point in the test set
probabilities <- predict(glm.fit, newdata = test,type = "response")
# Convert to predictions
contrasts(test$Class)
## malignant
## benign 0
## malignant 1
predictions <- rep("benign", nrow(test))
predictions[probabilities >0.5] = "malignant"
# Create confusion matrix
confusion_matrix <- table(predictions, test$Class)
confusion_matrix
##
## predictions benign malignant
## benign 94 4
## malignant 2 38
#AUC
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# Compute AUC for predicting Class with the model
prob <- predict(glm.fit, newdata=test, type="response")
pred <- prediction(prob, test$Class)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9957837
library(class)
# Predict the labels of the test set simultaneously with the training of the model
prediction_knn <- knn(train[,2:10], test[,2:10], train$Class, k=4)
# Confusion matrix
table(prediction_knn, test$Class)
##
## prediction_knn benign malignant
## benign 95 3
## malignant 1 39
Here we study performance estimation through sampling methods.
# In case you do not have some of these packages installed, you should run the "install.packages()"command before the library() command the first time you want to run the code.
library(RCurl)
## Loading required package: bitops
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## complete(): tidyr, RCurl
## filter(): dplyr, stats
## lag(): dplyr, stats
## lift(): purrr, caret
#install.packages("rafalib")
library(rafalib)
# Download data
URL_text_1 <- "https://raw.githubusercontent.com/kannan-kasthuri/kannan-kasthuri.github.io"
URL_text_2 <- "/master/Datasets/HANES/NYC_HANES_DIAB.csv"
URL <- paste(URL_text_1,URL_text_2, sep="")
HANES <- read.csv(text=getURL(URL))
HANES$GENDER <- factor(HANES$GENDER, labels=c("M","F"))
HANES$AGEGROUP <- factor(HANES$AGEGROUP, labels=c("20-39","40-59","60+"))
HANES$HSQ_1 <- factor(HANES$HSQ_1, labels=c("Excellent","Very Good","Good", "Fair", "Poor"))
HANES$DX_DBTS <- factor(HANES$DX_DBTS, labels=c("DIAB","DIAB NO_DX","NO DIAB"))
HANES <- na.omit(HANES)
str(HANES)
## 'data.frame': 1112 obs. of 23 variables:
## $ KEY : Factor w/ 1527 levels "133370A","133370B",..: 28 43 44 53 55 70 84 90 100 107 ...
## $ GENDER : Factor w/ 2 levels "M","F": 1 1 1 1 1 1 1 1 1 1 ...
## $ SPAGE : int 29 28 27 24 30 26 31 32 34 32 ...
## $ AGEGROUP : Factor w/ 3 levels "20-39","40-59",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ HSQ_1 : Factor w/ 5 levels "Excellent","Very Good",..: 2 2 2 1 1 3 1 2 1 3 ...
## $ UCREATININE : int 105 53 314 105 163 150 46 36 177 156 ...
## $ UALBUMIN : num 0.707 1 8 4 3 2 2 0.707 4 3 ...
## $ UACR : num 0.00673 2 3 4 2 ...
## $ MERCURYU : num 0.37 0.106 0.487 2.205 0.979 ...
## $ DX_DBTS : Factor w/ 3 levels "DIAB","DIAB NO_DX",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1C : num 5 5.2 4.8 5.1 4.3 5.2 4.8 5.2 4.8 5.2 ...
## $ CADMIUM : num 0.2412 0.1732 0.0644 0.0929 0.1202 ...
## $ LEAD : num 1.454 1.019 0.863 1.243 0.612 ...
## $ MERCURYTOTALBLOOD: num 2.34 2.57 1.32 14.66 2.13 ...
## $ HDL : int 42 51 42 61 52 50 57 56 42 44 ...
## $ CHOLESTEROLTOTAL : int 184 157 145 206 120 155 156 235 156 120 ...
## $ GLUCOSESI : num 4.61 4.77 5.16 5 5.11 ...
## $ CREATININESI : num 74.3 73 80 84.9 66 ...
## $ CREATININE : num 0.84 0.83 0.91 0.96 0.75 0.99 0.9 0.84 0.93 1.09 ...
## $ TRIGLYCERIDE : int 156 43 108 65 51 29 31 220 82 35 ...
## $ GLUCOSE : int 83 86 93 90 92 85 72 87 96 92 ...
## $ COTININE : num 31.5918 0.0635 0.035 0.0514 0.035 ...
## $ LDLESTIMATE : int 111 97 81 132 58 99 93 135 98 69 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:415] 2 15 16 24 26 28 33 34 35 39 ...
## .. ..- attr(*, "names")= chr [1:415] "2" "15" "16" "24" ...
The validation set approach
set.seed(1) # for reproducibility
# Choose the training set as half of the data
train=sample(1112,556)
# Train linear regression only on the train set using the "subset" option
lm.fit = lm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES,subset=train)
# Predict the MSE on the test set
# you need to run the following command before you run the following one for the first time
attach(HANES)
mean((CHOLESTEROLTOTAL-predict(lm.fit,HANES))[-train]^2)
## [1] 295.9421
# What it we choose a different train set?
set.seed(2)
train=sample(1112,556)
lm.fit=lm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES,subset=train)
#attach(HANES)
mean((CHOLESTEROLTOTAL-predict(lm.fit,HANES))[-train]^2)
## [1] 278.2441
It can be observed that with this method, the MSE on the test set is very variable and depends directly on the initial partition of the data in half.
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
#Fit a linear regression model using glm function this time
glm.fit=glm(CHOLESTEROLTOTAL~ LDLESTIMATE ,data=HANES)
# Notice that there is no need to use a train and a test set this time. Why?
# Perform LOOCV
cv.err=cv.glm(HANES,glm.fit) #might take some time
cv.err$delta # This parameter includes the MSE in this case for LOOCV
## [1] 290.2582 290.2577
The first number is the MSE for the LOOCV and the second is the adjusted value.
# 10-fold CV
set.seed(17)
cv.error.10=rep(0,10) # create an empty vector to store the error values
for (i in 1:10){ # Try different order polynomials
glm.fit=glm(CHOLESTEROLTOTAL~ poly(LDLESTIMATE,i) ,data=HANES)
cv.error.10[i]=cv.glm(HANES,glm.fit,K=10)$delta[1] #extract the MSE error
}
#cv.error.10
plot(seq(1,10,1),cv.error.10, xlab = "Order of polynomial",ylab = "MSE")
We will now study regularization methods - Ridge regression and Lasso
library(glmnet) # Library for regularization
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-13
# Make sure you have removed missing values from the data
# We have already done this at the beginning
# Select train and test sets
# Separate our data to train and test set - 90% train, 10% test
index <- sample(1:nrow(HANES),round(0.90*nrow(HANES)))
train <- HANES[index,]
test <- HANES[-index,]
# Special syntax - matrix x and vector y
train$KEY <- NULL
x=model.matrix(CHOLESTEROLTOTAL~.,train)[,-1] # includes the predictors
y=train$CHOLESTEROLTOTAL # includes the variable we wish to predict
# Set lambda grid to explore all the models
grid=10^seq(10,-2,length=100)
ridge.mod<- glmnet(x, y, alpha=0, lambda = grid)
# alpha = 0 -> use L2 penalty -> regularization
# family = "binomial"
dim(coef(ridge.mod))
## [1] 27 100
# Plot coefficients
plot(ridge.mod, xvar="lambda",main='Ridge')
set.seed(1)
# Perform cross validation to find the best value of lambda
cv.ridge <- cv.glmnet(x, y, alpha=0,nfolds=10)
plot(cv.ridge)
bestlam=cv.ridge$lambda.min
bestlam
## [1] 3.710213
# Predict MSE on the test set usind the best lambda
test$KEY <- NULL
x.test <- model.matrix(CHOLESTEROLTOTAL~.,test)[,-1] # includes the predictors
y.test <- test$CHOLESTEROLTOTAL
ridge.pred=predict(ridge.mod,s=bestlam,newx=x.test)
mean((ridge.pred-y.test)^2)
## [1] 12.33512
# What if we used not to optimal lambda?
ridge.pred=predict(ridge.mod,s=15,newx=x.test)
mean((ridge.pred-y.test)^2)
## [1] 111.0305
lasso.mod<- glmnet(x, y, alpha=1, lambda = grid)
# Plot coefficients
plot(lasso.mod, xvar="lambda",main='Lasso')
set.seed(1)
# Perform cross validation to find the best value of lambda
cv.lasso <- cv.glmnet(x, y, alpha=1,nfolds=10)
plot(cv.lasso)
bestlam=cv.lasso$lambda.min
bestlam
## [1] 0.6189008
# Predict MSE on the test set usind the best lambda
lasso.pred=predict(lasso.mod,s=bestlam,newx=x.test)
mean((lasso.pred-y.test)^2)
## [1] 1.156176
# What are the coefficients of the best model?
lasso.coef=predict(lasso.mod,type="coefficients",s=bestlam)[1:20,]
lasso.coef
## (Intercept) GENDERF SPAGE AGEGROUP40-59
## 6.5668852 0.0000000 0.0000000 0.0000000
## AGEGROUP60+ HSQ_1Very Good HSQ_1Good HSQ_1Fair
## 0.0000000 0.0000000 0.0000000 0.0000000
## HSQ_1Poor UCREATININE UALBUMIN UACR
## 0.0000000 0.0000000 0.0000000 0.0000000
## MERCURYU DX_DBTSDIAB NO_DX DX_DBTSNO DIAB A1C
## 0.0000000 0.0000000 0.0000000 0.0000000
## CADMIUM LEAD MERCURYTOTALBLOOD HDL
## 0.0000000 0.0000000 0.0000000 0.9413246
lasso.coef[lasso.coef!=0]
## (Intercept) HDL
## 6.5668852 0.9413246
In this particular example, if you calculate the mean square error using simple linear regression you will see that the regularization methods did not really improve it. This is because of the particular data we used in this lab, since the number of samples (1001 in train set) is more than enough to accurately predict the coefficients (21 variables) without overfitting the data. In more “real life” applications, regularization methods can really reduce overfitting and improve your method’s performance.