--- title: | | STAT 408 - Statistical Learning | Clustering date: "April 3, 2018" output: beamer_presentation: theme: "PaloAlto" fonttheme: "structuresmallcapsserif" --- ```{r setup, include=FALSE} library(ggplot2) library(dplyr) library(knitr) library(randomForest) library(maps) library(plotrix) library(mnormt) library(rpart) knitr::opts_chunk$set(echo = TRUE) knitr::knit_hooks$set(mysize = function(before, options, envir) { if (before) return(options$size) }) ``` # Unsupervised Learning ## Supervised vs. Unsupervised Learning ```{r, echo=F,fig.align='center'} set.seed(10) cluster1 <- rmnorm(n=25,mean=c(.3,.2), varcov=diag(2)*.025) cluster2 <- rmnorm(n=25,mean=c(.15,.75),varcov=diag(2)*.025) cluster3 <- rmnorm(n=50, mean=c(.75,.6),varcov=diag(2)*.023) plot(rbind(cluster1,cluster2,cluster3),pch=16,col=rgb(175,175,175,150,max=255),axes=F,xlab='', ylab='',cex=1.4) box() ``` ## Supervised ```{r, echo=F,fig.align='center'} set.seed(10) cluster1 <- rmnorm(n=25,mean=c(.3,.2), varcov=diag(2)*.025) cluster2 <- rmnorm(n=25,mean=c(.15,.75),varcov=diag(2)*.025) cluster3 <- rmnorm(n=50, mean=c(.75,.6),varcov=diag(2)*.03) plot(rbind(cluster1,cluster2,cluster3),type='n',axes=F,xlab='', ylab='') points(cluster1,pch='1',col='dodgerblue') points(cluster2,pch='1',col='dodgerblue') points(cluster3,pch='0',col='firebrick4') box() ``` ## Unsupervised - How many clusters? ```{r, echo=F,fig.align='center'} set.seed(10) cluster1 <- rmnorm(n=25,mean=c(.3,.2), varcov=diag(2)*.025) cluster2 <- rmnorm(n=25,mean=c(.15,.75),varcov=diag(2)*.025) cluster3 <- rmnorm(n=50, mean=c(.75,.6),varcov=diag(2)*.023) plot(rbind(cluster1,cluster2,cluster3),pch=16,col=rgb(175,175,175,150,max=255),axes=F,xlab='', ylab='',cex=1.4) box() ``` ## Unsupervised ```{r, echo=F,fig.align='center'} set.seed(10) cluster1 <- rmnorm(n=25,mean=c(.3,.2), varcov=diag(2)*.025) cluster2 <- rmnorm(n=25,mean=c(.15,.75),varcov=diag(2)*.025) cluster3 <- rmnorm(n=50, mean=c(.75,.6),varcov=diag(2)*.03) combined <- rbind(cluster1,cluster2,cluster3) plot(combined,type='n',axes=F,xlab='', ylab='') points(cluster1,pch=16,col='dodgerblue') points(cluster2,pch=16,col='forestgreen') points(cluster3,pch=16,col='firebrick4') box() ``` ## k-means clustering ```{r,echo=FALSE} km <- kmeans(combined, 3) plot(combined,type='n',axes=F, xlab='',ylab='') box() points(combined,pch=as.character(km$cluster), col=c(rep('dodgerblue',25),rep('forestgreen',25),rep('firebrick4',50))) draw.circle(.31,-0.1,.335, border='dodgerblue') draw.circle(.79,.65,.3, border='firebrick4') draw.circle(.14,1.05,.3, border='forestgreen') ``` ## k-means clustering ```{r, mysize=TRUE, size='\\scriptsize',echo=F} km ``` ## k-means clustering - code ```{r,eval=FALSE} km <- kmeans(combined, 3) plot(combined,type='n',axes=F, xlab='',ylab='') box() points(combined,pch=as.character(km$cluster), col=c(rep('dodgerblue',25), rep('forestgreen',25), rep('firebrick4',50))) draw.circle(.31,-0.1,.335, border='dodgerblue') draw.circle(.79,.65,.3, border='firebrick4') draw.circle(.14,1.05,.3, border='forestgreen') ``` ## Hierarchical clustering ```{r, mysize=TRUE, size='\\footnotesize',echo=F} hc <- hclust(dist(combined)) plot(hc, hang=-1) ``` ## Hierarchical clustering - with 3 clusters ```{r, mysize=TRUE, size='\\footnotesize',echo=F} hc <- hclust(dist(combined)) plot(combined,type='n',axes=F, xlab='',ylab='') box() points(combined,pch=as.character(cutree(hc,3)), col=c(rep('dodgerblue',25),rep('forestgreen',25),rep('firebrick4',50))) ``` ## Hierarchical clustering - with 4 clusters ```{r, mysize=TRUE, size='\\footnotesize',echo=F} hc <- hclust(dist(combined)) plot(combined,type='n',axes=F, xlab='',ylab='') box() points(combined,pch=as.character(cutree(hc,4)), col=c(rep('dodgerblue',25),rep('forestgreen',25),rep('firebrick4',50))) ``` ## Hierarchical clustering - code ```{r, mysize=TRUE, size='\\footnotesize',eval=F} hc <- hclust(dist(combined)) plot(hc, hang=-1) plot(combined,type='n',axes=F, xlab='',ylab='') box() points(combined,pch=as.character(cutree(hc,4)), col=c(rep('dodgerblue',25), rep('forestgreen',25), rep('firebrick4',50))) ``` ## How to choose the number of clusters? Given these plots that we have seen, how do we choose the \emph{appropriate} number of clusters? ## How to choose the number of clusters? - Scree plot ```{r, echo = F} wss <- rep(0,15) for (i in 1:15) wss[i] <- sum(kmeans(combined,centers=i)$withinss) plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") ``` ## Scree plot - code ```{r, eval=F} wss <- rep(0,15) for (i in 1:15) { wss[i] <- sum(kmeans(combined,centers=i)$withinss) } plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") ``` ## Data with more than 2 dimensions ```{r, echo=F, mysize=TRUE, size='\\scriptsize'} animals <- cluster::animals colnames(animals) <- c("warm-blooded", "can fly", "vertebrate", "endangered", "live in groups", "have hair") animals.cluster <- animals[,-(5)] animals.cluster <- animals.cluster[-c(4,5,12,16,18),] animals.cluster[10,4] <- 2 animals.cluster[14,4] <- 1 animals.cluster.char <- animals.cluster animals.cluster.char[animals.cluster.char ==1] <- 'No' animals.cluster.char[animals.cluster.char ==2] <- 'Yes' kable(animals.cluster.char) ``` ## Multidimensional Scaling ```{r, echo=F} d <- dist(animals.cluster) # euclidean distances between the rows fit <- cmdscale(d, k=2) # k is the number of dim fit.jitter <- fit + runif(nrow(fit*2),-.15,.15) # plot solution plot(fit.jitter[,1], fit.jitter[,2], xlab="", ylab="", main="", type="n",axes=F) box() text(fit.jitter[,1], fit.jitter[,2], labels = row.names(animals.cluster), cex=1.3) ``` ## MDS - Code ```{r, echo=T, mysize=TRUE, size='\\scriptsize',eval=F} animals <- cluster::animals colnames(animals) <- c("warm-blooded", "can fly", "vertebrate", "endangered", "live in groups", "have hair") animals.cluster <- animals[,-(5)] animals.cluster <- animals.cluster[-c(4,5,12,16,18),] animals.cluster[10,4] <- 2 animals.cluster[14,4] <- 1 d <- dist(animals.cluster) fit <- cmdscale(d, k=2) fit.jitter <- fit + runif(nrow(fit*2),-.15,.15) plot(fit.jitter[,1], fit.jitter[,2], xlab="", ylab="", main="", type="n",axes=F) box() text(fit.jitter[,1], fit.jitter[,2], labels = row.names(animals.cluster), cex=1.3) ``` ## Hierarchical Clustering of Animals ```{r, echo=F} hc <- hclust(dist(animals.cluster)) plot(hc, hang=-1) ``` ## Lecture Exercise: Clustering Zoo Animals Use the dataset create below for the following questions. ```{r, echo=T, mysize=TRUE, size='\\tiny',eval=F} zoo.data <- read.csv('http://www.math.montana.edu/ahoegh/teaching/stat408/datasets/ZooClean.csv') rownames(zoo.data) <- zoo.data[,1] zoo.data <- zoo.data[,-1] ``` - Use multidimensional scaling to visualize the data in two dimensions. - What are two animals that are very similar and two that are very different? - Create a hierachical clustering object for this dataset. Why are a leopard and raccoon clustered together for any cluster size? - Now add colors corresponding to four different clusters to your MDS plot. Interpret what each of the four clusters correspond to.