--- title: | | STAT 408 - Week 7 | Advanced R Graphics and ggplot2 output: beamer_presentation: theme: "PaloAlto" fonttheme: "structuresmallcapsserif" date: "February 22, 2018" --- ```{r setup, include=FALSE} library(ggplot2) library(dplyr) library(knitr) knitr::opts_chunk$set(echo = TRUE) knitr::knit_hooks$set(mysize = function(before, options, envir) { if (before) return(options$size) }) knitr::opts_chunk$set(fig.width=4, fig.height=3, fig.align = 'center') ``` # Advanced R Graphics ## NCAA Basketball data We will use data from the NCAA basketball tournament from 2011 - 2016. ```{r, mysize=TRUE, size='\\tiny'} url <- 'http://www.math.montana.edu/ahoegh/teaching/stat408/datasets/TourneyDetailedResults.csv' hoops <- read.csv(url,stringsAsFactors = FALSE) hoops.2011 <- filter(hoops, Season >= 2011) head(hoops.2011) ``` ## Compute annual averages ```{r, mysize=TRUE, size='\\footnotesize'} points <- hoops.2011 %>% group_by(Season) %>% summarise(Win.Points = mean(Wscore), Lose.Points = mean(Lscore), Win.3Pt = mean(Wfgm3), Lose.3pt = mean(Lfgm3)) head(points) ``` ## Plot types ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', main='Average Points Scored by Losing Teams \n in NCAA Tournament',cex.lab=.8, cex.main = .8) ``` ## Plot types ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', main='Average Points Scored by Losing Teams \n in NCAA Tournament', cex.lab=.8, cex.main = .8) ``` ## Plot types ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', type='h',lwd=3, main='Average Points Scored by Losing Teams \n in NCAA Tournament',cex.lab=.8, cex.main = .8) ``` ## Plot types ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', type='h',lwd=3, main='Average Points Scored by Losing Teams \n in NCAA Tournament',cex.lab=.8, cex.main = .8) ``` ## Lines ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', type='n', main='Average Points Scored by Losing Teams \n in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Lose.Points~points$Season,col='red',lwd=1) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) ``` ## Lines ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Lose.Points)), ylab='Average Points Scored', xlab='Season', type='n', main='Average Points Scored by Losing Teams \n in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Lose.Points~points$Season,col='red',lwd=1) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) ``` ## Lines and Legends ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='n', main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Lose.Points~points$Season,col='red',lwd=1) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) legend('bottomright',legend=c('Winning Team','Losing Team'),col=c('blue','red'), lwd=c(1,1), lty=c(2,1), cex = .7) ``` ## Lines and Legends ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='n', main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Lose.Points~points$Season,col='red',lwd=1) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) legend('bottomright',legend=c('Winning Team','Losing Team'),col=c('blue','red'), lwd=c(1,1), lty=c(2,1), cex = .7) ``` ## Points ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='l', lwd=1,col='red', main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) points(hoops.2011$Wscore~hoops.2011$Season,pch=16,col=rgb(0,0,.5,.1)) legend('bottomright',legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),col=c('blue','red',rgb(0,0,.5)), lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.7) ``` ## Points ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='l', lwd=1,col='red', main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) points(hoops.2011$Wscore~hoops.2011$Season,pch=16,col=rgb(0,0,.5,.1)) legend('bottomright',legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),col=c('blue','red',rgb(0,0,.5)), lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.7) ``` ## Annotation ```{r, mysize=TRUE, size='\\tiny',echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='l', lwd=1,col='red', main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) points(hoops.2011$Wscore~hoops.2011$Season,pch=16,col=rgb(0,0,.5,.1)) text(2015,40,'Shot Clock Shortened', cex=.6) arrows(x0=2015, y0=42, x1=2016, y1=70, length=0.1, lwd=2) legend('bottomright',legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),col=c('blue','red',rgb(0,0,.5)), lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.6) ``` ## Annotation ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), ylab='Average Points Scored', xlab='Season', type='l', lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.lab=.8, cex.main = .8) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2) points(hoops.2011$Wscore~hoops.2011$Season,pch=16,col=rgb(0,0,.5,.1)) text(2015,40,'Shot Clock Shortened', cex=.6) arrows(x0=2015, y0=42, x1=2016, y1=70, length=0.1, lwd=2) legend('bottomright',legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),col=c('blue','red',rgb(0,0,.5)), lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.6) ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2,type='b',pch=17) legend('bottomright',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2,type='b',pch=17) legend('bottomright',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=3,lty=2,type='b',pch=17) legend('bottomleft',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) axis(4) axis(1, at = 2011:2016,labels=c('10-11','11-12','12-13','13-14','14-15','15-16')) ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=3,lty=2,type='b',pch=17) legend('bottomleft',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) axis(4) axis(1, at = 2011:2016,labels=c('10-11','11-12','12-13','13-14','14-15','15-16')) ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', echo=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=3,lty=2,type='b',pch=17) legend('bottomleft',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) axis(4) axis(1, at = 2011:2016,labels=c('10-11','11-12','12-13','13-14','14-15','15-16')) box() ``` ## Axes ```{r, mysize=TRUE, size='\\tiny', eval=F} plot(points$Lose.Points~points$Season,ylim=c(0,max(points$Win.Points)), axes=F, ylab='Average Points Scored', xlab='Season', type='b',pch=16, lwd=1,col='red', main='Average Points Scored in NCAA Tournament', cex.main=.75, cex.lab=.75) lines(points$Win.Points~points$Season,col='blue',lwd=3,lty=2,type='b',pch=17) legend('bottomleft',legend=c('Average Winning Team','Average Losing Team'),col=c('blue','red'), lwd=1, lty=c(2,1),pch=c(17,16), cex=.7) axis(4) axis(1, at = 2011:2016,labels=c('10-11','11-12','12-13','13-14','14-15','15-16')) box() ``` ## Superimposed Plots ```{r, echo=T, mysize=TRUE, size='\\tiny'} plot(density(hoops.2011$Wscore),xlab='points',ylab='', main='histogram and superimposed density curve \n for points scored by winning team',lwd=3, cex.main=.8, cex.lab=.8) hist(hoops.2011$Wscore,add=T,probability = T) ``` ## Expression ```{r, echo=T, mysize=TRUE, size='\\tiny'} plot(density(hoops.2011$Wscore),ylab=expression(beta[2]),xlab='', main='Examples with Expression',axes=F, type='n') box() text(70,.023, expression(sum(theta[i]^2, i=1, n)),cex=2) ``` ## R-Markdown Captions ```{r, echo=F, mysize=TRUE, size='\\tiny',fig.align='center',fig.cap = 'Write caption here'} plot(density(hoops.2011$Wscore),xlab='',ylab='', main='Captions in R Markdown',axes=F, type='n') box() text(70,.023, 'caption in R header',cex=1.5) ``` ## Exercise: Advanced Plotting Use the Seattle Housing Data Set [http://math.montana.edu/ahoegh/teaching/stat408/datasets/SeattleHousing.csv](http://math.montana.edu/ahoegh/teaching/stat408/datasets/SeattleHousing.csv) to create an interesting graphic, include informative titles, labels, and add an annotation. ## Solution: Advanced Plotting ```{r, echo=F,fig.align='center'} Seattle.in <- read.csv('http://math.montana.edu/ahoegh/teaching/stat408/datasets/SeattleHousing.csv', stringsAsFactors = F) hist(Seattle.in$price,prob=T,breaks="FD", ylab='', col='forestgreen',xlab='Sales Price (million $)', main='Houses Sold in Seattle', axes=F) axis(1, at = c(0,500000,1000000,2500000,4000000,5500000,7000000), labels =c('0','.5','1','2.5','4','5.5','7')) arrows(x0=2500000, y0=1.5e-6, x1=1500000, y1=.5e-6, length=0.1, lwd=2) text(2500000,1.8e-6,'Most homes sell for \n less than one million',cex=.8) ``` ## Solution: Advanced Plotting ```{r, eval=F,mysize=TRUE, size='\\tiny'} Seattle.in <- read.csv( 'http://math.montana.edu/ahoegh/teaching/stat408/datasets/SeattleHousing.csv', stringsAsFactors = F) hist(Seattle.in$price,prob=T,breaks="FD", ylab='', col='forestgreen', xlab='Sales Price (million $)', main='Houses Sold in Seattle', axes=F) axis(1, at = c(0,500000,1000000,2500000,4000000,5500000, 7000000), labels =c('0','.5','1','2.5','4','5.5','7')) arrows(x0=2500000, y0=1.5e-6, x1=1500000, y1=.5e-6, length=0.1, lwd=2) text(2500000,1.8e-6,'Most homes sell for \n less than one million',cex=.8) ``` # ggplot2 ## ggplot2 Overview Why ggplot2? ### Advantages of ggplot2 - consistent underlying grammar of graphics (Wilkinson, 2005) - plot specification at a high level of abstraction - very flexible - theme system for polishing plot appearance ## Grammar of Graphics The basic idea: independently specify plot building blocks and combine them to create just about any kind of graphical display you want. Building blocks of a graph include: - data - aesthetic mapping - geometric object - statistical transformations - scales - coordinate system - position adjustments - faceting ## ggplot2 VS Base Graphics Compared to base graphics, ggplot2 - is more verbose for simple / canned graphics - is less verbose for complex / custom graphics - does not have methods (data should always be in a data.frame) - uses a different system for adding plot elements ## Aesthetic Mapping Aesthetics are things that you can see. Examples include: - position (i.e., on the x and y axes) - color ("outside" color) - fill ("inside" color) - shape (of points) - linetype - size Aesthetic mappings are set with the aes() function. ## Geometic Objects (geom) Geometric objects are the actual marks we put on a plot. Examples include: - points (`geom_point`) - lines (`geom_line`) - boxplot (`geom_boxplot`) A plot must have at least one geom; there is no upper limit. You can add a geom to a plot using the + operator ## Graphical Primitives/ ggplot ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a <- ggplot(data = hoops.2011, aes(Lfgm,Wfgm)) graph.a ``` ## Adding Geoms: `geom_point()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_point() ``` ## Adding Geoms: `geom_smooth()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_point() + geom_smooth(method = 'loess') ``` ## Adding Geoms: `geom_rug()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_point() + geom_smooth(method = 'loess') + geom_rug() ``` ## Adding Geoms: `geom_density2d()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_point() + geom_smooth(method = 'loess') + geom_rug() + geom_density2d() ``` ## Adding Geoms: `geom_jitter()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_rug() + geom_density2d() + geom_jitter() ``` ## Adding Geoms: `labs()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.65} graph.a + geom_rug() + geom_density2d() + geom_jitter() + labs(x='Losing Team Field Goals Made', y = 'Winning Team Field Goals Made') ``` ## Scales: `xlim()` and `ylim()` ```{r, mysize=TRUE, size='\\tiny',fig.align='center',warning=FALSE, fig.height=2.6} graph.a + geom_rug() + geom_density2d() + geom_jitter() + labs(x='Losing Team Field Goals Made', y = 'Winning Team Field Goals Made') + xlim(c(0,max(hoops.2011$Wfgm))) + ylim(c(0,max(hoops.2011$Wfgm))) ``` ## Themes ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_point() + theme_bw() + labs(x='Losing Team Field Goals Made', y = 'Winning Team Field Goals Made') ``` ## More about aes ```{r, mysize=TRUE, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_jitter(col = 'firebrick4') ``` ## More about aes ```{r, mysize=TRUE, eval=F, size='\\tiny',fig.align='center'} graph.a + geom_jitter(aes(col = as.factor(Season))) ``` ## More about aes ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_jitter(aes(col = as.factor(Season))) ``` ## More about aes ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_jitter(aes(col = as.factor(Season)), size=3,alpha=.4) ``` ## More about aes ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + geom_jitter(aes(shape = as.factor(Season),col=Wscore), size=3,alpha=.4) ``` ## Faceting ```{r, mysize=TRUE, eval=F, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + facet_wrap(~Season) ``` ## Faceting ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center', fig.height=2.75} graph.a + facet_wrap(~Season) + geom_jitter(alpha=.5, aes(color=Wfgm3)) ``` ## Maps ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center'} library(maps) usa <- map_data("usa") usa.map <- ggplot() + geom_polygon(data = usa, aes(x=long, y = lat, group = group)) + coord_fixed(1.3) usa.map ``` ## Maps ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center'} labs <- data.frame(long = c(-111.0429, -105.2211, -80.4139, -91.7857,-121.7405), stringsAsFactors = FALSE, lat = c(45.6770, 39.7555, 37.2296,43.3033,38.5449), names = c("Bozeman, MT", "Golden, CO", "Blacksburg, VA",'Decorah, IA', "Davis, CA") ) usa.map + geom_point(data = labs, aes(x = long, y = lat), color = "yellow", size = 4) + annotate(geom='text', y=44,x=-111,label='Bozeman,MT',col='white') ``` ## More Maps: `ggmap` ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center'} library(ggmap) myMap <- get_map(location = c(lon = - 110.8281,lat = 44.4605), source = "google", maptype = "terrain", crop = FALSE, zoom = 7) # plot map yellowstone.map <- ggmap(myMap) ``` ## More Maps: `ggmap` ```{r, mysize=TRUE, eval=T, size='\\tiny',fig.align='center', fig.height=2.75} yellowstone.map ``` ## Exercise: ggplot2 Now use `ggplot2` to create an interesting graph using the Seattle Housing data set. ## Solution: ggplot2 ```{r,fig.align='center',echo=F} library(ggplot2) Seattle.in$zipcode <- as.factor(Seattle.in$zipcode) graph.a <- ggplot(data = Seattle.in, aes(sqft_living,price)) graph.a + geom_jitter(aes(col = zipcode))+ theme(plot.title = element_text(size=8), text = element_text(size=6)) + geom_smooth(method='loess')+ggtitle('Seattle Housing Sales: Price vs. Square Footage Living Space') + ylab('Sales Price (million dollars)') + xlab('Living Space (square foot)')+scale_y_continuous(breaks=c(seq(0,7000000,by=1000000)), labels=as.character(0:7)) + annotate('text',3500,6000000, label = 'Housing price depends on zipcode', size=2) +annotate("rect", xmin = 0, xmax = 7250, ymin = 5500000, ymax = 6500000, alpha = .6) + geom_segment(aes(x=3500, xend=3500, y=5500000, yend=3000000), arrow = arrow(length = unit(0.5, "cm"))) ``` ## Solution: ggplot2 ```{r,eval=F, mysize=T, size='\\tiny'} library(ggplot2) Seattle.in$zipcode <- as.factor(Seattle.in$zipcode) graph.a <- ggplot(data = Seattle.in, aes(sqft_living,price)) graph.a + geom_jitter(aes(col = zipcode)) + theme(plot.title = element_text(size=20))+ geom_smooth(method='loess')+ ggtitle('Seattle Housing Sales: Price vs. Square Footage Living Space') + ylab('Sales Price (million dollars)') + xlab('Living Space (square foot)')+ scale_y_continuous(breaks=c(seq(0,7000000,by=1000000)), labels=as.character(0:7)) + annotate('text',3500,6000000, label = 'Housing price depends on zipcode') + annotate("rect", xmin = 0, xmax = 7250, ymin = 5500000, ymax = 6500000, alpha = .6) + geom_segment(aes(x=3500, xend=3500, y=5500000, yend=3000000), arrow = arrow(length = unit(0.5, "cm"))) ``` ## Solution: map ```{r,echo=F, mysize=T, size='\\tiny', fig.align='center', fig.height=2.5} myMap <- get_map(location = 'Seattle', source = "google", maptype = "terrain", crop = FALSE, zoom = 8) Seattle.map <- ggmap(myMap) Seattle.map + geom_point(data=Seattle.in, aes(x=long, y=lat, size=price), alpha=.25) + labs(title = 'Location of Housing Sales in Seattle, WA') + theme(plot.title = element_text(size=9)) ``` ## Solution: map ```{r, eval=F, mysize=T, size='\\tiny', fig.height=2.5} myMap <- get_map(location = 'Seattle', source = "google", maptype = "terrain", crop = FALSE, zoom = 8) Seattle.map <- ggmap(myMap) Seattle.map + geom_point(data=Seattle.in, aes(x=long, y=lat, size=price), alpha=.25) + labs(title = 'Location of Housing Sales in Seattle, WA') + theme(plot.title = element_text(size=9)) ```