## Exercise - Prediction for Capital Bike Share

bikes <- read.csv('http://www.math.montana.edu/ahoegh/teaching/stat408/datasets/Bike.csv')
set.seed(11142017)
num.obs <- nrow(bikes)
test.ids <- base::sample(1:num.obs, size=round(num.obs*.3))
test.bikes <- bikes[test.ids,]
train.bikes <- bikes[(1:num.obs)[!(1:num.obs) %in%
test.ids],]
dim(bikes)
## [1] 10886    12
dim(test.bikes)
## [1] 3266   12
dim(train.bikes)
## [1] 7620   12

## Exercise - Prediction for Capital Bike Share

lm.bikes <- lm(count ~ holiday + atemp,
data=train.bikes)
lm.mad <- mean(abs(test.bikes$count - predict(lm.bikes,test.bikes))) Create another predictive model and compare the results to the MAD of the linear model above ($$129$$). However, don’t use casual and registered in your model as those two will sum to the total count. ## Exercise: Predict Titanic Survival titanic <- read.csv( 'http://www.math.montana.edu/ahoegh/teaching/stat408/datasets/titanic.csv') set.seed(11142017) titanic <- titanic %>% filter(!is.na(Age)) num.pass <- nrow(titanic) test.ids <- base::sample(1:num.pass, size=round(num.pass*.3)) test.titanic <- titanic[test.ids,] train.titanic <- titanic[(1:num.pass)[!(1:num.pass) %in% test.ids],] dim(titanic) ## [1] 714 12 dim(test.titanic) ## [1] 214 12 dim(train.titanic) ## [1] 500 12 ## Exercise: Predict Titanic Survival See if you can improve the classification error from the model below. glm.titanic <- glm(Survived ~ Age, data=train.titanic, family = binomial) Class.Error <- mean(test.titanic$Survived != round(predict(glm.titanic, test.titanic, type='response')))

The logistic regression model only using age is wrong $$40$$% of the time.