\documentclass[12pt,titlepage]{article}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{verbatim}
\allowdisplaybreaks

\jot=.2in \pagestyle{plain} \setlength{\topmargin}{-0.5in}
\setlength{\textheight}{9.5in} \setlength{\oddsidemargin}{-0.1in}
\setlength{\evensidemargin}{-0.1in} \setlength{\textwidth}{6.7in}
\font\heada=cmbx10 scaled\magstep3 \font\headb=cmsl10
scaled\magstep1 \font\headc=cmr8 \pretolerance=10000
\setlength{\parindent}{2 em}

\begin{document}
\noindent {\heada Chapter 5 Revisited - Bivariate Numerical Data}

\subsection*{5.1 Correlation}
\noindent \underline{\bf Scatterplot} - a graphical display of the
relationship between two numerical variables.

\bigskip

\noindent \underline{\bf How to Describe a Relationship}:
\begin{enumerate}
\item {\bf Form} - linear, non-linear (curved), clustered, etc.
\item {\bf Direction} positive or negative.
\item {\bf Strength} - strong, moderate, or weak
\end{enumerate}

\noindent \underline{\bf Sample Correlation Coefficient} - A
statistic which estimates the {\it direction} and {\it strength} of
a linear relationship between two variables.



$$r = \frac{1}{n-1}\sum_{i=1}^n \left(\frac{x_i - \bar x}{s_x}\right) \left(\frac{y_i - \bar y}{s_y}\right)$$

\subsection*{5.2 Least Squares Regression}

\begin{itemize}
\item When there is a linear relationship between two numerical
variables, one is interested in determining the equation of the line
which describes the relationship.

\item The true line which describes the relationship, called the
{\bf Population Regression Line}, is
    $$y=\beta_0 + \beta_1x.$$

\begin{itemize}
\item The parameter $\beta_0$ is the true {\em y-intercept} of the
line.

\item The parameter $\beta_1=\frac{{\rm change~in~}y}{{\rm change~in~}x}$ is the true {\em slope} of the
line.  The slope can be interpreted as the average change in $y$ for
each unit increase in $x$.

\item Your textbook uses the notation $y=\alpha + \beta x$.
\end{itemize}

\item From a SRS of bivariate data, the {\bf least-squares regression line}, which estimates the true
line, can be calculated:

$$\hat y = b_0 + b_1x$$

\begin{itemize}
\item $\hat y$ is the {\em predicted} or {\em fitted} value.
\item The statistic $b_1=r\frac{s_y}{s_x}$, the estimated slope, is an unbiased estimator of $\beta_1$.
\item The statistic $b_0=\bar y-b_1\bar x$, the estimated $y$-intercept, is an unbiased
estimator of $\beta_0$.
\item $x$ is the explanatory, or {\em predictor}, variable.

\item Your textbook uses the notation $\hat y=a + b x$.
\vspace{0.1in}
\end{itemize}

\item The least-squares regression line is the one which is ``closest to the data" when compared to any other line.

\newpage
\noindent \underline{EXAMPLE}:
\begin{enumerate}
\item From $n=5$ skeletons for a particular species of dinosaur, femur measurements and humerus
measurements (in inches) are taken.

\bigskip

\begin{tabular}{c|c}
  % after \\: \hline or \cline{col1-col2} \cline{col3-col4} ...
$X$=femur & $Y$=humerus\\\hline
  38 & 41 \\
  50 & 63 \\
  59 & 70 \\
  64 & 72 \\
  74 & 84 \\
\end{tabular}

\bigskip

Calculate the least-squares regression line which describes the
relationship between $X$ and $Y$.  Use the statistics $\bar x=57$,
$s_x=13.71$, $\bar y=66$, $s_y=15.89$, and $r=.9776$.

\vspace{2in}

\item Sketch this line over the scatterplot for this data.

\bigskip

\item For a dinosaur with a 55" femur, what is the predicted humerus
length?

\vspace{1in}

\item Interpret the slope of the least-squares regression line.

\vspace{.5in}

\end{enumerate}

\item {\bf Extrapolation} - the use of a regression line to predict $y$ at an $x$ value
outside the range of $x$'s used to calculate the line. DO NOT
EXTRAPOLATE! Extrapolation often results in very inaccurate
predictions.

\bigskip

\noindent \underline{EXAMPLE}:

\end{itemize}


\newpage
\subsection*{5.3 Assessing the Fit of a Line}
\begin{itemize}
\item The vertical distance of a line to a data point $(x_i,y_i)$ is the
residual (just as in ANOVA)
    $$e_i = y_i - \hat y_i.$$

\item The least-squares regression line is the line that minimizes the sum of the squared
residuals or $SSE$ (just as in an ANOVA) when compared to any other
line
    $$SSE = \sum_i e_i^2 = \sum_i (y_i - \hat y_i)^2.$$
Your textbook calls $SSResid=SSE$.  Thus, a line with the minimal
$SSE$ is closest to the data when compared to any other line.

\vspace{0.1in}



\noindent \underline{EXAMPLE}:  Calculate the residuals for each
data point and calculate the $SSE$.

\bigskip

\begin{tabular}{ccccc}
$i$ &\hspace{0.2in} $x_i$ \hspace{0.2in} & \hspace{0.2in} $y_i$
\hspace{0.2in} & \hspace{1in} $\hat y_i$ \hspace{1in} & $e_i = y_i - \hat y_i$ \\
\hline
 & & & \\
1&  38 & 41 \\
2&  50 & 63 \\
3&  59 & 70 \\
4&  64 & 72 \\
5&  74 & 84 \\
\end{tabular}


\vspace{1in}

$SSE =$ \underline{\hspace{1in}}.

\bigskip

\bigskip
\end{itemize}


\noindent \underline{\bf Partition the Variability}:

\smallskip

\noindent Just as in ANOVA, the total sample variability of the
response $y$ can be partitioned into two pieces,

$$SSTo = SSM + SSE$$

\noindent

\begin{itemize}
\item SSTo, the {\em Total Sum of Squares}, is
    $SSTo = \sum(y_i - \bar y)^2.$
$SSTo$ represents the total variability of the response $y$,
    $$s^2_y = \frac{1}{n-1}\sum_i(y_i-\bar y)^2=\frac{SSTo}{n-1}.$$

\item SSM, the {\em Model Sum of Squares}, is
    $$SSM = \sum (\hat y - \bar y)^2.$$
$SSM$ is represents the part of the variability of the response
which is explained by the linear-regression line.

\item SSE, the {\em Residual Sum of Squares} (or {\em Sum of Squares
Error}), is
    $$SSE = \sum (y_i - \hat y)^2.$$
$SSE$ represents the part of the variability of the response which
is \underline{not} explained by the linear-regression line.

\item The variance of the residuals around the least-squares regression
line is

$$\sigma^2\approx S^2 = MSE = \frac{SSE}{n-2}.$$ \vspace{0.1in}

\end{itemize}




\noindent \underline{\bf Tools to Assess the Fit of a Line}
\begin{itemize}
\item Plot the residuals versus the fitted values, $(\hat y_i,ei)$.  The line is a good fit if the points form no pattern on the
plot.





\item The \underline{\bf Coefficient of Determination} is the proportion of the total sample variability of $y$ that is
explained by the linear-regression line between $x$ and $y$:
$$r^2 = \frac{SSM}{SSTo}.$$

\begin{itemize}
\item The notation $r^2$ is used because the coefficient of
determination is the correlation coefficient squared.

\item $r^2$ is a proportion, and so is a value between 0 and 1.

\item The line is a good model if $r^2$ is large.


\end{itemize}

\end{itemize}


\noindent\underline{EXAMPLE}:

\begin{enumerate}
\item Create the residual versus fits graph for the dinosaur data.

\vspace{2.5in}

\item Are there any patterns which indicate that the linear model is
not appropriate?

\vspace{.5in}


\item Calculate $r^2$.

\vspace{.5in}

\item Interpret $r^2$.

\end{enumerate}


\vspace{1in}


\section*{R code}

\verbatiminput{Chapter5Rcode.txt}

\section*{Exercises}
5.1 on p194: 1-11 odd, 15

\noindent 5.2 on p205: 17*, 19*, 21*, 23, 25*

\noindent 5.3 on p219: 33, 35*, 39-43 odd

\bigskip

\noindent A ``*" indicates that you may want to use R to assist with
the calculations.

\section*{Reading}
Sections 5.1-5.3


\end{document}

