\documentclass[11pt]{article}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{verbatim}
\allowdisplaybreaks

\pagestyle{empty} \setlength{\topmargin}{-0.5in}
\setlength{\textheight}{9.5in} \setlength{\oddsidemargin}{-0.2in}
\setlength{\evensidemargin}{-0.2in} \setlength{\textwidth}{6.9in}
\font\heada=cmbx10 scaled\magstep3 \font\headb=cmsl10
scaled\magstep1 \font\headc=cmr8 \pretolerance=10000 \raggedright
\def\ds{\displaystyle}

\begin{document}
\noindent{\heada Chapter 15 - One-Way Analysis of Variance}

\bigskip

\noindent{\bf \underline{COMPARING MANY POPULATION MEANS}}


\begin{itemize}
\item {\em Analysis of Variance} is often abbreviated as ANOVA.

\item A one-way ANOVA considers $k>2$ populations.  The mean of the $i^{th}$
population is $\mu_i$.  The variance of the $i^{th}$ population is
$\sigma^2_i$.

\item An ANOVA is used to compare the $k$ population means, $\mu_1, \mu_2, ..., \mu_k$.

\item ``One-way" means that the levels of a single factor define the populations
being compared.  In other words, the categories of a categorical
variable define the populations.

\end{itemize}
\vspace{0.2in}


\noindent \underline{\bf Setting}:
\begin{enumerate}
\item A SRS of size $n_i$ has been chosen from population $i$ for $i=1,2, ...,
k$.

\item Each SRS is independent of the others.


\item {\em Homogeneity of Variance}: $\sigma^2=\sigma^2_1 =
\sigma^2_2 = \hdots = \sigma^2_k$.

\item Each population is normally distributed, so the distribution
of the $i^{th}$ population is $N(\mu_i,\sigma)$.


\end{enumerate}
\vspace{0.1in}



\noindent {\bf \underline{Note about Study Design}:}
\begin{itemize}
\item Completely Randomized Design Experiment

\begin{itemize}
\item The $k$ treatment groups from a CRD are considered to be independent samples from $k$ populations.

\item In a CRD, if there is a difference among the $k$ means, it is appropriate to claim that the factor (treatment) caused the
difference in means.

\item If the groups of individuals in the CRD were chosen from a SRS, then
conclusions about $k$ means can be extended to the populations from
which the individuals were drawn. However, if the individuals were
not from a SRS, then conclusions to larger populations are dubious.
\end{itemize}

\item Observational Study
\begin{itemize}
\item The $k$ samples to be compared in an observational study are considered
independent if individuals in each sample were randomly chosen from
each respective population.

\item Do not claim that the factor (explanatory variable) caused the difference in
means.
\end{itemize}

\end{itemize}


\newpage

\noindent {\bf \underline{The one-way ANOVA Model}} \\
\vspace{0.1in} \hspace{0.2in} $X_{ij} = \mu_i + \epsilon_{ij}$ ~~
where $\epsilon_{ij} \sim N(0,\sigma)$

\begin{itemize}
\item $X_{ij}$ is the response of the $j^{th}$ individual
in the SRS from the $i^{th}$ population.

\item $\mu_i$ is mean of $i^{th}$ population.

\item $\epsilon_{ij}$ is the error term, $x_{ij} - \mu_i$, also called the {\em deviation} of $x_{ij}$ from $\mu_i$

\item $\epsilon_{ij} \sim N(0,\sigma)$ is equivalent to the
assumption that the $i^{th}$ population is normal, $x_{ij}\sim
N(\mu_i,\sigma)$.

\end{itemize}


\bigskip

\noindent {\bf \underline{The ``Estimated" one-way ANOVA Model}} \\
\vspace{0.1in} \hspace{0.2in} $X_{ij} = \overline X_i + e_{ij}$

\begin{itemize}

\item $\overline X_i$ is the sample mean of the SRS from the $i^{th}$ population, an unbiased point estimator of $\mu_i$.

\item  $e_{ij}$ = $X_{ij} - \overline X_i$ is the residual, the deviation of $X_{ij}$ from $\overline X_i$

\end{itemize}

\bigskip

\noindent \underline{\bf HYPOTHESIS TEST TO COMPARE $k$ POPULATION
MEANS}


\bigskip

\noindent {\bf The Overall Test}:

\bigskip

\noindent {\bf The Idea:} We will compare the variability {\bf
between} the sample means ($MSTr$) to the variability {\bf within}
each sample ($MSE$).

\begin{itemize}

\item If the variability between the $\bar x_i$'s is {\bf large} relative to
the variability within each sample ($MSTr >> MSE$), then we will
claim that there is a difference between the $\mu_i's$.

\item If the variability among the $\bar x_i$'s is {\bf not large} relative
to the variability within each sample, then we will fail to claim
that there is a difference between the $\mu_i's$.

\item The statistic $MSE$ is an unbiased estimator of the constant variance
$\sigma^2$.
\end{itemize}


\begin{enumerate}
\item \underline{Hypotheses}:

${\rm H}_{\rm 0}\colon \mu_1 = \mu_2 = \hdots = \mu_k$ \\
${\rm H}_{a}\colon$ $\mu_i\ne \mu_j$ for some $i$ and $j$  ~~(at
least one of the $\mu_i$ are different that the
others)\vspace{0.2in}



\item \underline{Check Assumptions}:
\begin{enumerate}
\item Independent SRS's have been chosen, and so $.05N_i \ge n_i$ for each $i=1, 2, ..., k$.

\item Check normal probability plots to determine if the residuals are not normally distributed.

\item Assume that the constant variance
assumption holds if $\frac{\rm largest~s}{\rm smallest~s} < 2$.

\end{enumerate}

\bigskip

\newpage
{\bf Perform Steps 3 and 4 assuming that ${\rm H}_0$ is true!}

\item \underline{Test Statistic}: \\ \vspace{0.1in}
$F = \frac{\rm MSTr}{\rm MSE}$, where $MSTr$ and $MSE$ are from the
following ANOVA table:

\bigskip

\centerline{\bf One-way ANOVA Table}  \vspace{0.1in}
\begin{tabular}{llllll}\hline
Source & DF & Sum of Squares (SS) & Mean Squares (MS) & F & p-value \\ \hline
Treatments & DFTr=k-1 & SSTr=$\sum_{i=1}^k n_i (\bar x_i - \bar{\bar x})^2$ & MSTr = $\frac{\rm SSTr}{\rm DFTr}$ & $F^*=\frac{\rm MSTr}{\rm MSE}$ & $P(F > F^*$) \\
Error & DFE=N-k & SSE=$\sum_{i=1}^k(n_i-1)s_i^2$ & MSE=$\frac{\rm SSE}{\rm DFE}$ & & \\ \hline
Total & DFTo=N-1 & SSTo=$\sum_{\rm all~x} (x_{ij} - \bar{\bar x})^2$ & & & \\ \hline
\end{tabular}
\vspace{0.1in}

where $N=\sum n_i$ is the {\em total sample size} and $\bar{\bar
x}=\frac{\sum n_i\bar x_i}{\sum n_i}$ is the {\em grand mean}.
Observe that $DFT=DFTr + DFE$ and $SST=SSTr + SSE$. \vspace{0.1in}


\begin{itemize}
\item When ${\rm H}_0: \mu_1 = \mu_2 = \hdots = \mu_k$ is true, $\mu_{_{\rm MSTr}} = \mu_{_{\rm MSE}}$ and therefore $F = \frac{\rm MSTr}{\rm MSE} \approx 1$.
\item When ${\rm H}_0$ is false, $\mu_{_{\rm MSTr}} > \mu_{_{\rm MSE}}$ and therefore $F= \frac{\rm MSTr}{\rm MSE} >> 1$.
\item Large $F = \frac{\rm MSTr}{\rm MSE}$ values are strong evidence against ${\rm H}_0$ and for ${\rm H}_a$.
\end{itemize}


\item \underline{$p$-value}:
 The $p$-value $=P(F > F^*).$

\begin{itemize}

\item The test statistic has an $F$ distribution, $F\sim F(DFTr, DFE)$, when ${\rm H}_0$ is true.

\item An $F$ distribution $F(DFTr, DFE)$ has two parameters, the
{\em numerator degrees of freedom} $DFTr$ and the {\em denominator
degrees of freedom} $DFE$.

\item An $F$ distribution is a beautiful right-skewed distribution.
 Probabilities can be found in Table 7 on pages 738-741 of your
 textbook, or by using R's pf(F, df1=\#, df2=\#,lower.tail=FALSE) function.

\end{itemize}


\vspace{0.1in}



\item and 6. Make a \underline{Decision} and give a
\underline{Conclusion}.

\vspace{0.2in}

\end{enumerate}


\newpage

\noindent \underline{\bf MULTIPLE COMPARISONS FOLLOW-UP TEST}

\noindent If we reject ${\rm H}_0$ in the overall test and conclude
that at least one of the $\mu_i$'s is different that the others,
then we should ask ``{\it Which} population means are different?"

\bigskip

\underline{Only do a follow-up test if you REJECT ${\rm H}_0: \mu_1
= \mu_2 = \hdots = \mu_k$!}

\vspace{0.1in}

\noindent {\bf \underline{Tukey's Method}}

Tukey's Method calculates a family of CI's for all possible pairwise
differences of the means $\mu_1, \mu_2, ... \mu_k$.   The overall
family-wise confidence level is held at some confidence level
$C=1-\alpha$ (which means that the confidence level for each
individual CI is more than $C$). For a given pair $\mu_i$ and
$\mu_j$ (with $i\ne j$):

\begin{itemize}
\item The Tukey's confidence interval for $\mu_i - \mu_j$ is:

\begin{center}
\fbox{$\bar x_i - \bar x_j \pm q_{1-\alpha,k,DFE}
\sqrt{\frac{MSE}{2}(\frac{1}{n_i} + \frac{1}{n_j})}$}
\end{center}
\vspace{-0.05in}

\item The Tukey critical value $q_{1-\alpha,k,DFE}$ can be found:

\begin{itemize}
\item  In Table 8 on page 742 of your textbook.  If you can not find
the appropriate DFE in Table 8, ROUND DOWN to be conservative.

\item Using R's qtukey(C,nmeans=k,df=DFE)
\end{itemize}

\item The Tukey's CI for $\mu_i - \mu_j$ can be used to test the hypothesis ${\rm H}_0\colon
\mu_i-\mu_j=0$ versus ${\rm H}_a\colon \mu_i-\mu_j\ne0$.
\begin{itemize}
\item If 0 {\bf is} in the CI, then fail to reject ${\rm H}_0$.
\item If 0 {\bf is not} in the CI, then reject ${\rm H}_0$.
\end{itemize}
\end{itemize}




\newpage


\noindent \underline{EXAMPLE}:

\noindent An archeologist is interested in studying skull breadths
of humans from different epochs.  Significant changes in head shape
over time would suggest that interbreeding occurred with immigrant
human populations.  A sample of 27 head breaths were obtained by
measuring skulls of Egyptian males from three different epochs:
4000BC, 1850BC, and 150AD.  The data are from {\it Ancient Races of
the Thebaid}, by Thomson and Randall-Maciver).

\bigskip

\noindent {\bf Display Your Data} \\
{\small {\small
\begin{verbatim}
> D = read.table("headbreadth.txt",header=TRUE)
> attach(D)
> Epoch = factor(as.character(Epoch),
  levels = c("4000BC","1850BC","150AD"))
> boxplot(HeadBreadth ~ Epoch,ylab="Head Breadth")


\end{verbatim}
}}



\vspace{-1.9in}

\hspace{3.8in}
\includegraphics[angle=0,width=3in,height=2.5in]{Chapter15.headbreadth.boxplot.ps}


\vspace{-.8in}

\noindent{\bf Fit the One-way ANOVA Model} \\
{\small
\begin{verbatim}
> library(MASS)
> library(pastecs)
> tapply(HeadBreadth,Epoch,mean)
  4000BC   1850BC    150AD
132.6667 134.4444 138.1111

> hb.aov=aov(HeadBreadth ~ Epoch)
> summary(hb.aov)   # Prints ANOVA table
\end{verbatim}}


\begin{center}
\begin{tabular}{lccccc}\hline
Source & $DF$ & $SS$ & $MS$ & $F$ & $p$-value \\ \hline
Treatment & 2 & 138.74 & 69.37 & 4.0497 & 0.03052 \\
Error & 24 & 411.11 & 17.13 & & \\ \hline Total & 26 & 549.85 & & &
\\ \hline
\end{tabular}
\end{center}
\vspace{0.15in}



\noindent{\bf Check Your Assumptions} \\
\noindent{\it 1. Independent Random Samples}

\noindent{\it 2. Normal Distribution} \\
{\small {\small

{\small {\small
\begin{verbatim}
> par(mfrow=c(1,2))
> qqnorm(studres(hb.aov))
> qqline(studres(hb.aov))
> hist(studres(hb.aov),freq=FALSE,ylim=c(0,0.45),main="Density Plot of Residuals",
  xlab="Studentized Residuals")
> lines(density(studres(hb.aov)))
> xy=qqnorm(studres(hb.aov))
> cor(xy$x,xy$y)
[1] 0.9722762
\end{verbatim}
}} \vspace{-0.5in}


\begin{center}
\includegraphics[angle=0,width=4.75in,height=2.5in]{Chapter15.headbreadth.normal.ps}
\end{center}



\noindent{\it 3. Check Constant Variance} \\
{\small {\small
\begin{verbatim}
> tapply(HeadBreadth,Epoch,sd)
  4000BC   1850BC    150AD
4.183300 3.358240 4.755114
\end{verbatim}
}}

$\frac{{\rm largest~s}}{{\rm smallest~s}} \approx \frac{4.76}{3.35}
\approx 1.42 < 2 $, so the constant variance assumption appears to
hold. \vspace{0.2in}






\begin{enumerate}
\item Perform the {\bf overall Hypothesis Test}

\begin{enumerate}

 \item \underline{Hypotheses}:

\vspace{.5in}

\item \underline{Test statistic value}: \vspace{.2in}

\item \underline{Distribution of the test statistic} given that ${\rm H}_0$ is true: \vspace{.2in}

\item \underline{$p$-value}: \vspace{.2in}

\item  \underline{Decision} at $\alpha=.05$: \vspace{.5in}

 \item \underline{Conclusion}: \vspace{.5in}

\end{enumerate}

\item Give an unbiased estimate of the constant variance $\sigma^2$.
\vspace{.2in}

\item Perform {\bf Tukey's Multiple Comparison Test}
{\small {\small
\begin{verbatim}
> TukeyHSD(hb.aov,which="Epoch",conf.level=0.95)
\end{verbatim}
}}

\begin{center}
\begin{tabular}{||c|c|c|c||}\hline
Comparison & Estimate & Lower & Upper \\ \hline
$\mu_{_{\rm 1850BC}} - \mu_{_{\rm 4000BC}}$ & 1.7778 & -3.0945 & 6.6501 \\
$\mu_{_{\rm 150AD}} - \mu_{_{\rm 4000BC}}$ & 5.4444 & 0.5721 & 10.3168 \\
$\mu_{_{\rm 150AD}} - \mu_{_{\rm 1850BC}}$ & 3.6667 & -1.2057 &
8.5390 \\ \hline
\end{tabular}
\end{center}

\begin{enumerate}

\item  {\bf \underline{Conclusions}:}\vspace{.5in}

\item Which epoch appears to have the largest mean head breadth?
How much larger is the head breadth during this epoch?

\end{enumerate}
\end{enumerate}

\newpage
\section*{Exercises}

\noindent 15.1 on p676: 1-15 odd

\noindent 15.2 on p684: 19-23 odd


\section*{Reading} Sections 15.1-15.2.   Unfortunately, we do not
have time to cover ANOVA for RBD (15.3) and Two-way ANOVA (15.4),
but you may want to read about these topics if they are applicable
to your research.

\end{document}

