# Introduction to R and Running an ANOVA by Jouke Postma

#To cite R in publications, use
(bibtext format)

#

# @Manual{,

# title = {R: A Language and Environment
for Statistical

# Computing},

# author = {{R Development Core Team}},

#
organization = {R Foundation for Statistical Computing},

# address = {Vienna,
Austria},

# year = 2005,

# note = {{ISBN} 3-900051-07-0},

# url =
{http://www.R-project.org}

# }

#======================R-demonstration script================

#--------------------------start and
quit---------------------------------

R

quit()

save.image() #save data,
although I would just copy code into a text file. Same as quit("yes")

#---------------------------comments--------------------------------------

#Text
after pound sign is ignored (considered to be comments)

#----------------------------help-----------------------------------------

help("matrix")

#or

?matrix

#or
search in help with

help.search("matrix")

#-----------------------Simple
calculation--------------------------------

1+1

2^3 #following notation is
also excepted: 2**3

#----------------------More
complex functions-----------------------------

#function are of form
functionname() like the help function we saw
earlier

sum(3,6)

sin(3)

#you may(for clarity) include argument
labels

help(topic="matrix")

#-------------------------variables---------------------------------------

#assignment
sign is -> or <- . Variables maybe created in
assignment

k<-2

#variables maybe listed with ls

ls()

#variable
maybe deleted with rm()

rm(k)

ls()

#-------------------------vector/array------------------------------------

1:9

c("jouke","eric","raul")

array(0,5)

example.vector<-c(3,5,6,7)

example.vector

#---------------------------matrix----------------------------------------

example.matrix<-matrix(data=c(3,5,8,6),nrow=3,ncol=4)

example.matrix

#----------------------vector
& matrix calculations-----------------------

#vector

3*1:3

#watch
out for the following:

example.matrix*c(2,2,2)

#instead what you probably
want

crossprod(example.matrix,c(2,2,2))

#--------------------------graphics---------------------------------------

#a
simple plot

x<-1:10

y<-1/x

plot(x,y)

#demo of graphic
capability of R:

demo(graphics)

#more advance settings can be set with
par() or in the plot comand

par() #for a list of current settings

#plot
works with many objects as demonstrate below

#-----------------------load data
from file-------------------------------

#working
directory

getwd()

setwd("/home/jap441/Desktop/R-demonstration")

#list
files

list.files() #or dir()

#read data into
tabel

dataTabel<-read.table("crop-simulation-sample-data.txt",
header=TRUE, na.strings = c("NAN","NA","na","nan"))

#show
data

dataTabel

summary(dataTabel)

plot(dataTabel)demo

# Read manual
for import from EpiInfo, Minitab, S-PLUS, SAS, SPSS, Stata, Systat

#---------------------tables/matrix
indexes & subsets---------------------

#colum time

dataTabel["TIME"]

dataTabel$TIME #which is equal to dataTabel[,"TIME"] watch the comma
here!!!!!!!!!!

dataTabel[2]

#row 3

dataTabel[3,]

#number 4 of colum
3 (Note that since this colum contains texts, it is considered nominal and
therefor levels are shown.

dataTabel[4,3]

#selection of a dataset with ==
>= <= != (note: this works with integers. Use for floating point
comparison
all.equal()

1:5>=3

(1:5)[1:5>=3]

(1:5)*(1:5>=3)

dataTabel["LAI"][dataTabel["LAI"]>=0]

#plot
a selection

plot(dataTabel$TIME,dataTabel$LAI) #plot LAI over time

#create a temporal reference to a colum by attaching it (until
detach)

LAI #gives error

attach(dataTabel)

LAI #is
present

detach(dataTabel)

LAI #gives error

#-----------------------data
types---------------------------------

#R uses several data types of which
the following are of importance

#logical, numeric, integer, complex,
character

#vector, factor, matrix, table,
list

factor(c(1,5,1,7,5,7))

array(c(1,5,1,7,5,7))

#Since R is Object
oriented, anything is really a datatype like a plot, or an anova

#Using the
wrong data type can give you warning/error or inexpected
results

8*array(1:5)

8*factor(1:5)

plot(array(1:5),c(5,3,7,8,4))

plot(factor(1:5),c(5,3,7,8,4))

#======================the
anova=========================

#---------------let's set up an
experiment

#Design: factorial design, 3 genotypes, 2 phosphorus treatments, 4
reps = 24

niveau<-gl(1,24,24) #gl(number of levels, number replica's,
length)

genotype<-gl(3,1,24,label=c("inefficient","efficient","landrace"))

phosphorus<-gl(2,3,24,label=c("low","high"))

reps<-gl(4,6,24)

#let's fabricate
data

g<-array(c(-6,9,-3),24)

p<-
array(c(-15,-15,-15,15,15,15),24)

gp<-array(c(-8,2,6,8,-2,-6),24)

error<-rnorm(24,
mean=0, sd=1)

DMroot<-50+g+p+gp+error

#all data in one tabel:
normally this is what you would get when you

#read your data form a
file

expdata<-data.frame(niveau,genotype,phosphorus,reps,DMroot)

#-----------------------check
your data!!!!!-------------------

#1) are missing values NA?

#pitfalls:
zero or blancs where there are missing values: replace with
NA!!!!!!!!!!

summary(dataTabel$LAI==0)

dataTabel$LAI[dataTabel["LAI"]==0]<-"NA"

summary(dataTabel$LAI==0)

#2) are nominal variables
factor

is.factor(genotype) #if false use factor(data=,...)
function

summary(dataTabel)

#3) are all factors and data
correct length, and in same range

length(genotype)

#4) are the numbers in the
right range

summary (genotype) # nominal

summary (DMroot) #
continues

#or just all in one

summary(expdata)

#----------------------do the
analysis of
variance-------------------------

anova1<-aov(DMroot~genotype*phosphorus)

#Model includes all interaction terms. Explicit notation is
DMroot~genotype+phosphorus+genotype:phosphorus

#-------------------------study
data----------------------------------

anova(anova1)
#coefficients

summary(anova1) #anova tabel

names(modelfit) #see what is
available example: fit$fit will give fitted values

names(summary(modelfit)
#see what is available in the summary

anova1$res #residuals

#-------------------check for
normality and
outlayers----------------

plot(anova1)

boxplot(DMroot~genotype*phosphorus)

hist(anova1$res)

#-----------------------Type I,
II, III sums of square--------------------

#Quote R website:

#"7.18 Why
does the output from anova() depend on the order of factors in the
model?

#

#In a model such as ~A+B+A:B, R will report the difference in
sums of squares between

#the models ~1, ~A, ~A+B and ~A+B+A:B. If the model
were ~B+A+A:B, R would report

#differences between ~1, ~B, ~A+B, and
~A+B+A:B . In the first case the sum of squares

#for A is comparing ~1 and
~A, in the second case it is comparing ~B and ~B+A. In a

#non-orthogonal
design (i.e., most unbalanced designs) these comparisons are

#(conceptually
and numerically) different.

#

#Some packages report instead the sums of
squares based on comparing the full model to

#the models with each factor
removed one at a time (the famous `Type III sums of squares'

#from SAS, for
example). These do not depend on the order of factors in the model.

#The
question of which set of sums of squares is the Right Thing provokes low-level

#holy wars on R-help from time to time.

#There is no need to be
agitated about the particular sums of squares that R reports.

#You can
compute your favorite sums of squares quite easily. Any two models can be

#compared with anova(model1, model2), and drop1(model1) will show the sums
of squares

#resulting from dropping single terms."

#In statistics class we all
learn type 1 -> it's R's default

anova(anova1)

#SAS and SPSS use type
III sums of square as default for purely factorial designs!

#R' library car
can produce those for you with the warning that you must deside whether they
make sense:

library(car)

Anova(anova1) #for type II, watch the capital
wich is used to avoid conflict with the base
library

Anova(anova1,type="III")

#-------------------------------post hoc
tests-----------------------

#TukeyHSD (Tukey Honest Significant
Difference)

TukeyHSD(anova1,ordered=TRUE,conf.level=0.95) #plot() will except
the result.

#-------------------------------a little
about the formula---------------------------

# this is just taken from the
help files: ?formula

# + = additive

# * = factor crossing (interaction
term)

# : = main effects plus all interaction terms

# ^n = only n level
interaction terms i.e. (a+b+c)^2 will exclude three way interaction

# %in% =
nested designs i.e. a+b %in% a

# - = remove a term from the model, also -1 is
a no intercept model, which can also be noted as y~0+x

# I() = when instead
of symbolic functions, arithmatics

#

#

#--------------------------------bloc design--------------------------

#
example modified from help files

N <-
as.factor(c(0,1,0,1,1,1,0,0,0,1,1,0,1,1,0,0,1,0,1,0,1,1,0,0))

P <-
as.factor(c(1,1,0,0,0,1,0,1,1,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0))

K <-
as.factor(c(1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,0,1,1,1,0,1,0))

yield <-
c(49.5,62.8,46.8,57.0,59.8,58.5,55.5,56.0,62.8,55.8,69.5,55.0,

62.0,48.8,45.5,44.2,52.0,51.5,49.8,48.8,57.2,59.0,53.2,56.0)

block<-gl(6,4)

npk
<- data.frame(block,N,P,K,yield)

aov(yield ~ block + N * P + K, npk) # or
lm, main difference is formatting of output

#???? random versus fixed
effects

#---------------------------------splitplot--------------------------------

#for
a split-plot design define the error term in the model with Error()

#note
here you can not use lm, since in lm you can not use the Error()
function

yield <- rnorm(20, mean=5, sd=1)

fertilizer <-
gl(2,2,20)

genotype <- gl(2,1,20)

h <- aov(yield ~ fertilizer +
genotype + fertilizer*genotype + Error(fertilizer))

summary(h)

#=============================Regression analysis===========

#let's create some perfect data
(ofcourse you will measure
things)

x<-1:15

y<-10-1.5*x+0.2*x^2+rnorm(15, mean=0,
sd=1)

#check data as before

#Do the regression analyses:
this case it's a linear model

x2<-x^2 #including the square in the model
will give you wrong results without
warning!

modelfit<-lm(y~x+x2)

modelfit #see the estimated
coefficients

summary(modelfit) #see the
statistics

plot(x,y)

lines(x,modelfit$fit)

# check afterwards: normality,
outlayers, no systematic pattern in residuals, equal variance in all
treatments

#1) outlayers

plot(fit)

#based on cook's distance we may
want to look at piont 9: colour it red

plot(x,y)

lines(x,modelfit$fit)

points(x[9],y[9],col=2)

#histogram
of residuals (for whoever likes
histograms)

hist(ra$res)

#boxplot

boxplot(..)

#stepwise regression

#let's
put some extra parameters in the
model

modelfit<-lm(y~x^4+x+x2+x^3)

step(modelfit,direction="forward")

#================transposing
data and different distributions========

#Generalized linear
model allows for analysis of data that is not normal distributed

#Typically
count data has a poisson distribution

#Following example came from the R help
files

## Dobson (1990) Page 93: Randomized Controlled Trial :

counts <-
c(18,17,15,20,10,20,25,13,12)

outcome <- gl(3,1,9)

treatment <-
gl(3,3)

print(d.AD <- data.frame(treatment, outcome, counts))

glm.D93
<- glm(counts ~ outcome + treatment,
family=poisson())

anova(glm.D93)

summary(glm.D93)

#Typically density-respons data
has a gamma distribution

#Following example came from the R help files

# A
Gamma example, from McCullagh & Nelder (1989, pp. 300-2)

clotting <-
data.frame(

u = c(5,10,15,20,30,40,60,80,100),

lot1 =
c(118,58,42,35,27,25,21,19,18),

lot2 =
c(69,35,26,21,18,16,13,12,12))

summary(glm(lot1 ~ log(u), data=clotting,
family=Gamma))

summary(glm(lot2 ~ log(u), data=clotting,
family=Gamma))

#===============================saving
my work============

#copy / paste form the terminal
with right mouse click

#graphs are plotted to the
selected device (normally the screen). You can choose to

#plot to for
example a PDF by selecting that device. See ?device for possible
options.

#write function will write text files. This is mainly handy when automating. Otherwise just use copy paste

#load and save workspace with save(file="someting.rdata") load(file="something.rdata")

#================================advanced:
example=========

#mixed models (example from help
files)

library(lme4)

lmer(decrease ~ treatment + (1|rowpos) +
(1|colpos),OrchardSprays)

#Principle component analysis
(PCA)

library(stats)

prcomp(USArrests, scale =
TRUE)

plot(prcomp(USArrests))

summary(prcomp(USArrests, scale =
TRUE))

biplot(prcomp(USArrests, scale = TRUE))

#================================Automation
of data analysis===

#you can create your own with
the function "function"

sum<-function(x1,x2){return(x1+x2)}

#Test the
function

sum(1,5)

sum(1:3,4:6)

#How error prone is the
function?

sum(1:3,1:4)

#can we see the function's code?

sum

#can we
see other functions code, i.e. matrix? Yes, we can learn from build in
functions

matrix

#Ofcourse their is a basic set which is not expressed in
R language. For example dimnames is such a primitive

dimnames

#Many
libraries are available with preprogrammed functions, for you to
use

library() #lists all libraries installed on your system

library(base)
#loads the base library (default on start up)

#(yes you can create your own
library and share it with the world)

#==== how to summarize
data=========

#tapply

a<-1:10

b<-gl(3,1,10)

tapply(a,b,mean)

library("stats")

tapply(a,b,sd)

#aggregate is similar to tapply

#model.tables for means and
standard errors.

#given anova object m

model.tables(m,se=T)