# Introduction to R and Running an ANOVA by Jouke Postma

#To cite R in publications, use (bibtext format)
#
# @Manual{,
# title = {R: A Language and Environment for Statistical
# Computing},
# author = {{R Development Core Team}},
# organization = {R Foundation for Statistical Computing},
# year = 2005,
# note = {{ISBN} 3-900051-07-0},
# url = {http://www.R-project.org}
# }

#======================R-demonstration script================

#--------------------------start and quit---------------------------------
R
quit()
save.image() #save data, although I would just copy code into a text file. Same as quit("yes")

#Text after pound sign is ignored (considered to be comments)

#----------------------------help-----------------------------------------
help("matrix")
#or
?matrix
#or search in help with
help.search("matrix")

#-----------------------Simple calculation--------------------------------
1+1
2^3 #following notation is also excepted: 2**3

#----------------------More complex functions-----------------------------
#function are of form functionname() like the help function we saw earlier
sum(3,6)
sin(3)
#you may(for clarity) include argument labels
help(topic="matrix")

#-------------------------variables---------------------------------------
#assignment sign is -> or <- . Variables maybe created in assignment
k<-2
#variables maybe listed with ls
ls()
#variable maybe deleted with rm()
rm(k)
ls()

#-------------------------vector/array------------------------------------
1:9
c("jouke","eric","raul")
array(0,5)
example.vector<-c(3,5,6,7)
example.vector

#---------------------------matrix----------------------------------------
example.matrix<-matrix(data=c(3,5,8,6),nrow=3,ncol=4)
example.matrix

#----------------------vector & matrix calculations-----------------------
#vector
3*1:3
#watch out for the following:
example.matrix*c(2,2,2)
crossprod(example.matrix,c(2,2,2))

#--------------------------graphics---------------------------------------
#a simple plot
x<-1:10
y<-1/x
plot(x,y)
#demo of graphic capability of R:
demo(graphics)
#more advance settings can be set with par() or in the plot comand
par() #for a list of current settings
#plot works with many objects as demonstrate below

#working directory
getwd()
setwd("/home/jap441/Desktop/R-demonstration")
#list files
list.files() #or dir()
#show data
dataTabel
summary(dataTabel)
plot(dataTabel)demo
# Read manual for import from EpiInfo, Minitab, S-PLUS, SAS, SPSS, Stata, Systat

#---------------------tables/matrix indexes & subsets---------------------
#colum time
dataTabel["TIME"]
dataTabel\$TIME #which is equal to dataTabel[,"TIME"] watch the comma here!!!!!!!!!!
dataTabel[2]
#row 3
dataTabel[3,]
#number 4 of colum 3 (Note that since this colum contains texts, it is considered nominal and therefor levels are shown.
dataTabel[4,3]
#selection of a dataset with == >= <= != (note: this works with integers. Use for floating point comparison all.equal()
1:5>=3
(1:5)[1:5>=3]
(1:5)*(1:5>=3)
dataTabel["LAI"][dataTabel["LAI"]>=0]
#plot a selection
plot(dataTabel\$TIME,dataTabel\$LAI) #plot LAI over time
#create a temporal reference to a colum by attaching it (until detach)
LAI #gives error
attach(dataTabel)
LAI #is present
detach(dataTabel)
LAI #gives error

#-----------------------data types---------------------------------
#R uses several data types of which the following are of importance
#logical, numeric, integer, complex, character
#vector, factor, matrix, table, list
factor(c(1,5,1,7,5,7))
array(c(1,5,1,7,5,7))
#Since R is Object oriented, anything is really a datatype like a plot, or an anova
#Using the wrong data type can give you warning/error or inexpected results
8*array(1:5)
8*factor(1:5)
plot(array(1:5),c(5,3,7,8,4))
plot(factor(1:5),c(5,3,7,8,4))

#======================the anova=========================

#---------------let's set up an experiment
#Design: factorial design, 3 genotypes, 2 phosphorus treatments, 4 reps = 24
niveau<-gl(1,24,24) #gl(number of levels, number replica's, length)
genotype<-gl(3,1,24,label=c("inefficient","efficient","landrace"))
phosphorus<-gl(2,3,24,label=c("low","high"))
reps<-gl(4,6,24)

#let's fabricate data
g<-array(c(-6,9,-3),24)
p<- array(c(-15,-15,-15,15,15,15),24)
gp<-array(c(-8,2,6,8,-2,-6),24)
error<-rnorm(24, mean=0, sd=1)
DMroot<-50+g+p+gp+error

#all data in one tabel: normally this is what you would get when you
expdata<-data.frame(niveau,genotype,phosphorus,reps,DMroot)

#1) are missing values NA?
#pitfalls: zero or blancs where there are missing values: replace with NA!!!!!!!!!!
summary(dataTabel\$LAI==0)
dataTabel\$LAI[dataTabel["LAI"]==0]<-"NA"
summary(dataTabel\$LAI==0)

#2) are nominal variables factor
is.factor(genotype) #if false use factor(data=,...) function
summary(dataTabel)

#3) are all factors and data correct length, and in same range
length(genotype)

#4) are the numbers in the right range
summary (genotype) # nominal
summary (DMroot) # continues
#or just all in one
summary(expdata)

#----------------------do the analysis of variance-------------------------
anova1<-aov(DMroot~genotype*phosphorus)
#Model includes all interaction terms. Explicit notation is DMroot~genotype+phosphorus+genotype:phosphorus

#-------------------------study data----------------------------------
anova(anova1) #coefficients
summary(anova1) #anova tabel
names(modelfit) #see what is available example: fit\$fit will give fitted values
names(summary(modelfit) #see what is available in the summary
anova1\$res #residuals

#-------------------check for normality and outlayers----------------
plot(anova1)
boxplot(DMroot~genotype*phosphorus)
hist(anova1\$res)

#-----------------------Type I, II, III sums of square--------------------
#Quote R website:
#"7.18 Why does the output from anova() depend on the order of factors in the model?
#
#In a model such as ~A+B+A:B, R will report the difference in sums of squares between
#the models ~1, ~A, ~A+B and ~A+B+A:B. If the model were ~B+A+A:B, R would report
#differences between ~1, ~B, ~A+B, and ~A+B+A:B . In the first case the sum of squares
#for A is comparing ~1 and ~A, in the second case it is comparing ~B and ~B+A. In a
#non-orthogonal design (i.e., most unbalanced designs) these comparisons are
#(conceptually and numerically) different.
#
#Some packages report instead the sums of squares based on comparing the full model to
#the models with each factor removed one at a time (the famous `Type III sums of squares'
#from SAS, for example). These do not depend on the order of factors in the model.
#The question of which set of sums of squares is the Right Thing provokes low-level
#holy wars on R-help from time to time.

#There is no need to be agitated about the particular sums of squares that R reports.
#You can compute your favorite sums of squares quite easily. Any two models can be
#compared with anova(model1, model2), and drop1(model1) will show the sums of squares
#resulting from dropping single terms."

#In statistics class we all learn type 1 -> it's R's default
anova(anova1)
#SAS and SPSS use type III sums of square as default for purely factorial designs!
#R' library car can produce those for you with the warning that you must deside whether they make sense:
library(car)
Anova(anova1) #for type II, watch the capital wich is used to avoid conflict with the base library
Anova(anova1,type="III")

#-------------------------------post hoc tests-----------------------
#TukeyHSD (Tukey Honest Significant Difference)
TukeyHSD(anova1,ordered=TRUE,conf.level=0.95) #plot() will except the result.

# this is just taken from the help files: ?formula
# * = factor crossing (interaction term)
# : = main effects plus all interaction terms
# ^n = only n level interaction terms i.e. (a+b+c)^2 will exclude three way interaction
# %in% = nested designs i.e. a+b %in% a
# - = remove a term from the model, also -1 is a no intercept model, which can also be noted as y~0+x
# I() = when instead of symbolic functions, arithmatics
#
#
#--------------------------------bloc design--------------------------
# example modified from help files
N <- as.factor(c(0,1,0,1,1,1,0,0,0,1,1,0,1,1,0,0,1,0,1,0,1,1,0,0))
P <- as.factor(c(1,1,0,0,0,1,0,1,1,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0))
K <- as.factor(c(1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,0,1,1,1,0,1,0))
yield <- c(49.5,62.8,46.8,57.0,59.8,58.5,55.5,56.0,62.8,55.8,69.5,55.0,
62.0,48.8,45.5,44.2,52.0,51.5,49.8,48.8,57.2,59.0,53.2,56.0)
block<-gl(6,4)
npk <- data.frame(block,N,P,K,yield)
aov(yield ~ block + N * P + K, npk) # or lm, main difference is formatting of output
#???? random versus fixed effects

#---------------------------------splitplot--------------------------------
#for a split-plot design define the error term in the model with Error()
#note here you can not use lm, since in lm you can not use the Error() function
yield <- rnorm(20, mean=5, sd=1)
fertilizer <- gl(2,2,20)
genotype <- gl(2,1,20)
h <- aov(yield ~ fertilizer + genotype + fertilizer*genotype + Error(fertilizer))
summary(h)

#=============================Regression analysis===========

#let's create some perfect data (ofcourse you will measure things)
x<-1:15
y<-10-1.5*x+0.2*x^2+rnorm(15, mean=0, sd=1)

#check data as before

#Do the regression analyses: this case it's a linear model
x2<-x^2 #including the square in the model will give you wrong results without warning!
modelfit<-lm(y~x+x2)
modelfit #see the estimated coefficients
summary(modelfit) #see the statistics
plot(x,y)
lines(x,modelfit\$fit)

# check afterwards: normality, outlayers, no systematic pattern in residuals, equal variance in all treatments
#1) outlayers
plot(fit)
#based on cook's distance we may want to look at piont 9: colour it red
plot(x,y)
lines(x,modelfit\$fit)
points(x[9],y[9],col=2)
#histogram of residuals (for whoever likes histograms)
hist(ra\$res)
#boxplot
boxplot(..)

#stepwise regression
#let's put some extra parameters in the model
modelfit<-lm(y~x^4+x+x2+x^3)
step(modelfit,direction="forward")

#================transposing data and different distributions========
#Generalized linear model allows for analysis of data that is not normal distributed
#Typically count data has a poisson distribution
#Following example came from the R help files
## Dobson (1990) Page 93: Randomized Controlled Trial :
counts <- c(18,17,15,20,10,20,25,13,12)
outcome <- gl(3,1,9)
treatment <- gl(3,3)
glm.D93 <- glm(counts ~ outcome + treatment, family=poisson())
anova(glm.D93)
summary(glm.D93)

#Typically density-respons data has a gamma distribution
#Following example came from the R help files
# A Gamma example, from McCullagh & Nelder (1989, pp. 300-2)
clotting <- data.frame(
u = c(5,10,15,20,30,40,60,80,100),
lot1 = c(118,58,42,35,27,25,21,19,18),
lot2 = c(69,35,26,21,18,16,13,12,12))
summary(glm(lot1 ~ log(u), data=clotting, family=Gamma))
summary(glm(lot2 ~ log(u), data=clotting, family=Gamma))

#===============================saving my work============
#copy / paste form the terminal with right mouse click

#graphs are plotted to the selected device (normally the screen). You can choose to
#plot to for example a PDF by selecting that device. See ?device for possible options.

#write function will write text files. This is mainly handy when automating. Otherwise just use copy paste

#mixed models (example from help files)
library(lme4)
lmer(decrease ~ treatment + (1|rowpos) + (1|colpos),OrchardSprays)

#Principle component analysis (PCA)
library(stats)
prcomp(USArrests, scale = TRUE)
plot(prcomp(USArrests))
summary(prcomp(USArrests, scale = TRUE))
biplot(prcomp(USArrests, scale = TRUE))

#================================Automation of data analysis===

#you can create your own with the function "function"
sum<-function(x1,x2){return(x1+x2)}
#Test the function
sum(1,5)
sum(1:3,4:6)
#How error prone is the function?
sum(1:3,1:4)
#can we see the function's code?
sum
#can we see other functions code, i.e. matrix? Yes, we can learn from build in functions
matrix
#Ofcourse their is a basic set which is not expressed in R language. For example dimnames is such a primitive
dimnames
#Many libraries are available with preprogrammed functions, for you to use
library() #lists all libraries installed on your system
library(base) #loads the base library (default on start up)
#(yes you can create your own library and share it with the world)

#==== how to summarize data=========
#tapply
a<-1:10
b<-gl(3,1,10)
tapply(a,b,mean)
library("stats")
tapply(a,b,sd)

#aggregate is similar to tapply

#model.tables for means and standard errors.
#given anova object m
model.tables(m,se=T)