#To cite R in publications, use (bibtext format)

#

# @Manual{,

# title = {R: A Language and Environment for Statistical

# Computing},

# author = {{R Development Core Team}},

# organization = {R Foundation for Statistical Computing},

# address = {Vienna, Austria},

# year = 2005,

# note = {{ISBN} 3-900051-07-0},

# url = {http://www.R-project.org}

# }

#======================R-demonstration script================

#--------------------------start and quit---------------------------------

R

quit()

save.image() #save data, although I would just copy code into a text file. Same as quit("yes")

#---------------------------comments--------------------------------------

#Text after pound sign is ignored (considered to be comments)

#----------------------------help-----------------------------------------

help("matrix")

#or

?matrix

#or search in help with

help.search("matrix")

#-----------------------Simple calculation--------------------------------

1+1

2^3 #following notation is also excepted: 2**3

#----------------------More complex functions-----------------------------

#function are of form functionname() like the help function we saw earlier

sum(3,6)

sin(3)

#you may(for clarity) include argument labels

help(topic="matrix")

#-------------------------variables---------------------------------------

#assignment sign is -> or <- . Variables maybe created in assignment

k<-2

#variables maybe listed with ls

ls()

#variable maybe deleted with rm()

rm(k)

ls()

#-------------------------vector/array------------------------------------

1:9

c("jouke","eric","raul")

array(0,5)

example.vector<-c(3,5,6,7)

example.vector

#---------------------------matrix----------------------------------------

example.matrix<-matrix(data=c(3,5,8,6),nrow=3,ncol=4)

example.matrix

#----------------------vector & matrix calculations-----------------------

#vector

3*1:3

#watch out for the following:

example.matrix*c(2,2,2)

#instead what you probably want

crossprod(example.matrix,c(2,2,2))

#--------------------------graphics---------------------------------------

#a simple plot

x<-1:10

y<-1/x

plot(x,y)

#demo of graphic capability of R:

demo(graphics)

#more advance settings can be set with par() or in the plot comand

par() #for a list of current settings

#plot works with many objects as demonstrate below

#-----------------------load data from file-------------------------------

#working directory

getwd()

setwd("/home/jap441/Desktop/R-demonstration")

#list files

list.files() #or dir()

#read data into tabel

dataTabel<-read.table("crop-simulation-sample-data.txt", header=TRUE, na.strings = c("NAN","NA","na","nan"))

#show data

dataTabel

summary(dataTabel)

plot(dataTabel)demo

# Read manual for import from EpiInfo, Minitab, S-PLUS, SAS, SPSS, Stata, Systat

#---------------------tables/matrix indexes & subsets---------------------

#colum time

dataTabel["TIME"]

dataTabel$TIME #which is equal to dataTabel[,"TIME"] watch the comma here!!!!!!!!!!

dataTabel[2]

#row 3

dataTabel[3,]

#number 4 of colum 3 (Note that since this colum contains texts, it is considered nominal and therefor levels are shown.

dataTabel[4,3]

#selection of a dataset with == >= <= != (note: this works with integers. Use for floating point comparison all.equal()

1:5>=3

(1:5)[1:5>=3]

(1:5)*(1:5>=3)

dataTabel["LAI"][dataTabel["LAI"]>=0]

#plot a selection

plot(dataTabel$TIME,dataTabel$LAI) #plot LAI over time

#create a temporal reference to a colum by attaching it (until detach)

LAI #gives error

attach(dataTabel)

LAI #is present

detach(dataTabel)

LAI #gives error

#-----------------------data types---------------------------------

#R uses several data types of which the following are of importance

#logical, numeric, integer, complex, character

#vector, factor, matrix, table, list

factor(c(1,5,1,7,5,7))

array(c(1,5,1,7,5,7))

#Since R is Object oriented, anything is really a datatype like a plot, or an anova

#Using the wrong data type can give you warning/error or inexpected results

8*array(1:5)

8*factor(1:5)

plot(array(1:5),c(5,3,7,8,4))

plot(factor(1:5),c(5,3,7,8,4))

#======================the anova=========================

#---------------let's set up an experiment

#Design: factorial design, 3 genotypes, 2 phosphorus treatments, 4 reps = 24

niveau<-gl(1,24,24) #gl(number of levels, number replica's, length)

genotype<-gl(3,1,24,label=c("inefficient","efficient","landrace"))

phosphorus<-gl(2,3,24,label=c("low","high"))

reps<-gl(4,6,24)

#let's fabricate data

g<-array(c(-6,9,-3),24)

p<- array(c(-15,-15,-15,15,15,15),24)

gp<-array(c(-8,2,6,8,-2,-6),24)

error<-rnorm(24, mean=0, sd=1)

DMroot<-50+g+p+gp+error

#all data in one tabel: normally this is what you would get when you

#read your data form a file

expdata<-data.frame(niveau,genotype,phosphorus,reps,DMroot)

#-----------------------check your data!!!!!-------------------

#1) are missing values NA?

#pitfalls: zero or blancs where there are missing values: replace with NA!!!!!!!!!!

summary(dataTabel$LAI==0)

dataTabel$LAI[dataTabel["LAI"]==0]<-"NA"

summary(dataTabel$LAI==0)

#2) are nominal variables factor

is.factor(genotype) #if false use factor(data=,...) function

summary(dataTabel)

#3) are all factors and data correct length, and in same range

length(genotype)

#4) are the numbers in the right range

summary (genotype) # nominal

summary (DMroot) # continues

#or just all in one

summary(expdata)

#----------------------do the analysis of variance-------------------------

anova1<-aov(DMroot~genotype*phosphorus)

#Model includes all interaction terms. Explicit notation is DMroot~genotype+phosphorus+genotype:phosphorus

#-------------------------study data----------------------------------

anova(anova1) #coefficients

summary(anova1) #anova tabel

names(modelfit) #see what is available example: fit$fit will give fitted values

names(summary(modelfit) #see what is available in the summary

anova1$res #residuals

#-------------------check for normality and outlayers----------------

plot(anova1)

boxplot(DMroot~genotype*phosphorus)

hist(anova1$res)

#-----------------------Type I, II, III sums of square--------------------

#Quote R website:

#"7.18 Why does the output from anova() depend on the order of factors in the model?

#

#In a model such as ~A+B+A:B, R will report the difference in sums of squares between

#the models ~1, ~A, ~A+B and ~A+B+A:B. If the model were ~B+A+A:B, R would report

#differences between ~1, ~B, ~A+B, and ~A+B+A:B . In the first case the sum of squares

#for A is comparing ~1 and ~A, in the second case it is comparing ~B and ~B+A. In a

#non-orthogonal design (i.e., most unbalanced designs) these comparisons are

#(conceptually and numerically) different.

#

#Some packages report instead the sums of squares based on comparing the full model to

#the models with each factor removed one at a time (the famous 'Type III sums of squares'

#from SAS, for example). These do not depend on the order of factors in the model.

#The question of which set of sums of squares is the Right Thing provokes low-level

#holy wars on R-help from time to time.

#There is no need to be agitated about the particular sums of squares that R reports.

#You can compute your favorite sums of squares quite easily. Any two models can be

#compared with anova(model1, model2), and drop1(model1) will show the sums of squares

#resulting from dropping single terms."

#In statistics class we all learn type 1 -> it's R's default

anova(anova1)

#SAS and SPSS use type III sums of square as default for purely factorial designs!

#R' library car can produce those for you with the warning that you must deside whether they make sense:

library(car)

Anova(anova1) #for type II, watch the capital wich is used to avoid conflict with the base library

Anova(anova1, )

#-------------------------------post hoc tests-----------------------

#TukeyHSD (Tukey Honest Significant Difference)

TukeyHSD(anova1,ordered=TRUE,conf.level=0.95) #plot() will except the result.

#-------------------------------a little about the formula---------------------------

# this is just taken from the help files: ?formula

# + = additive

# * = factor crossing (interaction term)

# : = main effects plus all interaction terms

# ^n = only n level interaction terms i.e. (a+b+c)^2 will exclude three way interaction

# %in% = nested designs i.e. a+b %in% a

# - = remove a term from the model, also -1 is a no intercept model, which can also be noted as y~0+x

# I() = when instead of symbolic functions, arithmatics

#

#

#--------------------------------bloc design--------------------------

# example modified from help files

N <- as.factor(c(0,1,0,1,1,1,0,0,0,1,1,0,1,1,0,0,1,0,1,0,1,1,0,0))

P <- as.factor(c(1,1,0,0,0,1,0,1,1,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0))

K <- as.factor(c(1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,0,1,1,1,0,1,0))

yield <- c(49.5,62.8,46.8,57.0,59.8,58.5,55.5,56.0,62.8,55.8,69.5,55.0,

62.0,48.8,45.5,44.2,52.0,51.5,49.8,48.8,57.2,59.0,53.2,56.0)

block<-gl(6,4)

npk <- data.frame(block,N,P,K,yield)

aov(yield ~ block + N * P + K, npk) # or lm, main difference is formatting of output

#???? random versus fixed effects

#---------------------------------splitplot--------------------------------

#for a split-plot design define the error term in the model with Error()

#note here you can not use lm, since in lm you can not use the Error() function

yield <- rnorm(20, mean=5, sd=1)

fertilizer <- gl(2,2,20)

genotype <- gl(2,1,20)

h <- aov(yield ~ fertilizer + genotype + fertilizer*genotype + Error(fertilizer))

summary(h)

#=============================Regression analysis===========

#let's create some perfect data (ofcourse you will measure things)

x<-1:15

y<-10-1.5*x+0.2*x^2+rnorm(15, mean=0, sd=1)

#check data as before

#Do the regression analyses: this case it's a linear model

x2<-x^2 #including the square in the model will give you wrong results without warning!

modelfit<-lm(y~x+x2)

modelfit #see the estimated coefficients

summary(modelfit) #see the statistics

plot(x,y)

lines(x,modelfit$fit)

# check afterwards: normality, outlayers, no systematic pattern in residuals, equal variance in all treatments

#1) outlayers

plot(fit)

#based on cook's distance we may want to look at piont 9: colour it red

plot(x,y)

lines(x,modelfit$fit)

points(x[9],y[9],col=2)

#histogram of residuals (for whoever likes histograms)

hist(ra$res)

#boxplot

boxplot(..)

#stepwise regression

#let's put some extra parameters in the model

modelfit<-lm(y~x^4+x+x2+x^3)

step(modelfit,direction="forward")

#================transposing data and different distributions========

#Generalized linear model allows for analysis of data that is not normal distributed

#Typically count data has a poisson distribution

#Following example came from the R help files

## Dobson (1990) Page 93: Randomized Controlled Trial :

counts <- c(18,17,15,20,10,20,25,13,12)

outcome <- gl(3,1,9)

treatment <- gl(3,3)

print(d.AD <- data.frame(treatment, outcome, counts))

glm.D93 <- glm(counts ~ outcome + treatment, family=poisson())

anova(glm.D93)

summary(glm.D93)

#Typically density-respons data has a gamma distribution

#Following example came from the R help files

# A Gamma example, from McCullagh & Nelder (1989, pp. 300-2)

clotting <- data.frame(

u = c(5,10,15,20,30,40,60,80,100),

lot1 = c(118,58,42,35,27,25,21,19,18),

lot2 = c(69,35,26,21,18,16,13,12,12))

summary(glm(lot1 ~ log(u), data=clotting, family=Gamma))

summary(glm(lot2 ~ log(u), data=clotting, family=Gamma))

#===============================saving my work============

#copy / paste form the terminal with right mouse click

#graphs are plotted to the selected device (normally the screen). You can choose to

#plot to for example a PDF by selecting that device. See ?device for possible options.

#write function will write text files. This is mainly handy when automating. Otherwise just use copy paste

#load and save workspace with save(file="someting.rdata") load(file="something.rdata")

#================================advanced: example=========

#mixed models (example from help files)

library(lme4)

lmer(decrease ~ treatment + (1|rowpos) + (1|colpos),OrchardSprays)

#Principle component analysis (PCA)

library(stats)

prcomp(USArrests, scale = TRUE)

plot(prcomp(USArrests))

summary(prcomp(USArrests, scale = TRUE))

biplot(prcomp(USArrests, scale = TRUE))

#================================Automation of data analysis===

#you can create your own with the function "function"

sum<-function(x1,x2){return(x1+x2)}

#Test the function

sum(1,5)

sum(1:3,4:6)

#How error prone is the function?

sum(1:3,1:4)

#can we see the function's code?

sum

#can we see other functions code, i.e. matrix? Yes, we can learn from build in functions

matrix

#Ofcourse their is a basic set which is not expressed in R language. For example dimnames is such a primitive

dimnames

#Many libraries are available with preprogrammed functions, for you to use

library() #lists all libraries installed on your system

library(base) #loads the base library (default on start up)

#(yes you can create your own library and share it with the world)

#==== how to summarize data=========

#tapply

a<-1:10

b<-gl(3,1,10)

tapply(a,b,mean)

library("stats")

tapply(a,b,sd)

#aggregate is similar to tapply

#model.tables for means and standard errors.

#given anova object m

model.tables(m,se=T)