##http://www.statmethods.net/stATS/index.html##
library(dplyr)
library(ggplot2)
#today we'll be playing with iris
iris
#t.test - independant : one species vs another
virginica <- iris %>%
filter(Species == "virginica") %>%
select(Sepal.Length)
virginica <- virginica$Sepal.Length
setosa <- iris %>%
filter(Species == "setosa") %>%
select(Sepal.Length)
setosa <- setosa$Sepal.Length
hist(virginica)
hist(setosa)
t.test(virginica, setosa)
#t.test - paired : different populations of the same species
vlow <- iris %>%
filter(Species == "virginica", Petal.Length <5.55) %>%
select(Sepal.Length)
vlow <- vlow$Sepal.Length
vhi <- iris %>%
filter(Species == "virginica", Petal.Length >5.55) %>%
select(Sepal.Length)
vhi <- vhi$Sepal.Length
hist(vlow)
hist(vhi)
t.test(vlow, vhi, paired=TRUE)
#RESAMPLING to compare distributions
r.dat <- virginica
#MAKING REPLICATED DATASETS (lots of "Obs")
resamples = 1000
matrix = replicate(resamples, sample(r.dat, length(r.dat), replace = T))
hist(matrix)
#BINNING AND COUNTING OCCURANCES PER BIN IN EACH REPLICATE
#lets try a forloop so we can go through it automatically
number = 1000
low = NULL
for (i in seq(1:number)){
low = c(low, length(matrix[which(matrix[,i]<6)]))
}
med = NULL
for (i in seq(1:number)){
med = c(med, length(matrix[which(matrix[,i]>6&matrix[,i]<7)]))
}
hi = NULL
for (i in seq(1:number)){
hi = c(hi, length(matrix[which(matrix[,i]>7)]))
}
#PUTTING THE REPLICATES INTO A DATABASE
counts = NULL
counts$low = low
counts$med = med
counts$hi = hi
counts
#GETTING THE MEAN FOR EACH CATEGORY
mean.low = mean(counts$low)
mean.med = mean(counts$med)
mean.hi = mean(counts$hi )
#DETERMINING HOW MANY OVERLAP WITH EXPECTED
#expected is even across all bins
expect = length(Dat)/3
#how many times did we get that or over?
over.low = sum(counts$low >= expect)
over.med = sum(counts$med >= expect)
under.hi = sum(counts$hi <= expect)
#DETERMINING P-VALUE
#What fraction of the time are my numbers over the expected values? That number divided by the number of resamples is my p-value
#p-values are the % of occurances equal or over the expected value, divided by the number of replicates
p.low = over.low/resamples
p.med = over.med/resamples
p.hi = under.hi/resamples
p.low
p.med
p.hi
#Generating Plot
par(mfrow = c(1, 1))
means = c(mean.low, mean.med, mean.hi)
plot.1 <- barplot(means, axes=FALSE, axisnames=FALSE, ylim=c(0, 10+max(counts$hi)),
main="Sepal Evenness", xlab="Bin", ylab="Frequency")
# The x-axis with labels for each group
# The labels are drawn at the bar midpoints
axis(1, labels=c("0-5", "6-7", "7-higer"), at = plot.1)
# The y-axis with the age going from 0 to 60
axis(2, at=seq(0 , max(counts$hi+10), by=5))
# Put the plot in a box
# Get standard deviation of each group
# The standard deviations are saved in a matrix of same size
# as the matrix with midpoints, this is useful for plotting
# the error bars
stDevs <- matrix(c(sd(low), sd(med), sd(hi)), 3)
# Plot the vertical lines of the error bars
# The vertical bars are plotted at the midpoints
segments(plot.1, means - stDevs, plot.1, means + stDevs, lwd=2)
# Now plot the horizontal bounds for the error bars
# 1. The lower bar
segments(plot.1 - 0.1, means - stDevs, plot.1 + 0.1, means - stDevs, lwd=2)
# 2. The upper bar
segments(plot.1 - 0.1, means + stDevs, plot.1 + 0.1, means + stDevs, lwd=2)
#adding stars where significant
#change the "x= " to change position to the center of the box to make it look good.
#low star
text(x= .7, y=max(low)+20,"*",cex=2)
#med star
text(x= 1.9, y=max(med)+20,"*",cex=2)
#hi star
text(x= 3.1, y=max(hi)+20,"*",cex=2)
#using ggplot to do this
library(ggplot2)
average.all <- data.frame(means)
average.all$bin <- c("0-5", "6-7", "7-higher")
ggplot.resample <-
ggplot(average.all) + aes(bin, means) + geom_bar(stat = "identity") + theme_bw()
#making the standard deviation bars
#stDevs is the deviation
average.all$stDev <- stDevs[,1]
average.all$upper <- average.all$means + average.all$stDev
average.all$lower <- average.all$means - average.all$stDev
#plot with error bars
resample.plot <- ggplot.resample + geom_errorbar(aes(
ymax = upper,
ymin = lower),
data = average.all) +
labs(x = "Sepal Evenness", y = "Frequency") +
theme(plot.title = element_text(size = 25))
resample.plot