##http://www.statmethods.net/stATS/index.html##

library(dplyr)
library(ggplot2)

#today we'll be playing with iris 
iris

#t.test - independant : one species vs another

virginica <- iris %>%
  filter(Species == "virginica") %>%
  select(Sepal.Length)
virginica <- virginica$Sepal.Length

setosa <- iris %>%
  filter(Species == "setosa") %>%
  select(Sepal.Length)
setosa <- setosa$Sepal.Length

hist(virginica)
hist(setosa)

t.test(virginica, setosa)

#t.test - paired : different populations of the same species 

vlow <- iris %>%
  filter(Species == "virginica", Petal.Length <5.55) %>%
  select(Sepal.Length)
vlow <- vlow$Sepal.Length

vhi <- iris %>%
  filter(Species == "virginica", Petal.Length >5.55) %>%
  select(Sepal.Length)
vhi <- vhi$Sepal.Length

hist(vlow)
hist(vhi)

t.test(vlow, vhi, paired=TRUE)



#RESAMPLING to compare distributions 

r.dat <- virginica 

#MAKING REPLICATED DATASETS (lots of "Obs")

resamples = 1000
matrix = replicate(resamples, sample(r.dat, length(r.dat), replace = T))

hist(matrix)

#BINNING AND COUNTING OCCURANCES PER BIN IN EACH REPLICATE
#lets try a forloop so we can go through it automatically 
number = 1000

low = NULL 
for (i in seq(1:number)){
  low = c(low, length(matrix[which(matrix[,i]<6)]))
}

med = NULL
for (i in seq(1:number)){
  med = c(med, length(matrix[which(matrix[,i]>6&matrix[,i]<7)]))
}

hi = NULL
for (i in seq(1:number)){
  hi = c(hi, length(matrix[which(matrix[,i]>7)]))
}

#PUTTING THE REPLICATES INTO A DATABASE
counts = NULL
counts$low = low
counts$med = med
counts$hi  = hi

counts


#GETTING THE MEAN FOR EACH CATEGORY  
mean.low = mean(counts$low) 
mean.med = mean(counts$med) 
mean.hi  = mean(counts$hi ) 

#DETERMINING HOW MANY OVERLAP WITH EXPECTED 
#expected is even across all bins
expect = length(Dat)/3

#how many times did we get that or over? 
over.low = sum(counts$low >= expect)
over.med = sum(counts$med >= expect)
under.hi  = sum(counts$hi  <= expect) 

#DETERMINING P-VALUE 
#What fraction of the time are my numbers over the expected values? That number divided by the number of resamples is my p-value
#p-values are the % of occurances equal or over the expected value, divided by the number of replicates

p.low = over.low/resamples
p.med = over.med/resamples
p.hi  = under.hi/resamples

p.low
p.med
p.hi


#Generating Plot

par(mfrow = c(1, 1))

means = c(mean.low, mean.med, mean.hi)

plot.1 <- barplot(means, axes=FALSE, axisnames=FALSE, ylim=c(0, 10+max(counts$hi)),
                  main="Sepal Evenness", xlab="Bin", ylab="Frequency")


# The x-axis with labels for each group
# The labels are drawn at the bar midpoints
axis(1, labels=c("0-5", "6-7", "7-higer"), at = plot.1)
# The y-axis with the age going from 0 to 60 
axis(2, at=seq(0 , max(counts$hi+10), by=5))
# Put the plot in a box


# Get standard deviation of each group
# The standard deviations are saved in a matrix of same size 
# as the matrix with midpoints, this is useful for plotting 
# the error bars
stDevs <- matrix(c(sd(low), sd(med), sd(hi)), 3)
# Plot the vertical lines of the error bars
# The vertical bars are plotted at the midpoints
segments(plot.1, means - stDevs, plot.1, means + stDevs, lwd=2)
# Now plot the horizontal bounds for the error bars
# 1. The lower bar
segments(plot.1 - 0.1, means - stDevs, plot.1 + 0.1, means - stDevs, lwd=2)
# 2. The upper bar
segments(plot.1 - 0.1, means + stDevs, plot.1 + 0.1, means + stDevs, lwd=2)

#adding stars where significant 
#change the "x= " to change position to the center of the box to make it look good. 
#low star
text(x= .7, y=max(low)+20,"*",cex=2)
#med star
text(x= 1.9, y=max(med)+20,"*",cex=2)
#hi star
text(x= 3.1, y=max(hi)+20,"*",cex=2)


#using ggplot to do this 

library(ggplot2)

average.all <-  data.frame(means)
average.all$bin <-  c("0-5", "6-7", "7-higher")

ggplot.resample <-  
  ggplot(average.all) + aes(bin, means) + geom_bar(stat = "identity") + theme_bw() 


#making the standard deviation bars
#stDevs is the deviation 

average.all$stDev <-  stDevs[,1]

average.all$upper <-  average.all$means + average.all$stDev
average.all$lower <-  average.all$means - average.all$stDev

#plot with error bars 

resample.plot <-  ggplot.resample + geom_errorbar(aes(
  ymax = upper, 
  ymin = lower), 
  data = average.all) + 
  labs(x = "Sepal Evenness", y = "Frequency") + 
  theme(plot.title = element_text(size = 25))

resample.plot