##http://www.statmethods.net/stATS/index.html## library(dplyr) library(ggplot2) #today we'll be playing with iris iris #t.test - independant : one species vs another virginica <- iris %>% filter(Species == "virginica") %>% select(Sepal.Length) virginica <- virginica$Sepal.Length setosa <- iris %>% filter(Species == "setosa") %>% select(Sepal.Length) setosa <- setosa$Sepal.Length hist(virginica) hist(setosa) t.test(virginica, setosa) #t.test - paired : different populations of the same species vlow <- iris %>% filter(Species == "virginica", Petal.Length <5.55) %>% select(Sepal.Length) vlow <- vlow$Sepal.Length vhi <- iris %>% filter(Species == "virginica", Petal.Length >5.55) %>% select(Sepal.Length) vhi <- vhi$Sepal.Length hist(vlow) hist(vhi) t.test(vlow, vhi, paired=TRUE) #RESAMPLING to compare distributions r.dat <- virginica #MAKING REPLICATED DATASETS (lots of "Obs") resamples = 1000 matrix = replicate(resamples, sample(r.dat, length(r.dat), replace = T)) hist(matrix) #BINNING AND COUNTING OCCURANCES PER BIN IN EACH REPLICATE #lets try a forloop so we can go through it automatically number = 1000 low = NULL for (i in seq(1:number)){ low = c(low, length(matrix[which(matrix[,i]<6)])) } med = NULL for (i in seq(1:number)){ med = c(med, length(matrix[which(matrix[,i]>6&matrix[,i]<7)])) } hi = NULL for (i in seq(1:number)){ hi = c(hi, length(matrix[which(matrix[,i]>7)])) } #PUTTING THE REPLICATES INTO A DATABASE counts = NULL counts$low = low counts$med = med counts$hi = hi counts #GETTING THE MEAN FOR EACH CATEGORY mean.low = mean(counts$low) mean.med = mean(counts$med) mean.hi = mean(counts$hi ) #DETERMINING HOW MANY OVERLAP WITH EXPECTED #expected is even across all bins expect = length(Dat)/3 #how many times did we get that or over? over.low = sum(counts$low >= expect) over.med = sum(counts$med >= expect) under.hi = sum(counts$hi <= expect) #DETERMINING P-VALUE #What fraction of the time are my numbers over the expected values? That number divided by the number of resamples is my p-value #p-values are the % of occurances equal or over the expected value, divided by the number of replicates p.low = over.low/resamples p.med = over.med/resamples p.hi = under.hi/resamples p.low p.med p.hi #Generating Plot par(mfrow = c(1, 1)) means = c(mean.low, mean.med, mean.hi) plot.1 <- barplot(means, axes=FALSE, axisnames=FALSE, ylim=c(0, 10+max(counts$hi)), main="Sepal Evenness", xlab="Bin", ylab="Frequency") # The x-axis with labels for each group # The labels are drawn at the bar midpoints axis(1, labels=c("0-5", "6-7", "7-higer"), at = plot.1) # The y-axis with the age going from 0 to 60 axis(2, at=seq(0 , max(counts$hi+10), by=5)) # Put the plot in a box # Get standard deviation of each group # The standard deviations are saved in a matrix of same size # as the matrix with midpoints, this is useful for plotting # the error bars stDevs <- matrix(c(sd(low), sd(med), sd(hi)), 3) # Plot the vertical lines of the error bars # The vertical bars are plotted at the midpoints segments(plot.1, means - stDevs, plot.1, means + stDevs, lwd=2) # Now plot the horizontal bounds for the error bars # 1. The lower bar segments(plot.1 - 0.1, means - stDevs, plot.1 + 0.1, means - stDevs, lwd=2) # 2. The upper bar segments(plot.1 - 0.1, means + stDevs, plot.1 + 0.1, means + stDevs, lwd=2) #adding stars where significant #change the "x= " to change position to the center of the box to make it look good. #low star text(x= .7, y=max(low)+20,"*",cex=2) #med star text(x= 1.9, y=max(med)+20,"*",cex=2) #hi star text(x= 3.1, y=max(hi)+20,"*",cex=2) #using ggplot to do this library(ggplot2) average.all <- data.frame(means) average.all$bin <- c("0-5", "6-7", "7-higher") ggplot.resample <- ggplot(average.all) + aes(bin, means) + geom_bar(stat = "identity") + theme_bw() #making the standard deviation bars #stDevs is the deviation average.all$stDev <- stDevs[,1] average.all$upper <- average.all$means + average.all$stDev average.all$lower <- average.all$means - average.all$stDev #plot with error bars resample.plot <- ggplot.resample + geom_errorbar(aes( ymax = upper, ymin = lower), data = average.all) + labs(x = "Sepal Evenness", y = "Frequency") + theme(plot.title = element_text(size = 25)) resample.plot