# ---------------------------------------------------------------------
#  R Working Group, WSU: Using R on air quality datasets, 2016-10-24
#
#  by Tsengel Nergui, PhD Candidate, Laboratory for Atmospheric Research, WSU
#
#  Contents:
#         A. Determining the distribution of data (some looping)
#         B. A ploblem with a donut plot
#         C. PCA application on air quality dataset
# ----------------------------------------------------------------------

#--- Basic libraries 
library(stats)
library(plyr)  # dplyr must be called before dplyr
library(dplyr) # data manipulation
library(ggplot2) # plotting

#--- clean up data in virtual memory
rm(list = ls())

#--- Working directory
wrkdir <- "C:/Users/Ts/My_R_Workplace/R-WorkingGroup/"
setwd(wrkdir)

list.files(wrkdir)


# A. Determining the distribution of data ----------------------------------------

# input and output directories
#indir  <- "C:/Users/Ts/3_Wavelet/wave_matlab -Ndep/detrended_nadp4wavelet_201312b/"  # laptop directory
#indir  <- "//fs1.cee.wsu.edu/students$/home/ntsengel/Tsengel/Wavelet/wave_matlab -Ndep/detrended_nadp4wavelet_201312/" # desktop directory
indir  <- wrkdir

list.files(indir)

site.list <-c('AL10','AZ03','CA76','CO00');

#diagnosis of input data
site.list[1]
length(site.list); str(site.list)

# --- Examining distribution of precip and Ndep at each site -------------------------------

# #read example input file
fname = paste(indir,site.list[2],"_nadp4wavelet.txt",sep="")
nadp<-read.table(fname,header=T)

head(nadp)

dev.off()
hist(nadp$Ppt)

col2run = 2  # 2 for Ppt, 3 for NH4-N, 4 for NO3-N, 5 for TotalN-N
nadp[col2run]

# test example-----------
## http://stats.stackexchange.com/questions/132652/how-to-determine-which-distribution-fits-my-data-best
library(fitdistrplus)

fit1 <- fitdist(nadp[,col2run],"norm") # write to an object
ls(fit1)
fit1$aic  # the Akaike Information Criteria (AIC) developed by Akaike (1973)
fit2 <- fitdist(nadp[,col2run],"gamma") # write to an object
ls(fit2)
fit2$aic

#Decision: the lower the absolute value of loglik or AIC values indicate a better fit:

library(moments)
skewness(nadp$Ppt)
kurtosis(nadp$Ppt)

# Creating explortary matrix on obs ---
library(fitdistrplus)
rm(EDA.NH4)

EDA.NH4 <- data.frame(site_id=character(),
                        gaus=numeric(), 
                        exp=numeric(),
                        gamma=numeric(),
                        lnorm = numeric(),
                        best = character(),
                        length_dat=numeric(),
                        skewness=numeric(),
                        kurtosis=numeric(),
                        nrmality=character(),
                        stringsAsFactors = FALSE)

# loopint through sites---
for (i in 1:length(site.list)) {
  
  fname = paste(indir,site.list[i],"_nadp4wavelet.txt",sep="")
  dat_tmp <- read.table(fname,header=T)

  # NH4 statistics 
  EDA.NH4[i,1] <- site.list[i]
  EDA.NH4[i,2] <- fitdist(1+dat_tmp$NH4, "norm")$aic
  EDA.NH4[i,3] <- fitdist(1+dat_tmp$NH4, "exp")$aic   
  EDA.NH4[i,4] <- fitdist(1+dat_tmp$NH4, "gamma")$aic
  EDA.NH4[i,5] <- fitdist(1+dat_tmp$NH4, "lnorm")$aic
  EDA.NH4[i,6] <- names(which.min(apply(EDA.NH4[i,2:5], MARGIN = 2, min))) # MARGIN = 2 is for getting column name
  EDA.NH4[i,7] <- as.integer(length(dat_tmp$NH4)/4)
  library(moments)
  EDA.NH4[i,8] <- skewness(dat_tmp$NH4)
  EDA.NH4[i,9] <- kurtosis(dat_tmp$NH4)
  if (shapiro.test(dat_tmp$NH4)$p.value > 0.05) EDA.NH4[i,10] <- "Yes,norm"  #Yes, there is >0.05 evidence to support H0: x~N(mu,sigma)
  if (shapiro.test(dat_tmp$NH4)$p.value <= 0.05) EDA.NH4[i,10] <- "No,not-norm"
  
  # clearing tmp_dat 
  rm(dat_tmp)
  
} # end of site loop


library(dplyr)
glimpse(EDA.NH4)
glimpse(data1)

write.table(EPA.NH4.N,"EDA_NH4_delme.csv",row.names=F,col.names=T,sep=",", quote=FALSE)







# B. A problem with a Donut plot------------------------------------------------------------

rm(datain)
# following csv file was created based on C:\Users\Ts\My_R_Workplace\NEI2002_emis_compare\3states_1997_Ndep_LUgroups_SPC_summary.txt file, which was prapared by Fortran codes 
# in /fastscratch/tnergui/post/cctm/deposition/sum_grids4LU foder in aeolus.
#datain <- read.table(file="C:/Users/Ts/My_R_Workplace/NEI2002_emis_compare/LUgroups_NdepSPC_1997.csv", skip=0, header = TRUE, sep=",", quote = "\"'", stringsAsFactors=TRUE,
#                         na.strings = "NA" )

# read input file
fname2 = paste(indir,"LUgroups_NdepSPC_1997.csv",sep = "")
datain <- read.table(fname2,
               skip=0, header = TRUE, sep=",", quote = "\"'", stringsAsFactors=TRUE, na.strings = "NA" )

head(datain)
str(datain)
library(dplyr)

dat1 <- datain %>%
  filter(year == 1997 & month == 12 ) %>%
  select(LUgroup,SPC,Flux_GgN)

colnames(dat1) <- c("LandUse","species","Ndep")
head(dat1)

# Donut plot, version 1----------------------------------------------------------------

donuts_plot <- function(
  panel = runif(3), # counts
  pctr = c(.5,.2,.9), # percentage in count
  legend.label='',
  cols = c('chartreuse', 'chocolate','deepskyblue'), # colors
  outradius = 1, # outter radius
  radius = .7,   # 1-width of the donus 
  add = F,
  innerradius = .5, # innerradius, if innerradius==innerradius then no suggest line
  legend = F,
  pilabels=F,
  legend_offset=.25, # non-negative number, legend right position control
  borderlit=c(T,F,T,T)
  
){ 
  par(new=add)
  if(sum(legend.label=='')>=1) legend.label=paste("Series",1:length(pctr))
  if(pilabels){
    idx <- which(panel <= 0.05)
    labeltext=paste(legend.label,round(panel,1),sep=": ")
    labeltext[idx] = ' '
    #pie(panel, col=cols,border = borderlit[1], labels = paste(legend.label,format(panel,scientific=TRUE,digits = 2),sep=": "), radius = outradius)
    pie(panel, col=cols, border = borderlit[1], labels = labeltext, radius = outradius)
  }
  
  
  panel = panel/sum(panel)
  
  pctr2= panel*(1 - pctr)
  pctr3 = c(pctr,pctr)
  pctr_indx=2*(1:length(pctr))
  pctr3[pctr_indx]=pctr2
  pctr3[-pctr_indx]=panel*pctr 
  cols_fill = c(cols,cols)
  cols_fill[pctr_indx]='white'
  cols_fill[-pctr_indx]=cols
  par(new=TRUE)
  pie(pctr3, col=cols_fill,border = borderlit[2],labels = '',radius = outradius)  # radius =1
  par(new=TRUE)
  pie(panel, col='white',border = borderlit[3],labels = '',radius = radius)     # radius =0.7
  par(new=TRUE)
  pie(1, col='white',border = borderlit[4],labels = '', radius = innerradius)    # radius =0.5
  
  if(legend){
    # par(mar=c(5.2, 4.1, 4.1, 8.2), xpd=TRUE)
    legend("topright",inset=c(-legend_offset,0),legend=legend.label, pch=rep(15,'.',length(pctr)), 
           col=cols,bty='n')
  }
  par(new=FALSE)
}

## col- > subcor(change hue/alpha)
subcolors <- function(.dta,main,mainCol){
  tmp_dta = cbind(.dta,1,'col')
  tmp1 = unique(.dta[[main]])
  for (i in 1:length(tmp1)){
    tmp_dta$"col"[.dta[[main]] == tmp1[i]] = mainCol[i]
  }
  u <- unlist(by(tmp_dta$"1",tmp_dta[[main]],cumsum))
  n <- dim(.dta)[1]
  subcol=rep(rgb(0,0,0),n);
  for(i in 1:n){
    t1 = col2rgb(tmp_dta$col[i])/256
    subcol[i]=rgb(t1[1],t1[2],t1[3],1/(1+u[i]))
  }
  return(subcol);
}
# end of function

# preparing data for plotting -------------
colnames(dat1) <- c("LandUse","species","Ndep")
dat2 <- dat1
dat2
dat2$ymax=cumsum(dat2$Ndep)
dat2$ymin=cumsum(dat2$Ndep)-dat2$Ndep
colnames(dat2)

# parameters for the dunut_plot funtion--------------
kk <- dim(dat2)[1] # total number of colors
mm <- length(unique(dat2$LandUse)) # total number of LU groups
xx <- kk-1  # total number of borderlit

dat2=dat2[order(dat2$LandUse,-dat2$Ndep),] # ordering Ndep values by reverse so that less contribtion will get ligher color scheme.
arr=aggregate(Ndep~LandUse,dat2,sum)

### choose your cols
# order of LU groups: ("crop", "forest", "othWet", "range", "spVeg", "tundra", "urban")
mainCol =  c('goldenrod3','forestgreen', 'deepskyblue2','aquamarine3','darkkhaki','snow4','gray40')  #chartreuse3 =lime color

# ploting outer pie
dev.off()
donuts_plot(dat2$Ndep,rep(1,kk),dat2$species,
            cols=subcolors(dat2,"LandUse",mainCol),outradius = 1.0,
            legend=F,pilabels = T,border=rep("white",xx))


#ploting inner pie only
donuts_plot(arr$Ndep,rep(1,mm),arr$LandUse,
            cols=mainCol,pilabels=T,legend=F,legend_offset=-.052,
            outradius = .4,radius = .0,innerradius=.0,add=T, border =rep("white",xx) )

# A thing I wanted on this plot is to have inner labels are printed inside of the pies.
# ... I gave up after spending 2 weeks on it, and endedup plotting in Excel.



# C. PCA application on air quality dataset -------------------------------

# hourly O3 measurement along with meteorological variables in Maryland
raw =read.table("BEL116_hourly_O3_met_2012.txt",na.strings = 99999, header=T)

glimpse(raw)
dim(raw)


# ...Coming soon