# --------------------------------------------------------------------- # R Working Group, WSU: Using R on air quality datasets, 2016-10-24 # # by Tsengel Nergui, PhD Candidate, Laboratory for Atmospheric Research, WSU # # Contents: # A. Determining the distribution of data (some looping) # B. A ploblem with a donut plot # C. PCA application on air quality dataset # ---------------------------------------------------------------------- #--- Basic libraries library(stats) library(plyr) # dplyr must be called before dplyr library(dplyr) # data manipulation library(ggplot2) # plotting #--- clean up data in virtual memory rm(list = ls()) #--- Working directory wrkdir <- "C:/Users/Ts/My_R_Workplace/R-WorkingGroup/" setwd(wrkdir) list.files(wrkdir) # A. Determining the distribution of data ---------------------------------------- # input and output directories #indir <- "C:/Users/Ts/3_Wavelet/wave_matlab -Ndep/detrended_nadp4wavelet_201312b/" # laptop directory #indir <- "//fs1.cee.wsu.edu/students$/home/ntsengel/Tsengel/Wavelet/wave_matlab -Ndep/detrended_nadp4wavelet_201312/" # desktop directory indir <- wrkdir list.files(indir) site.list <-c('AL10','AZ03','CA76','CO00'); #diagnosis of input data site.list[1] length(site.list); str(site.list) # --- Examining distribution of precip and Ndep at each site ------------------------------- # #read example input file fname = paste(indir,site.list[2],"_nadp4wavelet.txt",sep="") nadp<-read.table(fname,header=T) head(nadp) dev.off() hist(nadp$Ppt) col2run = 2 # 2 for Ppt, 3 for NH4-N, 4 for NO3-N, 5 for TotalN-N nadp[col2run] # test example----------- ## http://stats.stackexchange.com/questions/132652/how-to-determine-which-distribution-fits-my-data-best library(fitdistrplus) fit1 <- fitdist(nadp[,col2run],"norm") # write to an object ls(fit1) fit1$aic # the Akaike Information Criteria (AIC) developed by Akaike (1973) fit2 <- fitdist(nadp[,col2run],"gamma") # write to an object ls(fit2) fit2$aic #Decision: the lower the absolute value of loglik or AIC values indicate a better fit: library(moments) skewness(nadp$Ppt) kurtosis(nadp$Ppt) # Creating explortary matrix on obs --- library(fitdistrplus) rm(EDA.NH4) EDA.NH4 <- data.frame(site_id=character(), gaus=numeric(), exp=numeric(), gamma=numeric(), lnorm = numeric(), best = character(), length_dat=numeric(), skewness=numeric(), kurtosis=numeric(), nrmality=character(), stringsAsFactors = FALSE) # loopint through sites--- for (i in 1:length(site.list)) { fname = paste(indir,site.list[i],"_nadp4wavelet.txt",sep="") dat_tmp <- read.table(fname,header=T) # NH4 statistics EDA.NH4[i,1] <- site.list[i] EDA.NH4[i,2] <- fitdist(1+dat_tmp$NH4, "norm")$aic EDA.NH4[i,3] <- fitdist(1+dat_tmp$NH4, "exp")$aic EDA.NH4[i,4] <- fitdist(1+dat_tmp$NH4, "gamma")$aic EDA.NH4[i,5] <- fitdist(1+dat_tmp$NH4, "lnorm")$aic EDA.NH4[i,6] <- names(which.min(apply(EDA.NH4[i,2:5], MARGIN = 2, min))) # MARGIN = 2 is for getting column name EDA.NH4[i,7] <- as.integer(length(dat_tmp$NH4)/4) library(moments) EDA.NH4[i,8] <- skewness(dat_tmp$NH4) EDA.NH4[i,9] <- kurtosis(dat_tmp$NH4) if (shapiro.test(dat_tmp$NH4)$p.value > 0.05) EDA.NH4[i,10] <- "Yes,norm" #Yes, there is >0.05 evidence to support H0: x~N(mu,sigma) if (shapiro.test(dat_tmp$NH4)$p.value <= 0.05) EDA.NH4[i,10] <- "No,not-norm" # clearing tmp_dat rm(dat_tmp) } # end of site loop library(dplyr) glimpse(EDA.NH4) glimpse(data1) write.table(EPA.NH4.N,"EDA_NH4_delme.csv",row.names=F,col.names=T,sep=",", quote=FALSE) # B. A problem with a Donut plot------------------------------------------------------------ rm(datain) # following csv file was created based on C:\Users\Ts\My_R_Workplace\NEI2002_emis_compare\3states_1997_Ndep_LUgroups_SPC_summary.txt file, which was prapared by Fortran codes # in /fastscratch/tnergui/post/cctm/deposition/sum_grids4LU foder in aeolus. #datain <- read.table(file="C:/Users/Ts/My_R_Workplace/NEI2002_emis_compare/LUgroups_NdepSPC_1997.csv", skip=0, header = TRUE, sep=",", quote = "\"'", stringsAsFactors=TRUE, # na.strings = "NA" ) # read input file fname2 = paste(indir,"LUgroups_NdepSPC_1997.csv",sep = "") datain <- read.table(fname2, skip=0, header = TRUE, sep=",", quote = "\"'", stringsAsFactors=TRUE, na.strings = "NA" ) head(datain) str(datain) library(dplyr) dat1 <- datain %>% filter(year == 1997 & month == 12 ) %>% select(LUgroup,SPC,Flux_GgN) colnames(dat1) <- c("LandUse","species","Ndep") head(dat1) # Donut plot, version 1---------------------------------------------------------------- donuts_plot <- function( panel = runif(3), # counts pctr = c(.5,.2,.9), # percentage in count legend.label='', cols = c('chartreuse', 'chocolate','deepskyblue'), # colors outradius = 1, # outter radius radius = .7, # 1-width of the donus add = F, innerradius = .5, # innerradius, if innerradius==innerradius then no suggest line legend = F, pilabels=F, legend_offset=.25, # non-negative number, legend right position control borderlit=c(T,F,T,T) ){ par(new=add) if(sum(legend.label=='')>=1) legend.label=paste("Series",1:length(pctr)) if(pilabels){ idx <- which(panel <= 0.05) labeltext=paste(legend.label,round(panel,1),sep=": ") labeltext[idx] = ' ' #pie(panel, col=cols,border = borderlit[1], labels = paste(legend.label,format(panel,scientific=TRUE,digits = 2),sep=": "), radius = outradius) pie(panel, col=cols, border = borderlit[1], labels = labeltext, radius = outradius) } panel = panel/sum(panel) pctr2= panel*(1 - pctr) pctr3 = c(pctr,pctr) pctr_indx=2*(1:length(pctr)) pctr3[pctr_indx]=pctr2 pctr3[-pctr_indx]=panel*pctr cols_fill = c(cols,cols) cols_fill[pctr_indx]='white' cols_fill[-pctr_indx]=cols par(new=TRUE) pie(pctr3, col=cols_fill,border = borderlit[2],labels = '',radius = outradius) # radius =1 par(new=TRUE) pie(panel, col='white',border = borderlit[3],labels = '',radius = radius) # radius =0.7 par(new=TRUE) pie(1, col='white',border = borderlit[4],labels = '', radius = innerradius) # radius =0.5 if(legend){ # par(mar=c(5.2, 4.1, 4.1, 8.2), xpd=TRUE) legend("topright",inset=c(-legend_offset,0),legend=legend.label, pch=rep(15,'.',length(pctr)), col=cols,bty='n') } par(new=FALSE) } ## col- > subcor(change hue/alpha) subcolors <- function(.dta,main,mainCol){ tmp_dta = cbind(.dta,1,'col') tmp1 = unique(.dta[[main]]) for (i in 1:length(tmp1)){ tmp_dta$"col"[.dta[[main]] == tmp1[i]] = mainCol[i] } u <- unlist(by(tmp_dta$"1",tmp_dta[[main]],cumsum)) n <- dim(.dta)[1] subcol=rep(rgb(0,0,0),n); for(i in 1:n){ t1 = col2rgb(tmp_dta$col[i])/256 subcol[i]=rgb(t1[1],t1[2],t1[3],1/(1+u[i])) } return(subcol); } # end of function # preparing data for plotting ------------- colnames(dat1) <- c("LandUse","species","Ndep") dat2 <- dat1 dat2 dat2$ymax=cumsum(dat2$Ndep) dat2$ymin=cumsum(dat2$Ndep)-dat2$Ndep colnames(dat2) # parameters for the dunut_plot funtion-------------- kk <- dim(dat2)[1] # total number of colors mm <- length(unique(dat2$LandUse)) # total number of LU groups xx <- kk-1 # total number of borderlit dat2=dat2[order(dat2$LandUse,-dat2$Ndep),] # ordering Ndep values by reverse so that less contribtion will get ligher color scheme. arr=aggregate(Ndep~LandUse,dat2,sum) ### choose your cols # order of LU groups: ("crop", "forest", "othWet", "range", "spVeg", "tundra", "urban") mainCol = c('goldenrod3','forestgreen', 'deepskyblue2','aquamarine3','darkkhaki','snow4','gray40') #chartreuse3 =lime color # ploting outer pie dev.off() donuts_plot(dat2$Ndep,rep(1,kk),dat2$species, cols=subcolors(dat2,"LandUse",mainCol),outradius = 1.0, legend=F,pilabels = T,border=rep("white",xx)) #ploting inner pie only donuts_plot(arr$Ndep,rep(1,mm),arr$LandUse, cols=mainCol,pilabels=T,legend=F,legend_offset=-.052, outradius = .4,radius = .0,innerradius=.0,add=T, border =rep("white",xx) ) # A thing I wanted on this plot is to have inner labels are printed inside of the pies. # ... I gave up after spending 2 weeks on it, and endedup plotting in Excel. # C. PCA application on air quality dataset ------------------------------- # hourly O3 measurement along with meteorological variables in Maryland raw =read.table("BEL116_hourly_O3_met_2012.txt",na.strings = 99999, header=T) glimpse(raw) dim(raw) # ...Coming soon