# --------------------------------------------------------------------- # R Working Group, WSU: Using R on air quality datasets, 2017-02-15 # # by Tsengel Nergui, Laboratory for Atmospheric Research, WSU # # Contents: # A. Determining the distribution of data (some looping) # B. A ploblem with a donut plot # # C. PCA application on air quality dataset # D. Displaying an image in R graphic device # ---------------------------------------------------------------------- #--- Basic libraries library(stats) library(plyr) # dplyr must be called before dplyr library(dplyr) # data manipulation library(ggplot2) # plotting #--- clean up data in virtual memory rm(list = ls()) #--- Working directory wrkdir <- "C:/Users/Ts/My_R_Workplace/R-WorkingGroup/PCAonAQ" setwd(wrkdir) list.files(wrkdir) # C. PCA application on air quality dataset ------------------------------- # # Data Prep: hourly O3 concnetration along with meteorological variables in Maryland for 2012 # rm(raw) # raw =read.table("BEL116_hourly_O3_met_2012.txt",na.strings = 99999, header=T,stringsAsFactors = FALSE) # # dim(raw) # glimpse(raw) # summary(raw) # str(raw) # # library(lubridate) # ## Note: parse_date_time set an input vector into POSIXct date-time object, at which requires timezone infor for pase output. # #raw$date <- lubridate::parse_date_time(raw$DATE, orders = "%m/%d/Y%", tz = "GMT") # raw$date <- as.POSIXct(base::strptime(raw$DATE, format="%m/%d/%Y", tz = "GMT")) # str(raw) # # # dim(raw) # glimpse(raw) # # # select subsection of (rows & columns) of the data # rm(dat, dat1) # # select subsetting by columns of the data # #extracting 2 summer months # dat <- raw %>% dplyr::filter(date >= "2012-07-01",date <= "2012-08-31") # dat1 <- subset(dat,select=-c(SITE_ID,DATE,TIME,date)) # # write.table(dat1, "BEL116_hourly_O3_met_2012Summer.txt") #R session start from hereon rm(dat1) dat1 <- read.table("BEL116_hourly_O3_met_2012Summer.txt",header=T,stringsAsFactors = FALSE) summary(dat1);dim(dat1) str(dat1);colnames(dat1) # a new df for analysis dat1[complete.cases(dat1),]->mydat # writing complete cases to dat2 file dim(mydat) # PCA starts here... --------------------------------------------------------- # Application: # 1. Data reduction # 2. Exploring/examine covariance structure of multi-variate #Correlated variables will captured by a new variable called a principal component # testings for assumptions #Library for KMO test for measuring sampling adequecy library(psych) psych::KMO(cor(mydat)) # or KMO(mydat) both works #the Kaiser-Meyer-Olkin (KMO) index KMO(mydat) #Rule of thumb: Measure of Sampling Adequacy (MSA) >0.5 good to go # Bartlett's test that a correlation matrix is an identity matrix cortest.bartlett(mydat,n = dim(mydat)[1]) ?cortest.bartlett #Conc: if p.value is very small (0 than alpha level) # indicates there are significant relationships among variables # Eigen val and vector on Correlation matrix of data dim(mydat) # When is better to use standardized/ raw datasets #sd.dat<-(scale(mydat));sd.dat #standardizing data #mydat<-sd.dat cor<- cor(mydat);round(cor,2); eigen(cor(mydat)) # What is eigen value and vector??? eval<-eigen(cor(mydat))$values;eval # 13 eigenvalues round(eval,2) evec<-eigen(cor(mydat))$vectors;evec rm(pc.aq) # PCs from the correlation matrix pc.aq <- stats::princomp(mydat, cor = TRUE) # use this for cases obs < variables ls (pc.aq) # Scree plot and biplot dev.off() par(mfrow = c(1,2)) require(stats) # Scree plot represents eigenvalue distribution (explained variance) of principal components screeplot(pc.aq, npcs = 10, type = "lines", main = "air quality dataset") # npcs refers to PCs biplot(pc.aq) biplot(pc.aq,choices=c(1,3)) # use "choices" to plot other # Print variance accounted by PCs summary(pc.aq) pc.aq$loadings # predicting y_hat #(PC1) predict(pc.qa)[,1:2] # predict by the first 2 PCs write.table(pc.aq$loadings, file = "airquality_PCs.txt", append = FALSE, quote = TRUE, sep = " ", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double"), fileEncoding = "") pc.load <- read.table("airquality_PCs.txt",header=T,stringsAsFactors = FALSE) list.files(wrkdir) # D. Displaying an image in R graphic device --------------------------------- install.packages("jpeg") ## if necessary library(jpeg) img <- readJPEG("PCloadings.jpg",native = TRUE) # http://stackoverflow.com/questions/9543343/plot-a-jpg-image-using-base-graphics-in-r plot_jpeg = function(path, add=FALSE) { require('jpeg') jpg = readJPEG(path, native=T) # read the file res = dim(jpg)[2:1] # get the resolution if (!add) # initialize an empty plot area if add==FALSE plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]),asp=1,type='n',xaxs='i',yaxs='i',xaxt='n',yaxt='n',xlab='',ylab='',bty='n') rasterImage(jpg,1,1,res[1],res[2]) } dev.off() plot_jpeg("PCloadings.jpg") plot_jpeg("PCloadings_SO2.jpg") # Data Reduction ---------------------------------------------------------- head(dat1) colnames(dat1) rm(dat2) dat2 <- base::subset(dat1,select=c(TEMPERATURE,RELATIVE_HUMIDITY,SOLAR_RADIATION,SIGMA_THETA,WINDSPEED,WINDSPEED_SCALAR)) dim(dat2) # a new df for analysis dat2[complete.cases(dat2),]->mydat2 # writing complete cases to dat2 file dim(mydat2) rm(pc.aq2) pc.aq2 <- princomp(mydat2, cor = FALSE) # use this for cases obs < variables ls(pc.aq2) dev.off() par(mfrow = c(1,2)) # Scree plot and biplot screeplot(pc.aq2, npcs = 6, type = "lines", main = "meteorology dataset") # npcs refers to PCs biplot(pc.aq2) # Print variance accounted by PCs summary(pc.aq2) ## Summary: ## PC1 captures 99% of total variance of 6 meteorological variable. ## In other words, we can use PC1 representing meteorological condition ## that originally was characterized by variables on statistical model development such as in PCR or PFA. ## Tsengel Nergui