# --------------------------------------------------------------------- # R Working Group, WSU: Generalized Linear Mixed Models and Predictions in a Wolf-livestock System, 2017-03-22 # # by Zoe Hanley, Large Carnivore Conservation Laboratory, WSU # # Contents: # A. Assess full model fit (outliers, homogeneity, autocorrelation) # B. Determine best model using Akaike Information Criterion (AIC) # C. Ten-fold cross validation for Idaho and Montana (Internal Validation) # D. Model-averaged predictions for Washington (External Validation) # ---------------------------------------------------------------------- #---- Download packages library(glmmADMB) #Generalized Linear Mixed Modeling (GLMMs). Includes zero-inflated distributions. #Use download instructions from:http://glmmadmb.r-forge.r-project.org/ library(graphics) #temporal autocorrelation graphs library(lattice) #PACK vs. YEAR graphs library(bbmle) #AIC table library(plyr) #create cross-validation progress bar #---- Set working directory wrkdir <- "C:/Users/zlhan/Documents/WSU_GIS/R_Group" setwd(wrkdir) list.files(wrkdir) #---- Import GLMM dataset (Idaho and Montana data) GLMM =read.table("RGroup_GLMM.txt", header=T) #specify factors GLMM$YR_1DEP = as.factor(GLMM$YR_1DEP) GLMM$REPRODUCED = as.factor(GLMM$REPRODUCED) str(GLMM) #---- Prior to analyses: #1. Collinearity (corrplot package)/Multicollinearity (usdm package) - A predictor variable (or fixed effect) #was included in a multivariate model if it was not collinear (r > 0.70) with a stronger predictor. #Surviving predictors were tested for multicollinearity and removed if the Variance Inflation Factor #(VIF) > 3.0 (Cade, 2015; Zuur et al., 2010). #2. Standardize variables (x - mean)/(2*sd) - Continuous predictor variables were standardized using centering #procedures from Gelman (2008) to ease interpretation of interactions by placing binary and continuous #predictors on a common scale. #---- A. Run full GLMM (includes all variables) #What is a mixed model? #Includes fixed effects (predictor variables), and random effects (grouping variables). #Generalized linear mixed models (GLMMs) combine two statistical frameworks widely used in ecology, linear mixed #models(which include both fixed and random effects) and generalized mixed models (which handle non-normal #stochastic distributions; Bolker et al. 2009). A mixed modeling framework was chosen over the traditional logistic #regression because inter-annual observations within a wolf pack territory were not independent, therefore, #wolf pack was included as a random effect in the intercept. head(GLMM) mod_FULL <- glmmadmb(DEP_PACK ~ YR_1DEP + REPRODUCED + PACK_TOTALz + REPRODUCED + MADULTz + MBREEDz + FORESTz + CATTLE_abundz + PREYz + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") summary(mod_FULL) #---- Outliers (residuals vs. fitted values) #extract Pearsons residuals GLMM$resids = residuals(mod_FULL) hist(GLMM$resids, label = T, freq = T, main = "Histogram of full model residuals (frequency)", xlab = "resids" ) #---- Homogeneity #extract fitted and predicted values GLMM$fitted = fitted(mod_FULL) #"raw" values GLMM$predict = predict(mod_FULL, type = "response") #values in on the same scale as the response variable #plot plot(GLMM$fitted, GLMM$resids, main = "FULL fitted vs. resids") #----Autocorrelation #Spatial autocorrelation (Global Morans I) - ArcGIS #Temporal autocorrelation(graphics package) #create dataframe of YEAR and resids acf_resids = subset(GLMM, select = c(YEAR, resids)) #turn into a time series object acf_residsTS = ts(acf_resids) #run autocorrelation function acf(acf_residsTS, type = "correlation") #Another way to view residuals (lattice package) xyplot(resids~YEAR | factor(PACK), data = GLMM) #---- B. GLMM model set #Run set of hyothesis-based models including groups of variables which may influence cattle depredatio risk. #Include the top model in this model set! Assess which model of the set best fits these data using Akaike's #Information Criterion (AIC), which balances goodness of fit and model complexity. To learn more about AIC and #the information-theoretic approach, see Burnham and Anderson (2002). #PACK CATTLE MODEL: mod_pc <- glmmadmb(DEP_PACK ~ CATTLE_abundz + PACK_TOTALz + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") #FOREST-CATTLE MODEL: mod_fc <- glmmadmb(DEP_PACK ~ FORESTz + CATTLE_abundz + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") #PACK-FOREST MODEL: mod_pf <- glmmadmb(DEP_PACK ~ PACK_TOTALz + REPRODUCED + FORESTz + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") #CATTLE MORTALITY MODEL: mod_cm <- glmmadmb(DEP_PACK ~ MADULTz + CATTLE_abundz + YR_1DEP + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") #CATTLE MODEL: mod_c <- glmmadmb(DEP_PACK ~ CATTLE_abundz + (1 | PACK), data = GLMM, zeroInflation = FALSE, family = "binomial") #Which models were best (weight >= 0.90)? (bbmle package) modAIC = AICtab(mod_FULL, mod_pc, mod_cm, mod_fc, mod_c, base=TRUE, nobs=308, weights=TRUE) modAIC #None. Model average predictions for all models with sum weigh >= 0.90. (mod_cm, mod_fc, and mod_FULL) summary(mod_cm) summary(mod_fc) summary(mod_FULL) #----Calculate confidence intervals #for fixed effects - CATTLE-MORTALITY cbind(coef(mod_cm), confint(mod_cm, level = 0.95)) #for fixed effects - FOREST-CATTLE cbind(coef(mod_fc), confint(mod_fc, level = 0.95)) #for fixed effects - FULL cbind(coef(mod_FULL), confint(mod_FULL, level = 0.95)) #----- C. 10-fold cross validation (Internal Validation - Idaho and Montana) #K-fold cross validation is one way to assess how the results of a statistical analysis will #generalize to an independent data set. One round of cross-validation involves partitioning a #sample of data into complementary subsets, performing the analysis on one subset (called the training set), #and validating the analysis on the other subset (called the validation set or testing set). In k-fold #cross-validation, the original sample is randomly partitioned into k equal sized subsamples (or folds). #Because many packages don't accept the glmmadmb model objects, cross-validation code from: # http://diggdata.in/post/58333540883/k-fold-cross-validation-in-r #Make a new dataframe of necessary variables to put values into CV_binom <- subset(GLMM, select = c("PACK", "MADULTz","CATTLE_abundz","DEP_PACK", "YR_1DEP")) #reorder with response variable (DEP_PACK as column 1) CV_binom <- CV_binom[c("DEP_PACK", "PACK", "MADULTz", "CATTLE_abundz", "YR_1DEP")] #change factors to integer CV_binom$DEP_PACK = ifelse(CV_binom$DEP_PACK == "Y", 1,0) CV_binom$YR_1DEP = as.character(CV_binom$YR_1DEP) CV_binom$YR_1DEP = as.integer(CV_binom$YR_1DEP) #10-fold cross-validation k = 10 # sample from 1 to k, nrow times (the number of observations in the data) CV_binom$id <- sample(1:k, nrow(CV_binom), replace = TRUE) list <- 1:k # prediction and testset data frames that we add to with each iteration over # the 10 folds predictionB <- data.frame() testsetCopyB <- data.frame() #Creating a progress bar to know the status of CV (plyr package) progress.bar <- create_progress_bar("text") progress.bar$init(k) for (i in 1:k){ # remove rows with id i from dataframe to create training set # select rows with id i to create test set trainingsetB <- subset(CV_binom, id %in% list[-i]) testsetB <- subset(CV_binom, id %in% c(i)) # run BEST MODEL CVmodelB <- glmmadmb(DEP_PACK ~ MADULTz + CATTLE_abundz + YR_1DEP + (1 | PACK), data = CV_binom, zeroInflation = FALSE, family = "binomial") # remove response column 1, DEP_PACK tempB <- as.data.frame(predict(CVmodelB, type = "response", testsetB[,-1])) # append this iteration's predictions to the end of the prediction data frame predictionB <- rbind(predictionB, tempB) # append this iteration's test set to the test set copy data frame # keep all columns testsetCopyB <- rbind(testsetCopyB, as.data.frame(testsetB[, c(1,6)])) progress.bar$step() } # add predictions, actual values, and id resultB <- cbind(predictionB, testsetCopyB) names(resultB) <- c("Predicted", "Actual") resultB$Difference <- abs(resultB$Actual - resultB$Predicted) # As an example use Mean Absolute Error as Evalution summary(resultB$Difference) #---- D. Validate models for Washington #Calculate model-averaged predictions (using methods from Burnham and Anderson, 2002) using all models with #AIC weights >=0.90, and use mean absolute error to evaluate training model on a new dataset. #import new dataset WA = read.table("RGroup_MAP.txt", header=T) str(WA) #use standardized coefficients from training models to predict depredation risk for Washington summary(mod_cm) #reference top model WA$CM <- predict(mod_cm, newdata = WA, type = "response") WA$FC <- predict(mod_fc, newdata = WA, type = "response") WA$FULL <- predict(mod_FULL, newdata = WA, type = "response") #Model average predictions modAIC #reference model weights #model average WA$mod_av = (0.573*WA$CM) + (0.299*WA$FC) + (0.063*WA$FULL) #calculate percent WA$Pmod_av = WA$mod_av * 100 #no digits WA$Pmod_av = round(WA$Pmod_av, digits = 0) #How well do these models predict for Washington? #new dataframe of only depredation packs WA$DEP_PACK = ifelse(WA$DEP_PACK == "Y", 1,0) #calculate mean error WAvalid = abs(WA$DEP_PACK - WA$mod_av) #actual - predicted summary(WAvalid) #REFERENCES #Burnham, K., Anderson, D., 2002. Model selection and multimodel inference: a practical information-theoretic #approach, Second Ed. ed. Springer-Verlag, New York, New York, USA. #Bolker, B.M., Brooks, M.E., Clark, C.J., Geange, S.W., Poulsen, J.R., Stevens, M.H.H., White, J.S.S., 2009. #Generalized linear mixed models: a practical guide for ecology and evolution. Trends Ecol. Evol. 24, 127–135. #Cade, B.S., 2015. Model averaging and muddled multimodel inference. Ecology 96, 2370–2382. #Gelman, A., 2008. Scaling regression inputs by dividing by two standard deviations. Stat. Med. 27, 2865–2873. #Zuur, A.F., Ieno, E.N., Elphick, C.S., 2010. A protocol for data exploration to avoid common statistical problems. #Methods Ecol. Evol. 1, 3–14. #SUGGESTED READING #Fox, G.A., Negrete-Yankelevich, S., Sosa, V.J., 2015. Ecological statistics: contemporary theory and application. #Oxford University Press, Oxford, UK. #Zuur, A.F., Ieno, E.N., Walker, N., Savaliev, A.A., Smith, G., 2009. Mixed effects models and extensions in ecology with R. #Springer-Verlag New York Inc., New York, New York, USA.