##################################################################### # NAME: Tom Loughin # # DATE: 08-21-2013 # # PURPOSE: Detailed analysis of the alcohol consumption example # # # # NOTES: # ##################################################################### options(width = 60) ##################################################################### # Read the data dehart <- read.table("C:\\Data\\DeHartSimplified.csv", header = TRUE, sep = ",", na.strings = " ") head(dehart) # Reduce data to what is needed for examples saturday <- dehart[dehart$dayweek == 6,] head(round(x = saturday, digits = 3)) dim(saturday) # Setting the row labels to be the subject ID values for easier identification. row.names(saturday) <- saturday$id # Summarize variables. First three columns are ID and day info, not needed anymore summary(saturday[,-c(1:3,9)]) tabulate(saturday[,9]) # Code from http://stackoverflow.com/questions/11346880/r-plot-multiple-box-plots-using-columns-from-data-frame plotdata <- saturday[,-c(1:3,9)] x11(height = 10, width = 10) # pdf(file = "c:\\figures\\Figure5.15BW.pdf", width = 10, height = 10, colormodel = "cmyk", pointsize = 20) # Create plot for book par(mfrow = c(3,3), mai = c(0.5, 0.5, 0.5, 0.5)) for (i in 1:ncol(plotdata)) { boxplot(plotdata[,i], main = names(plotdata[i]), type = "l", cex.axis = 1.5, cex.main = 1.5) } # dev.off() # Create plot for book # Look at a scatterplot matrix. x11() pairs(saturday[,-c(1:3,9)]) # Enhanced scatterplot available from car package spm() library(car) x11(height = 12, width = 15) # pdf(file = "c:\\figures\\Figure5.16color.pdf", width = 10, height = 8, colormodel = "cmyk", pointsize = 14) # Create plot for book spm(saturday[,-c(1:3,9)], cex.labels = 1.4, cex.axis = 1.4) # dev.off() # Create plot for book ##################################################################### # Model searches using BMA through glmulti() ############### First a main-effects-only search. Exhaustive search is feasible. # Unfortinately, have to deactivate a system variable in order to get rJava to work. # See http://stackoverflow.com/questions/7019912/using-the-rjava-package-on-win7-64-bit-with-r # True for 32 bit as well if (Sys.getenv("JAVA_HOME")! = "") Sys.setenv(JAVA_HOME = "") library(glmulti) search.1.aicc <- glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 1, method = "h", crit = "aicc", family = poisson(link = "log")) print(search.1.aicc) # Look at top 6 models head(weightable(search.1.aicc)) x11(width = 7, height = 5, pointsize = 14) # pdf(file = "c:\\figures\\Figure5.17-1color.pdf", width = 7, height = 5, colormodel = "cmyk", pointsize = 16) # Create plot for book plot(search.1.aicc, type = "w") # Plot of model weights # dev.off() # Create plot for book x11(width = 7, height = 5, pointsize = 14) plot(search.1.aicc, type = "p") # Plot of IC values x11(width = 7, height = 5, pointsize = 14) # pdf(file = "c:\\figures\\Figure5.17-2color.pdf", width = 7, height = 5, colormodel = "cmyk", pointsize = 16) # Create plot for book plot(search.1.aicc, type = "s") # Plot of variable weights # dev.off() # Create plot for book # Note: Best model has 5 variables; same variables stand out as top ones in BMA with >90% weight. # All other variables have weight < 30%. ############### Now second order with ALL variables # Enumerate the exhaustive search size first glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "d", crit = "aicc", family = poisson(link = "log")) # "Your candidate set contains more than 1 billion (1e9) models." # We will use Genetic Algorithm instead. # Run multiple times in case one run gets stuck in sub-optimal model. set.seed(129981872) # Unfortunately, glmulti() does not heed the seed. This is ineffective in forcing the results to be the same run after run. search.2g0.aicc <- glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.2g1.aicc <- glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.2g2.aicc <- glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.2g3.aicc <- glmulti(y = numall ~ ., data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) # Look at top models from all 4 runs. Not always the same! head(weightable(search.2g0.aicc)) head(weightable(search.2g1.aicc)) head(weightable(search.2g2.aicc)) head(weightable(search.2g3.aicc)) # Should combine these to make a "best of" listing # This is what "consensus()" does. search.2allg.aicc <- consensus(xs = list(search.2g0.aicc, search.2g1.aicc, search.2g2.aicc, search.2g3.aicc), confsetsize = 100) head(weightable(search.2allg.aicc)) print(search.2allg.aicc) x11(width = 9, height = 15, pointsize = 11) plot(search.2allg.aicc, type = "w") x11(width = 9, height = 15, pointsize = 11) plot(search.2allg.aicc, type = "p") x11(width = 12, height = 15, pointsize = 12) # par(mai = c(1,1,1,1)) par(mai = c(1,1,.7,.5)) plot(search.2allg.aicc, type = "s") # Note that top model is also the one with all variables of high importance # Analysis of parameter estimates parms <- coef(search.2allg.aicc) # Renaming columns to fit in book output display colnames(parms) <- c("Estimate", "Variance", "n.Models", "Probability", "95%CI +/-") parms.ord <- parms[order(parms[,4], decreasing = TRUE),] round(parms.ord, digits = 3) # Reorder parameters in decreasing order of probability parms.ord <- parms[order(parms[,4], decreasing = TRUE),] # Confidence intervals for parameters (coef() contains a column with the confidence interval add-ons) ci.parms <- cbind(lower = parms.ord[,1] - parms.ord[,5], upper = parms.ord[,1] + parms.ord[,5]) round(cbind(parms.ord[,1], ci.parms), digits = 3) ############## # What would happen if instead I did an exhaustive search on only the # important main-effect variables and their interactions? search.2e.aicc <- glmulti(y = numall ~ desired + negevent + state + nrel + age, data = saturday[,-c(1:3)], fitfunction = "glm", level = 2, marginality = TRUE, method = "h", crit = "aicc", family = poisson(link = "log")) print(search.2e.aicc) # Look at top 6 models head(weightable(search.2e.aicc)) # Top model is AICC = 421.2, not nearly as good as top model from expanded search using GA. x11(width = 7, height = 4, pointsize = 12) plot(search.2e.aicc, type = "w") x11(width = 7, height = 4, pointsize = 12) plot(search.2e.aicc, type = "p") x11(width = 7, height = 4, pointsize = 12) plot(search.2e.aicc, type = "s") ###################################################################### # Fit best second-order model from GA. Use this as working model. mod.fit <- glm(formula = numall ~ 1 + prel + negevent + gender + rosn + age + desired + state + rosn:prel + age:rosn + desired:gender + desired:age + state:negevent, family = poisson(link = "log"), data = saturday) ############### Analyze the model fit with various functions summary(mod.fit) # Check fit statistic: deviance(mod.fit)/mod.fit$df.residual 1 + 2*sqrt(2/mod.fit$df.residual) 1 + 3*sqrt(2/mod.fit$df.residual) # Somewhat of an indication of lack-of-fit somehow. ############### Is there zero inflation? Create a Pearson Statistic to check mu.poi <- exp(predict(mod.fit)) zero.poi <- sum(exp(-mu.poi)) # Expected number of zeroes for Poisson zero.obs <- sum(saturday$numall == 0) # Total zeroes in data # p = 0.97, NO EVIDENCE OF ZERO INFLATION AS EXPECTED. # Check overall goodness of fit and look at Pearson residuals from it. # Useful to check for link issues. # Assuming that this resides in same folder as current program source("PostFitGOFTest.R") GOFtest <- PostFitGOFTest(obs = saturday$numall, pred = mu.poi, g = 0) ############### Examine residuals. saturday$mu.hat <- predict(mod.fit, type = "response") saturday$p.res <- residuals(mod.fit, type = "pearson") saturday$s.res <- rstandard(mod.fit, type = "pearson") saturday$lin.pred <- mod.fit$linear.predictors ## saturday$cookd <- cooks.distance(mod.fit) ## saturday$hat <- mod.fit$hat resid.plot <- function(y, x, x.label, color1 = "blue", color2 = "red") { ord.x <- order(x) plot(x = x, y = y, xlab = x.label, ylab = "Standardized residuals", ylim = c(min (-3, y), max(3,y))) abline(h = c(3, 2, 0, -2, -3), lty = 3, col = color1) smooth.stand <- loess(formula = y ~ x) lines(x = x[ord.x], y = predict(smooth.stand)[ord.x], lty = 2, col = color2) invisible() } x11(width = 7, height = 6, pointsize = 14) # pdf(file = "c:\\figures\\Figure5.19color.pdf", width = 7, height = 6, colormodel = "cmyk", pointsize = 14) # Create plot for book par(mfrow = c(3,3), mar = c(5, 4, 1, 2)) # Removed most of top margin because plots do not have titles resid.plot(y = saturday$s.res, x = saturday$prel, x.label = "Positive relations") resid.plot(y = saturday$s.res, x = saturday$negevent, x.label = "Negative Events") resid.plot(y = saturday$s.res, x = saturday$gender, x.label = "Gender") # Not sure if plot is worthwhile for a binary variable resid.plot(y = saturday$s.res, x = saturday$rosn, x.label = "Rosenberg Self Esteem") resid.plot(y = saturday$s.res, x = saturday$state, x.label = "State Self Esteem") resid.plot(y = saturday$s.res, x = saturday$desired, x.label = "Desire to Drink") resid.plot(y = saturday$s.res, x = saturday$age, x.label = "Age") resid.plot(y = saturday$s.res, x = saturday$mu.hat, x.label = "Estimated mean") resid.plot(y = saturday$s.res, x = saturday$lin.pred, x.label = "Linear predictor") # dev.off() # Create plot for book # Black-and-white version of plot # pdf(file = "c:\\figures\\Figure5.19BW.pdf", width = 7, height = 6, colormodel = "cmyk", pointsize = 14) # Create plot for book par(mfrow = c(3,3), mar = c(5, 4, 1, 2)) # Removed most of top margin because plots do not have titles resid.plot(y = saturday$s.res, x = saturday$prel, x.label = "Positive relations", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$negevent, x.label = "Negative Events", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$gender, x.label = "Gender", color1 = "black", color2 = "black") # Not sure if plot is worthwhile for a binary variable resid.plot(y = saturday$s.res, x = saturday$rosn, x.label = "Rosenberg Self Esteem", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$state, x.label = "State Self Esteem", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$desired, x.label = "Desire to Drink", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$age, x.label = "Age", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$mu.hat, x.label = "Estimated mean", color1 = "black", color2 = "black") resid.plot(y = saturday$s.res, x = saturday$lin.pred, x.label = "Linear predictor", color1 = "black", color2 = "black") # dev.off() # Create plot for book ########Summary: # Plots don't show any clear problems with poor choice of link or clear curvature in any predictors. # Splines generally wiggle around the center line. Only *possibly* negevent shows some curvature, # but it could just be the skewness of negevent creating influential values that attract the spline # toward their residual values on the right, There is one very large standardized resudual (> 4), # and a small number between 2 and 3 (few enough to believe that are random chance). # Should check influence diagnostics. ############### Default plot produced by plot.glm() x11() par(mfrow = c(2,2)) plot(mod.fit) ############### Influence diagnostics via our function. Assuming that this resides in same folder as current program source("glmDiagnostics.R") save.info <- glmInflDiag(mod.fit = mod.fit) ###### SUMMARY: # There are a few large hat values, one pretty large Cook's Distance, and one somewhat outlying value. # In addition, there is one observation with both moderate influence and moderate outlying-ness. # Looking at those observations in the listing, the outlier (largest Delta.X2 and Delta.dev) # belongs to someone who had 9 drinks but was predicted to have just 2.5. The moderate outlier/influential # point is a person who had 21 drinks and was predicted to have 14. We wonder whether the 21 can be accurate. # This is something that can be explored: does the same model get selected without this possible erroneous value? # The large hat values are ID 113, 118, 129. summary(mod.fit$data) # In Which percentile of the data do these three people fall pcts <- matrix(NA, nrow = 3, ncol = 9) colnames(pcts) <- colnames(saturday[,5:13]) rownames(pcts) <- list(113, 118, 129) counter <- 1 for(j in c(113, 118, 129)){ for(k in c(5:13)){ pcts[counter, k-4] <- sum((saturday[,k] <= saturday[saturday$id == j,k]))/length(saturday[,k]) } counter <- counter + 1 } pcts ###### 113: high nrel, desired, posevent, state; low rosn ###### 118: high prel, posevent; low state, rosn ###### 129: high nrel, low prel # Assess this visually with Parallel Coordinate Plot library(MASS) table(saturday$numall) # Plot highlighting observations # May be good to have the printed information from glmInflDiag() to also be printed returned # in the object so that one would not need to manually type in the id numbers below cols <- c("gray50","red") for(i in c(2, 113, 118, 129, 137, 58)){ pot.infl <- (saturday$id == i) x11(width = 11, height = 6, pointsize = 14) # pdf(file = "c:\\figures\\Figure5.21color.pdf", width = 11, height = 6, colormodel = "cmyk", pointsize = 14) # Create plot for book parcoord(x = saturday[, c(4:8, 10:13, 9)], col = cols[pot.infl + 1], lwd = 1 + 3*pot.infl, main = paste("Potentially influential observation ", i, " identified by thick red lines")) # dev.off() # Create plot for book } # Black-and-white version of plot def.par<-par() par(col.axis = "white", col.lab = "white") for(i in 58){ #pdf(file = "c:\\figures\\Figure5.21BW.pdf", width = 11, height = 6, colormodel = "cmyk", pointsize = 14) # Create plot for book #Re-order observations so that the highlighted observation gets plotted last n<-nrow(saturday) set1<-saturday[(saturday$id != i), c(4:8, 10:13, 9)] set2<-saturday[(saturday$id == i), c(4:8, 10:13, 9)] set3<-rbind(set1, set2) print(table(set3$numall)) parcoord(x = set3, col = c(rep("gray50", times = n-1), "black"), lwd = c(rep(1, times = n-1), 1 + 3)) #lty = c(rep(1, times = n-1), 2) #dev.off() # Create plot for book } par(def.par) parcoord2<-function (x, col = 1, lty = 1, var.label = FALSE, ...) { rx <- apply(x, 2L, range, na.rm = TRUE) x <- apply(x, 2L, function(x) (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))) matplot(1L:ncol(x), t(x), type = "l", col = col, lty = lty, xlab = "", ylab = "", axes = FALSE, ...) # matplot(1L:ncol(x), t(x), type = "l", col = col, lty = lty, # xlab = "", ylab = "", axes = FALSE) axis(1, at = 1L:ncol(x), labels = colnames(x), col = "white") for (i in 1L:ncol(x)) { lines(c(i, i), c(0, 1), col = "grey70") if (var.label) text(c(i, i), c(0, 1), labels = format(rx[, i], digits = 3), xpd = NA, offset = 0.3, pos = c(1, 3), cex = 0.7) } invisible() } x11(width = 11, height = 6, pointsize = 14) par(col.axis = "white", col.lab = "white") for(i in 58){ #pdf(file = "c:\\figures\\Figure5.21BW.pdf", width = 11, height = 6, colormodel = "cmyk", pointsize = 14) # Create plot for book #Re-order observations so that the highlighted observation gets plotted last n<-nrow(saturday) set1<-saturday[(saturday$id != i), c(4:8, 10:13, 9)] set2<-saturday[(saturday$id == i), c(4:8, 10:13, 9)] set3<-rbind(set1, set2) print(table(set3$numall)) parcoord2(x = set3, col = c(rep("lightgray", times = n-1), "darkblue"), lwd = c(rep(1, times = n-1), 1 + 3)) #lty = c(rep(1, times = n-1), 2) #dev.off() # Create plot for book } par(def.par) ############### Is the influential outlier with 21 drinks the main cause of lack of fit? # TO check, rerun selected model without it. mod.fit.1 <- glm(formula = numall ~ 1 + prel + negevent + gender + rosn + age + desired + state + rosn:prel + age:rosn + desired:gender + desired:age + state:negevent, family = poisson(link = "log"), data = saturday[saturday$numall<20,]) summary(mod.fit.1) # Check fit statistic: deviance(mod.fit.1)/mod.fit.1$df.residual 1 + 2*sqrt(2/mod.fit.1$df.residual) 1 + 3*sqrt(2/mod.fit.1$df.residual) ######## Little change in results. ### Might consider rerunning variable selection without this point. search.3g0.aicc <- glmulti(y = numall ~ ., data = saturday[saturday$numall<20,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.3g1.aicc <- glmulti(y = numall ~ ., data = saturday[saturday$numall<20,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.3g2.aicc <- glmulti(y = numall ~ ., data = saturday[saturday$numall<20,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.3g3.aicc <- glmulti(y = numall ~ ., data = saturday[saturday$numall<20,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "aicc", family = poisson(link = "log")) search.3allg.aicc <- consensus(xs = list(search.3g0.aicc, search.3g1.aicc, search.3g2.aicc, search.3g3.aicc), confsetsize = 100) head(weightable(search.3allg.aicc)) print(search.3allg.aicc) x11(width = 7, height = 6, pointsize = 12) plot(search.3allg.aicc, type = "w") x11(width = 7, height = 6, pointsize = 12) plot(search.3allg.aicc, type = "p") x11(width = 7, height = 6, pointsize = 12) plot(search.3allg.aicc, type = "s") x11(width = 15, height = 8, pointsize = 12) plot(search.3allg.aicc, type = "r") ### # Briefly check fit of the new model without extreme observation mod.fit.2 <- glm(formula = numall ~ 1 + prel + negevent + rosn + age + desired + state + rosn:prel + age:rosn + desired:age + state:negevent, family = poisson(link = "log"), data = saturday[saturday$numall<20,]) summary(mod.fit.2) # Check fit statistic: deviance(mod.fit.2)/mod.fit.2$df.residual 1 + 2*sqrt(2/mod.fit.2$df.residual) 1 + 3*sqrt(2/mod.fit.2$df.residual) ######## Little change in results. Still evidence of a poor fit. ##### Summary: The gender and desired:gender interactions are no longer very important # Moving from nearly 100% to <40% evidence weight. # All other variables remain in the model and no new ones are added. # We will stick with original model, as we have no reason to remove this observation. # Examination of larger data file reveals one other observation at 18 drinks, and one at 15, 14, 13. # hist(x = dehart$numall,breaks = 25) # Perhaps it is a real value? Question for the researcher. ############### Investigate overdispersion in original model # No reason found for poor deviance/df statistic; as it is not entirely caused by the one outlier. # Conclude that there is overdispersion. # Which is better: quasipoisson or negative binomial? res.sq <- residuals(object = mod.fit, type = "response")^2 set1 <- data.frame(res.sq, mu.hat = saturday$mu.hat) fit.quad <- lm(formula = res.sq ~ mu.hat + I(mu.hat^2), data = set1) anova(fit.quad) # Quadratic term is not at all significant. fit.lin <- lm(formula = res.sq ~ mu.hat, data = set1) x11(height = 7, width = 6) plot(x = set1$mu.hat, y = set1$res.sq, xlab = "Predicted count", ylab = "Squared Residual") curve(expr = predict(object = fit.lin, newdata = data.frame(mu.hat = x), type = "response"), col = "blue", add = TRUE, lty = "solid") curve(expr = predict(object = fit.quad, newdata = data.frame(mu.hat = x), type = "response"), col = "red", add = TRUE, lty = "dashed") legend(x = 50, y = 1000, legend = c("Linear", "Quadratic"), col = c("red", "blue"), lty = c("solid", "dashed"), bty = "n") # No curvature apparent. Conclusion: Use quasi-Poisson. ################################################################## # Since we are changing models, should start over with variable selection # In order to use quasipoisson in glmulti, need to provide dispersion # parameter estimate as "glmultiqaiccvalue". This comes from a previous # quasipoisson fit. # Would normally use a saturated model for this, so that we can be sure that # no important variables have been omitted. However, the previous working # model seemed to clearly indicate which variables were important (at least # in the Poisson model) and the saturated model with all pairwise interactions # among all variables gives a much larger dispersion parameter estimate than # the reduced model, so we will use the variables from the previous working # model in a new quasipoisson fit. # mq <- glm(formula = numall ~ prel + negevent + gender + rosn + age + desired + state + rosn:prel + age:rosn + desired:gender + desired:age + state:negevent, family = quasipoisson(link = "log"), data = saturday) names(summary(mq)) # We will use Genetic Algorithm on model with interactions. # Run multiple times in case one run gets stuck in sub-optimal model. # Code below is needed in order to use quasipoisson model in glmulti() library(R.utils) setOption("glmulti-cvalue", summary(mq)$dispersion) getOption("glmulti-cvalue") search.4g1.qaicc <- glmulti(y = numall ~ ., data = saturday[,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "qaicc", family = poisson(link = "log")) search.4g2.qaicc <- glmulti(y = numall ~ ., data = saturday[,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "qaicc", family = poisson(link = "log")) search.4g3.qaicc <- glmulti(y = numall ~ ., data = saturday[,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "qaicc", family = poisson(link = "log")) search.4g4.qaicc <- glmulti(y = numall ~ ., data = saturday[,c(4:13)], fitfunction = "glm", level = 2, marginality = TRUE, method = "g", crit = "qaicc", family = poisson(link = "log")) search.4allg.qaicc <- consensus(xs = list(search.4g4.qaicc, search.4g1.qaicc, search.4g2.qaicc, search.4g3.qaicc), confsetsize = 100) head(weightable(search.4allg.qaicc)) print(search.4allg.qaicc) x11(width = 7, height = 6, pointsize = 12) plot(search.4allg.qaicc, type = "w") x11(width = 7, height = 6, pointsize = 12) plot(search.4allg.qaicc, type = "p") x11(width = 7, height = 6, pointsize = 12) plot(search.4allg.qaicc, type = "s") #####RESULTS: Same variables are clearly more important than others. # One variable slides slightly to lower than 80% weight, but this one is still clearly more important than reamining ones. # Choosing to leave model as is. # Analysis of parameter estimates parms <- coef(search.4allg.qaicc) # Renaming columns to fit in book output display colnames(parms) <- c("Estimate", "Variance", "n.Models", "Probability", "95%CI +/-") round(parms, digits = 3) # Reorder parameters in decreasing order of probability parms.ord <- parms[order(parms[,4], decreasing = TRUE),] # Confidence intervals for parameters (coef() contains a column with the confidence interval add-ons) ci.parms <- cbind(lower = parms.ord[,1] - parms.ord[,5], upper = parms.ord[,1] + parms.ord[,5]) round(cbind(parms.ord[,1], ci.parms), digits = 3) # Expressed as odds ratios round(exp(cbind(OR = parms.ord[,1], ci.parms))[-1,], digits = 2) ################################################################## # Analysis of parameter estimates # Final model is object mq: quasipoisson on original variable set from first genetic algorithm set. summary(mq) round(summary(mq)$coefficients, 3) ############### Could look at LR tests for each term, but these are not meaningful following a variable selection. # (All terms have already been determined important by the AICc criterion.) # library(car) # Anova(mq) # Analyze interactions using contour plots. Every variable is involved in at least one interaction. # Showing example for prel: rosn. Others function the same way # Start by making grid of points. Then predict values onto the grid. Finally plot the surface. # Tell us where to make grid: approximately at ranges of explanatories summary(saturday[,c(4:13)]) # Set the values for the grid for each variable p1 <- seq(from = 0, to = 9, by = .05) n1 <- seq(from = 0, to = 2.5, by = .01) g1 <- seq(from = 1, to = 2, by = 1) # Only two possible levels of gender r1 <- seq(from = 2, to = 4, by = .01) a1 <- seq(from = 20, to = 45, by = .1) d1 <- seq(from = 0, to = 8, by = .05) s1 <- seq(from = 2.5, to = 5, by = .01) # Combine grids for two variables into a data frame, as required for predict(). # Set other explanatory variables to average values. # (Note gender = 1.5! The advantage of having the binary explanatory treated as numerical # is that we can conceive of an average value!) pr1 <- data.frame(expand.grid(prel = p1, rosn = r1), negevent = mean(saturday$negevent), gender = 1.5, age = mean(saturday$age), desired = mean(saturday$desired), state = mean(saturday$state)) # Predict values for the results and store into a matrix following dimensions of the two variables' grids. surface.pr = matrix(predict(object = mq, newdata = pr1, type = "response"), nrow = length(p1)) # 3D plot to show surface more clearly. library(rgl) open3d() persp3d(x = p1, y = r1, z = surface.pr, col = "red", xlab = "Positive Relationships", ylab = "Rosn", zlab = "Predicted Drinks", ) # Could add points if desired: # points3d(x = saturday$prel, y = saturday$rosn, z = saturday$numall, col = "blue") aspect3d(x = 1, y = 1, z = 1) ### Could repeat for other interaction pairs... # Numerical approach: Get estimates of the effect of each variable on the mean count. # Complicated by the fact that there are so many interactions, which means that # the effect of one variable changes depending on level of another variable. # One estimate does not help. Need to compute estimate for one variable at different levels of the other. # Using method suuggested by Milliken and Johnson: # Estimate effects of one variable separately at three quartiles of other variable. # Will do this for each variable: estimate ratio of meand for a 1-unit change in target variable, # separately at quartiles of interacting variable, holding other variables constant # For book, displaying the prel results first # Save the estimated coefficients into bhat bhat <- mq$coefficients round(bhat, digits = 3) # First compute and store the quartiles for each variable. # Note: the three quartiles are stored in the 2nd, 3rd, and 5th positions in the summary(). rosn.quart <- summary(saturday$rosn)[c(2,3,5)] rosn.quart # Compute the linear combinations of parameters to estimate mean ratios, and exponentiate # These are the estimated mean ratios of the first variable at three quartiles of interacting variable # Note: For variables that are involved in more than one interaction, need to consider values for # ALL interacting variables at once. Otherwise, those values are assumed to be 0 # which may not make sense (e.g., setting age = 0...). This applies to rosn, age, and desired. # Others need consider only the one variable that they interact with. # Single-interaction variables mprel.rosn <- exp(bhat["prel"] + rosn.quart*bhat["prel:rosn"]) mprel.rosn 100*(mprel.rosn - 1) # Remaining variables prel.quart <- summary(saturday$prel)[c(2,3,5)] negev.quart <- summary(saturday$negevent)[c(2,3,5)] state.quart <- summary(saturday$state)[c(2,3,5)] age.quart <- summary(saturday$age)[c(2,3,5)] des.quart <- summary(saturday$desired)[c(2,3,5)] rosn.quart <- summary(saturday$rosn)[c(2,3,5)] #"quartiles" for gender are senseless. Only two values to consider. gen.quart <- c(1,2) # Compute the linear combinations of parameters to estimate mean ratios, and exponentiate # These are the estimated mean ratios of the first variable at three quartiles of interacting variable # Note: For variables that are involved in more than one interaction, need to consider values for # ALL interacting variables at once. Otherwise, those values are assumed to be 0 # which may not make sense (e.g., setting age = 0...). This applies to rosn, age, and desired. # Others need consider only the one variable that they interact with. # Single-interaction variables mnegev.state <- exp(bhat["negevent"] + state.quart *bhat["negevent:state"]) mstate.negev <- exp(bhat["state"] + negev.quart *bhat["negevent:state"]) mgen.des <- exp(bhat["gender"] + des.quart *bhat["gender:desired"]) # Interpret as percent by which mean changes for a 1-unit increase in first variable at quartiles of second variable mean.ratio1 <- 100*t(cbind(mprel.rosn, mnegev.state, mgen.des, mstate.negev) - 1) round(mean.ratio1, digits = 1) # For variables involved in two interactions, need to create grid of combined quartiles (3x3 = 9 rows) # For rosn, involves prel and age p.a.grid <- as.matrix(expand.grid(prel = prel.quart, age = age.quart)) p.a.grid # For age, involves rosn and desired r.d.grid <- as.matrix(expand.grid(rosn = rosn.quart, desired = des.quart)) r.d.grid # For desired, involves age and gender. Recall only two values for gender! a.g.grid <- as.matrix(expand.grid(age = age.quart, gender = gen.quart)) a.g.grid # Then multiply by the two interaction coefficients (get the order right!) and add to main effect estimate p.a.coef <- c(bhat["prel:rosn"], bhat["rosn:age"]) p.a.coef r.d.coef <- c(bhat["rosn:age"], bhat["age:desired"]) r.d.coef a.g.coef <- c(bhat["age:desired"], bhat["gender:desired"]) a.g.coef mrosn.prel.age <- exp(bhat["rosn"] + p.a.grid %*% p.a.coef) mage.rosn.des <- exp(bhat["age"] + r.d.grid %*% r.d.coef) mdes.age.gen <- exp(bhat["desired"] + a.g.grid %*% a.g.coef) # Express as percentage change data.frame(p.a.grid, mratio.rosn.pa = round(100*(mrosn.prel.age-1), digits = 2)) data.frame(r.d.grid, mratio.age.rd = round(100*(mage.rosn.des-1), digits = 2)) data.frame(a.g.grid, mratio.desired.ag = round(100*(mdes.age.gen-1), digits = 2)) #Profile LR interval using mcprofile library(mcprofile) # Create coefficient matrices as 1*target coefficient + quartile*interaction(s) # Look at bhat for ordering of coefficients # CI for prel, controlling for rosn K.prel <- matrix(data = c(0, 1, 0, 0, 0, 0, 0, 0, rosn.quart[1], 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, rosn.quart[2], 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, rosn.quart[3], 0, 0, 0, 0), nrow = 3, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.prel <- mcprofile(object = mq, CM = K.prel) # Calculate -2log(Lambda) ci.prel <- confint(object = profile.prel, level = 0.95) # ci.beta$confint 100*(exp(ci.prel$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.prel$confint) - 1) # CI for state, controlling for negevent K.state <- matrix(data = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, negev.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, negev.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, negev.quart[3]), nrow = 3, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.state <- mcprofile(object = mq, CM = K.state) # Calculate -2log(Lambda) ci.state <- confint(object = profile.state, level = 0.95) # ci.beta$confint 100*(exp(ci.state$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.state$confint) - 1) # CI for negevent, controlling for state K.negev <- matrix(data = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, state.quart[1], 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, state.quart[2], 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, state.quart[3]), nrow = 3, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.negev <- mcprofile(object = mq, CM = K.negev) # Calculate -2log(Lambda) ci.negev <- confint(object = profile.negev, level = 0.95) # ci.beta$confint round(data.frame(100*(exp(ci.negev$estimate) - 1),100*(exp(ci.negev$confint) - 1)), digits = 2) 100*(exp(ci.negev$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.negev$confint) - 1) # CI for gender, controlling for desired K.gen <- matrix(data = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, des.quart[1], 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, des.quart[2], 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, des.quart[3], 0, 0), nrow = 3, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.gen <- mcprofile(object = mq, CM = K.gen) # Calculate -2log(Lambda) ci.gen <- confint(object = profile.gen, level = 0.95) # ci.beta$confint 100*(exp(ci.gen$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.gen$confint) - 1) # Now to those variables involved in two interactions # CI for rosn, controlling for prel and age K.rosn <- matrix(data = c(0, 0, 0, 0, 1, 0, 0, 0, prel.quart[1], age.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[1], age.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[1], age.quart[3], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[2], age.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[2], age.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[2], age.quart[3], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[3], age.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[3], age.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, prel.quart[3], age.quart[3], 0, 0, 0), nrow = 9, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.rosn <- mcprofile(object = mq, CM = K.rosn) # Calculate -2log(Lambda) ci.rosn <- confint(object = profile.rosn, level = 0.95) # ci.beta$confint 100*(exp(ci.rosn$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.rosn$confint) - 1) # CI for age, controlling for rosn and desired K.age <- matrix(data = c(0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[1], 0, des.quart[1], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[1], 0, des.quart[2], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[1], 0, des.quart[3], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[2], 0, des.quart[1], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[2], 0, des.quart[2], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[2], 0, des.quart[3], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[3], 0, des.quart[1], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[3], 0, des.quart[2], 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rosn.quart[3], 0, des.quart[3], 0), nrow = 9, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.age <- mcprofile(object = mq, CM = K.age) # Calculate -2log(Lambda) ci.age <- confint(object = profile.age, level = 0.95) # ci.beta$confint 100*(exp(ci.age$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.age$confint) - 1) # CI for desired, controlling for age and gender K.des <- matrix(data = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, age.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, age.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, age.quart[3], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, age.quart[1], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, age.quart[2], 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, age.quart[3], 0), nrow = 6, byrow = TRUE) # Does work with quasi-models. Produces wider intervals than otherwise. profile.des <- mcprofile(object = mq, CM = K.des) # Calculate -2log(Lambda) ci.des <- confint(object = profile.des, level = 0.95) # ci.beta$confint 100*(exp(ci.des$estimate) - 1) # Verifies got same answer as above 100*(exp(ci.des$confint) - 1)