# Horseshoe crab example for Chapter 4 ##################################################################### # Read in data and fit initial model # Read in data crab <- read.csv(file = "C:\\chris\\unl\\Dropbox\\NEW\\STAT875_2ndEd\\Chapter4\\horseshoe.csv") head(crab) # Fit model mod.fit <- glm(formula = satellite ~ width, data = crab, family = poisson(link = log)) summary(mod.fit) # LRT library(package = car) Anova(mod.fit, test = "LR") mod.fit.Ho <- glm(formula = satellite ~ 1, data = crab, family = poisson(link = log)) anova(mod.fit.Ho, mod.fit, test = "Chisq") G.sq.Ho <- mod.fit.Ho$deviance G.sq.Ha <- mod.fit$deviance G.sq <- G.sq.Ho-G.sq.Ha p.value <- 1-pchisq(q = G.sq, df = 1) data.frame(G.sq.Ho, G.sq.Ha, G.sq, p.value, df = 1) # Predict number of satellites lin.pred <- mod.fit$coefficients[1]+mod.fit$coefficients[2]*23 as.numeric(exp(lin.pred)) predict(object = mod.fit, newdata = data.frame(width = 23), type = "response") # Wald confidence interval for mu save.mu.hat <- predict(object = mod.fit, newdata = data.frame(width = 23), type = "response", se = TRUE) save.mu.hat alpha <- 0.05 lower <- save.mu.hat$fit - qnorm(1-alpha/2)*save.mu.hat$se upper <- save.mu.hat$fit + qnorm(1-alpha/2)*save.mu.hat$se data.frame(width = 23, mu.hat = round(save.mu.hat$fit,4), lower = round(lower,4), upper = round(upper,4)) # Alternative way to Wald confidence interval for mu - guarantee > 0 lin.pred.hat <- predict(object = mod.fit, newdata = data.frame(width = 23), type = "link", se = TRUE) alpha <- 0.05 lower <- exp(lin.pred.hat$fit - qnorm(1-alpha/2)*lin.pred.hat$se) upper <- exp(lin.pred.hat$fit + qnorm(1-alpha/2)*lin.pred.hat$se) data.frame(width = 23, mu.hat = round(exp(lin.pred.hat$fit),4), lower = round(lower,4), upper = round(upper,4)) # emmeans - same as the alternative way library(package = emmeans) predict.data <- list(width = 23) emmeans(object = mod.fit, specs = ~ width, at = predict.data, type = "response", level = 0.95) # Profile LR interval for mu library(package = mcprofile) K <- matrix(data = c(1, 23), nrow = 1, ncol = 2) K linear.combo <- mcprofile(object = mod.fit, CM = K) #Calculate -2log(Lambda) ci.logmu.profile <- confint(object = linear.combo, level = 0.95) #CI for beta_0 + beta_1 * x ci.logmu.profile ci.logmu.profile$confint exp(ci.logmu.profile) # Estimate covariance matrix vcov(mod.fit) ################################################################## # Plot # Plot of data and estimated model plot(x = crab$width, y = crab$satellite, xlab = "Width (cm)", ylab = "Number of satellites", main = "Horseshoe crab data set \n with Poisson regression model fit", panel.first = grid()) curve(expr = exp(mod.fit$coefficients[1]+mod.fit$coefficients[2]*x), col = "red", add = TRUE, lty = "solid") # Could also use this to plot the model # curve(expr = predict(object = mod.fit, newdata = data.frame(width = x), type ="response"), # col = "red", add = TRUE, lty = 1) # Function to find confidence interval ci.mu <- function(newdata, mod.fit.obj, alpha) { lin.pred.hat <- predict(object = mod.fit.obj, newdata = newdata, type = "link", se = TRUE) lower <- exp(lin.pred.hat$fit - qnorm(1-alpha/2)*lin.pred.hat$se) upper <- exp(lin.pred.hat$fit + qnorm(1-alpha/2)*lin.pred.hat$se) list(lower = lower, upper = upper) } # Test ci.mu(newdata = data.frame(width = 23), mod.fit.obj = mod.fit, alpha = 0.05) # Add confidence interval bands curve(expr = ci.mu(newdata = data.frame(width = x), mod.fit.obj = mod.fit, alpha = 0.05)$lower, col = "blue", add = TRUE, lty = "dotdash") curve(expr = ci.mu(newdata = data.frame(width = x), mod.fit.obj = mod.fit, alpha = 0.05)$upper, col = "blue", add = TRUE, lty = "dotdash") legend(x = 21, y = 14, legend = c("Poisson regression model", "95% individual C.I."), bty = "n", lty = c("solid", "dotdash"), col = c("red", "blue")) # Put the data into groups min(crab$width) max(crab$width) crab$groups <- cut(x = crab$width, c(20, seq(from = 23.25, to = 29.25, by = 1), 34)) head(crab) tail(crab) # Find the average number of satellites per group and plot ybar <- aggregate(x = satellite ~ groups, data = crab, FUN = mean) xbar <- aggregate(x = width ~ groups, data = crab, FUN = mean) count <- aggregate(x = satellite ~ groups, data = crab, FUN = length) data.frame(ybar, mean.width = xbar$width, count = count$satellite) points(x = xbar$width, y = ybar$satellite, pch = 17, col = "darkgreen", cex = 2) legend(x = 21, y = 14, legend = c("Poisson regression model", "95% individual C.I.", "Sample mean"), bty = "n", lty = c("solid", "dotdash", NA), col = c("red", "blue", "darkgreen"), pch = c(NA, NA, 17)) # Is there a way to magnify just the plotting point and not the corresponding character label? ################################################################## # Another version of the plot where groups are automatically chosen plot(x = crab$width, y = crab$satellite, xlab = "Width (cm)", ylab = "Number of satellites", main = "Horseshoe crab data set \n with Poisson regression model fit", panel.first = grid()) curve(expr = exp(mod.fit$coefficients[1]+mod.fit$coefficients[2]*x), col = "red", add = TRUE, lty = "solid") curve(expr = ci.mu(newdata = data.frame(width = x), mod.fit.obj = mod.fit, alpha = 0.05)$lower, col = "blue", add = TRUE, lty = "dotdash") curve(expr = ci.mu(newdata = data.frame(width = x), mod.fit.obj = mod.fit, alpha = 0.05)$upper, col = "blue", add = TRUE, lty = "dotdash") # Find 8 (9 quantiles) groups cutoff <- quantile(crab$width, probs = 0:8/8, na.rm = F) cutoff groups2 <- cut(x = crab$width, cutoff) # Find the average number of satellites per group and plot ybar <- aggregate(x = satellite ~ groups2, data = crab, FUN = mean) xbar <- aggregate(x = width ~ groups2, data = crab, FUN = mean) count <- aggregate(x = satellite ~ groups2, data = crab, FUN = length) data.frame(ybar, mean.width = xbar$width, count = count$satellite) points(x = xbar$width, y = ybar$satellite, pch = 17, col = "darkgreen", cex = 2) legend(x = 21, y = 14, legend = c("Poisson regression model", "95% individual C.I.", "Sample mean"), bty = "n", lty = c("solid", "dotdash", NA), col = c("red", "blue", "darkgreen"), pch = c(NA, NA, 17)) ################################################################## # Model interpretation # 1 cm increase exp(mod.fit$coefficients[2]) 100*(exp(mod.fit$coefficients[2]) - 1) # c unit increase c.unit <- sd(crab$width) c.unit 100*(exp(c.unit*mod.fit$coefficients[2]) - 1) # Show that for a constant change, we obtain the same values for mu(x+c)/mu(x) mu.hat <- predict(object = mod.fit, newdata = data.frame(width = 21:35), type = "response") data.frame(x.plus.c = 22:35, x = 21:34, change = round(mu.hat[-1]/mu.hat[-15],2)) # Wald interval with c = 1 beta.ci <- confint.default(object = mod.fit, parm = "width", level = 0.95) beta.ci exp(beta.ci) 100*(exp(beta.ci) - 1) # Calculation without confint.default vcov(mod.fit) # Var^(beta^_1) is in the (2,2) element of the matrix beta.ci <- mod.fit$coefficients[2] + qnorm(p = c(0.025, 0.975))*sqrt(vcov(mod.fit)[2,2]) 100*(exp(beta.ci) - 1) # Profile likelihood interval beta.ci <- confint(object = mod.fit, parm = "width", level = 0.95) beta.ci exp(beta.ci) 100*(exp(beta.ci) - 1) # emmeans library(package = emmeans) calc.est <- emmeans(object = mod.fit, specs = ~ width, at = list(width = c(31, 30)), type = "response") test.info <- contrast(object = calc.est, method = "pairwise") confint(object = test.info, adjust = "none", level = 0.95) # Profile LR interval using mcprofile library(package = mcprofile) K <- matrix(data = c(0, 1), nrow = 1, ncol = 2) linear.combo <- mcprofile(object = mod.fit, CM = K) # Calculate -2log(Lambda) ci.beta <- confint(object = linear.combo, level = 0.95) ci.beta$confint 100*(exp(ci.beta$confint) - 1) ################################################################## # Rate data # Convert data total.sat <- aggregate(x = satellite ~ width, data = crab, FUN = sum) # Number of satellites per unique width numb.crab <- aggregate(x = satellite ~ width, data = crab, FUN = length) # Number of crabs per unique width rate.data <- data.frame(total.sat, numb.crab = numb.crab$satellite) head(rate.data) # Estimate model mod.fit.rate <- glm(formula = satellite ~ width + offset(log(numb.crab)), data = rate.data, family = poisson(link = log)) summary(mod.fit.rate) predict(object = mod.fit.rate, newdata = data.frame(width = c(23, 23), numb.crab = c(1, 2)), type = "response") # Find the number of unique values of t and put into a vector plot.char.numb <- as.numeric(names(table(rate.data$numb.crab))) # Do this first to get gridlines BEHIND the plotting points :) plot(x = rate.data$width, y = rate.data$satellite, xlab = "Width (cm)", ylab = "Number of satellites per distinct width", type = "n", panel.first = grid(), main = "Horseshoe crab data set \n with Poisson regression model fit (rate data)") # Put observed values and estimated model on plot by values of t for (t in plot.char.numb) { # t<-plot.char.numb[1] width.t <- rate.data$width[rate.data$numb.crab == plot.char.numb[t]] satellite.t <- rate.data$satellite[rate.data$numb.crab == plot.char.numb[t]] points(x = width.t, y = satellite.t, pch = as.character(plot.char.numb[t]), cex = 0.5, col = t) curve(expr = t * exp(mod.fit.rate$coefficients[1] + mod.fit.rate$coefficients[2]*x), xlim = c(min(width.t), max(width.t)), lty = "solid", col = t, add = TRUE) } # Same values Anova(mod.fit.rate) Anova(mod.fit) #