#Bird data and hypothesis testing ##################################################################### # Data in a contigency table format #Create contingency table - notice the data is entered by columns c.table <- array(data = c(251, 48, 34, 5), dim = c(2,2), dimnames = list(First = c("made", "missed"), Second = c("made", "missed"))) c.table #Convert to raw form set1 <- as.data.frame(as.table(c.table)) set1 set2 <- set1[rep(1:nrow(set1), times = set1$Freq), -3] head(set2) tail(set2) table(set2) #table(set2$First, set2$Second) chisq.test(x = set2[,1], y = set2[,2], correct = FALSE) #This also works: chisq.test(x = c.table, correct = FALSE) ######################################################################### # Bootstrap approach library(boot) #Construct form of data set needed set3 <- rbind(data.frame(index = set2[,1], table.part = "X"), data.frame(index = set2[,2], table.part = "Y")) head(set3) tail(set3) #Perform the test calc.t <- function(data, i, dim.table) { d <- data[i,] x <- d[d$table.part == "X",] #Notice use of == y <- d[d$table.part == "Y",] #Note: chisq.test() will return an error message if I<2 or J<2 x.sq <- chisq.test(x = x$index, y = y$index, correct = FALSE) ck.row <- nrow(x.sq$observed) == dim.table[1] ck.col <- ncol(x.sq$observed) == dim.table[2] c(x.sq$statistic, ck.row, ck.col) } #Test calc.t with observed data calc.t(data = set3, i = 1:nrow(set3), dim.table = c(nrow(c.table), ncol(c.table))) set.seed(9180) R <- 4999 boot.res <- boot(data = set3, statistic = calc.t, R = R, sim = "ordinary", strata = set3$table.part, dim.table = c(nrow(c.table), ncol(c.table))) boot.res #Plot method function for boot class plot(boot.res, qdist = "chisq", df = (nrow(c.table)-1)*(ncol(c.table)-1)) #p-value (1 + sum(boot.res$t[,1]>=boot.res$t0[1]))/(R + 1) #Reform a contingency table* save.index <- boot.array(boot.res, indices = TRUE) i <- save.index[3,] d <- set3[i,] x <- d[d$table.part == "X",] y <- d[d$table.part == "Y",] x.sq <- chisq.test(x = x$index, y = y$index, correct = FALSE) x.sq$observed x.sq$statistic boot.res$t[3,] #table(x$index, y$index) #Also works but does not give X and Y labels c.table #Original summarize <- function(result.set, statistic, df, R, color.line = "red") { par(mfrow = c(1,3), mar = c(5,4,4,0.5)) # Histogram hist(x = result.set, main = "Histogram", freq = FALSE, xlab = expression(X^{"2*"})) curve(expr = dchisq(x = x, df = df), col = color.line, add = TRUE, lwd = 2) segments(x0 = statistic, y0 = -10, x1 = statistic, y1 = 10) # Compare CDFs plot.ecdf(x = result.set, verticals = TRUE, do.p = FALSE, main = "CDFs", lwd = 2, col = "black", xlab = expression(X^"2*"), ylab = "CDF") curve(expr = pchisq(q = x, df = df), col = color.line, add = TRUE, lwd = 2, lty = "dotted") legend(x = df, y = 0.4, legend = c(expression(Perm.), substitute(chi[df1]^2, list(df1 = df))), lwd = c(2,2), col = c("black", color.line), lty = c("solid", "dotted"), bty = "n") # When expression() is removed, the substitute() part oddly does not work # QQ-Plot chi.quant <- qchisq(p = seq(from = 1/(R+1), to = 1-1/(R+1), by = 1/(R+1)), df = df) plot(x = sort(result.set), y = chi.quant, main = "QQ-plot", xlab = expression(X^{"2*"}), ylab = "Chi-square quantiles") abline(a = 0, b = 1) par(mfrow = c(1,1)) # p-value (1 + sum(result.set >= statistic))/(R + 1) } win.graph(width = 9, height = 6, pointsize = 20) summarize(result.set = boot.res$t[,1], statistic = boot.res$t0[1], R = R, df = (nrow(c.table)-1)*(ncol(c.table)-1)) ######################################################################### # Permutation approach # Perform the test x.sq <- function(data, i, row.var) { col.var <- data[i] chisq.test(x = row.var, y = col.var, correct = FALSE)$statistic } x.sq(data = set2[,2], i = 1:nrow(set2), row.var = set2[,1]) set.seed(1198) perm.res <- boot(data = set2[,2], statistic = x.sq, R = R, sim = "permutation", row.var = set2[,1]) perm.res #p-value (1 + sum(perm.res$t[,1] >= perm.res$t0[1]))/(R + 1) win.graph(width = 9, height = 6, pointsize = 20) summarize(result.set = perm.res$t[,1], statistic = perm.res$t0[1], R = R, df = (nrow(c.table)-1)*(ncol(c.table)-1)) #Summary of all x^2* values table(round(perm.res$t,2)) #Show the margins are the same as observed save.index <- boot.array(boot.out = perm.res, indices = TRUE) i <- save.index[3,] d <- set2[i,2] x.sq <- chisq.test(x = set2[,1], y = d, correct = FALSE) x.sq$observed x.sq$statistic perm.res$t[3] rowSums(x.sq$observed) colSums(x.sq$observed) rowSums(c.table) colSums(c.table) ######################################################################### # Simpler permutation test implementation # In the chisq.test() code, it uses the following for the p-value: PVAL <- (1 + sum(ss >= STATISTIC))/(B + 1) set.seed(6611) chisq.test(c.table, correct = FALSE, simulate.p.value = TRUE, B = 4999) #