#Bird data and hypothesis testing

#####################################################################
#  Data in a contigency table format

  #Create contingency table - notice the data is entered by columns
  c.table <- array(data = c(251, 48, 34, 5), dim = c(2,2), dimnames = list(First = c("made", "missed"),
                 Second = c("made", "missed")))
  c.table

  #Convert to raw form
  set1 <- as.data.frame(as.table(c.table))
  set1
  set2 <- set1[rep(1:nrow(set1), times = set1$Freq), -3]
  head(set2)
  tail(set2)
  table(set2)
  #table(set2$First, set2$Second)

  chisq.test(x = set2[,1], y = set2[,2], correct = FALSE)

  #This also works:
  chisq.test(x = c.table, correct = FALSE)


#########################################################################
# Bootstrap approach

  library(boot)

  #Construct form of data set needed
  set3 <- rbind(data.frame(index = set2[,1], table.part = "X"),
              data.frame(index = set2[,2], table.part = "Y"))
  head(set3)
  tail(set3)

  #Perform the test
  calc.t <- function(data, i, dim.table) {
    d <- data[i,]
    x <- d[d$table.part == "X",]  #Notice use of ==
    y <- d[d$table.part == "Y",]

    #Note: chisq.test() will return an error message if I<2 or J<2
    x.sq <- chisq.test(x = x$index, y = y$index, correct = FALSE)
    ck.row <- nrow(x.sq$observed) == dim.table[1]
    ck.col <- ncol(x.sq$observed) == dim.table[2]

    c(x.sq$statistic, ck.row, ck.col)
  }

  #Test calc.t with observed data
  calc.t(data = set3, i = 1:nrow(set3), dim.table = c(nrow(c.table), ncol(c.table)))

  set.seed(9180)
  R <- 4999
  boot.res <- boot(data = set3, statistic = calc.t, R = R, sim = "ordinary", strata = set3$table.part,
    dim.table = c(nrow(c.table), ncol(c.table)))
  boot.res

  #Plot method function for boot class
  plot(boot.res, qdist = "chisq", df = (nrow(c.table)-1)*(ncol(c.table)-1))

  #p-value
  (1 + sum(boot.res$t[,1]>=boot.res$t0[1]))/(R + 1)

  #Reform a contingency table*
  save.index <- boot.array(boot.res, indices = TRUE)
  i <- save.index[3,]
  d <- set3[i,]
  x <- d[d$table.part == "X",]
  y <- d[d$table.part == "Y",]
  x.sq <- chisq.test(x = x$index, y = y$index, correct = FALSE)
  x.sq$observed
  x.sq$statistic
  boot.res$t[3,]

  #table(x$index, y$index)  #Also works but does not give X and Y labels
  c.table  #Original

  summarize <- function(result.set, statistic, df, R, color.line = "red") {

    par(mfrow = c(1,3), mar = c(5,4,4,0.5))

    # Histogram
    hist(x = result.set, main = "Histogram", freq = FALSE,
     xlab = expression(X^{"2*"}))
    curve(expr = dchisq(x = x, df = df), col = color.line, add = TRUE, lwd = 2)
    segments(x0 = statistic, y0 = -10, x1 = statistic, y1 = 10)

    # Compare CDFs
    plot.ecdf(x = result.set, verticals = TRUE, do.p = FALSE, main = "CDFs", lwd = 2, col = "black",
      xlab = expression(X^"2*"),
      ylab = "CDF")
    curve(expr = pchisq(q = x, df = df), col = color.line, add = TRUE, lwd = 2, lty = "dotted")
    legend(x = df, y = 0.4, legend = c(expression(Perm.), substitute(chi[df1]^2, list(df1 = df))), lwd = c(2,2),
      col = c("black", color.line), lty = c("solid", "dotted"), bty = "n")  # When expression() is removed, the substitute() part oddly does not work

    # QQ-Plot
    chi.quant <- qchisq(p = seq(from = 1/(R+1), to = 1-1/(R+1), by = 1/(R+1)), df = df)
    plot(x = sort(result.set), y = chi.quant, main = "QQ-plot",
      xlab = expression(X^{"2*"}), ylab = "Chi-square quantiles")
    abline(a = 0, b = 1)

    par(mfrow = c(1,1))

    # p-value
    (1 + sum(result.set >= statistic))/(R + 1)
  }

  win.graph(width = 9, height = 6, pointsize = 20)
  summarize(result.set = boot.res$t[,1], statistic = boot.res$t0[1], R = R,
    df = (nrow(c.table)-1)*(ncol(c.table)-1))


#########################################################################
# Permutation approach

  # Perform the test
  x.sq <- function(data, i, row.var) {
    col.var <- data[i]
    chisq.test(x = row.var, y = col.var, correct = FALSE)$statistic
  }

  x.sq(data = set2[,2], i = 1:nrow(set2), row.var = set2[,1])

  set.seed(1198)
  perm.res <- boot(data = set2[,2], statistic = x.sq, R = R, sim = "permutation", row.var = set2[,1])
  perm.res

  #p-value
  (1 + sum(perm.res$t[,1] >= perm.res$t0[1]))/(R + 1)

  win.graph(width = 9, height = 6, pointsize = 20)
  summarize(result.set = perm.res$t[,1], statistic = perm.res$t0[1], R = R,
    df = (nrow(c.table)-1)*(ncol(c.table)-1))

  #Summary of all x^2* values
  table(round(perm.res$t,2))

  #Show the margins are the same as observed
  save.index <- boot.array(boot.out = perm.res, indices = TRUE)
  i <- save.index[3,]
  d <- set2[i,2]
  x.sq <- chisq.test(x = set2[,1], y = d, correct = FALSE)
  x.sq$observed
  x.sq$statistic
  perm.res$t[3]
  rowSums(x.sq$observed)
  colSums(x.sq$observed)
  rowSums(c.table)
  colSums(c.table)


#########################################################################
# Simpler permutation test implementation
#   In the chisq.test() code, it uses the following for the p-value: PVAL <- (1 + sum(ss >= STATISTIC))/(B + 1)

  set.seed(6611)
  chisq.test(c.table, correct = FALSE, simulate.p.value = TRUE, B = 4999)

  


 
  
#