######################################################################
# Show CLT does work #
# #
######################################################################
#Plot of distribution for Y_i
# Let X = 4*W where W~beta(5,2) (for those who have had a mathematical statistics class)
curve(expr = 15/2048 * x^4 * (4 - x), from = 0, to = 4, col = "red",
main = "Probability distribution for GPAs", ylab = "f(y)", xlab = "y")
abline(h = 0)
#Mean and variance
mu <- 2.8571 #4 * 5/(5+2)
sigma.sq <- 0.4082 #See formula for Var(W) from a book like Casella and Berger (2001) and then multiply it by 4^2
#####################################################################
# Simulate the samples
n <- 20
set.seed(8129)
#Simulated samples from a beta probability distribution (4*W where W~beta(5,2))
# Students are not responsible for how I simulated data.
set1 <- 4*matrix(data = rbeta(n = 1000*n, shape1 = 5, shape2 = 2), nrow = 1000, ncol = n)
head(round(set1,2))
ybar <- rowSums(set1)/n
head(round(ybar,2))
#Histogram
hist(x = ybar, main = "Histogram of sample means", xlab = expression(bar(y)))
#Histogram with normal distribution overlay
hist(x = ybar, main = "Histogram of sample means", xlab = expression(bar(y)), freq = FALSE)
curve(expr = dnorm(x = x, mean = mu, sd = sqrt(sigma.sq/n)), add = TRUE, col = "red")
#Same plot, but with more bars
hist(x = ybar, main = "Histogram of sample means", xlab = expression(bar(y)), freq = FALSE, breaks = 15)
curve(expr = dnorm(x = x, mean = mu, sd = sqrt(sigma.sq/n)), add = TRUE, col = "red")
#Estimate of mu
mean(ybar)
#Estimate of sigma/sqrt(n)
sd(ybar)
#Actual sigma/sqrt(n)
sqrt(sigma.sq/n)
#Estimate of P(3 < Ybar < 4)
sum(ybar <= 4)/1000 - sum(ybar <= 3)/1000 #Also could just do sum(ybar > 3) since all GPAs <= 4 in data
#Estimate of P(3 < Ybar < 4) using a normal distribution approximation
pnorm(q = 4, mean = mu, sd = sqrt(sigma.sq/n)) - pnorm(q = 3, mean = mu, sd = sqrt(sigma.sq/n))
#