# Regression model for GPA data
# Read in the comma-delimited data file - change file location for your own computer
gpa <- read.csv(file = "C:\\data\\GPA.csv")
# Print the data
gpa
# Alternative ways to read in the file
gpa2 <- read.csv(file = "C:/data/GPA.csv") # Use one forward slash rather than two back slashes
gpa3 <- read.table(file = "C:\\data\\GPA.csv", header = TRUE, sep = ",") # This is a more general R function for reading in data
setwd(dir = "C:\\data") # This sets the "working directory" to a file location
gpa4 <- read.csv(file = "GPA.csv")
gpa5 <- read.csv(file = file.choose()) # This will prompt the user to locate the file in a window
# Read in the space delimited data file
gpa6 <- read.table(file = "C:\\data\\GPA.txt", header = TRUE, sep = "")
gpa6
#########################################################################
# Examples of how to read in an Excel file
library(package = readxl)
# The data is read in as a tibble, which is a different type of data frame
# Tibbles are used with the tidyverse paradigm of R
set1 <- read_xls(path = "C:\\data\\GPA.xls")
# While not usually needed, we transform the tibble to a regular data frame
set1.df <- as.data.frame(set1)
set1 <- read_xlsx(path = "C:\\data\\GPA.xlsx")
# While not usually needed, we transform the tibble to a regular data frame
set1.df <- as.data.frame(set1)
set1 <- read_excel(path = "C:\\data\\GPA.xlsx")
# While not usually needed, we transform the tibble to a regular data frame
set1.df <- as.data.frame(set1)
# This package requires having Java on a computer
library(package = xlsx)
set1 <- read.xlsx(file = "C:\\data\\GPA.xlsx", sheetName = "sheet1")
# In the first edition of the book, we used the RODBC package to read in data
# from Excel files. This package only works for 32-bit Excel, so we no longer
# include the code here.
#########################################################################
# Continue with the introductory code
# Summary statistics for variables
summary(object = gpa) # summary(gpa) works as well
# Simple plot
plot(x = gpa$HSGPA, y = gpa$CollegeGPA)
# Better plot
# pdf(file = "c:\\figures\\FigureA.5color.pdf", width = 7, height = 6, colormodel = "cmyk") # Create plot for book
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), col = "red", pch = 1, cex = 1.0, panel.first = grid(col = "gray", lty = "dotted"))
# Code used 238 characters
# dev.off() # Create plot for book
# Black-and-white version of plot
# pdf(file = "c:\\figures\\FigureA.5BW.pdf", width = 7, height = 6, colormodel = "cmyk") # Create plot for book
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), col = "black", pch = 1, cex = 1.0, panel.first = grid(col = "gray", lty = "dotted"))
# dev.off() # Create plot for book
# Simple plot using the ggplot2 package
library(package = ggplot2)
save.plot <- ggplot(data = gpa, mapping = aes(x = HSGPA, y = CollegeGPA))
save.plot + geom_point() # Add points to the plot; could have included geom_point() in previous line
# Better plot using the ggplot2 package
theme_set(new = theme_bw()) # Removes awful gray background!
save.plot + geom_point(color = "red", shape = 1) + xlim(0, 4.5) + ylim(0,4.5) +
ggtitle(label = "College GPA vs. HS GPA") + xlab(label = "HS GPA") +
ylab(label = "College GPA") + theme(plot.title = element_text(hjust = 0.5))
# Code used 293 characters
# Can also use for plotting labels: labs(title = "College GPA vs. HS GPA", x = "HS GPA", ylab = "College GPA")
# title is no longer centered with ggplot2 2.2.0 and higher, so used theme(plot.title = element_text(hjust = 0.5))
# Print just one variable
options(width = 60) # Used in book to limit width of line in R Console window
gpa$HSGPA
options(width = 80) # Set back to default
gpa[,1]
gpa[, "HSGPA"]
# Other parts of the data frame
gpa[1,1] # row 1 and column 1 value
gpa[1:10,1] # first 10 observations of variable 1
gpa[, c("HSGPA", "CollegeGPA")] # Whole data set
########################################################################
# Find estimated simple linear regression model
# Fit the simple linear regression model and save the results in mod.fit
mod.fit <- lm(formula = CollegeGPA ~ HSGPA, data = gpa)
# A very brief look of what is inside of mod.fit - see the summary function for a better way
mod.fit
# See the names of all of the object components
names(mod.fit)
mod.fit$coefficients
round(mod.fit$residuals[1:5], digits = 2)
# Put some of the components into a data.frame object
save.fit <- data.frame(gpa, C.GPA.hat = round(mod.fit$fitted.values, digits = 2), residuals = round(mod.fit$residuals, digits = 2))
# We could instead put C.GPA.hat and residuals directly in the gpa data frame
# using the code below
# gpa$C.GPA.hat <- round(mod.fit$fitted.values, digits = 2)
# gpa$residuals <- round(mod.fit$residuals, digits = 2)
# Print contents save.fit
head(save.fit)
save.fit
# Summarize the information stored in mod.fit
summary(object = mod.fit)
# Class of objects
class(mod.fit)
class(gpa)
# Method functions for a class of type lm
# options(width = 70) # Used for book to control width displayed
methods(class = "lm")
# Method functions for the summary generic function
methods(generic.function = "summary")
########################################################################
# Put regression line on plot
# Open a new graphics window - there are a number ways
# win.graph(width = 8, height = 6, pointsize = 10) # Windows
# windows(width = 8, height = 6, pointsize = 10) # Windows
# quartz(width = 8, height = 6, pointsize = 10) # Mac
dev.new(width = 8, height = 6, pointsize = 10) # Originally for Linux, but works for other operating systems
# pdf(file = "c:\\figures\\FigureA.6color.pdf", width = 8, height = 6, colormodel = "cmyk", pointsize = 10) # Create plot for book in a PDF file
# 1 row and 2 columns of plots
par(mfrow = c(1,2))
# Same scatter plot as before
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), panel.first =grid(col = "gray", lty = "dotted"))
# Puts the line y = a + bx on the plot
abline(a = mod.fit$coefficients[1], b = mod.fit$coefficients[2], lty = "solid", col = "blue", lwd = 2)
# Same scatter plot as before
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), panel.first = grid(col = "gray", lty = "dotted"))
# Add line
curve(expr = mod.fit$coefficients[1] + mod.fit$coefficients[2]*x, xlim = c(min(gpa$HSGPA),max(gpa$HSGPA)),
col= "blue", add = TRUE, lwd = 2)
# dev.off() # End creating plot for book in a PDF file
# Draw a line from (x0, y0) to (x1, y1)
# segments(x0 = min(gpa$HS.GPA), y0 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*min(gpa$HS.GPA),
# x1 = max(gpa$HS.GPA), y1 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*max(gpa$HS.GPA),
# lty = 1, col = "blue", lwd = 2)
# pdf(file = "c:\\figures\\FigureA.6color.pdf", width = 8, height = 6, colormodel = "cmyk", pointsize = 10) # Create plot for book in a PDF file
# Black-and-white version of plot
# pdf(file = "c:\\figures\\FigureA.6BW.pdf", width = 8, height = 6, colormodel = "cmyk", pointsize = 10) # Create plot for book
par(mfrow = c(1,2))
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), panel.first=grid(col = "gray", lty = "dotted"))
abline(a = mod.fit$coefficients[1], b = mod.fit$coefficients[2], lty = "solid", col = "black", lwd = 2)
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xlim = c(0,4.5), ylim = c(0,4.0), panel.first=grid(col = "gray", lty = "dotted"))
curve(expr = mod.fit$coefficients[1] + mod.fit$coefficients[2]*x, xlim = c(min(gpa$HSGPA),max(gpa$HSGPA)),
col= "black", add = TRUE, lwd = 2)
# dev.off() # End creating plot for book in a PDF file
########################################################################
# Create a function to find the estimated simple linear regression model
# and put the line on a scatter plot
par(mfrow = c(1,1))
my.reg.func <- function(x, y, data) {
# Fit simple linear regression model and save results in mod.fit
mod.fit <- lm(formula = y ~ x, data = data)
# Open a new graphics window - do not need to
dev.new(width = 6, height = 6, pointsize = 10)
# Same scatter plot as before
plot(x = x, y = y, xlab = "x", ylab = "y", main = "y vs. x",
panel.first = grid(col = "gray", lty = "dotted"))
# Include regression model
curve(expr = mod.fit$coefficients[1] + mod.fit$coefficients[2]*x,
xlim = c(min(x), max(x)), col = "blue", add = TRUE, lwd = 2)
# This is the object returned
mod.fit
}
save.it <- my.reg.func(x = gpa$HSGPA, y = gpa$CollegeGPA, data = gpa)
names(save.it)
summary(save.it)
#########################################################################
# Specific x-axis values
# Note that xaxt = "n" tells R to not give any labels on the x-axis (yaxt = "n" works for y-axis)
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA",
xaxt = "n", xlim = c(0, 4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0)
axis(side = 1, at = seq(from = 0, to = 4.5, by = 0.5)) # Major tick marks
axis(side = 1, at = seq(from = 0, to = 4.5, by = 0.1), tck = 0.01, labels = FALSE) # Minor tick marks
########################################################################
# Example of getting mathematical characters on a plot
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA",
main = expression(hat(Y) == hat(beta)[0] + hat(beta)[1]*x),
xlim = c(0,4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0, panel.first=grid(col = "gray", lty = "dotted"))
# Draw a line from (x0, y0) to (x1, y1)
segments(x0 = min(gpa$HSGPA), y0 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*min(gpa$HSGPA),
x1 = max(gpa$HSGPA), y1 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*max(gpa$HSGPA),
lty = 1, col = "blue", lwd = 2)
plot(x = gpa$HSGPA, y = gpa$CollegeGPA, xlab = "HS GPA", ylab = "College GPA",
main = expression(paste("College GPA vs. HS GPA and ", widehat(College.GPA) == hat(beta)[0] + hat(beta)[1]*HS.GPA)),
xlim = c(0,4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0, panel.first=grid(col = "gray", lty = "dotted"))
# Draw a line from (x0, y0) to (x1, y1)
segments(x0 = min(gpa$HSGPA), y0 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*min(gpa$HSGPA),
x1 = max(gpa$HSGPA), y1 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*max(gpa$HSGPA),
lty = 1, col = "blue", lwd = 2)
demo(plotmath) # Run this to see examples
########################################################################
# Open R Commander
library(package = Rcmdr)
#