basic statistics in R
top
Some simple statistics functions in R (based on the course MATH204).
# read data
data <- read.csv("data.csv")
# utilities for working with/summarize data
ncol(data) # number of columns
nrow(data) # number of rows
colnames(data) # column names
rownames(data) # row names (if they exist)
head(data) # get the first "few" rows of a dataframe
tail(data) # get the last "few" rows of a dataframe
summary(data) # get more information then you'd ever need
data[data$x == "something", ] # get all rows from data from matching certain parameter(s)
data.lm <- lm(y ~ x, data = data) # fit a linear model
# working with the model (also applies to non-linear models made this way)
data.lm # just displays the coefficients
summary(data.lm) # displays coefficients, their standard errors, t-statistics, and p-values, etc.
anova(data.lm) # analysis of variance; F-statistic, etc
confint(data.lm, level=1-0.01) # confidence intervals for the coefficients
predict(data.lm, newdata = data.frame(x = 1:10)) # predict y values for x = 1:10
# make sure the column in the newdata is named the same as the column passed to lm!
# using predict, you can either predict for an individual response or mean response
# just change the "interval" argument to "prediction" or "confidence" respectively
# a linear model can be added as a line to a plot
plot(y ~ x, data = data)
abline(data.lm, col = "red")
lines(data$x[sort(x, index.return=T)$ix], data.lm[ix], col="red") # a non-linear model can also be added to a plot (just a little more complicated)
# this can be made a little fancier with ggplot to add the confidence intervals
library(ggplot2)
ggplot(data, aes(x = x, y = y)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red")
# lm can also be used to fit a polynomial
lm(y ~ x + I(x^2), data = data) # I() is used to indicate that x^2 is a variable, not an operation
# or covariates
lm(y ~ x + z, data = data)
# or interactions
lm(y ~ x*z, data = data) # note that this is the same as lm(y ~ x + z + x:z, data = data)
# test statistics can also be manually found
alpha = 0.05
df = 10
# t-test
t.test(y ~ x, data=data) # 1-tailed
t.test(y ~ x, data=data, var.equal=T) # 2-tailed
qt(1 - alpha/2, df = df) # t-statistic for a 95% confidence interval with 10 degrees of freedom
2*pt(-abs(2.228), df = df) # p-value for a t-statistic of 2.262 with 10 degrees of freedom; two-tailed
# Z-test
qnorm(1 - alpha/2) # z-statistic for a 95% confidence interval
2*pnorm(-abs(1.96)) # p-value for a z-statistic of 1.96; two-tailed
# F-test
qf(1 - alpha, df1 = 1, df2 = df) # F-statistic for a 95% confidence interval with 1 and 10 degrees of freedom
aov(y ~ x, data=data) # another way of doing a global F-test; wrap with summary() to get more information
# residuals
residuals(data.lm) # get the residuals of a model
data.lm$residuals # same as above
library(MASS) # library that makes it easy to work with residuals
data.lm.stdres <- stdres(residuals(data.lm)) # standardized residuals
par(mfrow = c(1, 2))
hist(data.lm.stdres) # plot the standardized residuals
qqnorm(data.lm.stdres) # plot the standardized residuals against a normal distribution
qqline(data.lm.stdres) # plot the standardized residuals against a normal distribution
# plotting these side-by-side is a good way to check if the residuals are normally distributed
library(interactions)
interact_plot(data.lm, pred = x.1, modx = x.2) # plot interactions (assuming data.lm is a model with an interaction)
boxplot(y ~ x, data=data) # box-plot, helpful for qualitative data
aggregate(x = data$y, by = list(data$x), FUN = mean) # get the mean of y for each x
anova(mod_1, mod_2) # compare two models, mod_1 smaller than mod_2
chisq.test(x_1, x_2) # chi-squared test, for categorical data
qchisq(1 - alpha, df = df. lower.tail=T) # chi-squared significant value
fisher.test(x_1, x_2) # fisher's exact test, for categorical data
mcnemar.test(data, correct = FALSE) # mcnemar's test, for paired categorical data
wilcox.test(x_1, x_2, alternative="greater", correct = F) # wilcoxon rank sum test
wilcox.test(x_1, x_2, alternative="greater", correct = F, exact = F) # wilcoxon rank sum test, larger samples, approximates normal distribution