Useful Functions
.cor(x,y,method="pearson")- default method = pearson
- other methods: “spearman” and “kendall”
- default method = pearson
.aggregate(x~y, data=, FUN=summary)- splits data into subsets, computes summary statistics for each, returns result in a convenient form
- x grouped by y
- x: var you want to summarise
- y: grouping var
summarygives Min, Q1, Median, Mean, 3Q, Max- can also use FUN=sd, FUN=mean, FUN=Skew, etc.
Hypothesis Tests (L7)
# Independent two-sample t-test
t.test(x, y, var.equal=TRUE) # pooled variance
t.test(x, y, var.equal=FALSE) # Welch's (unequal variance)
# Paired t-test
t.test(before, after, paired=TRUE)
# Extract from result
t_out$statistic; t_out$p.value; t_out$conf.int
# Wilcoxon Rank-Sum (non-parametric, independent)
wilcox.test(x, y)
# Wilcoxon Signed-Rank (non-parametric, paired)
wilcox.test(before, after, paired=TRUE, exact=FALSE)
# Normality
shapiro.test(x)
qqnorm(x); qqline(x)Skewness & Kurtosis (L7, DescTools)
library(DescTools)
aggregate(viscera ~ gender, data=abl, Skew, method=1)
aggregate(viscera ~ gender, data=abl, Kurt, method=1)
# method=1 → method-of-moments estimator (matches lectures)ANOVA (L8)
# Fit model and get ANOVA table
heifers_lm <- lm(org ~ type, data=heifers)
anova(heifers_lm) # SS breakdown + F-test
summary(heifers_lm) # coefficient estimates + t-tests
# Factor levels matter — set them explicitly
heifers$type <- factor(heifers$type, levels=c("Control", "Alfacyp", ...))
# Residuals
r1 <- residuals(heifers_lm)
hist(r1); qqnorm(r1); qqline(r1)
# Tukey HSD (all pairwise, valid post-hoc)
TukeyHSD(aov(heifers_lm), ordered=TRUE)
# Kruskal-Wallis (non-parametric ANOVA)
kruskal.test(heifers$org, heifers$type)
# t-quantile for CI construction
qt(0.025, df, lower.tail=FALSE) # returns positive critical valueLinear Regression (L9)
lm_out <- lm(Flow ~ Water, data=concrete)
lm_out <- lm(Flow ~ Water + Slag, data=concrete) # multiple
lm_out <- lm(registered ~ casual + workingday, data=bike2) # indicator var
lm_out <- lm(registered ~ casual * workingday, data=bike2) # interaction
summary(lm_out) # coefficients, R², F-test
anova(lm_out) # SS decomposition
confint(lm_out) # 95% CI for all β's
# Predictions with CI bands
new_df <- data.frame(Water = seq(160, 240, by=5))
conf_intervals <- predict(lm_out, new_df, interval="conf")
# Returns matrix with columns: fit, lwr, upr
# Residual diagnostics
r_s <- rstandard(lm_out) # standardised residuals
fitted(lm_out) # fitted values
influence.measures(lm_out) # leverage, Cook's distance etc.Simulation (L10)
set.seed(2137) # set seed for reproducibility
# Random variable generation
rnorm(n, mean=0, sd=1)
runif(n, min=0, max=1)
rgamma(n, shape=2, rate=3) # or scale=1/rate
rpois(n, lambda=1.3)
rbinom(n, size=2, prob=0.3)
# Replicate a function call n times
replicate(2000, generate_one_test())
# Apply with type safety (faster than sapply for known output type)
vapply(1:2000, function(x) generate_one_test(), 1L)
# Vectorised if-else
ifelse(X >= 11000, 11000, floor(X) + (11000 - floor(X))*(-0.25))Bootstrap (L10, boot library)
library(boot)
stat_fn <- function(d, i) {
mean(d[i], trim=0.1) # i = bootstrap indices
}
boot_out <- boot(data, stat_fn, R=1999, stype="i")
boot.ci(boot.out=boot_out, type=c("perc", "bca"))
# perc: percentile interval
# bca: bias-corrected and accelerated (more accurate)Full reference: R Glossary