R Glossary

Quick reference: R

QUICK REFERENCE CARD

Most Used Functions (Memorize!)

# Reading
read.table(), read.csv()
 
# Inspection  
head(), str(), summary(), dim(), names()
 
# Statistics
mean(), median(), sd(), min(), max(), summary()
 
# Subsetting
df[rows, cols], df$column, df[condition, ]
 
# Utilities
length(), nrow(), ncol(), is.na(), sum()
 
# Sequences
1:10, seq(), rep()
 
# Sorting
sort(), order()
 
# Data Structures
c(), matrix(), data.frame(), list()
 
# Tabulation
table(), aggregate()
 
# Plots
plot(), hist(), boxplot()
 
# Tests
chisq.test(), t.test(), lm()

DATA STRUCTURES

Vectors

# Creating vectors
c(1, 2, 3, 4)                    # combine function
2:5                              # sequence: 2, 3, 4, 5
seq(2, 10, by=2)                 # 2, 4, 6, 8, 10
seq(2, 10, length=5)             # 5 evenly spaced numbers
rep(2, 3)                        # repeat: 2, 2, 2
rep(c(1,2), 3)                   # 1, 2, 1, 2, 1, 2
rep(c(1,2), c(3,2))              # 1, 1, 1, 2, 2

Matrices

# Create matrix
matrix(v, nrow=2, ncol=3)              # fills by COLUMN (default)
matrix(v, nrow=2, ncol=3, byrow=TRUE)  # fills by ROW
 
# Matrix operations
rbind(row1, row2)              # stack rows vertically
cbind(col1, col2)              # stack columns horizontally
t(matrix)                      # transpose
solve(matrix)                  # inverse of square matrix
matrixA %*% matrixB            # matrix multiplication

Dataframes

# Create dataframe
df <- data.frame(col1 = c(1,2,3), col2 = c("a","b","c"))
as.data.frame(matrix)          # convert matrix to dataframe
 
# Add columns
df$new_col <- values           # add by name

Lists

# Create list
my_list <- list(A = c(1,3,5), B = "text", C = df)
 
# Access elements
my_list$A                      # by name with $
my_list[[1]]                   # by position
my_list[["A"]]                 # by name with [[]]

READING & WRITING DATA

Reading Files

# Text files
read.table("file.txt")                          # space/tab separated
read.table("file.txt", header=TRUE)             # with header row
read.table("file.txt", header=FALSE, col.names=names_vec)
 
# CSV files
read.csv("file.csv")                            # has header
read.csv("file.csv", header=FALSE)              # no header
 
# JSON files (requires jsonlite package)
library(jsonlite)
data <- fromJSON("file.json")                   # returns list

Important: Always open file in text editor FIRST to check:

Header row presence
Separator type (space, tab, comma, semicolon)
Trailing empty rows
Expected row count

Writing Files

write.csv(df, "output.csv", row.names=FALSE)    # save as CSV
write.table(df, "output.txt")                   # save as text
saveRDS(df, "data.rds")                         # save R object
df <- readRDS("data.rds")                       # load R object

DATA INSPECTION

Basic Info

head(df)                       # first 6 rows
head(df, 10)                   # first 10 rows
tail(df)                       # last 6 rows
str(df)                        # structure of object
summary(df)                    # summary statistics
nrow(df)                       # number of rows (preferred)
NROW(df)                       # also works, more general
ncol(df)                       # number of columns (preferred)
NCOL(df)                       # also works
dim(df)                        # dimensions: c(rows, cols)
names(df)                      # column names

SUBSETTING & INDEXING

Using [] Brackets

# Basic indexing
df[rows, columns]
df[1:3, ]                      # rows 1-3, all columns
df[, 2:4]                      # all rows, columns 2-4
df[1:3, 2:4]                   # rows 1-3, columns 2-4
df[c(1,3,5), ]                 # specific rows: 1, 3, 5
 
# Negative indexing (DROPS elements)
x[-1]                          # drops first element
x[-c(1,3)]                     # drops elements 1 and 3

Using $ (Column Access)

df$column_name                 # returns vector
df$age                         # get 'age' column

Logical Subsetting

# Single condition
df[df$Gender == "M", ]                    # males only
df[df$Age > 25, ]                         # age > 25
df[df$Score >= 80, ]                      # score >= 80
 
# Multiple conditions
df[df$Gender == "M" & df$Age > 25, ]      # AND: male AND over 25
df[df$Gender == "M" | df$Grade == "A", ]  # OR: male OR grade A
 
# Operators
# ==  equal to
# !=  not equal to
# >   greater than
# <   less than
# >=  greater than or equal
# <=  less than or equal
# &   AND (vectorized)
# |   OR (vectorized)
# &&  AND (non-vectorized, only first element)
# ||  OR (non-vectorized, only first element)

SUMMARY STATISTICS

Single Variable

mean(x)                        # average
median(x)                      # median
sd(x)                          # standard deviation
var(x)                         # variance
min(x)                         # minimum
max(x)                         # maximum
range(x)                       # c(min, max)
sum(x)                         # sum
length(x)                      # number of elements
summary(x)                     # five-number summary + mean
quantile(x, probs)             # quantiles (default: 0, 0.25, 0.5, 0.75, 1)
IQR(x)                         # interquartile range

Grouped Statistics

# aggregate() - THE WORKHORSE for grouped summaries
aggregate(x ~ y, data=df, FUN=summary)    # x grouped by y
aggregate(score ~ gender, data=df, FUN=mean)
aggregate(cbind(x, y) ~ group, data=df, FUN=mean)  # multiple variables
 
# Formula syntax: outcome ~ predictor
# Can use custom functions
# FUN must return same-length output for each group

Apply Family

Function	Input	Output	Use when
`apply`	matrix/array	vector or matrix	row/col summaries of a matrix
`lapply`	list/vector	list (always)	transform each element, keep as list
`sapply`	list/vector	vector or matrix	same as lapply, simplified result
`vapply`	list/vector	typed vector	same as sapply, type-safe & faster
`tapply`	vector + factor	named array	group-wise summaries

# apply() — over rows (MARGIN=1) or columns (MARGIN=2) of a matrix
apply(m, MARGIN=1, FUN=mean)              # row means
apply(m, MARGIN=2, FUN=sd)               # column std deviations
apply(m, MARGIN=1, FUN=function(x) mean(x, trim=0.1))
 
# lapply() — returns a list, always
lapply(1:5, function(i) i^2)             # list(1, 4, 9, 16, 25)
lapply(df_list, summary)                 # summary of each data frame
 
# sapply() — like lapply but simplifies to vector/matrix if possible
sapply(1:5, function(i) i^2)             # c(1, 4, 9, 16, 25)
sapply(groups, mean)                     # named vector of group means
 
# vapply() — like sapply but type-safe; specify expected output type
vapply(1:5, function(i) i^2, numeric(1)) # must return a numeric scalar each time
vapply(1:2000, function(x) run_test(), 1L)  # 1L = integer scalar
 
# tapply() — apply FUN to subsets of x defined by a grouping factor
tapply(df$score, df$gender, mean)        # mean score by gender
tapply(df$score, df$gender, sd)          # SD by group
tapply(df$value, df$group, function(x) mean(Winsorize(x)))

Correlation

cor(x, y)                               # correlation (default: Pearson)
cor(x, y, method="pearson")             # Pearson correlation
cor(x, y, method="spearman")            # Spearman rank correlation  
cor(x, y, method="kendall")             # Kendall's tau
cor(df[, c("col1", "col2", "col3")])   # correlation matrix

ROBUST STATISTICS

# Trimmed mean — discard extreme proportions before averaging
mean(x, trim=0.1)                      # 10% trimmed: drops lowest/highest 10%
 
# Median absolute deviation (MAD) — robust spread measure
mad(x)                                 # median(|x - median(x)|) * 1.4826
# The 1.4826 scale factor makes it consistent with SD for normal data
 
# Winsorize — cap extreme values at given quantiles (requires DescTools)
library(DescTools)
Winsorize(x, quantile=c(0.05, 0.95))   # replace values outside [5th, 95th] pctile
# Winsorize returns modified vector; combine with tapply/aggregate for groups
aggregate(score ~ group, data=df,
          FUN=function(x) mean(Winsorize(x, quantile=c(0.05, 0.95))))

MISSING VALUES

is.na(x)                       # TRUE/FALSE for each element
sum(is.na(x))                  # count missing values
na.omit(df)                    # remove rows with ANY missing
complete.cases(df)             # logical vector: TRUE if row complete

SORTING & ORDERING

sort(x)                                  # sort vector ascending
sort(x, decreasing=TRUE)                 # sort descending
order(x)                                 # get indices for sorting
 
# Sort dataframe by column
df[order(df$Age), ]                      # ascending
df[order(df$Age, decreasing=TRUE), ]     # descending
df[order(df$Age, df$Score), ]            # sort by Age, then Score

USEFUL UTILITY FUNCTIONS

Finding & Matching

which(condition)               # returns indices where TRUE
which.max(x)                   # index of maximum (FIRST if ties)
which.min(x)                   # index of minimum (FIRST if ties)
which(x == max(x))             # ALL indices with max value
match(x, table)                # returns positions of first matches
# match(c("A","B","C"), c("X","A","Y","C")) returns c(2, NA, 4)

Tabulation

table(x)                       # frequency table
table(x, y)                    # cross-tabulation (contingency table)
xtabs(~ x + y, data=df)        # alternative cross-tab using formula

Date Handling

as.Date("2020-01-15")                        # parse ISO date string to Date
as.Date("15/01/2020", format="%d/%m/%Y")     # parse with custom format
Sys.Date()                                   # today's date
as.numeric(date2 - date1)                    # difference in days

Data Reshaping

# stack() — reshape wide data to long format (selected columns → values + ind)
stack(df, select=c(col1, col2))    # returns data.frame with $values and $ind
stack(df[, c("before", "after")]) # use with wilcox.test/t.test on stacked data
# $values: all numeric values concatenated
# $ind   : factor indicating which original column each value came from

Cutting (Binning)

cut(x, breaks)                             # bin continuous variable
cut(score, breaks=c(-Inf, 10, 12, 15, 18, 20), 
    labels=c("F","D","C","B","A"))        # create grade categories
cut(x, breaks=5)                           # 5 equal-width bins
cut(x, breaks=c(0,10,20), include.lowest=TRUE)  # close left endpoint

Combinations

combn(n, k)                    # all combinations of k items from n
# combn(5, 2) returns all pairs from 1:5
# combn(c("A","B","C"), 2) returns all pairs from letters

Set Operations

union(x, y)                    # elements in x OR y
intersect(x, y)                # elements in BOTH x and y
setdiff(x, y)                  # elements in x but NOT in y
%in%                           # element-wise membership test
# c(1,2,3) %in% c(2,4,6) returns c(FALSE, TRUE, FALSE)

STATISTICAL FUNCTIONS

Distributions

Pattern: Every distribution has 4 functions: d, p, q, r

# d = density/PMF/PDF
# p = cumulative probability (CDF)
# q = quantile function (inverse CDF)
# r = random generation
 
# Examples with normal distribution:
dnorm(x, mean=0, sd=1)         # density at x
pnorm(q, mean=0, sd=1)         # P(X <= q)
qnorm(p, mean=0, sd=1)         # value with cumulative prob p
rnorm(n, mean=0, sd=1)         # generate n random values
 
# Other distributions: binom, pois, unif, exp, chisq, t, f, etc.
# dhyper(w, m, n, k) - hypergeometric
## w: successes drawn
## m: no. of success states in popn 
## n: no. of failure states in popn 
## k: no. of draws 
 
# dpois(k, lambda) - Poisson

for a table:
(a, b)
(c, d)

dhyper(a, a+c, b+d, a+b)

Hypothesis Tests

# Chi-square test
chisq.test(table)              # returns list with test results
chisq_out <- chisq.test(table)
chisq_out$statistic            # test statistic
chisq_out$p.value              # p-value
chisq_out$expected             # expected frequencies
 
# Fisher's exact test (for 2x2 tables with small counts)
fisher.test(table)
 
# t-test
t.test(x, mu=0)                        # one-sample
t.test(x, y)                           # two-sample (Welch by default)
t.test(x, y, var.equal=TRUE)           # pooled variance (equal var assumed)
t.test(before, after, paired=TRUE)     # paired
t.test(x, y, conf.level=0.99)         # 99% CI instead of default 95%
 
# Extract t-test results
t_out <- t.test(x, y, var.equal=TRUE)
t_out$statistic                        # test statistic
t_out$p.value                          # p-value
t_out$conf.int                         # confidence interval
 
# Normality tests
shapiro.test(x)                        # Shapiro-Wilk, H0: data is Normal
ks.test(x, "pnorm")                    # Kolmogorov-Smirnov
 
# Q-Q plots
qqnorm(x)                              # theoretical normal quantiles vs data
qqline(x)                              # add reference line through quartiles
 
# Non-parametric two-sample tests
wilcox.test(x, y)                          # Wilcoxon Rank-Sum (independent)
wilcox.test(x, y, alternative="greater")  # one-sided: H₁: x > y
wilcox.test(before, after,
            paired=TRUE, exact=FALSE)      # Wilcoxon Signed-Rank (paired)
# exact=FALSE: use normal approximation (required when ties present)
# alternative: "two.sided" (default), "greater", "less"
 
# Linear models
lm(y ~ x, data=df)                # simple linear regression
lm(y ~ x1 + x2, data=df)          # multiple regression
lm(y ~ x1 + x2 + x3, data=df)     # add indicator variable for categorical
lm(y ~ x1 * x2, data=df)          # interaction (includes main effects too)
lm_out <- lm(y ~ x, data=df)
 
# Accessing lm results
coef(lm_out)                       # named vector of β estimates
lm_out$coefficients                # same
summary(lm_out)                    # coefficients + t-tests + R² + F-test
anova(lm_out)                      # SS decomposition table (SSReg, SSRes, F)
confint(lm_out)                    # 95% CI for all parameters
fitted(lm_out)                     # ŷ fitted values
residuals(lm_out)                  # raw residuals r_i = y_i - ŷ_i
rstandard(lm_out)                  # standardised residuals (use for diagnostics)
influence.measures(lm_out)         # leverage, Cook's D, DFFITS etc.
 
# Extracting scalars from summary()
s <- summary(lm_out)
s$r.squared                        # R²
s$adj.r.squared                    # adjusted R²
s$sigma                            # residual standard error σ̂
s$coefficients["x", "Pr(>|t|)"]   # p-value for specific coefficient

ANOVA (L8)

# One-way ANOVA via lm
heifers_lm <- lm(org ~ type, data=heifers)
anova(heifers_lm)                          # F-test table
summary(heifers_lm)                        # coefficient estimates
 
# Ordering factor levels (controls which group is reference)
heifers$type <- factor(heifers$type,
                       levels=c("Control", "Alfacyp", "Enro", ...))
 
# Residual checks
r1 <- residuals(heifers_lm)
hist(r1)
qqnorm(r1); qqline(r1)
 
# Confidence interval for contrast / pairwise comparison (manual)
qt(0.025, df=n-k, lower.tail=FALSE)        # critical t-value (positive)
 
# Tukey HSD — all pairwise CIs, valid post-hoc
TukeyHSD(aov(heifers_lm), ordered=TRUE)
 
# Kruskal-Wallis (non-parametric, requires n_i >= 5 per group)
kruskal.test(heifers$org, heifers$type)

Prediction from Linear Model (L9)

# Generate new data for prediction
new_df <- data.frame(Water = seq(160, 240, by=5))
 
# Confidence interval for mean response E(Y|X)
conf_intervals <- predict(lm_out, new_df, interval="conf")
# Returns matrix: columns = fit, lwr, upr
 
# Prediction interval for individual new observation
pred_intervals <- predict(lm_out, new_df, interval="pred")
 
# Plot fitted line with CI bands
plot(x, y)
abline(lm_out, col="red")
lines(new_df$Water, conf_intervals[, "lwr"], col="red", lty=2)
lines(new_df$Water, conf_intervals[, "upr"], col="red", lty=2)

LOOPS & CONTROL FLOW

For Loops

# Basic structure
for(variable in sequence) {
  # do something
}
 
# Examples
for(i in 1:10) {
  print(i^2)
}
 
for(name in c("Alice", "Bob", "Charlie")) {
  print(paste("Hello", name))
}
 
# Loop over dataframe groups
for(group_name, group_df in groupby_output) {
  # process each group
}

While Loops

# Basic structure
while(condition) {
  # do something
  # MUST update condition!
}
 
# Example
x <- 0
while(x < 10) {
  print(x)
  x <- x + 1
}

If-Else

if(condition) {
  # do something
} else if(another_condition) {
  # do something else
} else {
  # default action
}

USER-DEFINED FUNCTIONS

# Basic structure
function_name <- function(arg1, arg2, default_arg=10) {
  # calculations
  result           # last line is returned automatically
}
 
# Or use explicit return
function_name <- function(arg1, arg2) {
  result <- arg1 + arg2
  return(result)
}
 
# Call function
function_name(5, 3)

PLOTTING (BASE R)

Basic Plots

plot(x, y)                            # scatter plot
plot(x, y, col="red", pch=19, cex=1.5)  # with options
hist(x)                               # histogram
hist(x, breaks=20)                    # more bins
boxplot(x)                            # box plot
boxplot(y ~ group, data=df)           # box plot by group
 
# Plot returns object with stats
box_out <- boxplot(x)
box_out$out                           # outlier values
box_out$stats                         # five-number summary

Plot Customization

# Common arguments
col                             # color (e.g., "red", "#FF0000")
pch                             # plotting character (1-25)
cex                             # character expansion (size multiplier)
lty                             # line type (1=solid, 2=dashed, etc.)
lwd                             # line width
xlab, ylab                      # axis labels
main                            # title
xlim, ylim                      # axis limits c(min, max)
 
# Adding elements
abline(a, b)                    # line with intercept a, slope b
abline(h=5)                     # horizontal line at y=5
abline(v=3)                     # vertical line at x=3
abline(lm_model)                # add regression line
points(x, y)                    # add points
lines(x, y)                     # add lines
text(x, y, labels)              # add text
legend()                        # add legend
 
# Multiple plots in one figure
par(mfrow=c(2, 2))              # 2x2 grid of plots
par(mfrow=c(1, 1))              # reset to single plot
op <- par()                     # save old parameters
par(op)                         # restore old parameters
 
# Colors
colors()                        # 657 named colors
rgb(255, 0, 0, 64, maxColorValue=255)  # custom with transparency
# alpha controls transparency (0=transparent, 255=opaque)

Advanced Plotting Techniques

# Jittering (add random noise to avoid overplotting)
plot(x, y + runif(length(y), -0.2, 0.2))
 
# Transparency for overplotting
red_transparent <- rgb(255, 0, 0, alpha=64, maxColorValue=255)
plot(x, y, col=red_transparent, pch=20, cex=1.6)

PLOTTING (LATTICE)

library(lattice)
 
# Histograms
histogram(~ variable, data=df)
histogram(~ variable | factor, data=df)  # conditioned on factor
 
# Density plots
densityplot(~ variable, data=df)
densityplot(~ variable, data=df, bw=0.5)  # bandwidth control
densityplot(~ variable | factor, data=df)
 
# Box plots
bwplot(~ variable, data=df)
bwplot(variable ~ factor, data=df)        # by group
 
# Scatter plots
xyplot(y ~ x, data=df)
xyplot(y ~ x | factor, data=df)           # panels by factor
xyplot(y ~ x, groups=factor, data=df)     # overlay by group
xyplot(y ~ x, data=df, type="l")          # line plot
 
# Dot plots
dotplot(value ~ group, data=df)           # dot plot by group (good for means/CIs)
 
# Bar charts
barchart(table, horizontal=FALSE)
 
# Common arguments
layout = c(3, 1)                # 3 columns, 1 row
main = "title"                  # plot title
xlab, ylab                      # axis labels
col                             # color
pch                             # plotting character

PLOTTING (GGPLOT2) - If Covered

library(ggplot2)
 
# Basic template
ggplot(data=df, aes(x=var1, y=var2)) +
  geom_point() +
  labs(title="Title", x="X label", y="Y label")
 
# Common geoms
geom_point()                    # scatter plot
geom_line()                     # line plot
geom_histogram()                # histogram
geom_boxplot()                  # box plot
geom_bar()                      # bar chart
geom_density()                  # density plot

CATEGORICAL DATA ANALYSIS PATTERNS

Computing Expected Counts Under Independence

# Given contingency table
tab <- table(df$rater_A, df$rater_B)
n <- sum(tab)                          # total count
 
# Marginal probabilities
row_margins <- rowSums(tab) / n        # row marginals
col_margins <- colSums(tab) / n        # column marginals
 
# Expected counts using outer product
expected <- n * outer(row_margins, col_margins)
 
# Pearson residuals
residuals <- (tab - expected) / sqrt(expected)

Agreement Measures

Cohen’s Kappa:

# Measure of inter-rater agreement
# κ = (observed - expected) / (1 - expected)
 
# Joint probabilities
pi_ij <- tab / n
 
# Observed agreement (diagonal sum)
observed <- sum(diag(pi_ij))
 
# Expected agreement under independence  
expected <- sum(row_margins * col_margins)
 
# Cohen's Kappa
kappa <- (observed - expected) / (1 - expected)

Weighted Kappa (for ordinal categories):

# Create weight matrix: w_ij = 1 - |i-j|/(I-1)
I <- 4  # number of categories
w <- matrix(0, nrow=I, ncol=I)
 
for(i in 1:I) {
  for(j in 1:I) {
    w[i, j] <- 1 - abs(i - j) / (I - 1)
  }
}
 
# Weighted observed agreement
weighted_obs <- sum(w * pi_ij)
 
# Weighted expected agreement
weighted_exp <- sum(w * outer(row_margins, col_margins))
 
# Weighted Kappa
kappa_w <- (weighted_obs - weighted_exp) / (1 - weighted_exp)

Key functions:

outer(x, y) - outer product for expected counts
diag(matrix) - extract diagonal (for agreement)
sum(diag(matrix)) - trace (sum of diagonal)

CATEGORICAL DATA ANALYSIS

Mosaic Plots

# Mosaic plot (base R)
mosaicplot(table)
mosaicplot(table, shade=TRUE)              # color by residuals
mosaicplot(table, shade=TRUE, 
           xlab="X", ylab="Y")             # with labels
 
# Interpretation:
# - Area proportional to cell count
# - Blue = more than expected under independence
# - Red = less than expected
# - Solid = positive residual, dashed = negative

Association Measures (Categorical)

# Requires DescTools package
library(DescTools)
 
# Get all association measures
Desc(table, plotit=FALSE)
assoc_out <- Desc(table, plotit=FALSE)[[1]]$assocs
 
# Specific measures:
# - Kendall's tau-b: for ordinal variables
# - Goodman-Kruskal gamma: for ordinal variables
# - Cramér's V: for nominal variables
# - Odds Ratio: for 2x2 tables

Skewness & Kurtosis (DescTools, L7)

library(DescTools)
 
# Per-group skewness and kurtosis via aggregate
aggregate(viscera ~ gender, data=abl, Skew, method=1)
aggregate(viscera ~ gender, data=abl, Kurt, method=1)
 
# method=1: method-of-moments estimator (matches lecture formulas)
# Skew > 0: right-skewed; < 0: left-skewed
# Kurt > 0: fat tails (leptokurtic); < 0: thin tails (platykurtic)
# Note: Kurt() here is EXCESS kurtosis (Normal = 0)

Odds Ratio & Relative Risk

# Odds ratio for 2x2 table:
# OR = (a*d) / (b*c) where table is:
#      col1  col2
# row1   a     b
# row2   c     d
 
# Relative risk:
# RR = (a/(a+b)) / (c/(c+d))
# RR ≈ OR when probabilities are small

SIMULATION (L10)

Random Number Generation

set.seed(2137)    # set seed — must call BEFORE generating; resets to same stream
 
rnorm(n, mean=0, sd=1)           # Normal
runif(n, min=0, max=1)           # Uniform
rgamma(n, shape=2, rate=3)       # Gamma (rate = 1/scale)
rpois(n, lambda=1.3)             # Poisson
rbinom(n, size=2, prob=0.3)      # Binomial
rexp(n, rate=1)                  # Exponential

Simulation Loop Patterns

# Basic simulation loop
set.seed(2139)
output_vec <- rep(0, length=100)     # pre-allocate
for(i in 1:length(output_vec)) {
  X <- rpois(20, 0.5)
  # ... compute statistic
  if(condition) output_vec[i] <- 1
}
mean(output_vec)                     # estimate of probability
 
# replicate() — cleaner alternative for repeating a function call
replicate(2000, some_function())

ifelse and floor

ifelse(condition, value_if_true, value_if_false)   # vectorised if-else
# e.g. profit function:
hX <- ifelse(X >= 11000, 11000,
             floor(X) + (11000 - floor(X)) * (-0.25))
 
floor(x)     # round DOWN to nearest integer
ceiling(x)   # round UP
round(x, n)  # round to n decimal places

Bootstrap (boot library)

library(MASS)
library(boot)
 
# Define statistic function — must accept (data, indices)
stat_fn <- function(d, i) {
  mean(d[i], trim=0.1)     # i contains bootstrap sample indices
}
 
# Run bootstrap
boot_out <- boot(chem, stat_fn, R=1999, stype="i")
 
# Compute confidence intervals
boot.ci(boot.out=boot_out, type=c("perc", "bca"))
# perc: basic percentile interval
# bca:  bias-corrected and accelerated (more accurate, preferred)

Monte Carlo Integration

# To estimate ∫ h(x) f(x) dx = E[h(X)]:
# 1. Generate X ~ f
# 2. Compute mean(h(X))
 
set.seed(2138)
X <- runif(50000, 0, 1)     # X ~ Uniform(0,1)
hX <- exp(2*X)              # h(X)
mean(hX)                    # Monte Carlo estimate
 
# Confidence interval for the estimate (from CLT)
s <- sd(hX)
q1 <- qnorm(0.95)           # z-quantile
CI <- c(mean(hX) - q1*s/sqrt(n), mean(hX) + q1*s/sqrt(n))

SPECIAL TECHNIQUES

Poisson-ness Plot

# Check if data follows Poisson distribution
# 1. Get counts of counts
Xk <- table(data)
k <- as.integer(names(Xk))
Xk <- as.vector(Xk)
 
# 2. Compute phi
N <- length(data)
phi <- lfactorial(k) + log(Xk/N)
 
# 3. Plot phi vs k (should be linear if Poisson)
plot(k, phi)
 
# 4. Estimate lambda from slope
lm_out <- lm(phi ~ k)
slope <- coef(lm_out)[2]
lambda_hat <- exp(slope)

Theil-Sen Estimator (Robust Slope)

# Compute slope from ALL pairwise combinations
# More robust to outliers than least squares
 
all_pairs <- combn(n, 2)
slopes <- numeric(ncol(all_pairs))
 
for(i in 1:ncol(all_pairs)) {
  idx <- all_pairs[, i]
  slopes[i] <- (y[idx[2]] - y[idx[1]]) / (x[idx[2]] - x[idx[1]])
}
 
robust_slope <- median(slopes)

STRING OPERATIONS

paste("Hello", "World")              # concatenate with space
paste0("Hello", "World")             # concatenate without space
sprintf("%.2f", 3.14159)             # formatted printing
substr("Hello", 1, 3)                # substring: "Hel"
nchar("Hello")                       # string length: 5
toupper("hello")                     # "HELLO"
tolower("HELLO")                     # "hello"
strsplit("a,b,c", ",")               # split string: list("a","b","c")
grep("pattern", vector)              # find pattern (returns indices)
grepl("pattern", vector)             # find pattern (returns T/F)
sub("old", "new", string)            # replace first occurrence
gsub("old", "new", string)           # replace all occurrences

OUTPUT & PRINTING

print(x)                             # print object
cat("text", x, "\n")                 # concatenate and print (no quotes)
cat("Value:", x, "\n", sep="")       # control separator
 
# format() — convert number to string with controlled precision
format(3.14159, digits=3)            # "3.14" (3 significant digits)
format(0.0001234, digits=3)          # "0.000123"
format(x, nsmall=2)                  # ensure at least 2 decimal places
 
# Formatted printing
sprintf("%.2f", 3.14159)             # "3.14" (2 decimals)
sprintf("%d", 42)                    # "42" (integer)
sprintf("%s", "text")                # "text" (string)
 
# Redirect output to file
sink("output.txt")                   # start saving to file
cat("Results:\n")
print(summary(data))
sink()                               # stop, close file

PACKAGES

# Install (once)
install.packages("package_name")
install.packages(c("pkg1", "pkg2"))  # multiple packages
 
# Load (every session)
library(package_name)
require(package_name)                # alternative, returns T/F
 
# Check if installed
"package_name" %in% installed.packages()
 
# Get help
help(package="package_name")

Common Packages for This Course:

MASS - datasets and robust regression
lattice - trellis graphics (conditioned plots, histogram, xyplot, etc.)
jsonlite - JSON read/write
DescTools - Skew(), Kurt(), Desc() for categorical association measures
psych - correlation plots; corPlot(r, cex=0.8, cex.axis=0.8, show.legend=TRUE) where r is a correlation matrix
boot - bootstrap resampling (boot(), boot.ci())

GETTING HELP

?function_name                       # help for function
??search_term                        # search all help
help(function_name)                  # same as ?
help.search("search_term")           # same as ??
example(function_name)               # run examples from help
args(function_name)                  # show function arguments
apropos("text")                      # find functions with "text" in name

Tip: Examples at bottom of help pages are GUARANTEED to work!

IMPORTANT CONCEPTS

Recycling Rule

Shorter vector repeats to match longer vector

c(1, 2, 3, 4) + c(10, 20)
# Returns: 11, 22, 13, 24
# Explanation: c(10,20) recycled to c(10,20,10,20)
 
# Used in matrix operations
matrix / vector  # vector recycles to match each column/row

Indexing Differences from Python

Indexing starts at 1 (not 0)
Negative indexing DROPS elements (not “from end”)

x <- c(10, 20, 30, 40, 50)
x[1]           # 10 (first element)
x[-1]          # 20, 30, 40, 50 (drops first)
x[-c(1,3)]     # 20, 40, 50 (drops 1st and 3rd)

Formula Syntax

Powerful notation used across many functions

# Basic: outcome ~ predictor
aggregate(score ~ gender, data=df, FUN=mean)
lm(y ~ x, data=df)
boxplot(score ~ group, data=df)
 
# Multiple predictors
lm(y ~ x1 + x2, data=df)
 
# Interactions
lm(y ~ x1 * x2, data=df)        # includes main effects + interaction
lm(y ~ x1 : x2, data=df)        # interaction only
 
# Conditioning in lattice
xyplot(y ~ x | factor, data=df)  # separate panels by factor

DEBUGGING TIPS

# Insert breakpoint
browser()                      # stop execution, enter debug mode
 
# Debug mode commands
# n - next line
# c - continue (exit debug mode)
# Q - quit
# help - show debug commands
# where - show call stack
 
# View objects
str(object)                    # structure
class(object)                  # object type
typeof(object)                 # storage mode

General Debugging Strategy:

Print intermediate values to understand structure
Test functions with small, known datasets first
Read error messages carefully - last line tells you where
Check for common mistakes (see below)

COMMON MISTAKES TO AVOID

❌ Forgetting c()

# WRONG
x <- 1, 2, 3
# CORRECT
x <- c(1, 2, 3)

❌ Using = instead of == in logical tests

# WRONG
df[df$Gender = "M", ]
# CORRECT  
df[df$Gender == "M", ]

❌ Forgetting comma in df[rows, cols]

# WRONG (treats as list indexing)
df[1:3]
# CORRECT
df[1:3, ]

❌ Using && instead of & in vectors

# WRONG (only compares first elements)
df[df$Age > 25 && df$Gender == "M", ]
# CORRECT (vectorized)
df[df$Age > 25 & df$Gender == "M", ]

❌ Accessing non-existent columns

# If column has spaces
df$col name      # WRONG
df$"col name"    # CORRECT
df[, "col name"] # ALSO CORRECT

BEST PRACTICES

✅ Use meaningful variable names

student_scores  # good
x               # bad (unless in math context)

✅ Check data after reading

data <- read.csv("file.csv")
head(data)      # check first few rows
str(data)       # check structure
summary(data)   # check for oddities

✅ Handle missing values explicitly

mean(x, na.rm=TRUE)           # remove NA before calculation
sum(is.na(x))                 # count missing

✅ Factor variables when appropriate

df$category <- factor(df$category, 
                      levels=c("low","medium","high"),
                      ordered=TRUE)

✅ Check assumptions before tests

# Chi-square: all expected counts >= 5
chisq_out$expected
 
# Normality: Q-Q plot, Shapiro-Wilk
qqnorm(x)
qqline(x)
shapiro.test(x)

NOTES FROM LECTURES

Instructor Quotes:

“Always open the data file in text editor before reading”
“The roof is not going to fall down if the code doesn’t work”
“If you’re not sure, just experiment with it”
“The examples at bottom of help pages - guaranteed to work”
“Try to use the recycling rule to your advantage”
“Converting column to factor before summary() gives frequency table instead of five-number summary”
“Small group sizes (n=3) make estimates unreliable — always check counts with table()”
“Never automatically drop outliers; investigate WHY; run analysis twice (with/without) and compare”

kienans garden *

Explorer