library(dplyr) # Data manipulation library(magrittr) # Pipes library(ggplot2) # Plotting suite library(boot) # Bootstrapping functions
Statistical Programming with R
library(dplyr) # Data manipulation library(magrittr) # Pipes library(ggplot2) # Plotting suite library(boot) # Bootstrapping functions
set.seed(54321) height <- rnorm(3e6, mean = 180, sd = 7.5) shoesize <- 35 + 0.02*height + runif(3e6, -1, 1) city <- sample(c("Rnhem", "AlkmR", "LeeuwRden"), size = 3e6, prob = c(0.6, 0.3, 0.1), replace = TRUE) sumR <- data.frame(height, shoesize, city)
par(mfrow = c(1,3)) hist(height, breaks = 80, col = "light green", border = "transparent") hist(shoesize, breaks = 80, col = "light blue", border = "transparent") table(city) %>% barplot(col = "pink")
(In a real country we will not know these things)
We randomly sample 100 persons and "measure" their height, shoesize and city:
samp100 <- sample(3e6, size = 100) sdat100 <- sumR[samp100, ] head(sdat100)
## height shoesize city ## 2908895 177.8716 39.28683 Rnhem ## 236907 170.2528 38.97513 Rnhem ## 1230916 195.9043 38.27168 LeeuwRden ## 1937234 177.4607 38.39331 Rnhem ## 2782743 172.3269 38.89885 AlkmR ## 2551759 169.6376 39.02313 Rnhem
Let's use this sample to estimate quantities and infer things about our population!
We want to know the mean height of our population
The sample mean \(\bar{x}\) is an estimator for the population mean \(\mu\)
m <- mean(sdat100$height) m
## [1] 179.7258
This statistic has a sampling distribution: taking a different sample generates a different mean value
Thus we have uncertainty about our estimated mean
se <- sd(sdat100$height)/sqrt(100) se
## [1] 0.6673471
Using the standard error and the assumption of normality, we can create a confidence interval (CI):
(ci <- c(m - 1.96*se, m + 1.96*se))
## [1] 178.4178 181.0338
means <- numeric(10000) for (i in 1:10000) { means[i] <- mean(sumR$height[sample(3e6, size = 100)]) }
In the 50s and 60s, computers enabled another measure of uncertainty
jknf <- numeric(100) for (i in 1:100) jknf[i] <- mean(sdat100$height[-i])
Similar to LOOCV!
The Jackknife can be used to estimate the standard error:
\[ \begin{align} \hat{se}_{j} &= \sqrt{\frac{n-1}{n}\sum_{i=1}^n\left(\hat{\theta}_{(i)}-\overline{\hat{\theta}_{(\cdot)}}\right)^2} \end{align}\]
jkse <- sqrt(99/100*sum((jknf - mean(jknf))^2))
## Jackknife estimate: 0.667 ## True standarderror: 0.750
With this, we can once again create a sampling distribution and a CI based on normal theory.
Introduced by Efron in 1979 as computers became more powerful.
btsp <- numeric(1000) for (i in 1:1000) btsp[i] <- mean(sdat100$height[sample(100, replace = TRUE)])
We sample from our sample, pulling ourselves "up" into the space of population by our bootstraps
The standard error calculation does not need to be known
For some quantities, unreasonable sample sizes are needed before normality (e.g. product of two regression coefficients in mediation analysis)
log(mean(sdat100$shoesize))
## [1] 3.653499
boot
packagemystat <- function(dat, index) { median(dat$height[index])/mean(dat$shoesize[index]) } bootstat <- sdat100 %>% boot(mystat, R = 1000) # SE sd(bootstat$t)
## [1] 0.02899687
# 95% CI bootstat$t %>% quantile(c(0.025, 0.975))
## 2.5% 97.5% ## 4.596886 4.698511
boot
package# Distribution data.frame(mystat = bootstat$t) %>% ggplot(aes(x = mystat)) + geom_density(fill = "light seagreen") + xlim(4.5, 4.8) + theme_minimal()
boot
can be used for bootstrapping