Data manipulation
Basic analysis
Combination
Pipes
Statistical Programming in R
Data manipulation
Basic analysis
Combination
Pipes
library(MASS) # for the cats data library(dplyr) # data manipulation library(haven) # in/exporting data library(magrittr) # pipes
transform()
: changing and adding columnsdplyr::filter()
: row-wise selection (of cases)table()
: frequency tablesclass()
: object classlevels()
: levels of a factororder()
: data entries in increasing orderhaven::read_sav()
: import SPSS datacor()
: bivariate correlationsample()
: drawing a samplet.test()
: t-testhead(cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
str(cats)
## 'data.frame': 144 obs. of 3 variables: ## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ... ## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ... ## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
fem.cats <- cats[cats$Sex == "F", ] dim(fem.cats)
## [1] 47 3
head(fem.cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
heavy.cats <- cats[cats$Bwt > 3, ] dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
heavy.cats <- subset(cats, Bwt > 3) dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
dplyr
filter(cats, Bwt > 2, Bwt < 2.2, Sex == "F")
## Sex Bwt Hwt ## 1 F 2.1 7.2 ## 2 F 2.1 7.3 ## 3 F 2.1 7.6 ## 4 F 2.1 8.1 ## 5 F 2.1 8.2 ## 6 F 2.1 8.3 ## 7 F 2.1 8.5 ## 8 F 2.1 8.7 ## 9 F 2.1 9.8
class(cats$Sex)
## [1] "factor"
levels(cats$Sex)
## [1] "F" "M"
levels(cats$Sex) <- c("Female", "Male") table(cats$Sex)
## ## Female Male ## 47 97
head(cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Female 2.1 7.2 ## 5 Female 2.1 7.3 ## 6 Female 2.1 7.6
sorted.cats <- cats[order(cats$Bwt), ] head(sorted.cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 48 Male 2.0 6.5 ## 49 Male 2.0 6.5 ## 4 Female 2.1 7.2
cats.numbers <- cbind(Weight = cats$Bwt, Height = cats$Hwt) head(cats.numbers)
## Weight Height ## [1,] 2.0 7.0 ## [2,] 2.0 7.4 ## [3,] 2.0 9.5 ## [4,] 2.1 7.2 ## [5,] 2.1 7.3 ## [6,] 2.1 7.6
rbind(cats[1:3, ], cats[1:5, ])
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Female 2.0 7.0 ## 5 Female 2.0 7.4 ## 6 Female 2.0 9.5 ## 7 Female 2.1 7.2 ## 8 Female 2.1 7.3
cor(cats[, -1])
## Bwt Hwt ## Bwt 1.0000000 0.8041274 ## Hwt 0.8041274 1.0000000
With [, -1]
we exclude the first column
cor.test(cats$Bwt, cats$Hwt)
## ## Pearson's product-moment correlation ## ## data: cats$Bwt and cats$Hwt ## t = 16.119, df = 142, p-value < 2.2e-16 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## 0.7375682 0.8552122 ## sample estimates: ## cor ## 0.8041274
What do we conclude?
Makes sense, remember?
plot(cats$Bwt, cats$Hwt)
boys <- read_sav("boys.sav") %>% head()
It effectively replaces head(read_sav("boys.sav"))
.
Let's assume that we want to load data, change a variable, filter cases and select columns. Without a pipe, this would look like
boys <- read_sav("boys.sav") boys2 <- transform(boys, hgt = hgt / 100) boys3 <- filter(boys2, age > 15) boys4 <- subset(boys3, select = c(hgt, wgt, bmi))
With the pipe:
boys <- read_sav("boys.sav") %>% transform(hgt = hgt/100) %>% filter(age > 15) %>% subset(select = c(hgt, wgt, bmi))
Benefit: a single object in memory that is easy to interpret
Your code becomes more readable:
f(x)
becomes x %>% f()
rnorm(10) %>% mean()
## [1] -0.3881695
f(x, y)
becomes x %>% f(y)
boys %>% cor(use = "pairwise.complete.obs")
## hgt wgt bmi ## hgt 1.0000000 0.6100784 0.1758781 ## wgt 0.6100784 1.0000000 0.8841304 ## bmi 0.1758781 0.8841304 1.0000000
h(g(f(x)))
becomes x %>% f %>% g %>% h
boys %>% subset(select = wgt) %>% na.omit() %>% max()
## [1] 117.4
%>%
pipe%$%
pipe%T>%
pipe.
in a pipeset.seed(123) 1:5 %>% mean() %>% rnorm(10) %>% mean()
## [1] 10.25602
VS
set.seed(123) 1:5 %>% mean() %>% rnorm(n = 10, mean = ., ) %>% mean()
## [1] 3.074626
The .
can be used as a placeholder in the pipe.
set.seed(123) 1:26 %>% sample(3) %>% paste(., LETTERS[.])
## [1] "8 H" "20 T" "10 J"
library(MASS) # for cats data cats %$% t.test(Bwt ~ Sex)
## ## Welch Two Sample t-test ## ## data: Bwt by Sex ## t = -8.7095, df = 136.84, p-value = 8.831e-15 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.6631268 -0.4177242 ## sample estimates: ## mean in group Female mean in group Male ## 2.359574 2.900000
is the same as
t.test(Hwt ~ Sex, data = cats)
cats.test <- cats %$% t.test(Bwt ~ Sex) cats.test
## ## Welch Two Sample t-test ## ## data: Bwt by Sex ## t = -8.7095, df = 136.84, p-value = 8.831e-15 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.6631268 -0.4177242 ## sample estimates: ## mean in group Female mean in group Male ## 2.359574 2.900000
set.seed(123) x <- rnorm(5) x
## [1] -0.56047565 -0.23017749 1.55870831 0.07050839 0.12928774
x %<>% abs() x
## [1] 0.56047565 0.23017749 1.55870831 0.07050839 0.12928774
or
x <- x %>% abs() x
## [1] 0.56047565 0.23017749 1.55870831 0.07050839 0.12928774
cats %$% mean(Hwt)
## [1] 10.63056
cats.outl <- cats %>% filter(Hwt < mean(Hwt) + 3*sd(Hwt), Hwt > mean(Hwt) - 3*sd(Hwt)) cats %>% filter(Hwt > mean(Hwt) + 3*sd(Hwt))
## Sex Bwt Hwt ## 1 Male 3.9 20.5