- Tools for data exploration and transformation
- Intuitive to write and easy to read
- Super-fast on data frames
Everything from the tutorial from Kevin Markham
And Hadley Wikham's dplyr resources without which nothing exists
if (packageVersion("devtools") < 1.6) {
install.packages("devtools")
}
devtools::install_github("hadley/lazyeval")
devtools::install_github("hadley/dplyr")
install.packages(c("hflights", "Lahman"))
suppressMessages(library(dplyr))
library(hflights)
hflights: Dataset on commercial domestic flights that departed Houston (IAH and HOU) in 2011.
# explore data
data(hflights)
head(hflights,3)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
Wraps a data frame that prints nicely
flights <- tbl_df(hflights)
head(flights,3) # Can also use print(flights,3) instead
## Source: local data frame [3 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# convert to a normal data frame to see all of the columns
head(data.frame(flights),3)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
filter
: for subsetting variablesselect
: for subsetting rowsarrange
: for re-ordering rowsmutate
: for adding new columnssummarise
or summarize
: for reducing each group to a smaller number of summary statistics # base R: flights[flights$Month==1 & flights$DayofMonth==1, ]
filter(flights, Month==1 & DayofMonth==1)
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## 7 2011 1 1 6 557 906 AA 1994
## 8 2011 1 1 6 1824 2106 AS 731
## 9 2011 1 1 6 654 1124 B6 620
## 10 2011 1 1 6 1639 2110 B6 622
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# same as filter(flights, Month==1, DayofMonth==1)
head(filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA"),2)
## Source: local data frame [2 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
tail(filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA"),2)
## Source: local data frame [2 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 12 30 5 1300 1436 UA 971
## 2 2011 12 26 1 607 728 UA 978
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(flights, UniqueCarrier %in% c("AA", "UA"))
## Source: local data frame [5,316 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# base R approach to select DepTime, ArrTime, and FlightNum columns
head(flights[, c("DepTime", "ArrTime", "FlightNum")])
## Source: local data frame [6 x 3]
##
## DepTime ArrTime FlightNum
## 1 1400 1500 428
## 2 1401 1501 428
## 3 1352 1502 428
## 4 1403 1513 428
## 5 1405 1507 428
## 6 1359 1503 428
# dplyr approach
print(select(flights, DepTime, ArrTime, FlightNum),n=6)
## Source: local data frame [227,496 x 3]
##
## DepTime ArrTime FlightNum
## 1 1400 1500 428
## 2 1401 1501 428
## 3 1352 1502 428
## 4 1403 1513 428
## 5 1405 1507 428
## 6 1359 1503 428
## .. ... ... ...
# use colon to select multiple contiguous columns, and use `contains` to match columns by name
# note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
head(select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay")))
## Source: local data frame [6 x 7]
##
## Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## 1 2011 1 1 7 13 -10 0
## 2 2011 1 2 6 9 -9 1
## 3 2011 1 3 5 17 -8 -8
## 4 2011 1 4 9 22 3 3
## 5 2011 1 5 9 9 -3 5
## 6 2011 1 6 6 13 -7 -1
# nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
head(filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60))
## Source: local data frame [6 x 2]
##
## UniqueCarrier DepDelay
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
## 6 AA 99
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60) %>%
head()
## Source: local data frame [6 x 2]
##
## UniqueCarrier DepDelay
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
## 6 AA 99
# create two vectors and calculate Euclidian distance between them
x1 <- 1:5; x2 <- 2:6
# Usual
sqrt(sum((x1-x2)^2))
## [1] 2.236
# chaining method
(x1-x2)^2 %>% sum() %>% sqrt()
## [1] 2.236
# base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
head(flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")])
## Source: local data frame [6 x 2]
##
## UniqueCarrier DepDelay
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
# dplyr approach
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay) %>% # arrange(desc(DepDelay)) for descending order
head()
## Source: local data frame [6 x 2]
##
## UniqueCarrier DepDelay
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
# base R approach to create a new variable Speed (in mph)
flights$Speed <- flights$Distance / flights$AirTime*60
head(flights[, c("Distance", "AirTime", "Speed")])
## Source: local data frame [6 x 3]
##
## Distance AirTime Speed
## 1 224 40 336.0
## 2 224 45 298.7
## 3 224 48 280.0
## 4 224 39 344.6
## 5 224 44 305.5
## 6 224 45 298.7
# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60) %>%
head()
## Source: local data frame [6 x 3]
##
## Distance AirTime Speed
## 1 224 40 336.0
## 2 224 45 298.7
## 3 224 48 280.0
## 4 224 39 344.6
## 5 224 44 305.5
## 6 224 45 298.7
# store the new variable: flights <- flights %>% mutate(Speed = Distance/AirTime*60)
group_by
creates the groups that will be operated onsummarise
uses the provided aggregation function to summarise each group# base R approaches to calculate the average arrival delay to each destination
head(aggregate(ArrDelay ~ Dest, flights, mean))
## Dest ArrDelay
## 1 ABQ 7.226
## 2 AEX 5.839
## 3 AGS 4.000
## 4 AMA 6.840
## 5 ANC 26.081
## 6 ASE 6.795
# or head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
# dplyr approach: create a table grouped by Dest, and then summarise each group by taking the mean of ArrDelay
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
head()
## Source: local data frame [6 x 2]
##
## Dest avg_delay
## 1 ABQ 7.226
## 2 AEX 5.839
## 3 AGS 4.000
## 4 AMA 6.840
## 5 ANC 26.081
## 6 ASE 6.795
# for each carrier, calculate the percentage of flights cancelled or diverted
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted) %>%
head()
## Source: local data frame [6 x 3]
##
## UniqueCarrier Cancelled Diverted
## 1 AA 0.018496 0.001850
## 2 AS 0.000000 0.002740
## 3 B6 0.025899 0.005755
## 4 CO 0.006783 0.002627
## 5 DL 0.015903 0.003029
## 6 EV 0.034483 0.003176
# for each carrier, calculate the minimum and maximum arrival and departure delays
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay")) %>%
head()
## Source: local data frame [6 x 5]
##
## UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## 1 AA -39 -15 978 970
## 2 AS -43 -15 183 172
## 3 B6 -44 -14 335 310
## 4 CO -55 -18 957 981
## 5 DL -32 -17 701 730
## 6 EV -40 -18 469 479
flights %>%
select(matches("Delay")) %>%
head(3)
## Source: local data frame [3 x 2]
##
## ArrDelay DepDelay
## 1 -10 0
## 2 -9 1
## 3 -8 -8
flights %>%
select(matches("Delay")) %>%
mutate_each(funs(half=./2)) %>%
head(3)
## Source: local data frame [3 x 2]
##
## ArrDelay DepDelay
## 1 -5.0 0.0
## 2 -4.5 0.5
## 3 -4.0 -4.0
# for each day of the year, count the total number of flights and sort in descending order
flights %>%
group_by(Month, DayofMonth) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count)) %>%
head()
## Source: local data frame [6 x 3]
## Groups: Month
##
## Month DayofMonth flight_count
## 1 1 3 702
## 2 1 2 678
## 3 1 20 663
## 4 1 27 663
## 5 1 13 662
## 6 1 7 661
# rewrite more simply with the `tally` function
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort = TRUE)
# for each destination, count the total number of flights and the number of distinct planes that flew there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum)) %>%
head()
## Source: local data frame [6 x 3]
##
## Dest flight_count plane_count
## 1 ABQ 2812 716
## 2 AEX 724 215
## 3 AGS 1 1
## 4 AMA 1297 158
## 5 ANC 125 38
## 6 ASE 125 60
# for each destination, show the number of cancelled and not cancelled flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
## Cancelled
## Dest 0 1
## ABQ 2787 25
## AEX 712 12
## AGS 1 0
## AMA 1265 32
## ANC 125 0
## ASE 120 5
mean
) takes n inputs and returns 1 valuemin_rank
), offset functions (lead
and lag
), and cumulative aggregates (like cummean
).# for each carrier, calculate which two days of the year they had their longest departure delays
# note: smallest (not largest) value is ranked as 1, so you have to use `desc` to rank by largest value
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <= 2) %>%
arrange(UniqueCarrier, desc(DepDelay)) %>%
head()
## Source: local data frame [6 x 4]
## Groups: UniqueCarrier
##
## UniqueCarrier Month DayofMonth DepDelay
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
# for each carrier, calculate which two days of the year they had their longest departure delays --- rewrite previous with the `top_n` function
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2) %>%
arrange(UniqueCarrier, desc(DepDelay))
# for each month, calculate the number of flights and the change from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count))
# rewrite previous with the `tally` function
flights %>%
group_by(Month) %>%
tally() %>%
mutate(change = n - lag(n))
# randomly sample a fixed number of rows, without replacement
flights %>% sample_n(5)
# randomly sample a fraction of rows, with replacement
flights %>% sample_frac(0.25, replace=TRUE)
# base R approach to view the structure of an object
str(flights)
# dplyr approach: better formatting, and adapts to your screen width
glimpse(flights)
mtcars %>% group_by(cyl) %>% do(head(.,2))
## Source: local data frame [6 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 4 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# Another example
models <- mtcars %>% group_by(cyl) %>% do(lm = lm(mpg ~ wt, data = .))
models %>% summarise(rsq = summary(lm)$r.squared)
4 joins from SQL
inner_join(x, y)
: matching x + yleft_join(x, y)
: all x + matching ysemi_join(x, y)
: all x with match in yanti_join(x, y)
: all x without match in y