- Tools for data exploration and transformation
- Intuitive to write and easy to read
- Super-fast on data frames
September 28, 2016
if (packageVersion("devtools") < 1.6) { install.packages("devtools") } install.packages(c("dplyr","nycflights13"))
library(nycflights13) suppressMessages(library(dplyr))
Package nycflights13
has data about all flights that departed NYC in 2013 - 5 datasets
flights
: Flights dataairlines
: Airline namesairports
: Airport metadataplanes
: Plane metadataweather
: Hourly weather dataflights
data# explore data data(flights) dim(flights)
## [1] 336776 19
head(flights,3)
## # A tibble: 3 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
336,776 rows… because of its size, this particular dataset has been wrapped, by default, using tbl_df
(what's that?)
Prints nicely and prevents an accidental display of the whole dataset
tblflights <- tbl_df(flights) head(tblflights,3) # Can also use print(flights,3) instead
## # A tibble: 3 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
# convert to a normal data frame to see all of the columns head(data.frame(tblflights),3)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## arr_delay carrier flight tailnum origin dest air_time distance hour ## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 ## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 ## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 ## minute time_hour ## 1 15 2013-01-01 05:00:00 ## 2 29 2013-01-01 05:00:00 ## 3 40 2013-01-01 05:00:00
filter
: for subsetting variablesselect
: for subsetting rowsarrange
: for re-ordering rowsmutate
: for adding new columnssummarise
or summarize
: for reducing each group to a smaller number of summary statistics# base R: tblflights[tblflights$carrier=="AA" & tblflights$origin=="LGA", ] filter(tblflights, carrier=="AA" & origin=="LGA")
## # A tibble: 15,459 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 558 600 -2 753 ## 2 2013 1 1 559 600 -1 941 ## 3 2013 1 1 623 610 13 920 ## 4 2013 1 1 629 630 -1 824 ## 5 2013 1 1 635 635 0 1028 ## 6 2013 1 1 656 700 -4 854 ## 7 2013 1 1 659 700 -1 1008 ## 8 2013 1 1 724 730 -6 1111 ## 9 2013 1 1 739 745 -6 918 ## 10 2013 1 1 753 755 -2 1056 ## # ... with 15,449 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
# same as filter(tblflights, carrier=="AA", origin=="LGA")
filter(tblflights, carrier=="AA" | carrier=="UA")
## # A tibble: 91,394 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 554 558 -4 740 ## 5 2013 1 1 558 600 -2 753 ## 6 2013 1 1 558 600 -2 924 ## 7 2013 1 1 558 600 -2 923 ## 8 2013 1 1 559 600 -1 941 ## 9 2013 1 1 559 600 -1 854 ## 10 2013 1 1 606 610 -4 858 ## # ... with 91,384 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
filter(tblflights, carrier %in% c("AA", "UA"))
## # A tibble: 91,394 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 554 558 -4 740 ## 5 2013 1 1 558 600 -2 753 ## 6 2013 1 1 558 600 -2 924 ## 7 2013 1 1 558 600 -2 923 ## 8 2013 1 1 559 600 -1 941 ## 9 2013 1 1 559 600 -1 854 ## 10 2013 1 1 606 610 -4 858 ## # ... with 91,384 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
# base R approach to select DepTime, ArrTime, and FlightNum columns head(tblflights[, c("dep_time", "arr_time", "flight")])
## # A tibble: 6 × 3 ## dep_time arr_time flight ## <int> <int> <int> ## 1 517 830 1545 ## 2 533 850 1714 ## 3 542 923 1141 ## 4 544 1004 725 ## 5 554 812 461 ## 6 554 740 1696
# dplyr approach print(select(tblflights, dep_time, arr_time, flight),n=6)
## # A tibble: 336,776 × 3 ## dep_time arr_time flight ## <int> <int> <int> ## 1 517 830 1545 ## 2 533 850 1714 ## 3 542 923 1141 ## 4 544 1004 725 ## 5 554 812 461 ## 6 554 740 1696 ## # ... with 3.368e+05 more rows
# use colon to select multiple contiguous columns, and use `contains` to match columns by name # note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name head(select(tblflights, flight:dest, contains("arr"), contains("dep")))
## # A tibble: 6 × 11 ## flight tailnum origin dest arr_time sched_arr_time arr_delay carrier ## <int> <chr> <chr> <chr> <int> <int> <dbl> <chr> ## 1 1545 N14228 EWR IAH 830 819 11 UA ## 2 1714 N24211 LGA IAH 850 830 20 UA ## 3 1141 N619AA JFK MIA 923 850 33 AA ## 4 725 N804JB JFK BQN 1004 1022 -18 B6 ## 5 461 N668DN LGA ATL 812 837 -25 DL ## 6 1696 N39463 EWR ORD 740 728 12 UA ## # ... with 3 more variables: dep_time <int>, sched_dep_time <int>, ## # dep_delay <dbl>
# nesting method to select carrier and dep_delay columns and filter for delays over 60 minutes head(filter(select(tblflights, carrier, dep_delay), dep_delay > 60))
## # A tibble: 6 × 2 ## carrier dep_delay ## <chr> <dbl> ## 1 MQ 101 ## 2 AA 71 ## 3 MQ 853 ## 4 UA 144 ## 5 UA 134 ## 6 EV 96
tblflights %>% select(carrier, dep_delay) %>% filter(dep_delay > 60) %>% head()
## # A tibble: 6 × 2 ## carrier dep_delay ## <chr> <dbl> ## 1 MQ 101 ## 2 AA 71 ## 3 MQ 853 ## 4 UA 144 ## 5 UA 134 ## 6 EV 96
# create two vectors and calculate Euclidian distance between them x1 <- 1:5; x2 <- 2:6 # Usual sqrt(sum((x1-x2)^2))
## [1] 2.236068
# chaining method (x1-x2)^2 %>% sum() %>% sqrt()
## [1] 2.236068
# base R approach to select carrier and dep_delay columns and sort by dep_delay head(tblflights[order(tblflights$dep_delay), c("carrier", "dep_delay")])
## # A tibble: 6 × 2 ## carrier dep_delay ## <chr> <dbl> ## 1 B6 -43 ## 2 DL -33 ## 3 EV -32 ## 4 DL -30 ## 5 F9 -27 ## 6 MQ -26
# dplyr approach tblflights %>% select(carrier, dep_delay) %>% arrange(dep_delay) %>% # arrange(desc(dep_delay)) for descending order head()
## # A tibble: 6 × 2 ## carrier dep_delay ## <chr> <dbl> ## 1 B6 -43 ## 2 DL -33 ## 3 EV -32 ## 4 DL -30 ## 5 F9 -27 ## 6 MQ -26
# base R approach to create a new variable - sum of squares of delays (arr and dep) tblflights$delaysquare <- tblflights$dep_delay^2 + tblflights$arr_delay^2 head(tblflights[, c("dep_delay", "arr_delay", "delaysquare")])
## # A tibble: 6 × 3 ## dep_delay arr_delay delaysquare ## <dbl> <dbl> <dbl> ## 1 2 11 125 ## 2 4 20 416 ## 3 2 33 1093 ## 4 -1 -18 325 ## 5 -6 -25 661 ## 6 -4 12 160
# dplyr approach (prints the new variable but does not store it) tblflights %>% select(dep_delay, arr_delay) %>% mutate(delaysquare = dep_delay^2 + arr_delay^2) %>% head()
## # A tibble: 6 × 3 ## dep_delay arr_delay delaysquare ## <dbl> <dbl> <dbl> ## 1 2 11 125 ## 2 4 20 416 ## 3 2 33 1093 ## 4 -1 -18 325 ## 5 -6 -25 661 ## 6 -4 12 160
# store the new variable: tblflights <- tblflights %>% # mutate(delaysquare = dep_delay^2 + arr_delay^2)
group_by
creates the groups that will be operated onsummarise
uses the provided aggregation function to summarise each group# base R approaches to calculate the mean arrival delays at different airports aggregate(arr_delay ~ origin, tblflights, mean)
## origin arr_delay ## 1 EWR 9.107055 ## 2 JFK 5.551481 ## 3 LGA 5.783488
# or with(tblflights, tapply(arr_delay, origin, mean, na.rm=TRUE))
# dplyr approach: create a table grouped by origin, and then summarise each group by taking the mean of arr_delay tblflights %>% group_by(origin) %>% summarise(avg_delay = mean(arr_delay, na.rm=TRUE))
## # A tibble: 3 × 2 ## origin avg_delay ## <chr> <dbl> ## 1 EWR 9.107055 ## 2 JFK 5.551481 ## 3 LGA 5.783488
# for each carrier, calculate the mean arrival and departure delays at the different origin airports tblflights %>% group_by(origin) %>% summarise_each(funs(mean(.,na.rm=TRUE)), arr_delay, dep_delay) %>% head()
## # A tibble: 3 × 3 ## origin arr_delay dep_delay ## <chr> <dbl> <dbl> ## 1 EWR 9.107055 15.10795 ## 2 JFK 5.551481 12.11216 ## 3 LGA 5.783488 10.34688
# for each carrier, calculate the minimum and maximum of arrival and departure delays tblflights %>% group_by(carrier) %>% summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("_delay")) %>% head()
## # A tibble: 6 × 5 ## carrier dep_delay_min arr_delay_min dep_delay_max arr_delay_max ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 9E -24 -68 747 744 ## 2 AA -24 -75 1014 1007 ## 3 AS -21 -74 225 198 ## 4 B6 -43 -71 502 497 ## 5 DL -33 -71 960 931 ## 6 EV -32 -62 548 577
tblflights %>% select(matches("_delay")) %>% head(3)
## # A tibble: 3 × 2 ## dep_delay arr_delay ## <dbl> <dbl> ## 1 2 11 ## 2 4 20 ## 3 2 33
tblflights %>% select(matches("_delay")) %>% mutate_each(funs(half=./2)) %>% head(3)
## # A tibble: 3 × 4 ## dep_delay arr_delay dep_delay_half arr_delay_half ## <dbl> <dbl> <dbl> <dbl> ## 1 2 11 1 5.5 ## 2 4 20 2 10.0 ## 3 2 33 1 16.5
# for each month, count the total number of flights each day and sort in order of busiest days tblflights %>% group_by(month,day) %>% summarise(flight_count = n()) %>% arrange(desc(flight_count)) %>% head()
## Source: local data frame [6 x 3] ## Groups: month [3] ## ## month day flight_count ## <int> <int> <int> ## 1 11 27 1014 ## 2 7 11 1006 ## 3 7 8 1004 ## 4 7 10 1004 ## 5 12 2 1004 ## 6 7 18 1003
# rewrite more simply with the `tally` function tblflights %>% group_by(month, day) %>% tally(sort = TRUE)
# for each destination, count the total number of flights and the number of distinct planes that flew there tblflights %>% group_by(dest) %>% summarise(flight_count = n(), plane_count = n_distinct(tailnum)) %>% head()
## # A tibble: 6 × 3 ## dest flight_count plane_count ## <chr> <int> <int> ## 1 ABQ 254 108 ## 2 ACK 265 58 ## 3 ALB 439 172 ## 4 ANC 8 6 ## 5 ATL 17215 1180 ## 6 AUS 2439 993
# for each destination, show the number of flights from the 3 origin airports tblflights %>% group_by(dest) %>% select(origin) %>% table() %>% head()
## Adding missing grouping variables: `dest`
## origin ## dest EWR JFK LGA ## ABQ 0 254 0 ## ACK 0 265 0 ## ALB 439 0 0 ## ANC 8 0 0 ## ATL 5022 1930 10263 ## AUS 968 1471 0
mean
) takes n inputs and returns 1 valuemin_rank
), offset functions (lead
and lag
), and cumulative aggregates (like cummean
).# for each carrier, calculate which two days of the year they had their longest departure delays # note: smallest (not largest) value is ranked as 1, so you have to use `desc` to rank by largest value tblflights %>% group_by(carrier) %>% select(month, day, dep_delay) %>% filter(min_rank(desc(dep_delay)) <= 2 & dep_delay!="NA") %>% arrange(carrier, desc(dep_delay)) %>% head()
## Adding missing grouping variables: `carrier`
## Source: local data frame [6 x 4] ## Groups: carrier [3] ## ## carrier month day dep_delay ## <chr> <int> <int> <dbl> ## 1 9E 2 16 747 ## 2 9E 7 24 430 ## 3 AA 9 20 1014 ## 4 AA 12 5 896 ## 5 AS 5 23 225 ## 6 AS 1 20 222
# for each carrier, calculate which two days of the year they had their longest departure delays rewrite previous with the `top_n` function tblflights %>% group_by(carrier) %>% select(month, day, dep_delay) %>% filter(dep_delay!="NA") %>% top_n(2) %>% arrange(carrier, desc(dep_delay)) %>% head() # for each month, calculate the number of flights and the change from the previous month tblflights %>% group_by(month) %>% summarise(flight_count = n()) %>% mutate(change = flight_count - lag(flight_count)) # rewrite previous with the `tally` function tblflights %>% group_by(month) %>% tally() %>% mutate(change = n - lag(n))
# randomly sample a fixed number of rows, without replacement tblflights %>% sample_n(5) # randomly sample a fraction of rows, with replacement tblflights %>% sample_frac(0.25, replace=TRUE) # base R approach to view the structure of an object str(tblflights) # dplyr approach: better formatting, and adapts to your screen width glimpse(tblflights)
model=tblflights %>% group_by(origin) %>% do(lm=lm(dep_delay~arr_delay+carrier,data=.)) model %>% summarise(rsq=summary(lm)$r.squared)
## # A tibble: 3 × 1 ## rsq ## <dbl> ## 1 0.8635791 ## 2 0.8181881 ## 3 0.8416845
4 joins from SQL
inner_join(x, y)
: matching x + yleft_join(x, y)
: all x + matching ysemi_join(x, y)
: all x with match in yanti_join(x, y)
: all x without match in ytblflights %>% head(1)
## # A tibble: 1 × 20 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>, delaysquare <dbl>
airlines %>% head(1)
## # A tibble: 1 × 2 ## carrier name ## <chr> <chr> ## 1 9E Endeavor Air Inc.
airports %>% head(1)
## # A tibble: 1 × 7 ## faa name lat lon alt tz dst ## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> ## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A
tblflights %>% inner_join(airlines) %>% head(1)
## Joining, by = "carrier"
## # A tibble: 1 × 21 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## # ... with 14 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>, delaysquare <dbl>, name <chr>
tblflights %>% inner_join(airlines) %>% select(distance:name) %>% head(1)
## Joining, by = "carrier"
## # A tibble: 1 × 6 ## distance hour minute time_hour delaysquare ## <dbl> <dbl> <dbl> <dttm> <dbl> ## 1 1400 5 15 2013-01-01 05:00:00 125 ## # ... with 1 more variables: name <chr>
faatblflights=tblflights %>% select(origin) %>% mutate(faa=origin) faatblflights %>% left_join(airports) %>% head(2)
## Joining, by = "faa"
## # A tibble: 2 × 8 ## origin faa name lat lon alt tz dst ## <chr> <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> ## 1 EWR EWR Newark Liberty Intl 40.69250 -74.16867 18 -5 A ## 2 LGA LGA La Guardia 40.77725 -73.87261 22 -5 A
airports %>% left_join(faatblflights) %>% head(2)
## Joining, by = "faa"
## # A tibble: 2 × 8 ## faa name lat lon alt tz dst ## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> ## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A ## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 A ## # ... with 1 more variables: origin <chr>
airports %>% left_join(faatblflights) %>% head(2)
## Joining, by = "faa"
## # A tibble: 2 × 8 ## faa name lat lon alt tz dst ## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> ## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A ## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 A ## # ... with 1 more variables: origin <chr>
airports %>% semi_join(faatblflights) %>% head(2)
## Joining, by = "faa"
## # A tibble: 2 × 7 ## faa name lat lon alt tz dst ## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> ## 1 EWR Newark Liberty Intl 40.69250 -74.16867 18 -5 A ## 2 LGA La Guardia 40.77725 -73.87261 22 -5 A
faatblflights %>% anti_join(airports)
## Joining, by = "faa"
## # A tibble: 0 × 2 ## # ... with 2 variables: origin <chr>, faa <chr>
airports %>% anti_join(faatblflights)
## Joining, by = "faa"
## # A tibble: 1,393 × 7 ## faa name lat lon alt tz ## <chr> <chr> <dbl> <dbl> <int> <dbl> ## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 ## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 ## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6 ## 4 06N Randall Airport 41.43191 -74.39156 523 -5 ## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -4 ## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -4 ## 7 0G6 Williams County Airport 41.46731 -84.50678 730 -5 ## 8 0G7 Finger Lakes Regional Airport 42.88356 -76.78123 492 -5 ## 9 0P2 Shoestring Aviation Airfield 39.79482 -76.64719 1000 -5 ## 10 0S9 Jefferson County Intl 48.05381 -122.81064 108 -8 ## # ... with 1,383 more rows, and 1 more variables: dst <chr>