03 - Data Transformation

Psicometria per le Neuroscienze Cognitive
GPT summaries of R4DS Books

Filippo Gambarota, PhD

Introduction

library(dplyr)
library(nycflights13)
flights <- nycflights13::flights

Selecting Columns

flights %>% select(year, month, day, dep_delay, arr_delay)
# A tibble: 336,776 × 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013     1     1         2        11
 2  2013     1     1         4        20
 3  2013     1     1         2        33
 4  2013     1     1        -1       -18
 5  2013     1     1        -6       -25
 6  2013     1     1        -4        12
 7  2013     1     1        -5        19
 8  2013     1     1        -3       -14
 9  2013     1     1        -3        -8
10  2013     1     1        -2         8
# ℹ 336,766 more rows

Filtering Rows

flights %>% filter(month == 1, day == 1)
# A tibble: 842 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 832 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Arranging Rows

flights %>% arrange(desc(dep_delay))
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     9      641            900      1301     1242           1530
 2  2013     6    15     1432           1935      1137     1607           2120
 3  2013     1    10     1121           1635      1126     1239           1810
 4  2013     9    20     1139           1845      1014     1457           2210
 5  2013     7    22      845           1600      1005     1044           1815
 6  2013     4    10     1100           1900       960     1342           2211
 7  2013     3    17     2321            810       911      135           1020
 8  2013     6    27      959           1900       899     1236           2226
 9  2013     7    22     2257            759       898      121           1026
10  2013    12     5      756           1700       896     1058           2020
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Creating New Variables

flights %>% mutate(speed = distance / air_time * 60)
# A tibble: 336,776 × 20
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>

Renaming Columns

flights %>% rename(departure_delay = dep_delay, arrival_delay = arr_delay)
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time departure_delay arr_time
   <int> <int> <int>    <int>          <int>           <dbl>    <int>
 1  2013     1     1      517            515               2      830
 2  2013     1     1      533            529               4      850
 3  2013     1     1      542            540               2      923
 4  2013     1     1      544            545              -1     1004
 5  2013     1     1      554            600              -6      812
 6  2013     1     1      554            558              -4      740
 7  2013     1     1      555            600              -5      913
 8  2013     1     1      557            600              -3      709
 9  2013     1     1      557            600              -3      838
10  2013     1     1      558            600              -2      753
# ℹ 336,766 more rows
# ℹ 12 more variables: sched_arr_time <int>, arrival_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Relocating Columns

flights %>% relocate(dep_delay, arr_delay, .before = distance)
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time arr_time sched_arr_time carrier
   <int> <int> <int>    <int>          <int>    <int>          <int> <chr>  
 1  2013     1     1      517            515      830            819 UA     
 2  2013     1     1      533            529      850            830 UA     
 3  2013     1     1      542            540      923            850 AA     
 4  2013     1     1      544            545     1004           1022 B6     
 5  2013     1     1      554            600      812            837 DL     
 6  2013     1     1      554            558      740            728 UA     
 7  2013     1     1      555            600      913            854 B6     
 8  2013     1     1      557            600      709            723 EV     
 9  2013     1     1      557            600      838            846 B6     
10  2013     1     1      558            600      753            745 AA     
# ℹ 336,766 more rows
# ℹ 11 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, dep_delay <dbl>, arr_delay <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Summarizing Data

flights %>% group_by(carrier) %>% summarize(avg_delay = mean(dep_delay, na.rm = TRUE))
# A tibble: 16 × 2
   carrier avg_delay
   <chr>       <dbl>
 1 9E          16.7 
 2 AA           8.59
 3 AS           5.80
 4 B6          13.0 
 5 DL           9.26
 6 EV          20.0 
 7 F9          20.2 
 8 FL          18.7 
 9 HA           4.90
10 MQ          10.6 
11 OO          12.6 
12 UA          12.1 
13 US           3.78
14 VX          12.9 
15 WN          17.7 
16 YV          19.0 

Counting Observations

flights %>% count(carrier)
# A tibble: 16 × 2
   carrier     n
   <chr>   <int>
 1 9E      18460
 2 AA      32729
 3 AS        714
 4 B6      54635
 5 DL      48110
 6 EV      54173
 7 F9        685
 8 FL       3260
 9 HA        342
10 MQ      26397
11 OO         32
12 UA      58665
13 US      20536
14 VX       5162
15 WN      12275
16 YV        601

Using Pipes

flights %>% 
  filter(month == 1, day == 1) %>% 
  select(year, month, day, carrier, flight)
# A tibble: 842 × 5
    year month   day carrier flight
   <int> <int> <int> <chr>    <int>
 1  2013     1     1 UA        1545
 2  2013     1     1 UA        1714
 3  2013     1     1 AA        1141
 4  2013     1     1 B6         725
 5  2013     1     1 DL         461
 6  2013     1     1 UA        1696
 7  2013     1     1 B6         507
 8  2013     1     1 EV        5708
 9  2013     1     1 B6          79
10  2013     1     1 AA         301
# ℹ 832 more rows

Combining Multiple Transformations

flights %>% 
  group_by(carrier) %>% 
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>% 
  arrange(desc(avg_delay))
# A tibble: 16 × 2
   carrier avg_delay
   <chr>       <dbl>
 1 F9          20.2 
 2 EV          20.0 
 3 YV          19.0 
 4 FL          18.7 
 5 WN          17.7 
 6 9E          16.7 
 7 B6          13.0 
 8 VX          12.9 
 9 OO          12.6 
10 UA          12.1 
11 MQ          10.6 
12 DL           9.26
13 AA           8.59
14 AS           5.80
15 HA           4.90
16 US           3.78

Conclusion

summary(flights)
      year          month             day           dep_time    sched_dep_time
 Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
 1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
 Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
 Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
 Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
                                                 NA's   :8255                 
   dep_delay          arr_time    sched_arr_time   arr_delay       
 Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
 1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
 Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
 Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
 3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
 Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
 NA's   :8255      NA's   :8713                  NA's   :9430      
   carrier              flight       tailnum             origin         
 Length:336776      Min.   :   1   Length:336776      Length:336776     
 Class :character   1st Qu.: 553   Class :character   Class :character  
 Mode  :character   Median :1496   Mode  :character   Mode  :character  
                    Mean   :1972                                        
                    3rd Qu.:3465                                        
                    Max.   :8500                                        
                                                                        
     dest              air_time        distance         hour      
 Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
 Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
 Mode  :character   Median :129.0   Median : 872   Median :13.00  
                    Mean   :150.7   Mean   :1040   Mean   :13.18  
                    3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
                    Max.   :695.0   Max.   :4983   Max.   :23.00  
                    NA's   :9430                                  
     minute        time_hour                     
 Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
 1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
 Median :29.00   Median :2013-07-03 10:00:00.00  
 Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
 3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
 Max.   :59.00   Max.   :2013-12-31 23:00:00.00