In our meeting on 24 February 2021 we discussed profiling and comparing of code running on data.table versus data.frame. Code and timing for examples are below.

Config

myDFlm time (user; seconds)

myDTlm time (user; seconds)

teton: module load r/4.0.2-intel; single threaded

2.464

2.305

teton: module load r/4.0.2-py27; single threaded

1.794

1.692

iMac from 2017; standard R build for MacOS

1.990

1.541

system.time(foo <- subset(myDF, x < 10))
user  system elapsed
3.035   0.846   4.260

system.time(foo <- subset(myDT, x < 10))
user  system elapsed
0.486   0.374   0.842

## compare the data.table way of subsetting
> system.time(foo <- myDT[x < 10])
   user  system elapsed 
  0.126   0.059   0.185  
> system.time(foo2 <- myDT[x < 10])
   user  system elapsed 
  0.087   0.031   0.119
  
## here is an example with grouping and keys
> grp <- sample(LETTERS[1:10], 10^7, replace = TRUE)
> myDT[, grp := grp]
> system.time(myDT[, as.list(coef(lm(y~x))), by = grp])
   user  system elapsed 
  1.349   0.176   1.390 
> setkey(myDT, grp)
> system.time(myDT[, as.list(coef(lm(y~x))), by = grp])
   user  system elapsed 
  1.206   0.156   1.349 

library(bench)
library(data.table)
library(dtplyr)
library(tidyverse)

set.seed(10101)
x <- rnorm(10^4) 
y <- x * 5 + 2 + rnorm(n = length(x), sd = 2)

my_df <- data.frame(x = x, y = y)
my_dt <- data.table(x = x, y = y)

mark(
  #Using dplyr with data.table (implicitly uses dtplyr)
  data.table_tidy = filter(my_dt, x < 0),
  #Using data.table syntax
  data.table_dt = my_dt[x < 0],
  #Using base subset
  data.frame_subset = subset(my_df, x< 0),
  #Using base with square brackets
  data.frame_square = my_df[x < 0,],
  iterations = 1000,
  check = FALSE) %>%
  select(-result:-gc)
  
  # A tibble: 4 x 9
  expression             min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 data.table_tidy     1.25ms   1.36ms      675.    5.12KB     8.19   988    12      1.46s
2 data.table_dt      317.1us  404.1us     2028.  192.15KB     2.03   999     1   492.61ms
3 data.frame_subset  393.5us  640.4us     1277.  493.93KB     1.28   999     1   782.53ms
4 data.frame_square  468.4us  490.5us     1768.   376.6KB     0     1000     0   565.74ms

# Use setkey to create Primary Keys for a data.table. Having a set key(s) can greatly 
# search/query of large data.frames because it sorts the data by the 'keys' which then
# allows for rapid indexed searching. If your data.frame isn't overly large, this is probably
# overkill. What is large? Guess some experimentation may be necessary.
# Type 'example(setkey)' to run these at the prompt and browse output taken directly from
# data.table setkey() help documentation

DT = data.table(A=5:1,B=letters[5:1])
DT # before
setkey(DT,B)          # re-orders table and marks it sorted.
DT # after
tables()              # KEY column reports the key'd columns
key(DT)
keycols = c("A","B")
setkeyv(DT,keycols)
library(data.table)
library(microbenchmark)
set.seed(10101)
x<-rnorm(10^7)
y <- x * 5 + 2 + rnorm(n=length(x), sd=2)
myDF<-data.frame(x=x, y=y)
myDT<-data.table(x=x, y=y)

ldf = list(myDF, myDF)
ldt = list(myDT, myDT)
microbenchmark("dt" = {rbindlist(ldt)},
               "df" = {do.call("rbind", ldf)})

Speed differences using different pipes:

library(bench)
library(data.table)
library(tidyverse)

df_df <- data.frame(x = rnorm(10^5))

df_tbl <- as_tibble(df_df)

df_dt <- as.data.table(df_df)

r_pipe <- function(df) {
  df ->.;
    .[x < 0, ] ->.;
  `+`(., 100)
}

dplyr_pipe <- function(df) {
  df %>%
    filter(x < 0) %>%
    mutate(x = x + 100)
}

dt_pipe <- function(df) {
  df[x < 0, ][
    x + 100]
}

mark(base = r_pipe(df_df),
  dplyr = dplyr_pipe(df_tbl),
  data.table = dt_pipe(df_dt),
  iterations = 1000,
  check = FALSE) %>%
  arrange(median) %>%
  select(-result:-gc)
  
  # A tibble: 3 x 9
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 base        595.2us  651.1us     1498. 1013.08KB    0      1000     0   667.45ms
2 dplyr        1.36ms   1.66ms      562.    2.32MB    0.562   999     1      1.78s
3 data.table   4.32ms   5.59ms      163.    2.53MB    1.81    989    11      6.08s

Goofing around a bit more with R’s native “pipe”. Note that this is just for fun and using this syntax is strongly discouraged as it is difficult to understand. That said, a base R pipe, akin to the {magrittr} pipe is coming.

#Using R's native pipe operator

#Inspiration:
#https://win-vector.com/2017/07/07/in-praise-of-syntactic-sugar/
#The native pipe: `->.;`

#Basic example, using the typical assignment direction.
# Note the need to create a child environment with `{}` to get this to work.
# Also note the need to explicitly use dot notation to signify a resulting
# object.
result_v1 <- {
  0 ->.; #The native "pipe"
  cos(.)
}
result_v1
#> [1] 1

#Another example without a child environment, but requires assignment at the
# bottom of the code.
0 ->.;
  cos(.) -> result_1_v2
result_v2
#> Error in eval(expr, envir, enclos): object 'result_v2' not found

#This example expands the use of the pipe to include an infix operator as a
# function, making operators like `+`, `-`, `*` pipe friendly.
result_3a <- {
  0 ->.;
  cos(.) ->.;
  `+`(., 2 * .) #Infix operator
}
result_3a
#> [1] 3

#Here we have pipes inside of pipes with the "dot" having different meanings,
# depending on where it is used. This is because each environment can have its
# own definition of what an object means. Useful when piped operations are
# complex, or you are using lambda/anonymous functions that recycle variable
# names.
result_3b <- {
  0 ->.;
  cos(.) ->.;
  `+`(., { #Child environment inside another environment
    . ->.;
    `*`(., 2)
  }
  )
}
result_3b
#> [1] 3

#Same as above, just proving to myself that the result is behaving as expected.
result_4 <- {
  0 ->.; #The native "pipe"
  cos(.) ->.;
  `+`(., {
    . ->.; #Infix operator
    `*`(., 3)
  }
  )
}
result_4
#> [1] 4