2024

Introduction to data.table

The data.table package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame with syntax that allows concise and fast operations.

Installing and loading data.table

Installation

install.packages("data.table")

Loading the package

library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3

Creating a data.table manually

You can create a data.table directly using the data.table() function:

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
print(dt)
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie
## 4:     4   David
## 5:     5     Eva

Creating from a file using fread

fname <- "data.csv"
dt_from_file <- fread(fname)
print(dt_from_file)
  • For this example to work you’ll need to have a file called data.csv in your working directory. The file should contain the following:
column_1,column_2,column_3
1,2,3
4,5,6
7,8,9

Creating from a string using fread

data <- "ID,Name,Score
         1,Alice,95
         2,Bob,82
         3,Charlie,88
         4,David,74
         5,Eva,91"

dt_from_file <- fread(data)
print(dt_from_file)
##       ID    Name Score
##    <int>  <char> <int>
## 1:     1   Alice    95
## 2:     2     Bob    82
## 3:     3 Charlie    88
## 4:     4   David    74
## 5:     5     Eva    91

Convert a data.frame to a data.table with as.data.table

df <- data.frame(ID = 1:5, 
                 Score = c(95, 82, 88, 74, 91))
print(str(df))
## 'data.frame':    5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
## NULL
df <- as.data.table(df)
print(str(df))
## Classes 'data.table' and 'data.frame':   5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL

Convert to a data.tablein place with setDT

df <- data.frame(ID = 1:5, 
                 Score = c(95, 82, 88, 74, 91))

# operates in place so no assignment needed
setDT(df)

print(str(df))
## Classes 'data.table' and 'data.frame':   5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL

Introduction to dt[i, j, by] syntax

  • dt: The data.table object
  • i: Row operations (filter rows)
  • j: Column operations (select, compute, modify columns)
  • by: Group by operations

Select rows by index

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select the first 3 rows
print(dt[1:3])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie

Select rows by logic

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select rows where ID is less than 4
print(dt[ID < 4])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie

Select rows by set

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select rows where ID is 1, 3, or 5
print(dt[ID %in% c(1, 3, 5)])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     3 Charlie
## 3:     5     Eva

Select or compute on columns

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# Select the 'Name' column as vector
print(dt[, Name])
## [1] "Alice"   "Bob"     "Charlie" "David"   "Eva"

Select or compute on columns

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# Select the 'Name' column as data.table
print(dt[, .(Name)])
##       Name
##     <char>
## 1:   Alice
## 2:     Bob
## 3: Charlie
## 4:   David
## 5:     Eva

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as vector
df[, mean(Score)]
## [1] 86

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as data.table
df[, .(mean(Score))]
##       V1
##    <num>
## 1:    86

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as data.table with a new
# column name `Mean_Score`
df[, .(Mean_Score = mean(Score))]
##    Mean_Score
##         <num>
## 1:         86

Combine i, j, and by

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Calculate mean Score by Group for IDs less than 5
print(df[ID < 5, .(Mean_Score = mean(Score)), by = Group])
##     Group Mean_Score
##    <char>      <num>
## 1:      A       91.5
## 2:      B       78.0