2024

Introduction to data.table

The data.table package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame with syntax that allows concise and fast operations.

Installing and loading data.table

Installation

install.packages("data.table")

Loading the package

library(data.table)

Creating a data.table manually

You can create a data.table directly using the data.table() function:

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
print(dt)
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie
## 4:     4   David
## 5:     5     Eva

Creating from a file using fread

fname <- "data.csv"
dt_from_file <- fread(fname)
print(dt_from_file)
  • For this example to work you’ll need to have a file called data.csv in your working directory. The file should contain the following:
column_1,column_2,column_3
1,2,3
4,5,6
7,8,9

Creating from a string using fread

data <- "ID,Name,Score
         1,Alice,95
         2,Bob,82
         3,Charlie,88
         4,David,74
         5,Eva,91"

dt_from_file <- fread(data)
print(dt_from_file)
##       ID    Name Score
##    <int>  <char> <int>
## 1:     1   Alice    95
## 2:     2     Bob    82
## 3:     3 Charlie    88
## 4:     4   David    74
## 5:     5     Eva    91

Convert a data.frame to a data.table with as.data.table

df <- data.frame(ID = 1:5, 
                 Score = c(95, 82, 88, 74, 91))
print(str(df))
## 'data.frame':    5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
## NULL
df <- as.data.table(df)
print(str(df))
## Classes 'data.table' and 'data.frame':   5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL

Convert to a data.tablein place with setDT

df <- data.frame(ID = 1:5, 
                 Score = c(95, 82, 88, 74, 91))

# operates in place so no assignment needed
setDT(df)

print(str(df))
## Classes 'data.table' and 'data.frame':   5 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5
##  $ Score: num  95 82 88 74 91
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL

Introduction to dt[i, j, by] syntax

  • dt: The data.table object
  • i: Row operations (filter rows)
  • j: Column operations (select, compute, modify columns)
  • by: Group by operations

Select rows by index

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select the first 3 rows
print(dt[1:3])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie

Select rows by logic

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select rows where ID is less than 4
print(dt[ID < 4])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     2     Bob
## 3:     3 Charlie

Select rows by set

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# select rows where ID is 1, 3, or 5
print(dt[ID %in% c(1, 3, 5)])
##       ID    Name
##    <int>  <char>
## 1:     1   Alice
## 2:     3 Charlie
## 3:     5     Eva

Select or compute on columns

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# Select the 'Name' column as vector
print(dt[, Name])
## [1] "Alice"   "Bob"     "Charlie" "David"   "Eva"

Select or compute on columns

dt <- data.table(ID = 1:5, 
                 Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# Select the 'Name' column as data.table
print(dt[, .(Name)])
##       Name
##     <char>
## 1:   Alice
## 2:     Bob
## 3: Charlie
## 4:   David
## 5:     Eva

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as vector
df[, mean(Score)]
## [1] 86

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as data.table
df[, .(mean(Score))]
##       V1
##    <num>
## 1:    86

Select or compute on columns

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Compute the mean Score and return as data.table with a new
# column name `Mean_Score`
df[, .(Mean_Score = mean(Score))]
##    Mean_Score
##         <num>
## 1:         86

Combine i, j, and by

df <- data.table(ID = 1:5, 
                 Group = c("A", "B", "A", "B", "A"), 
                 Score = c(95, 82, 88, 74, 91))

# Calculate mean Score by Group for IDs less than 5
print(df[ID < 5, .(Mean_Score = mean(Score)), by = Group])
##     Group Mean_Score
##    <char>      <num>
## 1:      A       91.5
## 2:      B       78.0