The data.table package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame with syntax that allows concise and fast operations.
2024
data.tableThe data.table package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame with syntax that allows concise and fast operations.
data.tableinstall.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3
data.table manuallyYou can create a data.table directly using the data.table() function:
dt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
print(dt)
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie ## 4: 4 David ## 5: 5 Eva
freadfname <- "data.csv" dt_from_file <- fread(fname) print(dt_from_file)
data.csv in your working directory. The file should contain the following:column_1,column_2,column_3 1,2,3 4,5,6 7,8,9
freaddata <- "ID,Name,Score
1,Alice,95
2,Bob,82
3,Charlie,88
4,David,74
5,Eva,91"
dt_from_file <- fread(data)
print(dt_from_file)
## ID Name Score ## <int> <char> <int> ## 1: 1 Alice 95 ## 2: 2 Bob 82 ## 3: 3 Charlie 88 ## 4: 4 David 74 ## 5: 5 Eva 91
data.frame to a data.table with as.data.tabledf <- data.frame(ID = 1:5,
Score = c(95, 82, 88, 74, 91))
print(str(df))
## 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## NULL
df <- as.data.table(df) print(str(df))
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## - attr(*, ".internal.selfref")=<externalptr> ## NULL
data.tablein place with setDTdf <- data.frame(ID = 1:5,
Score = c(95, 82, 88, 74, 91))
# operates in place so no assignment needed
setDT(df)
print(str(df))
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## - attr(*, ".internal.selfref")=<externalptr> ## NULL
dt[i, j, by] syntaxdt: The data.table objecti: Row operations (filter rows)j: Column operations (select, compute, modify columns)by: Group by operationsdt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# select the first 3 rows
print(dt[1:3])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie
dt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# select rows where ID is less than 4
print(dt[ID < 4])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie
dt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# select rows where ID is 1, 3, or 5
print(dt[ID %in% c(1, 3, 5)])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 3 Charlie ## 3: 5 Eva
dt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# Select the 'Name' column as vector
print(dt[, Name])
## [1] "Alice" "Bob" "Charlie" "David" "Eva"
dt <- data.table(ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# Select the 'Name' column as data.table
print(dt[, .(Name)])
## Name ## <char> ## 1: Alice ## 2: Bob ## 3: Charlie ## 4: David ## 5: Eva
df <- data.table(ID = 1:5,
Group = c("A", "B", "A", "B", "A"),
Score = c(95, 82, 88, 74, 91))
# Compute the mean Score and return as vector
df[, mean(Score)]
## [1] 86
df <- data.table(ID = 1:5,
Group = c("A", "B", "A", "B", "A"),
Score = c(95, 82, 88, 74, 91))
# Compute the mean Score and return as data.table
df[, .(mean(Score))]
## V1 ## <num> ## 1: 86
df <- data.table(ID = 1:5,
Group = c("A", "B", "A", "B", "A"),
Score = c(95, 82, 88, 74, 91))
# Compute the mean Score and return as data.table with a new
# column name `Mean_Score`
df[, .(Mean_Score = mean(Score))]
## Mean_Score ## <num> ## 1: 86
i, j, and bydf <- data.table(ID = 1:5,
Group = c("A", "B", "A", "B", "A"),
Score = c(95, 82, 88, 74, 91))
# Calculate mean Score by Group for IDs less than 5
print(df[ID < 5, .(Mean_Score = mean(Score)), by = Group])
## Group Mean_Score ## <char> <num> ## 1: A 91.5 ## 2: B 78.0