The data.table
package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame
with syntax that allows concise and fast operations.
2024
data.table
The data.table
package in R is designed for efficient data manipulation, aggregation, and analysis. It extends the data.frame
with syntax that allows concise and fast operations.
data.table
install.packages("data.table")
library(data.table)
data.table
manuallyYou can create a data.table
directly using the data.table()
function:
dt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) print(dt)
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie ## 4: 4 David ## 5: 5 Eva
fread
fname <- "data.csv" dt_from_file <- fread(fname) print(dt_from_file)
data.csv
in your working directory. The file should contain the following:column_1,column_2,column_3 1,2,3 4,5,6 7,8,9
fread
data <- "ID,Name,Score 1,Alice,95 2,Bob,82 3,Charlie,88 4,David,74 5,Eva,91" dt_from_file <- fread(data) print(dt_from_file)
## ID Name Score ## <int> <char> <int> ## 1: 1 Alice 95 ## 2: 2 Bob 82 ## 3: 3 Charlie 88 ## 4: 4 David 74 ## 5: 5 Eva 91
data.frame
to a data.table
with as.data.table
df <- data.frame(ID = 1:5, Score = c(95, 82, 88, 74, 91)) print(str(df))
## 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## NULL
df <- as.data.table(df) print(str(df))
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## - attr(*, ".internal.selfref")=<externalptr> ## NULL
data.table
in place with setDT
df <- data.frame(ID = 1:5, Score = c(95, 82, 88, 74, 91)) # operates in place so no assignment needed setDT(df) print(str(df))
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables: ## $ ID : int 1 2 3 4 5 ## $ Score: num 95 82 88 74 91 ## - attr(*, ".internal.selfref")=<externalptr> ## NULL
dt[i, j, by]
syntaxdt
: The data.table
objecti
: Row operations (filter rows)j
: Column operations (select, compute, modify columns)by
: Group by operationsdt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) # select the first 3 rows print(dt[1:3])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie
dt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) # select rows where ID is less than 4 print(dt[ID < 4])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 2 Bob ## 3: 3 Charlie
dt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) # select rows where ID is 1, 3, or 5 print(dt[ID %in% c(1, 3, 5)])
## ID Name ## <int> <char> ## 1: 1 Alice ## 2: 3 Charlie ## 3: 5 Eva
dt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) # Select the 'Name' column as vector print(dt[, Name])
## [1] "Alice" "Bob" "Charlie" "David" "Eva"
dt <- data.table(ID = 1:5, Name = c("Alice", "Bob", "Charlie", "David", "Eva")) # Select the 'Name' column as data.table print(dt[, .(Name)])
## Name ## <char> ## 1: Alice ## 2: Bob ## 3: Charlie ## 4: David ## 5: Eva
df <- data.table(ID = 1:5, Group = c("A", "B", "A", "B", "A"), Score = c(95, 82, 88, 74, 91)) # Compute the mean Score and return as vector df[, mean(Score)]
## [1] 86
df <- data.table(ID = 1:5, Group = c("A", "B", "A", "B", "A"), Score = c(95, 82, 88, 74, 91)) # Compute the mean Score and return as data.table df[, .(mean(Score))]
## V1 ## <num> ## 1: 86
df <- data.table(ID = 1:5, Group = c("A", "B", "A", "B", "A"), Score = c(95, 82, 88, 74, 91)) # Compute the mean Score and return as data.table with a new # column name `Mean_Score` df[, .(Mean_Score = mean(Score))]
## Mean_Score ## <num> ## 1: 86
i
, j
, and by
df <- data.table(ID = 1:5, Group = c("A", "B", "A", "B", "A"), Score = c(95, 82, 88, 74, 91)) # Calculate mean Score by Group for IDs less than 5 print(df[ID < 5, .(Mean_Score = mean(Score)), by = Group])
## Group Mean_Score ## <char> <num> ## 1: A 91.5 ## 2: B 78.0