Get lots of hands-on practice with data.table
.
You should pay very careful attention to learn about: *
data.table
chaining * Adding columns using :=
* Selecting rows with i
, operating on columns with
j
, and grouping with by
using
data.table
syntax [i, j, by]
# load data.table library
library(data.table)
# clean session
rm(list=ls())
# Please include the following line of code as well
set.seed(0)
# create vectors from which to build d
V1 <- rep(month.abb[1:8], 2) #month.abb is a built in vector
V2 <- rep(letters[1:8], 2) #letters is a bult in vector
V3 <- rnorm(length(V1))
# create a data.table named d
d <- data.table(month=V1, event=V2, obs=V3)
A). Use the data.table
keyword .N
to report
how many rows are in d
and store the result in a variable
named ans_1a
.
ans_1a <- d[, .N]
B). Return a data.table
containing only the rows of
d
for which month=="Feb"
and
event=="c"
and store the result in a variable named
ans_1b
.
ans_1b <- d[month == 'Feb' & event == 'c']
C). Using the %in%
operator, return a
data.table
containing only the rows of d
for
which month
is in the vector
c("Jan", "Mar", "May", "Jul")
, and store the result in a
variable named ans_1c
.
ans_1c <- d[month %in% c('Jan', 'Mar', 'May', 'Jul')]
A). Return a vector
containing the month
column of d
and store the result in a variable named
ans_2a
.
ans_2a <- d[, month]
B). Return a data.table
containing only the
month
and event
columns of d
and
store the result in a variable named ans_2b
.
ans_2b <- d[, .(month, event)]
C). Return the mean and standard deviation of the obs
column across all rows in d
and store the result in a
data.table
named ans_2c
. The resulting
data.table
should have 1
observation with
columns named obs_mean
and obs_sd
.
ans_2c <- d[, .(obs_mean = mean(obs), obs_sd = sd(obs))]
A). Return the mean obs
grouped by event
and grouped by month
and return the result in a
data.table
named ans_3a
. Name the column of
ans_3a
that contains the mean of the obs
column obs_mean
.
ans_3a <- d[, .(obs_mean = mean(obs)), .(event, month)]
A). Select the first 8 rows of d
and compute the median
value of the obs
column grouped by month
and
event
and return the result in a data.table
named ans_4a
. Name the column of ans_4a
that
contains the median of the obs
column
obs_median
.
ans_4a <- d[1:8, .(obs_median = median(obs)), .(month, event)]
D <- data.table(EmployeeID = 101:105,
Department = c("HR", "IT", "Finance", "Marketing", "Sales"))
A). Add a new column called Salary with values (50000, 60000, 55000, 52000, 58000)
D[, Salary := c(50000, 60000, 55000, 52000, 58000)]
# the := operator modifies D directly without creating a new copy.
B). Increase salary by 10%
D[, Salary := Salary * 1.10]
C). Create a new column called Bonus such that: If Salary is greater than or equal to 55000, Bonus should be “High” Otherwise, Bonus should be “Low”
D[, Bonus := ifelse(Salary >= 55000, "High", "Low")]
D). create a new column EmployeesPerDept that stores the number of employees in each Department.
D[, EmployeesPerDept := .N, by = Department]
products <- data.table(ProductID = 101:105,
ProductName = c("Laptop", "Phone", "Tablet", "Monitor", "Keyboard"))
sales <- data.table(ProductID = c(102, 103, 105, 106),
UnitsSold = c(50, 30, 20, 40))
A). Perform a left join to merge products with sales by ProductID, ensuring all rows from products are retained. Store the result in ans_6a.
ans_6a <- merge(products, sales, by = "ProductID", all.x = TRUE)
# If a ProductID exists in products but not in sales (e.g., ProductID 101 and 104), UnitsSold will be NA.
B). Perform a right join to merge products with sales, ensuring all rows from sales are retained. Store the result in ans_6b.
ans_6b <- merge(products, sales, by = "ProductID", all.y = TRUE)
# If a ProductID exists in sales but not in products (e.g., ProductID 106), ProductName will be NA.
D <- data.table(Student = c("A", "B", "C"), Math = c(85, 78, 92), Science = c(90, 82, 95), English = c(88, 80, 94))
A). Convert d from wide format to long format, where: Student remains as the identifier column (id.vars). The column names Math, Science, and English should be stored in a column called “Subject”. The values should be stored in a column named “Score”. Store the result in ans_7a.
ans_7a <- melt(D, id.vars = "Student",
variable.name = "Subject",
value.name = "Score")
DT <- data.table(
ID = 1:5,
Name = c("Alice", "Bob", "Charlie", "David", "Eva"),
Score = c(85, 92, 88, 94, 90)
)
A). Sort DT by Score in descending order. Select the top 3 rows. Compute the maximum score in these rows and store it in a new column/datatable “TopScore”. Store the result in ans_8a.
ans_8a <- DT[order(-Score)][1:3][, .(TopScore = max(Score))]
# DT[order(-Score)] sorts rows by Score in descending order.
# [1:3] selects the top 3 rows.
# [, .(TopScore = max(Score))] computes the maximum score from the selected rows.
B). Filter DT to include only rows where Score >= 90. Select only the columns Name and Score. Sort the result in descending order of Score. Store the result in ans_8b
ans_8b <- DT[Score >= 90, .(Name, Score)][order(-Score)]
# DT[Score >= 90] filters only scores 90 and above.
# [, .(Name, Score)] selects only Name and Score columns.
# [order(-Score)] sorts results in descending order.
C). Compute the mean Score of all employees. Create a new column AdjustedScore where each score is increased by 5% of the mean score. Store the result in ans_8c.
ans_8c <- DT[, .(MeanScore = mean(Score))][, AdjustedScore := MeanScore * 1.05]
# [, .(MeanScore = mean(Score))] computes the mean Score across all rows.
# [, AdjustedScore := MeanScore * 1.05] increases the mean score by 5% and stores it in AdjustedScore.
D). Group DT by Name. Compute the average score for each person. Sort the result in descending order of AverageScore. Store the result in ans_8d
ans_8d <- DT[, .(AverageScore = mean(Score)), by = Name][order(-AverageScore)]
DT[, AverageScore := mean(Score), .(Name)][order(-AverageScore)]
# [, .(AverageScore = mean(Score)), by = Name] computes average score per Name (though redundant here with unique names).
# [order(-AverageScore)] sorts in descending order.