# load libraries
library(data.table)
library(ggplot2)
# clean work space
rm(list = ls())
# init colorscheme
COL <- c("#2271B2", "#E69F00", "#D55E00")
names(COL) <- c("blue", "orange", "red")
theme_set(
theme_minimal(base_size = 13) +
theme(
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = "bottom"
)
)
update_geom_defaults("point", list(size = 2))
update_geom_defaults("line", list(linewidth = 0.8))
Two of the most widely used tools for examining relationships between continuous variables are correlation and simple linear regression.
Correlation tells you how strongly two variables are linearly associated. Simple linear regression goes further by fitting a line that predicts one variable from another.
In this tutorial, you will work with a human Serial Reaction Time
(SRT) dataset. The key idea is that you must first create one
independent summary row per participant before running
inferential analyses such as cor.test() and
lm().
Thirty adults completed a 15-day SRT task. On each day, they completed blocks from a predictable condition and a random condition. Sequence learning should lead to a growing response-time advantage for the predictable condition.
d <- fread("data/human_srt_summary.csv")
str(d)
## Classes 'data.table' and 'data.frame': 1800 obs. of 7 variables:
## $ sub_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ group : chr "young" "young" "young" "young" ...
## $ day : int 1 1 1 1 2 2 2 2 3 3 ...
## $ block : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sequence_type: chr "predictable" "random" "predictable" "random" ...
## $ mean_rt : num 409 413 391 381 384 ...
## $ prop_correct : num 0.954 0.881 0.899 0.973 0.998 ...
## - attr(*, ".internal.selfref")=<externalptr>
head(d)
## sub_id group day block sequence_type mean_rt prop_correct
## <int> <char> <int> <int> <char> <num> <num>
## 1: 1 young 1 1 predictable 409.17 0.9539
## 2: 1 young 1 2 random 412.53 0.8807
## 3: 1 young 1 3 predictable 390.69 0.8987
## 4: 1 young 1 4 random 381.13 0.9730
## 5: 1 young 2 5 predictable 384.26 0.9983
## 6: 1 young 2 6 random 403.97 0.9310
Each row here is a block, not a participant. That means the raw dataset is not yet at the right level for correlation or regression.
First, aggregate to one value per participant per day and condition.
d_day <- d[, .(mean_rt = mean(mean_rt)), .(sub_id, group, day, sequence_type)]
head(d_day)
## sub_id group day sequence_type mean_rt
## <int> <char> <int> <char> <num>
## 1: 1 young 1 predictable 399.930
## 2: 1 young 1 random 396.830
## 3: 1 young 2 predictable 372.545
## 4: 1 young 2 random 399.940
## 5: 1 young 3 predictable 365.050
## 6: 1 young 3 random 387.900
Next, reshape to wide format so each row gives predictable and random mean RT for a participant on a given day.
d_wide <- dcast(
d_day,
sub_id + group + day ~ sequence_type,
value.var = "mean_rt"
)
d_wide[, rt_advantage := random - predictable]
head(d_wide)
## Key: <sub_id, group, day>
## sub_id group day predictable random rt_advantage
## <int> <char> <int> <num> <num> <num>
## 1: 1 young 1 399.930 396.830 -3.100
## 2: 1 young 2 372.545 399.940 27.395
## 3: 1 young 3 365.050 387.900 22.850
## 4: 1 young 4 344.490 401.780 57.290
## 5: 1 young 5 325.080 381.830 56.750
## 6: 1 young 6 328.155 402.425 74.270
Now create one participant-level summary row. For each participant, compute:
baseline_predictable - predictable RT on Day 1final_predictable - predictable RT on Day 15improvement_rt - how much faster the participant became
from Day 1 to Day 15mean_advantage - the participant’s average random minus
predictable RTd_sub <- d_wide[, .(
baseline_predictable = predictable[day == 1],
final_predictable = predictable[day == 15],
improvement_rt = predictable[day == 1] - predictable[day == 15],
mean_advantage = mean(rt_advantage)
), by = .(sub_id, group)]
d_sub
## Key: <sub_id, group>
## sub_id group baseline_predictable final_predictable improvement_rt mean_advantage
## <int> <char> <num> <num> <num> <num>
## 1: 1 young 399.930 303.965 95.965 71.07133
## 2: 2 young 443.505 371.300 72.205 43.01133
## 3: 3 young 454.285 332.665 121.620 88.77167
## 4: 4 young 390.200 271.890 118.310 80.26633
## 5: 5 young 426.510 279.485 147.025 85.37767
## 6: 6 young 449.960 357.060 92.900 50.51700
## 7: 7 young 392.775 333.730 59.045 49.93167
## 8: 8 young 403.910 263.760 140.150 92.74000
## 9: 9 young 381.245 318.760 62.485 62.46733
## 10: 10 young 417.945 346.925 71.020 54.44633
## 11: 11 young 485.445 348.550 136.895 80.81833
## 12: 12 young 394.975 285.245 109.730 90.46767
## 13: 13 young 440.365 307.020 133.345 95.65267
## 14: 14 young 473.235 331.785 141.450 95.13600
## 15: 15 young 412.685 307.970 104.715 67.33367
## 16: 16 older 522.980 458.275 64.705 76.25833
## 17: 17 older 617.525 537.320 80.205 64.38700
## 18: 18 older 610.785 536.530 74.255 42.23167
## 19: 19 older 558.700 456.315 102.385 56.84800
## 20: 20 older 559.900 455.775 104.125 70.65967
## 21: 21 older 553.085 470.135 82.950 47.35467
## 22: 22 older 558.700 469.035 89.665 65.65267
## 23: 23 older 624.075 539.745 84.330 46.80967
## 24: 24 older 625.835 549.225 76.610 49.92067
## 25: 25 older 483.150 430.585 52.565 47.92867
## 26: 26 older 487.795 434.380 53.415 61.19467
## 27: 27 older 554.960 531.410 23.550 22.36400
## 28: 28 older 612.415 509.825 102.590 68.59067
## 29: 29 older 680.000 625.330 54.670 38.79767
## 30: 30 older 517.485 455.660 61.825 78.02733
## sub_id group baseline_predictable final_predictable improvement_rt mean_advantage
At this point, each row is one participant. That is the correct unit of analysis for the correlation and regression below.
Before computing statistics, make a scatter plot.
ggplot(d_sub, aes(x = mean_advantage, y = improvement_rt, colour = group)) +
geom_point(size = 2, alpha = 0.8) +
labs(
x = "Mean RT advantage (random - predictable, ms)",
y = "Improvement from Day 1 to Day 15 (ms)",
colour = "Group",
title = "Do participants with larger sequence advantages improve more?"
) +
scale_colour_manual(values = COL)
Participant-level relationship between sequence advantage and improvement across training.
If the relationship is positive, participants with larger overall sequence advantages also tend to show larger improvements across training.
Pearson’s correlation coefficient r measures the
strength and direction of the linear association between two continuous
variables.
cor.test(d_sub$mean_advantage, d_sub$improvement_rt)
##
## Pearson's product-moment correlation
##
## data: d_sub$mean_advantage and d_sub$improvement_rt
## t = 7.0434, df = 28, p-value = 1.162e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6169511 0.9004244
## sample estimates:
## cor
## 0.7995136
How to read the output:
r tells you the direction and strength of the linear
associationrYou can also look at the relationship separately within each group.
d_sub[group == "young", cor(mean_advantage, improvement_rt)]
## [1] 0.870716
d_sub[group == "older", cor(mean_advantage, improvement_rt)]
## [1] 0.5105794
These grouped values are useful descriptively, but keep your main interpretation focused on the participant-level analysis using all 30 independent participants.
Correlation tells you whether two variables are related. Regression tells you how much the outcome changes for a one-unit increase in the predictor.
Here, fit a model predicting improvement_rt from
mean_advantage.
fit <- lm(improvement_rt ~ mean_advantage, data = d_sub)
summary(fit)
##
## Call:
## lm(formula = improvement_rt ~ mean_advantage, data = d_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.046 -10.665 4.011 12.432 29.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.075 12.618 0.402 0.691
## mean_advantage 1.317 0.187 7.043 1.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.09 on 28 degrees of freedom
## Multiple R-squared: 0.6392, Adjusted R-squared: 0.6263
## F-statistic: 49.61 on 1 and 28 DF, p-value: 1.162e-07
Interpret the output like this:
mean_advantage = 0Add the fitted line to the scatter plot.
ggplot(d_sub, aes(x = mean_advantage, y = improvement_rt)) +
geom_point(alpha = 0.7, colour = COL[1]) +
geom_smooth(method = "lm", se = TRUE, colour = COL[2]) +
labs(
x = "Mean RT advantage (random - predictable, ms)",
y = "Improvement from Day 1 to Day 15 (ms)",
title = "Regression of improvement on sequence advantage"
)
Participant-level regression line with 95% confidence band.
Before interpreting a regression, inspect the residuals.
d_diag <- data.table(fitted = fitted(fit), resid = resid(fit))
ggplot(d_diag, aes(x = fitted, y = resid)) +
geom_point(colour = COL[1], alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
x = "Fitted values",
y = "Residuals",
title = "Residuals vs fitted"
)
ggplot(d_diag, aes(sample = resid)) +
stat_qq(colour = COL[1]) +
stat_qq_line(colour = COL[2]) +
labs(
x = "Theoretical quantiles",
y = "Sample quantiles",
title = "Q-Q plot of residuals"
)
Look for:
You can also compare the two age groups on
mean_advantage.
d_sub[, .(
n = .N,
mean_advantage = mean(mean_advantage),
sd_advantage = sd(mean_advantage)
), by = group]
## group n mean_advantage sd_advantage
## <char> <int> <num> <num>
## 1: young 15 73.86727 18.18650
## 2: older 15 55.80169 15.44076
ggplot(d_sub, aes(x = group, y = mean_advantage, fill = group)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.12, alpha = 0.5) +
scale_fill_manual(values = COL) +
labs(
x = "Group",
y = "Mean RT advantage (ms)",
fill = "Group",
title = "Sequence advantage by age group"
) +
theme(legend.position = "none")
Mean sequence advantage by age group.
This section is descriptive. The main worked examples for correlation and regression are the participant-level analyses above.
Your assigned dataset is determined by your student ID number. Take
the last digit and compute last_digit %% 3 in R.
0: https://github.com/crossley/cogs2020/tree/main/final_project_data/cat_learn_switch1: https://github.com/crossley/cogs2020/tree/main/final_project_data/cat_learn_auto2: https://github.com/crossley/cogs2020/tree/main/final_project_data/cat_learn_unlearnLoad your assigned dataset and run a correlation and a simple regression. Specifically:
cor.test().lm().