(from Richard McElreath, Statistical Rethinking, 2nd edition)
Purpose of simulation: show that "controlling" for marriage status creates a correlation between age and happiness, even though we know there is no correlation.
Pseudocode:
for t in 1:N:
for p in 1:20:
u ~ uniform(0, 1)
add person (yob=t, id=p, happyness=u,
marriageYear=-1, active=1) to data
end
for (y, i, h, m, a) in data:
if a == 1 and t-y >= 65:
a <- 0
end
if a == 1 and t-y >= 18 and m == -1:
x ~ uniform(0, 1)
if x < h
m = t
end
end
end
end
options(repr.plot.width=5, repr.plot.height=5)
N = 1000
d <- c()
for (t in 1:N) {
for (p in 1:20) {
u <- runif(1)
d <- rbind(d, c(t, p, u, -1, 1))
}
for (k in 1:nrow(d)) {
if (d[k, 5] == 1 & t-d[k, 1] >= 65) {
d[k, 5] <- 0
}
if (d[k, 5] ==1 & t-d[k, 1] >= 18 & d[k, 4] == -1) {
x <- runif(1)
if (x < d[k, 3]) {
d[k, 4] <- t
}
}
}
}
d <- data.frame(d)
colnames(d) = c('yearOfBirth', 'id', 'happyness', 'yearOfMarriage', 'active')
d
d$ageOfMarriage <- d$yearOfMarriage - d$yearOfBirth
d$ageOfMarriage[d$yearOfMarriage == -1] <- -1
hist(d[d$yearOfMarriage > 0,]$ageOfMarriage, freq=F, nclass=30,
xlab='age of marriage', main='Distribution of age of marriage')
d
contains all relevant information. Let us draw some random data points from this log.
drawSample <- function() {
yob <- sample(N, 1) ### uniformly distributed random integer from 1..N
p <- sample(20, 1) ### uniformly distributed random integer from 1..20
s <- d[(d$yearOfBirth == yob) & (d$id == p),]
age <- sample(0:65, 1) ### uniformly distributed random integer from 0..65
married <- 0
if ((yob+age > as.numeric(s$yearOfMarriage)) & (as.numeric(s$yearOfMarriage) > -1)) {
married <- 1
}
return(c(age, s$happyness, c('single', 'married')[1+married]))
}
df <- data.frame(t(replicate(10000, drawSample())))
colnames(df) <- c('age', 'happyness', 'married')
df$age <- as.numeric(as.character(df$age))
df$happyness <- as.numeric(as.character(df$happyness))
plot(happyness~married, df)
plot(married ~ happyness, df)
plot(age~married, df)
plot(married~age, df)
plot(happyness~age, df, cex=.3)
summary(lm(happyness ~ age, df))
summary(lm(happyness ~ age + married, df))
library(ggplot2)
ggplot(df, aes(x=age, y=happyness, col=married)) +
geom_point(size=.1) +
geom_smooth(method='lm') +
geom_smooth(col='black', method='lm')