library(mosaic)
library(ggplot2)
library(dplyr)
library(psych)
We are going to be using the Big Five Inventory dataset, bfi
, to demonstrate ANOVA. This dataset it contained in the psych
package.
data(bfi)
Before we get into the ANOVA, we should first create all of our scale scores.
corr.test(select(bfi, A1, A2, A3, A4, A5))
corr.test(select(bfi, C1, C2, C3, C4, C5))
corr.test(select(bfi, E1, E2, E3, E4, E5))
corr.test(select(bfi, N1, N2, N3, N4, N5))
corr.test(select(bfi, O1, O2, O3, O4, O5))
bfi <- bfi %>%
mutate(A1.r = (min(A1, na.rm = TRUE) + max(A1, na.rm = TRUE)) - A1,
C4.r = (min(C4, na.rm = TRUE) + max(C4, na.rm = TRUE)) - C4,
C5.r = (min(C5, na.rm = TRUE) + max(C5, na.rm = TRUE)) - C5,
E1.r = (min(E1, na.rm = TRUE) + max(E1, na.rm = TRUE)) - E1,
E2.r = (min(E2, na.rm = TRUE) + max(E2, na.rm = TRUE)) - E2,
O2.r = (min(O2, na.rm = TRUE) + max(O2, na.rm = TRUE)) - O2,
O5.r = (min(O5, na.rm = TRUE) + max(O5, na.rm = TRUE)) - O5)
alpha(select(bfi, A1.r, A2, A3, A4, A5))
alpha(select(bfi, C1, C2, C3, C4.r, C5.r))
alpha(select(bfi, E1.r, E2.r, E3, E4, E5))
alpha(select(bfi, N1, N2, N3, N4, N5))
alpha(select(bfi, O1, O2.r, O3, O4, O5.r))
bfi <- bfi %>%
mutate(agreeable = rowMeans(select(bfi, A1.r, A2, A3, A4, A5), na.rm = TRUE),
conscient = rowMeans(select(bfi, C1, C2, C3, C4.r, C5.r), na.rm = TRUE),
extrov = rowMeans(select(bfi, E1.r, E2.r, E3, E4, E5), na.rm = TRUE),
neurot = rowMeans(select(bfi, N1, N2, N3, N4, N5), na.rm = TRUE),
openness = rowMeans(select(bfi, O1, O2.r, O3, O4, O5.r), na.rm = TRUE)) %>%
filter(!is.na(education))
Then take a quick look at the data with glimpse()
.
glimpse(bfi)
## Observations: 2,577
## Variables: 40
## $ A1 <int> 6, 4, 4, 4, 4, 1, 2, 4, 1, 2, 2, 4, 2, 1, 4, 5, 1, 1...
## $ A2 <int> 6, 3, 3, 4, 5, 5, 6, 5, 6, 4, 5, 5, 5, 5, 4, 3, 6, 4...
## $ A3 <int> 5, 1, 6, 5, 2, 6, 5, 5, 6, 4, 1, 6, 6, 6, 4, 5, 4, 4...
## $ A4 <int> 6, 5, 3, 6, 2, 5, 6, 6, 1, 4, 3, 5, 6, 5, 4, 4, 6, 2...
## $ A5 <int> 5, 1, 3, 5, 1, 6, 5, 5, 6, 3, 5, 5, 6, 4, 4, 2, 4, 3...
## $ C1 <int> 6, 3, 6, 4, 5, 4, 3, 5, 5, 6, 5, 5, 5, 1, 4, 2, 5, 6...
## $ C2 <int> 6, 2, 6, 3, 5, 3, 5, 5, 2, 5, 4, 5, 5, 5, 3, 2, 6, 5...
## $ C3 <int> 6, 4, 3, 5, 5, 2, 6, 4, 5, 6, 5, 3, 5, 6, 3, 4, 3, 6...
## $ C4 <int> 1, 2, 4, 3, 2, 4, 3, 1, 1, 1, 2, 5, 2, 4, 3, 3, 1, 3...
## $ C5 <int> 3, 4, 5, 2, 2, 5, 6, 1, 1, 1, 5, 4, 4, 6, 4, 4, 5, 4...
## $ E1 <int> 2, 3, 5, 1, 3, 2, 2, 3, 1, 2, 1, 1, 1, 6, 2, 3, 6, 3...
## $ E2 <int> 1, 6, 3, 3, 4, 1, 2, 2, 1, 4, 2, 2, 2, 6, 3, 4, 6, 4...
## $ E3 <int> 6, 4, NA, 2, 3, 2, 4, 5, 6, 4, 6, 6, 4, 2, 4, 3, 3, ...
## $ E4 <int> 5, 2, 4, 5, 6, 5, 6, 5, 6, 2, 5, 5, 5, 1, 2, 2, 2, 3...
## $ E5 <int> 6, 1, 3, 4, 5, 2, 6, 6, 6, 6, 4, 5, 5, 1, 3, 3, 2, 5...
## $ N1 <int> 3, 6, 5, 3, 2, 2, 4, 2, 2, 3, 1, 5, 3, 1, NA, 5, 2, ...
## $ N2 <int> 5, 3, 5, 3, 4, 2, 4, 3, 3, 3, 4, 4, 2, 2, 2, 3, 2, 6...
## $ N3 <int> 2, 2, 2, 4, 2, 2, 4, 3, 1, 5, 2, 4, 4, 1, 1, 4, 2, 5...
## $ N4 <int> 2, 6, 3, 2, 2, 2, 6, 1, 2, 3, 2, 3, 1, 3, 2, 4, 4, 5...
## $ N5 <int> 3, 4, 3, 3, 3, 2, 6, 1, 1, 2, 5, 1, 2, 6, 2, 3, 1, 4...
## $ O1 <int> 4, 3, 6, 5, 5, 6, 6, 6, 6, 5, 2, 4, 5, 6, 4, 4, 5, 5...
## $ O2 <int> 3, 2, 6, 3, 2, 1, 1, 2, 4, 2, 4, 4, 2, 6, 3, 5, 5, 5...
## $ O3 <int> 5, 4, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 4, 4, 4...
## $ O4 <int> 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 4, 5, 5, 6, 5, 4, 5, 5...
## $ O5 <int> 1, 3, 1, 3, 5, 2, 1, 2, 3, 1, 1, 1, 2, 1, 3, 3, 3, 2...
## $ gender <int> 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1...
## $ education <int> 3, 2, 1, 1, 1, 5, 2, 1, 3, 5, 3, 3, 3, 5, 3, 3, 3, 3...
## $ age <int> 21, 19, 19, 21, 17, 68, 27, 18, 20, 51, 33, 18, 41, ...
## $ A1.r <dbl> 1, 3, 3, 3, 3, 6, 5, 3, 6, 5, 5, 3, 5, 6, 3, 2, 6, 6...
## $ C4.r <dbl> 6, 5, 3, 4, 5, 3, 4, 6, 6, 6, 5, 2, 5, 3, 4, 4, 6, 4...
## $ C5.r <dbl> 4, 3, 2, 5, 5, 2, 1, 6, 6, 6, 2, 3, 3, 1, 3, 3, 2, 3...
## $ E1.r <dbl> 5, 4, 2, 6, 4, 5, 5, 4, 6, 5, 6, 6, 6, 1, 5, 4, 1, 4...
## $ E2.r <dbl> 6, 1, 4, 4, 3, 6, 5, 5, 6, 3, 5, 5, 5, 1, 4, 3, 1, 3...
## $ O2.r <dbl> 4, 5, 1, 4, 5, 6, 6, 5, 3, 5, 3, 3, 5, 1, 4, 2, 2, 2...
## $ O5.r <dbl> 6, 4, 6, 4, 2, 5, 6, 5, 4, 6, 6, 6, 5, 6, 4, 4, 4, 5...
## $ agreeable <dbl> 4.6, 2.6, 3.6, 4.6, 2.6, 5.6, 5.4, 4.8, 5.0, 4.0, 3....
## $ conscient <dbl> 5.6, 3.4, 4.0, 4.2, 5.0, 2.8, 3.8, 5.2, 4.8, 5.8, 4....
## $ extrov <dbl> 5.60, 2.40, 3.25, 4.20, 4.20, 4.00, 5.20, 5.00, 6.00...
## $ neurot <dbl> 3.00, 4.20, 3.60, 3.00, 2.60, 2.00, 4.80, 2.00, 1.80...
## $ openness <dbl> 5.0, 4.2, 5.0, 4.8, 4.4, 5.4, 5.8, 5.4, 4.6, 5.6, 4....
Let’s also take a look at the distributions of our new variables.
ggplot(bfi, aes(x = agreeable)) + geom_density()
ggplot(bfi, aes(x = conscient)) + geom_density()
ggplot(bfi, aes(x = extrov)) + geom_density()
ggplot(bfi, aes(x = neurot)) + geom_density()
ggplot(bfi, aes(x = openness)) + geom_density()
Yesterday we used t.test()
to test for differences in conscientiousness between those who graduated college, and those who did not. Note that I did not create the variable again, I used a logical statement directly in the t.test()
function.
t.test(conscient ~ (education > 3), data = bfi)
##
## Welch Two Sample t-test
##
## data: conscient by education > 3
## t = 1.8433, df = 1530.6, p-value = 0.06548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.004768244 0.153480246
## sample estimates:
## mean in group FALSE mean in group TRUE
## 4.327394 4.253038
A better thing to do would be to check for differences with a one-way anova.
Perform a one-way ANOVA for education level on conscientiousness. Let’s first look at the distributions of conscientiousness by education level. Note that first we’re changing the education variable’s type from integer to factor with the as.factor()
function and giving nice labels to the factor levels.
bfi <- bfi %>%
mutate(education = as.factor(education),
education = factor(education, labels=c("HS",
"finished HS",
"some college",
"college graduate",
"graduate degree")))
Let’s take a look at the distributions.
ggplot(bfi, aes(x = education, y = conscient, fill = education)) +
geom_boxplot(alpha = .5)
Also, let’s get descriptives by education level.
#descriptives
bfi %>%
group_by(education) %>%
summarize(mean = mean(conscient, na.rm = TRUE),
sd = sd(conscient, na.rm = TRUE),
min = min(conscient, na.rm = TRUE),
max = max(conscient, na.rm = TRUE))
## # A tibble: 5 x 5
## education mean sd min max
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 HS 4.120685 0.9410469 1.4 6
## 2 finished HS 4.229110 0.9854484 1.2 6
## 3 some college 4.387443 0.9083686 1.0 6
## 4 college graduate 4.220305 0.9405653 1.4 6
## 5 graduate degree 4.283892 0.9795707 1.0 6
We can perform a Levene’s test to test the homogeneity of variance assumption with the leveneTest()
function that’s in the car
package. Note also the use of the %$%
operator from the magrittr
package. It’s like the pipe, except for it “explodes” a dataset into a function, allowing naked variable names.
#install.packages("car")
library(car)
library(magrittr)
bfi %$%
leveneTest(conscient, education)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.3069 0.265
## 2572
Now that we’ve checked our assumptions, finally, we can run the one-way ANOVA.
mod1 <- aov(conscient ~ education, data = bfi)
summary(mod1)
## Df Sum Sq Mean Sq F value Pr(>F)
## education 4 20.8 5.198 5.921 9.67e-05 ***
## Residuals 2572 2257.6 0.878
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There are statistically significant differences between people with different education levels in conscientiousness, F(4, 2485) = 5.67, p < .001. We might want a bar graph for publication!
#A small companion dataset for making error bars
plotdata <- bfi %>%
group_by(education) %>%
summarise(mean = mean(conscient, na.rm = TRUE),
stdv = sd(conscient, na.rm = TRUE),
n = n()) %>%
mutate(se = stdv/sqrt(n))
#Making the Bar Graph
ggplot(plotdata, aes(x = education,
y = mean,
fill = education)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymax = mean + se, ymin = mean - se), width = 0.25) +
labs(x = "Education Level", y = "Conscientiousness") +
ylim(0, 5) + #the scale is really from 1 to 6
ggtitle("Concientiousness by Education Level") +
scale_fill_discrete(guide = FALSE) +
theme_minimal()
Two-way ANOVA for gender by education level on conscientiousness. We can get favstats()
split by another categorical variable with the |
symbol. It’s above your return
key.
favstats(conscient ~ education, gender, data = bfi)
## gender min Q1 median Q3 max mean sd n
## 1 HS.1 2.4 3.60 4.200 5.0 6 4.229391 0.8581184 93
## 2 finished HS.1 1.2 3.40 4.000 4.8 6 4.004369 1.0175930 103
## 3 some college.1 1.0 3.60 4.225 5.0 6 4.256601 0.9418097 356
## 4 college graduate.1 2.0 3.40 4.000 4.8 6 4.114552 0.9602205 134
## 5 graduate degree.1 1.2 3.55 4.200 4.8 6 4.123026 1.0032708 152
## 6 HS.2 1.4 3.40 4.200 4.7 6 4.043511 0.9917234 131
## 7 finished HS.2 1.6 3.80 4.400 5.0 6 4.351587 0.9479388 189
## 8 some college.2 1.0 3.80 4.600 5.2 6 4.439604 0.8898754 893
## 9 college graduate.2 1.4 3.60 4.400 5.0 6 4.274808 0.9274295 260
## 10 graduate degree.2 1.0 3.80 4.400 5.0 6 4.375815 0.9555508 266
## 11 1 1.0 3.40 4.200 5.0 6 4.175636 0.9587777 838
## 12 2 1.0 3.80 4.400 5.0 6 4.365804 0.9254656 1739
## missing
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
favstats(conscient ~ education|gender, data = bfi)
## gender min Q1 median Q3 max mean sd n
## 1 HS.1 2.4 3.60 4.200 5.0 6 4.229391 0.8581184 93
## 2 finished HS.1 1.2 3.40 4.000 4.8 6 4.004369 1.0175930 103
## 3 some college.1 1.0 3.60 4.225 5.0 6 4.256601 0.9418097 356
## 4 college graduate.1 2.0 3.40 4.000 4.8 6 4.114552 0.9602205 134
## 5 graduate degree.1 1.2 3.55 4.200 4.8 6 4.123026 1.0032708 152
## 6 HS.2 1.4 3.40 4.200 4.7 6 4.043511 0.9917234 131
## 7 finished HS.2 1.6 3.80 4.400 5.0 6 4.351587 0.9479388 189
## 8 some college.2 1.0 3.80 4.600 5.2 6 4.439604 0.8898754 893
## 9 college graduate.2 1.4 3.60 4.400 5.0 6 4.274808 0.9274295 260
## 10 graduate degree.2 1.0 3.80 4.400 5.0 6 4.375815 0.9555508 266
## 11 1 1.0 3.40 4.200 5.0 6 4.175636 0.9587777 838
## 12 2 1.0 3.80 4.400 5.0 6 4.365804 0.9254656 1739
## missing
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
Alternatively we can use dplyr
bfi %>%
group_by(gender, education) %>%
summarise(M = mean(conscient, na.rm = TRUE),
Md = median(conscient, na.rm = TRUE),
SD = sd(conscient, na.rm = TRUE))
## # A tibble: 10 x 5
## # Groups: gender [?]
## gender education M Md SD
## <int> <fctr> <dbl> <dbl> <dbl>
## 1 1 HS 4.229391 4.200 0.8581184
## 2 1 finished HS 4.004369 4.000 1.0175930
## 3 1 some college 4.256601 4.225 0.9418097
## 4 1 college graduate 4.114552 4.000 0.9602205
## 5 1 graduate degree 4.123026 4.200 1.0032708
## 6 2 HS 4.043511 4.200 0.9917234
## 7 2 finished HS 4.351587 4.400 0.9479388
## 8 2 some college 4.439604 4.600 0.8898754
## 9 2 college graduate 4.274808 4.400 0.9274295
## 10 2 graduate degree 4.375815 4.400 0.9555508
Let’s check our assumption of homogeneity of variance. First let’s do some data stuff we will need. For the Levene’s Test we will create the gender X education levels with the unite()
function.
#install.packages("tidyr")
library(tidyr)
bfi %>%
unite(gen_edu, gender, education, remove = FALSE) %$%
leveneTest(conscient, gen_edu)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 9 0.8484 0.5714
## 2567
Because Levene’s Test is non-significant we have evidence of homogeneity of variance (really, no evidence of heterogeneity of variance). Now we perform the two-way ANOVA.
mod2 <- aov(conscient ~ education*gender, data = bfi)
summary(mod2)
## Df Sum Sq Mean Sq F value Pr(>F)
## education 4 20.8 5.198 5.981 8.67e-05 ***
## gender 1 17.2 17.212 19.806 8.94e-06 ***
## education:gender 4 9.7 2.420 2.785 0.0252 *
## Residuals 2567 2230.8 0.869
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a statistically significant main effect of gender, F(1, 2480) = 17.14, p < .001, such that, women (M = 4.36, SD = 0.93) are more conscientious than men (M = 4.18, SD = 0.96). These main effects are qualified by a statistically significant two-way interaction of education and gender, F(4, 2480) = 2.56, p = .037. Let’s make a bar graph!
plotdata2 <- bfi %>%
group_by(gender, education) %>%
summarise(mean = mean(conscient, na.rm = TRUE),
stdv = sd(conscient, na.rm = TRUE),
n = n()) %>%
mutate(se = stdv/sqrt(n))
ggplot(plotdata2, aes(x = education, y = mean, fill = as.factor(gender))) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
labs(x = "Education", y = "Conscientiousness") +
ggtitle("Conscientiousness by Gender and Education Level") +
scale_fill_discrete(name = "Gender")
As we see in the graph (and you saw yesterday from the t-tests), women are higher than men in conscientiousness for every level of education expect for those participants that did not finish high school.
Try fiddling with the graph. Change the theme, change the labels. See if you can find on the internet how to change the colors.
plotdata2 <- bfi %>%
group_by(gender, education) %>%
summarise(mean = mean(conscient, na.rm = TRUE),
stdv = sd(conscient, na.rm = TRUE),
n = n()) %>%
mutate(se = stdv/sqrt(n))
ggplot(plotdata2, aes(x = education, y = mean, fill = as.factor(gender))) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
labs(x = "Education", y = "Conscientiousness") +
ggtitle("Conscientiousness by Gender and Education Level") +
scale_fill_manual(name = "Gender", values = c("gold", "dodgerblue")) +
theme_minimal()
We will see another example of a two-way ANOVA in the reproducible APA style document.
To demonstrate the mixed effects ANOVA we’ll use the sat.act
dataset. Recall that the sat.act
dataset has information for 700 people on their SAT verbal, SAT quantitative, and ACT scores.
data(sat.act)
Recall from yesterday that boys were higher on the SAT quantitative than the SAT verbal (on average) and the girls had the opposite test patterns.
sat.act_men <- sat.act %>%
filter(gender == 1)
t.test(~(SATV-SATQ), data = sat.act_men)
##
## One Sample t-test
##
## data: sat.act_men$(SATV - SATQ)
## t = -3.4934, df = 244, p-value = 0.0005661
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -32.08114 -8.94743
## sample estimates:
## mean of x
## -20.51429
sat.act %>%
filter(gender == 2) %>%
t.test(~(SATV-SATQ), data = .)
##
## One Sample t-test
##
## data: .$(SATV - SATQ)
## t = 3.1813, df = 441, p-value = 0.00157
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 5.604249 23.721543
## sample estimates:
## mean of x
## 14.6629
We can test if these two patterns are statistically different from each other with a gender (between-subjects) by topic (within-subjects) two-way mixed effects ANOVA. Where topic has two levels: 1) Verbal and 2) quantitative. We have to restructure our data first.
sat.act_long <- sat.act %>%
mutate(id = as.factor(seq_along(SATV))) %>%
gather(key = topic, value = score, SATV, SATQ) %>%
mutate(gender = as.factor(factor(gender, labels=c('Boy', 'Girl'))),
topic = as.factor(topic)) %>%
arrange(id)
ggplot(sat.act_long, aes(x = gender, y = score, color = topic)) +
geom_boxplot()
After you’ve taken a good look at the data, it’s time for the ANOVA.
mod3 <- aov(score ~ gender*topic + Error(id),
data = sat.act_long)
summary(mod3)
##
## Error: id
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 154094 154094 7.283 0.00713 **
## topic 1 0 0 0.000 0.99774
## gender:topic 1 3910 3910 0.185 0.66741
## Residuals 696 14726767 21159
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Error: Within
## Df Sum Sq Mean Sq F value Pr(>F)
## topic 1 1541 1541 0.34 0.56
## gender:topic 1 97527 97527 21.54 4.15e-06 ***
## Residuals 685 3101217 4527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As we may have guessed from our preliminary analyses, there is a main effect of gender, F(1, 696) = 7.28, p = .007, no main effect of topic, F(1, 685) = 0.34, p = .56, but a significant interaction of gender and topic, F(1, 685) = 21.54, p < .001.
plotdata <- sat.act_long %>%
group_by(gender, topic) %>%
summarise(mean = mean(score, na.rm = TRUE),
stdv = sd(score, na.rm = TRUE),
n = n()) %>%
mutate(se = stdv/sqrt(n))
ggplot(plotdata, aes(x = topic, y = mean, fill = gender)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
labs(x = "Topic", y = "Score") +
ggtitle("Score by Gender and Topic") +
scale_fill_discrete(name = "Gender")
It looks as though there are only gender differences on the SATQ, but not the SATV. Follow this up with two t-test of gender for each topic. Hint: use the sat.act
data and the filter()
function.
#two independent samples t-tests here.
Back to the bfi
dataset. What if we wanted to treat education
like an interval measured variable instead of ordinal? We could then regress conscientiousness on education in a simple linear regression model. First, let’s clear our environment and re-load the data.
data(bfi)
bfi <- bfi %>%
mutate(C4.r = (min(C4, na.rm = TRUE) + max(C4, na.rm = TRUE)) - C4,
C5.r = (min(C5, na.rm = TRUE) + max(C5, na.rm = TRUE)) - C5,
conscient = (C1 + C2 + C3 + C4.r + C5.r)/5)
It’s good practice to make a scatter plot before running a regression. Do this using ggplot2
. You might want to try using geom_jitter()
. Also, add a linear regression line using geom_smooth(method = "lm")
.
#make a scatter plot here.
qplot(x = education, y = conscient, data = bfi)
ggplot(bfi, aes(x = education, y = conscient)) +
geom_jitter(height = 0, width = 0.05, alpha = .05) +
geom_smooth(method = "lm", se = 0)
Now let’s run our model using the lm()
function.
mod4 <- lm(conscient ~ education, data = bfi)
mod4
##
## Call:
## lm(formula = conscient ~ education, data = bfi)
##
## Coefficients:
## (Intercept) education
## 4.2482 0.0169
summary(mod4)
##
## Call:
## lm(formula = conscient ~ education, data = bfi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3327 -0.6989 0.1011 0.7011 1.7349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.24816 0.05758 73.778 <2e-16 ***
## education 0.01690 0.01702 0.993 0.321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9428 on 2488 degrees of freedom
## (310 observations deleted due to missingness)
## Multiple R-squared: 0.0003961, Adjusted R-squared: -5.714e-06
## F-statistic: 0.9858 on 1 and 2488 DF, p-value: 0.3209
confint(mod4)
## 2.5 % 97.5 %
## (Intercept) 4.13525467 4.36107438
## education -0.01647737 0.05027662
There is no statistically significant effect of education on conscientiousness using a linear regression model. Let’s check our regression diagnostics. This is one of the benefits of R, we can use the plot()
function.
#plot(mod)
Let’s add gender as a factor to make it a multiple regression model. Note that we can also get the confidence intervals.
mod5.0 <- lm(conscient ~ as.factor(education)*as.factor(gender), data = bfi)
summary(mod5.0)
##
## Call:
## lm(formula = conscient ~ as.factor(education) * as.factor(gender),
## data = bfi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4372 -0.6372 0.0722 0.7254 1.9980
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 4.23864 0.09967 42.525
## as.factor(education)2 -0.23666 0.13635 -1.736
## as.factor(education)3 0.02119 0.11180 0.190
## as.factor(education)4 -0.11094 0.12907 -0.860
## as.factor(education)5 -0.11064 0.12555 -0.881
## as.factor(gender)2 -0.19020 0.12948 -1.469
## as.factor(education)2:as.factor(gender)2 0.52266 0.17401 3.004
## as.factor(education)3:as.factor(gender)2 0.36754 0.14263 2.577
## as.factor(education)4:as.factor(gender)2 0.33711 0.16420 2.053
## as.factor(education)5:as.factor(gender)2 0.42822 0.16115 2.657
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## as.factor(education)2 0.08275 .
## as.factor(education)3 0.84970
## as.factor(education)4 0.39013
## as.factor(education)5 0.37830
## as.factor(gender)2 0.14198
## as.factor(education)2:as.factor(gender)2 0.00269 **
## as.factor(education)3:as.factor(gender)2 0.01003 *
## as.factor(education)4:as.factor(gender)2 0.04017 *
## as.factor(education)5:as.factor(gender)2 0.00793 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.935 on 2480 degrees of freedom
## (310 observations deleted due to missingness)
## Multiple R-squared: 0.01986, Adjusted R-squared: 0.01631
## F-statistic: 5.585 on 9 and 2480 DF, p-value: 1.16e-07
bfi <- bfi %>%
mutate(genderE = ifelse(gender == 2, -1, gender))
mod5 <- lm(conscient ~ education*genderE, data = bfi)
summary(mod5)
##
## Call:
## lm(formula = conscient ~ education * genderE, data = bfi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4229 -0.6229 0.0456 0.6456 1.8433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.23599 0.05936 71.358 <2e-16 ***
## education 0.01076 0.01750 0.615 0.539
## genderE -0.01561 0.05936 -0.263 0.793
## education:genderE -0.02350 0.01750 -1.343 0.179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.939 on 2486 degrees of freedom
## (310 observations deleted due to missingness)
## Multiple R-squared: 0.009233, Adjusted R-squared: 0.008037
## F-statistic: 7.722 on 3 and 2486 DF, p-value: 3.917e-05
confint(mod5)
## 2.5 % 97.5 %
## (Intercept) 4.11958030 4.35239148
## education -0.02354989 0.04507208
## genderE -0.13201474 0.10079644
## education:genderE -0.05781121 0.01081076
Check the residuals for mod5
#use the plot() function here
plot(mod5)
Try making a figure that splits the slope of education on conscientiousness by gender. Copy and past from yesterday’s code if you’d like.
#regression lines split by gender
ggplot(bfi, aes(x = education, y = conscient, color = as.factor(gender))) +
geom_smooth(method = "lm", se = 0) +
ylim(1, 6)
Because personality is relatively stable, we may instead ask are people who are more conscientious more likely to graduate from college? This would be a logistic regression model. We can use the code we wrote yesterday for creating the dichotomous variable 1 = yes college, 0 = no college.
bfi <- bfi %>%
mutate(coll_grad = (education > 3))
tally(~coll_grad, data = bfi)
## coll_grad
## TRUE FALSE <NA>
## 812 1765 223
ggplot(filter(bfi, !is.na(coll_grad)), aes(x = coll_grad, y = conscient)) +
geom_boxplot()
Then we can run our model by using the glm()
function and adding family = binomial
.
mod6 <- glm(coll_grad ~ conscient, data = bfi, family = binomial)
summary(mod6)
##
## Call:
## glm(formula = coll_grad ~ conscient, family = binomial, data = bfi)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9750 -0.8829 -0.8535 1.4818 1.5773
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.41527 0.19925 -2.084 0.0371 *
## conscient -0.08142 0.04549 -1.790 0.0735 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3113.0 on 2489 degrees of freedom
## Residual deviance: 3109.8 on 2488 degrees of freedom
## (310 observations deleted due to missingness)
## AIC: 3113.8
##
## Number of Fisher Scoring iterations: 4
confint(mod6)
## 2.5 % 97.5 %
## (Intercept) -0.8074043 -0.026019218
## conscient -0.1705590 0.007841277
It’s handy to look at the exponentiated estimates using the exp()
function.
exp(coef(mod6))
## (Intercept) conscient
## 0.6601588 0.9218058
You are 0.92 times as likely to go graduate from college for every 1 unit increase in conscientiousness, but this is only marginally significant, p = .073.