library(mosaic)
library(ggplot2)
library(dplyr)
library(psych)

Scale Scores Recap

We are going to be using the Big Five Inventory dataset, bfi, to demonstrate ANOVA. This dataset it contained in the psych package.

data(bfi)

Before we get into the ANOVA, we should first create all of our scale scores.

corr.test(select(bfi, A1, A2, A3, A4, A5))
corr.test(select(bfi, C1, C2, C3, C4, C5))
corr.test(select(bfi, E1, E2, E3, E4, E5))
corr.test(select(bfi, N1, N2, N3, N4, N5))
corr.test(select(bfi, O1, O2, O3, O4, O5))

bfi <- bfi %>%
  mutate(A1.r = (min(A1, na.rm = TRUE) + max(A1, na.rm = TRUE)) - A1,
         C4.r = (min(C4, na.rm = TRUE) + max(C4, na.rm = TRUE)) - C4,
         C5.r = (min(C5, na.rm = TRUE) + max(C5, na.rm = TRUE)) - C5,
         E1.r = (min(E1, na.rm = TRUE) + max(E1, na.rm = TRUE)) - E1,
         E2.r = (min(E2, na.rm = TRUE) + max(E2, na.rm = TRUE)) - E2,
         O2.r = (min(O2, na.rm = TRUE) + max(O2, na.rm = TRUE)) - O2,
         O5.r = (min(O5, na.rm = TRUE) + max(O5, na.rm = TRUE)) - O5)

alpha(select(bfi, A1.r, A2, A3, A4, A5))
alpha(select(bfi, C1, C2, C3, C4.r, C5.r))
alpha(select(bfi, E1.r, E2.r, E3, E4, E5))
alpha(select(bfi, N1, N2, N3, N4, N5))
alpha(select(bfi, O1, O2.r, O3, O4, O5.r))

bfi <- bfi %>%
  mutate(agreeable = rowMeans(select(bfi, A1.r, A2, A3, A4, A5), na.rm = TRUE),
         conscient = rowMeans(select(bfi, C1, C2, C3, C4.r, C5.r), na.rm = TRUE),
         extrov = rowMeans(select(bfi, E1.r, E2.r, E3, E4, E5), na.rm = TRUE),
         neurot = rowMeans(select(bfi, N1, N2, N3, N4, N5), na.rm = TRUE),
         openness = rowMeans(select(bfi, O1, O2.r, O3, O4, O5.r), na.rm = TRUE)) %>%
  filter(!is.na(education))

Then take a quick look at the data with glimpse().

glimpse(bfi)

## Observations: 2,577
## Variables: 40
## $ A1        <int> 6, 4, 4, 4, 4, 1, 2, 4, 1, 2, 2, 4, 2, 1, 4, 5, 1, 1...
## $ A2        <int> 6, 3, 3, 4, 5, 5, 6, 5, 6, 4, 5, 5, 5, 5, 4, 3, 6, 4...
## $ A3        <int> 5, 1, 6, 5, 2, 6, 5, 5, 6, 4, 1, 6, 6, 6, 4, 5, 4, 4...
## $ A4        <int> 6, 5, 3, 6, 2, 5, 6, 6, 1, 4, 3, 5, 6, 5, 4, 4, 6, 2...
## $ A5        <int> 5, 1, 3, 5, 1, 6, 5, 5, 6, 3, 5, 5, 6, 4, 4, 2, 4, 3...
## $ C1        <int> 6, 3, 6, 4, 5, 4, 3, 5, 5, 6, 5, 5, 5, 1, 4, 2, 5, 6...
## $ C2        <int> 6, 2, 6, 3, 5, 3, 5, 5, 2, 5, 4, 5, 5, 5, 3, 2, 6, 5...
## $ C3        <int> 6, 4, 3, 5, 5, 2, 6, 4, 5, 6, 5, 3, 5, 6, 3, 4, 3, 6...
## $ C4        <int> 1, 2, 4, 3, 2, 4, 3, 1, 1, 1, 2, 5, 2, 4, 3, 3, 1, 3...
## $ C5        <int> 3, 4, 5, 2, 2, 5, 6, 1, 1, 1, 5, 4, 4, 6, 4, 4, 5, 4...
## $ E1        <int> 2, 3, 5, 1, 3, 2, 2, 3, 1, 2, 1, 1, 1, 6, 2, 3, 6, 3...
## $ E2        <int> 1, 6, 3, 3, 4, 1, 2, 2, 1, 4, 2, 2, 2, 6, 3, 4, 6, 4...
## $ E3        <int> 6, 4, NA, 2, 3, 2, 4, 5, 6, 4, 6, 6, 4, 2, 4, 3, 3, ...
## $ E4        <int> 5, 2, 4, 5, 6, 5, 6, 5, 6, 2, 5, 5, 5, 1, 2, 2, 2, 3...
## $ E5        <int> 6, 1, 3, 4, 5, 2, 6, 6, 6, 6, 4, 5, 5, 1, 3, 3, 2, 5...
## $ N1        <int> 3, 6, 5, 3, 2, 2, 4, 2, 2, 3, 1, 5, 3, 1, NA, 5, 2, ...
## $ N2        <int> 5, 3, 5, 3, 4, 2, 4, 3, 3, 3, 4, 4, 2, 2, 2, 3, 2, 6...
## $ N3        <int> 2, 2, 2, 4, 2, 2, 4, 3, 1, 5, 2, 4, 4, 1, 1, 4, 2, 5...
## $ N4        <int> 2, 6, 3, 2, 2, 2, 6, 1, 2, 3, 2, 3, 1, 3, 2, 4, 4, 5...
## $ N5        <int> 3, 4, 3, 3, 3, 2, 6, 1, 1, 2, 5, 1, 2, 6, 2, 3, 1, 4...
## $ O1        <int> 4, 3, 6, 5, 5, 6, 6, 6, 6, 5, 2, 4, 5, 6, 4, 4, 5, 5...
## $ O2        <int> 3, 2, 6, 3, 2, 1, 1, 2, 4, 2, 4, 4, 2, 6, 3, 5, 5, 5...
## $ O3        <int> 5, 4, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 4, 4, 4...
## $ O4        <int> 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 4, 5, 5, 6, 5, 4, 5, 5...
## $ O5        <int> 1, 3, 1, 3, 5, 2, 1, 2, 3, 1, 1, 1, 2, 1, 3, 3, 3, 2...
## $ gender    <int> 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1...
## $ education <int> 3, 2, 1, 1, 1, 5, 2, 1, 3, 5, 3, 3, 3, 5, 3, 3, 3, 3...
## $ age       <int> 21, 19, 19, 21, 17, 68, 27, 18, 20, 51, 33, 18, 41, ...
## $ A1.r      <dbl> 1, 3, 3, 3, 3, 6, 5, 3, 6, 5, 5, 3, 5, 6, 3, 2, 6, 6...
## $ C4.r      <dbl> 6, 5, 3, 4, 5, 3, 4, 6, 6, 6, 5, 2, 5, 3, 4, 4, 6, 4...
## $ C5.r      <dbl> 4, 3, 2, 5, 5, 2, 1, 6, 6, 6, 2, 3, 3, 1, 3, 3, 2, 3...
## $ E1.r      <dbl> 5, 4, 2, 6, 4, 5, 5, 4, 6, 5, 6, 6, 6, 1, 5, 4, 1, 4...
## $ E2.r      <dbl> 6, 1, 4, 4, 3, 6, 5, 5, 6, 3, 5, 5, 5, 1, 4, 3, 1, 3...
## $ O2.r      <dbl> 4, 5, 1, 4, 5, 6, 6, 5, 3, 5, 3, 3, 5, 1, 4, 2, 2, 2...
## $ O5.r      <dbl> 6, 4, 6, 4, 2, 5, 6, 5, 4, 6, 6, 6, 5, 6, 4, 4, 4, 5...
## $ agreeable <dbl> 4.6, 2.6, 3.6, 4.6, 2.6, 5.6, 5.4, 4.8, 5.0, 4.0, 3....
## $ conscient <dbl> 5.6, 3.4, 4.0, 4.2, 5.0, 2.8, 3.8, 5.2, 4.8, 5.8, 4....
## $ extrov    <dbl> 5.60, 2.40, 3.25, 4.20, 4.20, 4.00, 5.20, 5.00, 6.00...
## $ neurot    <dbl> 3.00, 4.20, 3.60, 3.00, 2.60, 2.00, 4.80, 2.00, 1.80...
## $ openness  <dbl> 5.0, 4.2, 5.0, 4.8, 4.4, 5.4, 5.8, 5.4, 4.6, 5.6, 4....

Let’s also take a look at the distributions of our new variables.

ggplot(bfi, aes(x = agreeable)) + geom_density()

ggplot(bfi, aes(x = conscient)) + geom_density()

ggplot(bfi, aes(x = extrov)) + geom_density()

ggplot(bfi, aes(x = neurot)) + geom_density()

ggplot(bfi, aes(x = openness)) + geom_density()

Yesterday we used t.test() to test for differences in conscientiousness between those who graduated college, and those who did not. Note that I did not create the variable again, I used a logical statement directly in the t.test() function.

t.test(conscient ~ (education > 3), data = bfi)

## 
##  Welch Two Sample t-test
## 
## data:  conscient by education > 3
## t = 1.8433, df = 1530.6, p-value = 0.06548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.004768244  0.153480246
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            4.327394            4.253038

A better thing to do would be to check for differences with a one-way anova.

One-Way ANOVA

Perform a one-way ANOVA for education level on conscientiousness. Let’s first look at the distributions of conscientiousness by education level. Note that first we’re changing the education variable’s type from integer to factor with the as.factor() function and giving nice labels to the factor levels.

bfi <- bfi %>%
  mutate(education = as.factor(education),
         education = factor(education, labels=c("HS", 
                                                "finished HS", 
                                                "some college", 
                                                "college graduate", 
                                                "graduate degree")))

Let’s take a look at the distributions.

ggplot(bfi, aes(x = education, y = conscient, fill = education)) +
  geom_boxplot(alpha = .5)

Also, let’s get descriptives by education level.

#descriptives
bfi %>%
  group_by(education) %>%
  summarize(mean = mean(conscient, na.rm = TRUE),
            sd = sd(conscient, na.rm = TRUE),
            min = min(conscient, na.rm = TRUE),
            max = max(conscient, na.rm = TRUE))

## # A tibble: 5 x 5
##          education     mean        sd   min   max
##             <fctr>    <dbl>     <dbl> <dbl> <dbl>
## 1               HS 4.120685 0.9410469   1.4     6
## 2      finished HS 4.229110 0.9854484   1.2     6
## 3     some college 4.387443 0.9083686   1.0     6
## 4 college graduate 4.220305 0.9405653   1.4     6
## 5  graduate degree 4.283892 0.9795707   1.0     6

We can perform a Levene’s test to test the homogeneity of variance assumption with the leveneTest() function that’s in the car package. Note also the use of the %$% operator from the magrittr package. It’s like the pipe, except for it “explodes” a dataset into a function, allowing naked variable names.

#install.packages("car")
library(car)
library(magrittr)

bfi %$%
  leveneTest(conscient, education)

## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    4  1.3069  0.265
##       2572

Now that we’ve checked our assumptions, finally, we can run the one-way ANOVA.

mod1 <- aov(conscient ~ education, data = bfi)

summary(mod1)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## education      4   20.8   5.198   5.921 9.67e-05 ***
## Residuals   2572 2257.6   0.878                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There are statistically significant differences between people with different education levels in conscientiousness, F(4, 2485) = 5.67, p < .001. We might want a bar graph for publication!

#A small companion dataset for making error bars         
plotdata <- bfi %>% 
  group_by(education) %>%
  summarise(mean = mean(conscient, na.rm = TRUE), 
            stdv = sd(conscient, na.rm = TRUE), 
            n = n()) %>%
  mutate(se = stdv/sqrt(n))

#Making the Bar Graph
ggplot(plotdata, aes(x = education, 
                     y = mean, 
                     fill = education)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymax = mean + se, ymin = mean - se), width = 0.25) +
  labs(x = "Education Level", y = "Conscientiousness") + 
  ylim(0, 5) +  #the scale is really from 1 to 6
  ggtitle("Concientiousness by Education Level") + 
  scale_fill_discrete(guide = FALSE) +
  theme_minimal()

Two-Way ANOVA

Two-way ANOVA for gender by education level on conscientiousness. We can get favstats() split by another categorical variable with the | symbol. It’s above your return key.

favstats(conscient ~ education, gender, data = bfi)

##                gender min   Q1 median  Q3 max     mean        sd    n
## 1                HS.1 2.4 3.60  4.200 5.0   6 4.229391 0.8581184   93
## 2       finished HS.1 1.2 3.40  4.000 4.8   6 4.004369 1.0175930  103
## 3      some college.1 1.0 3.60  4.225 5.0   6 4.256601 0.9418097  356
## 4  college graduate.1 2.0 3.40  4.000 4.8   6 4.114552 0.9602205  134
## 5   graduate degree.1 1.2 3.55  4.200 4.8   6 4.123026 1.0032708  152
## 6                HS.2 1.4 3.40  4.200 4.7   6 4.043511 0.9917234  131
## 7       finished HS.2 1.6 3.80  4.400 5.0   6 4.351587 0.9479388  189
## 8      some college.2 1.0 3.80  4.600 5.2   6 4.439604 0.8898754  893
## 9  college graduate.2 1.4 3.60  4.400 5.0   6 4.274808 0.9274295  260
## 10  graduate degree.2 1.0 3.80  4.400 5.0   6 4.375815 0.9555508  266
## 11                  1 1.0 3.40  4.200 5.0   6 4.175636 0.9587777  838
## 12                  2 1.0 3.80  4.400 5.0   6 4.365804 0.9254656 1739
##    missing
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
## 7        0
## 8        0
## 9        0
## 10       0
## 11       0
## 12       0

favstats(conscient ~ education|gender, data = bfi)

##                gender min   Q1 median  Q3 max     mean        sd    n
## 1                HS.1 2.4 3.60  4.200 5.0   6 4.229391 0.8581184   93
## 2       finished HS.1 1.2 3.40  4.000 4.8   6 4.004369 1.0175930  103
## 3      some college.1 1.0 3.60  4.225 5.0   6 4.256601 0.9418097  356
## 4  college graduate.1 2.0 3.40  4.000 4.8   6 4.114552 0.9602205  134
## 5   graduate degree.1 1.2 3.55  4.200 4.8   6 4.123026 1.0032708  152
## 6                HS.2 1.4 3.40  4.200 4.7   6 4.043511 0.9917234  131
## 7       finished HS.2 1.6 3.80  4.400 5.0   6 4.351587 0.9479388  189
## 8      some college.2 1.0 3.80  4.600 5.2   6 4.439604 0.8898754  893
## 9  college graduate.2 1.4 3.60  4.400 5.0   6 4.274808 0.9274295  260
## 10  graduate degree.2 1.0 3.80  4.400 5.0   6 4.375815 0.9555508  266
## 11                  1 1.0 3.40  4.200 5.0   6 4.175636 0.9587777  838
## 12                  2 1.0 3.80  4.400 5.0   6 4.365804 0.9254656 1739
##    missing
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
## 7        0
## 8        0
## 9        0
## 10       0
## 11       0
## 12       0

Alternatively we can use dplyr

bfi %>%
  group_by(gender, education) %>%
  summarise(M = mean(conscient, na.rm = TRUE),
            Md = median(conscient, na.rm = TRUE),
            SD = sd(conscient, na.rm = TRUE))

## # A tibble: 10 x 5
## # Groups:   gender [?]
##    gender        education        M    Md        SD
##     <int>           <fctr>    <dbl> <dbl>     <dbl>
##  1      1               HS 4.229391 4.200 0.8581184
##  2      1      finished HS 4.004369 4.000 1.0175930
##  3      1     some college 4.256601 4.225 0.9418097
##  4      1 college graduate 4.114552 4.000 0.9602205
##  5      1  graduate degree 4.123026 4.200 1.0032708
##  6      2               HS 4.043511 4.200 0.9917234
##  7      2      finished HS 4.351587 4.400 0.9479388
##  8      2     some college 4.439604 4.600 0.8898754
##  9      2 college graduate 4.274808 4.400 0.9274295
## 10      2  graduate degree 4.375815 4.400 0.9555508

Let’s check our assumption of homogeneity of variance. First let’s do some data stuff we will need. For the Levene’s Test we will create the gender X education levels with the unite() function.

#install.packages("tidyr")
library(tidyr)

bfi %>%
  unite(gen_edu, gender, education, remove = FALSE) %$%
  leveneTest(conscient, gen_edu)

## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    9  0.8484 0.5714
##       2567

Because Levene’s Test is non-significant we have evidence of homogeneity of variance (really, no evidence of heterogeneity of variance). Now we perform the two-way ANOVA.

mod2 <- aov(conscient ~ education*gender, data = bfi)

summary(mod2)

##                    Df Sum Sq Mean Sq F value   Pr(>F)    
## education           4   20.8   5.198   5.981 8.67e-05 ***
## gender              1   17.2  17.212  19.806 8.94e-06 ***
## education:gender    4    9.7   2.420   2.785   0.0252 *  
## Residuals        2567 2230.8   0.869                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a statistically significant main effect of gender, F(1, 2480) = 17.14, p < .001, such that, women (M = 4.36, SD = 0.93) are more conscientious than men (M = 4.18, SD = 0.96). These main effects are qualified by a statistically significant two-way interaction of education and gender, F(4, 2480) = 2.56, p = .037. Let’s make a bar graph!

plotdata2 <- bfi %>% 
  group_by(gender, education) %>%
  summarise(mean = mean(conscient, na.rm = TRUE), 
            stdv = sd(conscient, na.rm = TRUE), 
            n = n()) %>%
  mutate(se = stdv/sqrt(n))

ggplot(plotdata2, aes(x = education, y = mean, fill = as.factor(gender))) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
  labs(x = "Education", y = "Conscientiousness") +
  ggtitle("Conscientiousness by Gender and Education Level") +
  scale_fill_discrete(name = "Gender")

As we see in the graph (and you saw yesterday from the t-tests), women are higher than men in conscientiousness for every level of education expect for those participants that did not finish high school.

Try fiddling with the graph. Change the theme, change the labels. See if you can find on the internet how to change the colors.

plotdata2 <- bfi %>% 
  group_by(gender, education) %>%
  summarise(mean = mean(conscient, na.rm = TRUE), 
            stdv = sd(conscient, na.rm = TRUE), 
            n = n()) %>%
  mutate(se = stdv/sqrt(n))

ggplot(plotdata2, aes(x = education, y = mean, fill = as.factor(gender))) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
  labs(x = "Education", y = "Conscientiousness") +
  ggtitle("Conscientiousness by Gender and Education Level") +
  scale_fill_manual(name = "Gender", values = c("gold", "dodgerblue")) +
  theme_minimal()

We will see another example of a two-way ANOVA in the reproducible APA style document.

Mixed Effects ANOVA (Split-Plot)

To demonstrate the mixed effects ANOVA we’ll use the sat.act dataset. Recall that the sat.act dataset has information for 700 people on their SAT verbal, SAT quantitative, and ACT scores.

data(sat.act)

Recall from yesterday that boys were higher on the SAT quantitative than the SAT verbal (on average) and the girls had the opposite test patterns.

sat.act_men <- sat.act %>%
  filter(gender == 1)

t.test(~(SATV-SATQ), data = sat.act_men)

## 
##  One Sample t-test
## 
## data:  sat.act_men$(SATV - SATQ)
## t = -3.4934, df = 244, p-value = 0.0005661
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -32.08114  -8.94743
## sample estimates:
## mean of x 
## -20.51429

sat.act %>%
  filter(gender == 2) %>%
  t.test(~(SATV-SATQ), data = .)

## 
##  One Sample t-test
## 
## data:  .$(SATV - SATQ)
## t = 3.1813, df = 441, p-value = 0.00157
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   5.604249 23.721543
## sample estimates:
## mean of x 
##   14.6629

We can test if these two patterns are statistically different from each other with a gender (between-subjects) by topic (within-subjects) two-way mixed effects ANOVA. Where topic has two levels: 1) Verbal and 2) quantitative. We have to restructure our data first.

sat.act_long <- sat.act %>%
  mutate(id = as.factor(seq_along(SATV))) %>%
  gather(key = topic, value = score, SATV, SATQ) %>%
  mutate(gender = as.factor(factor(gender, labels=c('Boy', 'Girl'))),
         topic = as.factor(topic)) %>%
  arrange(id)

ggplot(sat.act_long, aes(x = gender, y = score, color = topic)) +
  geom_boxplot()

After you’ve taken a good look at the data, it’s time for the ANOVA.

mod3 <- aov(score ~ gender*topic + Error(id), 
            data = sat.act_long)

summary(mod3)

## 
## Error: id
##               Df   Sum Sq Mean Sq F value  Pr(>F)   
## gender         1   154094  154094   7.283 0.00713 **
## topic          1        0       0   0.000 0.99774   
## gender:topic   1     3910    3910   0.185 0.66741   
## Residuals    696 14726767   21159                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Within
##               Df  Sum Sq Mean Sq F value   Pr(>F)    
## topic          1    1541    1541    0.34     0.56    
## gender:topic   1   97527   97527   21.54 4.15e-06 ***
## Residuals    685 3101217    4527                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we may have guessed from our preliminary analyses, there is a main effect of gender, F(1, 696) = 7.28, p = .007, no main effect of topic, F(1, 685) = 0.34, p = .56, but a significant interaction of gender and topic, F(1, 685) = 21.54, p < .001.

plotdata <- sat.act_long %>% 
  group_by(gender, topic) %>%
  summarise(mean = mean(score, na.rm = TRUE), 
            stdv = sd(score, na.rm = TRUE), 
            n = n()) %>%
  mutate(se = stdv/sqrt(n))

ggplot(plotdata, aes(x = topic, y = mean, fill = gender)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  geom_errorbar(aes(ymax = mean + se, ymin = mean - se), position = position_dodge(0.9), width = 0.25) +
  labs(x = "Topic", y = "Score") +
  ggtitle("Score by Gender and Topic") +
  scale_fill_discrete(name = "Gender")

It looks as though there are only gender differences on the SATQ, but not the SATV. Follow this up with two t-test of gender for each topic. Hint: use the sat.act data and the filter() function.

#two independent samples t-tests here.

Regression

Back to the bfi dataset. What if we wanted to treat education like an interval measured variable instead of ordinal? We could then regress conscientiousness on education in a simple linear regression model. First, let’s clear our environment and re-load the data.

data(bfi)

bfi <- bfi %>%
  mutate(C4.r = (min(C4, na.rm = TRUE) + max(C4, na.rm = TRUE)) - C4,
         C5.r = (min(C5, na.rm = TRUE) + max(C5, na.rm = TRUE)) - C5,
         conscient = (C1 + C2 + C3 + C4.r + C5.r)/5)

It’s good practice to make a scatter plot before running a regression. Do this using ggplot2. You might want to try using geom_jitter(). Also, add a linear regression line using geom_smooth(method = "lm").

#make a scatter plot here. 
qplot(x = education, y = conscient, data = bfi)

ggplot(bfi, aes(x =  education, y = conscient)) +
  geom_jitter(height = 0, width = 0.05, alpha = .05) +
  geom_smooth(method = "lm", se = 0)

Now let’s run our model using the lm() function.

mod4 <- lm(conscient ~ education, data = bfi)

mod4

## 
## Call:
## lm(formula = conscient ~ education, data = bfi)
## 
## Coefficients:
## (Intercept)    education  
##      4.2482       0.0169

summary(mod4)

## 
## Call:
## lm(formula = conscient ~ education, data = bfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3327 -0.6989  0.1011  0.7011  1.7349 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.24816    0.05758  73.778   <2e-16 ***
## education    0.01690    0.01702   0.993    0.321    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9428 on 2488 degrees of freedom
##   (310 observations deleted due to missingness)
## Multiple R-squared:  0.0003961,  Adjusted R-squared:  -5.714e-06 
## F-statistic: 0.9858 on 1 and 2488 DF,  p-value: 0.3209

confint(mod4)

##                   2.5 %     97.5 %
## (Intercept)  4.13525467 4.36107438
## education   -0.01647737 0.05027662

There is no statistically significant effect of education on conscientiousness using a linear regression model. Let’s check our regression diagnostics. This is one of the benefits of R, we can use the plot() function.

#plot(mod)

Let’s add gender as a factor to make it a multiple regression model. Note that we can also get the confidence intervals.

mod5.0 <- lm(conscient ~ as.factor(education)*as.factor(gender), data = bfi)

summary(mod5.0)

## 
## Call:
## lm(formula = conscient ~ as.factor(education) * as.factor(gender), 
##     data = bfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4372 -0.6372  0.0722  0.7254  1.9980 
## 
## Coefficients:
##                                          Estimate Std. Error t value
## (Intercept)                               4.23864    0.09967  42.525
## as.factor(education)2                    -0.23666    0.13635  -1.736
## as.factor(education)3                     0.02119    0.11180   0.190
## as.factor(education)4                    -0.11094    0.12907  -0.860
## as.factor(education)5                    -0.11064    0.12555  -0.881
## as.factor(gender)2                       -0.19020    0.12948  -1.469
## as.factor(education)2:as.factor(gender)2  0.52266    0.17401   3.004
## as.factor(education)3:as.factor(gender)2  0.36754    0.14263   2.577
## as.factor(education)4:as.factor(gender)2  0.33711    0.16420   2.053
## as.factor(education)5:as.factor(gender)2  0.42822    0.16115   2.657
##                                          Pr(>|t|)    
## (Intercept)                               < 2e-16 ***
## as.factor(education)2                     0.08275 .  
## as.factor(education)3                     0.84970    
## as.factor(education)4                     0.39013    
## as.factor(education)5                     0.37830    
## as.factor(gender)2                        0.14198    
## as.factor(education)2:as.factor(gender)2  0.00269 ** 
## as.factor(education)3:as.factor(gender)2  0.01003 *  
## as.factor(education)4:as.factor(gender)2  0.04017 *  
## as.factor(education)5:as.factor(gender)2  0.00793 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.935 on 2480 degrees of freedom
##   (310 observations deleted due to missingness)
## Multiple R-squared:  0.01986,    Adjusted R-squared:  0.01631 
## F-statistic: 5.585 on 9 and 2480 DF,  p-value: 1.16e-07

bfi <- bfi %>%
  mutate(genderE = ifelse(gender == 2, -1, gender))

mod5 <- lm(conscient ~ education*genderE, data = bfi)

summary(mod5)

## 
## Call:
## lm(formula = conscient ~ education * genderE, data = bfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4229 -0.6229  0.0456  0.6456  1.8433 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.23599    0.05936  71.358   <2e-16 ***
## education          0.01076    0.01750   0.615    0.539    
## genderE           -0.01561    0.05936  -0.263    0.793    
## education:genderE -0.02350    0.01750  -1.343    0.179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.939 on 2486 degrees of freedom
##   (310 observations deleted due to missingness)
## Multiple R-squared:  0.009233,   Adjusted R-squared:  0.008037 
## F-statistic: 7.722 on 3 and 2486 DF,  p-value: 3.917e-05

confint(mod5)

##                         2.5 %     97.5 %
## (Intercept)        4.11958030 4.35239148
## education         -0.02354989 0.04507208
## genderE           -0.13201474 0.10079644
## education:genderE -0.05781121 0.01081076

Check the residuals for mod5

#use the plot() function here
plot(mod5)

Try making a figure that splits the slope of education on conscientiousness by gender. Copy and past from yesterday’s code if you’d like.

#regression lines split by gender
ggplot(bfi, aes(x = education, y = conscient, color = as.factor(gender))) +
  geom_smooth(method = "lm", se = 0) +
  ylim(1, 6)

Logistic Regression

Because personality is relatively stable, we may instead ask are people who are more conscientious more likely to graduate from college? This would be a logistic regression model. We can use the code we wrote yesterday for creating the dichotomous variable 1 = yes college, 0 = no college.

bfi <- bfi %>%
  mutate(coll_grad = (education > 3))

tally(~coll_grad, data = bfi)

## coll_grad
##  TRUE FALSE  <NA> 
##   812  1765   223

ggplot(filter(bfi, !is.na(coll_grad)), aes(x = coll_grad, y = conscient)) + 
  geom_boxplot()

Then we can run our model by using the glm() function and adding family = binomial.

mod6 <- glm(coll_grad ~ conscient, data = bfi, family = binomial)

summary(mod6)

## 
## Call:
## glm(formula = coll_grad ~ conscient, family = binomial, data = bfi)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9750  -0.8829  -0.8535   1.4818   1.5773  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.41527    0.19925  -2.084   0.0371 *
## conscient   -0.08142    0.04549  -1.790   0.0735 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3113.0  on 2489  degrees of freedom
## Residual deviance: 3109.8  on 2488  degrees of freedom
##   (310 observations deleted due to missingness)
## AIC: 3113.8
## 
## Number of Fisher Scoring iterations: 4

confint(mod6)

##                  2.5 %       97.5 %
## (Intercept) -0.8074043 -0.026019218
## conscient   -0.1705590  0.007841277

It’s handy to look at the exponentiated estimates using the exp() function.

exp(coef(mod6))

## (Intercept)   conscient 
##   0.6601588   0.9218058

You are 0.92 times as likely to go graduate from college for every 1 unit increase in conscientiousness, but this is only marginally significant, p = .073.