21 GLMM, Binary Outcome: Thailand Education (Hox ch 6)

Article: https://www.nature.com/articles/s41598-023-34478-0

https://stats.oarc.ucla.edu/other/examples/ma-hox/

chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://www.lesahoffman.com/CLDP945/CLDP945_Lecture07_Generalized.pdf

https://stats.oarc.ucla.edu/r/dae/mixed-effects-logistic-regression/

21.1 PREPARATION

21.1.1 Load Packages

library(tidyverse)  
library(haven)
library(psych) 
library(lme4) 
library(optimx)
library(emmeans)     
library(performance)
library(interactions) 

21.1.2 Background

NESTING * schoolid identify school each student attends

OUCOME * re student repeated a grade during primary school: Yes or No

PREDICTORS * sex sex: Girl or Boy * pped attended preschool: Yes or No * msesc school’s mean SES, missing for some schools, grand mean centered

21.1.3 Read in the data

df_raw <- haven::read_sav("data/thaieduc.sav") %>% 
  haven::as_factor() %>% 
  haven::zap_label() %>% 
  haven::zap_formats() %>% 
  haven::zap_widths()
tibble::glimpse(df_raw)
Rows: 8,582
Columns: 5
$ SCHOOLID <dbl> 10101, 10101, 10101, 10101, 10101, 10101, 10101, 10101, 10101…
$ SEX      <fct> girl, girl, girl, girl, girl, girl, girl, girl, girl, girl, g…
$ PPED     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y…
$ REPEAT   <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, n…
$ MSESC    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

21.1.4 Long Format

One line per child

df_long <- df_raw %>%   
  dplyr::rename_all(tolower) %>% 
  dplyr::rename(re = `repeat`) %>% 
  dplyr::mutate_if(is.factor, 
                   ~forcats::fct_relabel(.x, stringr::str_to_title)) %>% 
  dplyr::mutate(re01 = case_when(re == "No" ~ 0,
                                 re == "Yes" ~ 1))
tibble::glimpse(df_long)
Rows: 8,582
Columns: 6
$ schoolid <dbl> 10101, 10101, 10101, 10101, 10101, 10101, 10101, 10101, 10101…
$ sex      <fct> Girl, Girl, Girl, Girl, Girl, Girl, Girl, Girl, Girl, Girl, G…
$ pped     <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y…
$ re       <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, N…
$ msesc    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ re01     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
df_long %>% 
  psych::headTail(top = 10) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Thailand Education Data on Repeating a Grade, Long Format")
Table 21.1
Thailand Education Data on Repeating a Grade, Long Format

schoolid

sex

pped

re

msesc

re01

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

10101

Girl

Yes

No

0

...

...

...

180665

Boy

No

Yes

0.98

1

180665

Boy

No

Yes

0.98

1

180665

Boy

No

Yes

0.98

1

180665

Boy

No

Yes

0.98

1

21.1.5 Wide Format

One Line Per School

df_wide <- df_long %>% 
  dplyr::group_by(schoolid, msesc) %>% 
  dplyr::summarise(n_std = n(),
                   per_girl = 100*mean(sex == "Girl"),
                   per_pped = 100*mean(pped == "Yes"),
                   per_re   = 100*mean(re == "Yes")) %>% 
  dplyr::ungroup()
tibble::glimpse(df_wide)
Rows: 411
Columns: 6
$ schoolid <dbl> 10101, 10102, 10103, 10104, 10105, 10106, 10107, 10108, 10109…
$ msesc    <dbl> NA, NA, 0.88, 0.20, -0.07, 0.47, NA, 0.76, 1.06, 0.54, 0.50, …
$ n_std    <int> 19, 24, 17, 29, 18, 5, 15, 19, 21, 25, 12, 14, 17, 22, 15, 26…
$ per_girl <dbl> 78.95, 45.83, 35.29, 51.72, 44.44, 40.00, 60.00, 63.16, 52.38…
$ per_pped <dbl> 100.00, 95.83, 88.24, 55.17, 66.67, 100.00, 80.00, 100.00, 85…
$ per_re   <dbl> 0.00, 0.00, 5.88, 0.00, 27.78, 0.00, 13.33, 15.79, 42.86, 4.0…
df_wide %>% 
  psych::headTail(top = 10) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Thailand Education Data on Repeating a Grade, Wide Format")
Table 21.2
Thailand Education Data on Repeating a Grade, Wide Format

schoolid

msesc

n_std

per_girl

per_pped

per_re

10101

19

78.95

100

0

10102

24

45.83

95.83

0

10103

0.88

17

35.29

88.24

5.88

10104

0.2

29

51.72

55.17

0

10105

-0.07

18

44.44

66.67

27.78

10106

0.47

5

40

100

0

10107

15

60

80

13.33

10108

0.76

19

63.16

100

15.79

10109

1.06

21

52.38

85.71

42.86

10211

0.54

25

40

68

4

...

...

...

...

...

...

180662

0.4

24

79.17

33.33

0

180663

0.57

14

42.86

0

28.57

180664

0.57

16

56.25

25

37.5

180665

0.98

30

53.33

3.33

16.67

21.1.6 Modeling Subset

df_model <- df_long %>% 
  dplyr::filter(complete.cases(msesc, sex, pped, re))

21.2 EXPLORATORY DATA ANALYSIS

21.2.1 Sample Size

n = 7516 students

nrow(df_model)
[1] 7516

n = 411 - 55 = 356 schools

nrow(df_wide)
[1] 411
df_wide %>% 
  dplyr::filter(complete.cases(msesc)) %>% 
  nrow()
[1] 356

21.2.2 Missing Data

df_wide %>% 
  VIM::aggr(numbers = TRUE,   # shows the number to the far right
            prop = FALSE)     # shows counts instead of proportions

df_wide %>% 
  dplyr::mutate(miss = ifelse(!is.na(msesc), "Include", "Exclude") %>% 
                  factor) %>% 
  dplyr::select(miss,
                "# Students" = n_std,
                "% Girls" = per_girl,
                "% Preschool" = per_pped,
                "% Repeat Grade" = per_re) %>% 
  apaSupp::tab1(split = "miss",
                caption = "Compare School Excluded due to Missing SES")
Table 21.3
Compare School Excluded due to Missing SES

Total
N = 411

Exclude
n = 55 (13.4%)

Include
n = 356 (86.6%)

p-value

# Students

20.88 (6.89)

19.38 (6.60)

21.11 (6.92)

.076

% Girls

49.95 (14.89)

51.72 (17.06)

49.68 (14.54)

.403

% Preschool

48.10 (37.85)

53.27 (35.07)

47.30 (38.24)

.250

% Repeat Grade

15.74 (16.51)

18.10 (15.58)

15.38 (16.64)

.235

Note. Continuous variables are summarized with means (SD) and significant group differences assessed via independent t-tests. Categorical variables are summarized with counts (%) and significant group differences assessed via Chi-squared tests for independence.

* p < .05. ** p < .01. *** p < .001.

21.2.3 Summarize

21.2.3.1 Demographics and Baseline

df_wide %>% 
  dplyr::filter(complete.cases(msesc)) %>% 
  dplyr::select("# Students" = n_std,
                "% Girls" = per_girl,
                "% Preschool" = per_pped,
                "% Repeat Grade" = per_re) %>% 
  apaSupp::tab_desc(caption = "Description of Schools")
Table 21.4
Description of Schools

NA

M

SD

min

Q1

Mdn

Q3

max

# Students

0

21.11

6.92

2.00

16.00

21.00

26.00

41.00

% Girls

0

49.68

14.54

0.00

41.67

50.00

57.69

100.00

% Preschool

0

47.30

38.24

0.00

4.35

48.00

85.95

100.00

% Repeat Grade

0

15.38

16.64

0.00

0.00

11.11

24.14

83.33

Note. N = 356. NA = not available or missing; Mdn = median; Q1 = 25th percentile; Q3 = 75th percentile.

df_wide %>% 
  dplyr::filter(complete.cases(msesc)) %>% 
  ggplot(aes(x = per_pped,
             y = per_re)) +
  geom_point() +
  theme_bw() 

df_model %>% 
  dplyr::mutate(re = forcats::fct_recode(re,
                                         "Never Repeated" = "No",
                                         "Repeated a Grade" = "Yes")) %>% 
  dplyr::select(re,
                "Sex" = sex,
                "Attended Preschool" = pped,
                "SES, school mean" = msesc) %>% 
  apaSupp::tab1(split = "re",
                caption = "Compare Students ",
                type = list("Attended Preschool" = "categorical"))
Table 21.5
Compare Students

Total
N = 7516

Never Repeated
n = 6449 (85.8%)

Repeated a Grade
n = 1067 (14.2%)

p-value

Sex

< .001***

Girl

3,700 (49.2%)

3,272 (50.7%)

428 (40.1%)

Boy

3,816 (50.8%)

3,177 (49.3%)

639 (59.9%)

Attended Preschool

< .001***

No

3,772 (50.2%)

3,099 (48.1%)

673 (63.1%)

Yes

3,744 (49.8%)

3,350 (51.9%)

394 (36.9%)

SES, school mean

0.01 (0.38)

0.02 (0.37)

-0.04 (0.40)

< .001***

Note. Continuous variables are summarized with means (SD) and significant group differences assessed via independent t-tests. Categorical variables are summarized with counts (%) and significant group differences assessed via Chi-squared tests for independence.

* p < .05. ** p < .01. *** p < .001.

21.3 MULILEVEL ANALYSIS

21.3.1 Fit Model

Great articles:
https://bbolker.github.io/mixedmodels-misc/glmmFAQ.html#what-methods-are-available-to-fit-estimate-glmms https://www.ellessenne.xyz/blog/2022/06/18/ * https://bbolker.github.io/goettingen_2019/notes/glmm_details.html slides: https://people.math.aau.dk/~rw/Undervisning/TostaII/Slides/2.pdf

21.3.1.1 Default is the Laplace ML

nAGQ = 1 (Laplace Approximation): This is the default method for glmer(). It approximates the integrand (the likelihood function) by a normal distribution using a second-order Taylor expansion around the mode. It is more accurate than PQL but slower.

fit_glmer_1 <- lme4::glmer(re01 ~ sex + pped + msesc + (1|schoolid),
                           data = df_model,
                           family = binomial)
summary(fit_glmer_1)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: re01 ~ sex + pped + msesc + (1 | schoolid)
   Data: df_model

      AIC       BIC    logLik -2*log(L)  df.resid 
     5457      5492     -2724      5447      7511 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-2.180 -0.403 -0.247 -0.168  6.092 

Random effects:
 Groups   Name        Variance Std.Dev.
 schoolid (Intercept) 1.62     1.27    
Number of obs: 7516, groups:  schoolid, 356

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -2.2393     0.1057  -21.18  < 2e-16 ***
sexBoy        0.5340     0.0759    7.04  2.0e-12 ***
ppedYes      -0.6265     0.0999   -6.27  3.6e-10 ***
msesc        -0.2969     0.2140   -1.39     0.17    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
        (Intr) sexBoy ppedYs
sexBoy  -0.433              
ppedYes -0.384 -0.003       
msesc    0.058  0.005 -0.108

21.3.1.2 Penalized Iteratively Reweighted Least Squares

nAGQ = 0 (Penalized Iteratively Reweighted Least Squares - PIRLS): This is a faster but less exact method that avoids explicit integration by optimizing the fixed and random effects simultaneously in a penalized fashion. It is often referred to as penalized quasi-likelihood (PQL), and provides a good starting point for more accurate methods.

fit_glmer_0 <- lme4::glmer(re01 ~ sex + pped + msesc + (1|schoolid),
                           data = df_model,
                           family = binomial,
                           nAGQ = 0)
summary(fit_glmer_0)
Generalized linear mixed model fit by maximum likelihood (Adaptive
  Gauss-Hermite Quadrature, nAGQ = 0) [glmerMod]
 Family: binomial  ( logit )
Formula: re01 ~ sex + pped + msesc + (1 | schoolid)
   Data: df_model

      AIC       BIC    logLik -2*log(L)  df.resid 
     5461      5496     -2726      5451      7511 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-2.174 -0.412 -0.254 -0.176  5.840 

Random effects:
 Groups   Name        Variance Std.Dev.
 schoolid (Intercept) 1.61     1.27    
Number of obs: 7516, groups:  schoolid, 356

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -2.0786     0.0995  -20.88  < 2e-16 ***
sexBoy        0.5156     0.0745    6.92  4.4e-12 ***
ppedYes      -0.5998     0.0980   -6.12  9.4e-10 ***
msesc        -0.2599     0.2102   -1.24     0.22    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
        (Intr) sexBoy ppedYs
sexBoy  -0.437              
ppedYes -0.413  0.000       
msesc    0.049  0.006 -0.110

21.3.1.3 Adaptive Gauss-Hermite Quadrature

nAGQ = k (Adaptive Gauss-Hermite Quadrature - AGHQ): For k > 1, glmer() uses adaptive Gauss-Hermite quadrature, which is the most reliable and accurate method among the available options. This method approximates the integral as a weighted sum over a specific number of points (nodes).Larger values of k (e.g., up to 25) increase accuracy but also significantly increase computation time. Limitation: AGHQ in glmer() is currently implemented only for models with a single scalar random effect (i.e., a single random effects term with a single variance component).

fit_glmer_50 <- lme4::glmer(re01 ~ sex + pped + msesc + (1|schoolid),
                            data = df_model,
                            family = binomial,
                            nAGQ = 50)
summary(fit_glmer_50)
Generalized linear mixed model fit by maximum likelihood (Adaptive
  Gauss-Hermite Quadrature, nAGQ = 50) [glmerMod]
 Family: binomial  ( logit )
Formula: re01 ~ sex + pped + msesc + (1 | schoolid)
   Data: df_model

      AIC       BIC    logLik -2*log(L)  df.resid 
     5452      5486     -2721      5442      7511 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-2.200 -0.404 -0.246 -0.167  6.125 

Random effects:
 Groups   Name        Variance Std.Dev.
 schoolid (Intercept) 1.69     1.3     
Number of obs: 7516, groups:  schoolid, 356

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -2.242      0.107  -21.02  < 2e-16 ***
sexBoy         0.535      0.076    7.04  1.9e-12 ***
ppedYes       -0.627      0.100   -6.25  4.0e-10 ***
msesc         -0.296      0.217   -1.36     0.17    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
        (Intr) sexBoy ppedYs
sexBoy  -0.430              
ppedYes -0.382 -0.003       
msesc    0.056  0.005 -0.107

21.3.2 Parameter Estiamtes Table

apaSupp::tab_glmers(list("PQL"     = fit_glmer_0,
                         "Laplase" = fit_glmer_1,
                         "Numeric" = fit_glmer_50),
                    narrow = TRUE,
                    var_labels = c("sex" = "Sex",
                                   "pped" = "Attended Preschool",
                                   "msesc" = "School SES"),
                    general_note = "School SES is grand mean centered.")
Table 21.6
Compare Generalized Mixed-Effects Models

PQL

Laplase

Numeric

Variable

OR

[95% CI]

OR

[95% CI]

OR

[95% CI]

Sex

Girl

Boy

1.67

[1.45, 1.94]***

1.71

[1.47, 1.98]***

1.71

[1.47, 1.98]***

Attended Preschool

No

Yes

0.55

[0.45, 0.67]***

0.53

[0.44, 0.65]***

0.53

[0.44, 0.65]***

School SES

0.77

[0.51, 1.16]

0.74

[0.49, 1.13]

0.74

[0.49, 1.14]

schoolid.var__(Intercept)

1.61

1.62

1.69

AIC

5,461.00

5,456.93

5,451.66

BIC

5,495.62

5,491.56

5,486.28

Log-likelihood

-2,725.50

-2,723.47

-2,720.83

Note. N = 7516. CI = confidence interval.School SES is grand mean centered.

* p < .05. ** p < .01. *** p < .001.

apaSupp::tab_glmer(fit_glmer_50,
                   var_labels = c("sex" = "Sex",
                                   "pped" = "Attended Preschool",
                                   "msesc" = "School SES",
                                  "schoolid.var__(Intercept)" = "Random Intercept Variance"),
                    general_note = "School SES is grand mean centered.")
Table 21.7
Parameter Estimates for Generalized Mixed Effects Regression

Odds Ratio

Logit Scale

Variable

Est

[95% CI]

b

(SE)

p

Sex

Girl

Boy

1.71

[1.47, 1.98]

0.54

(0.08)

< .001***

Attended Preschool

No

Yes

0.53

[0.44, 0.65]

-0.63

(0.10)

< .001***

School SES

0.74

[0.49, 1.14]

-0.30

(0.22)

.173

Random Intercept Variance

1.69

Note. p = significance from Wald t-test for parameter estimate using Satterthwaite's method for degrees of freedom. School SES is grand mean centered.

* p < .05. ** p < .01. *** p < .001.

21.3.3 Predicted Probabilites

21.3.3.1 Likert Scale

fit_glmer_1 %>% 
  emmeans::emmeans(~ pped + sex)
 pped sex  emmean    SE  df asymp.LCL asymp.UCL
 No   Girl  -2.24 0.106 Inf     -2.45     -2.04
 Yes  Girl  -2.87 0.114 Inf     -3.09     -2.64
 No   Boy   -1.71 0.100 Inf     -1.90     -1.51
 Yes  Boy   -2.33 0.109 Inf     -2.55     -2.12

Results are given on the logit (not the response) scale. 
Confidence level used: 0.95 

21.3.3.2 Back-Transform

fit_glmer_1 %>% 
  emmeans::emmeans(~ pped + sex,
                   type = "response")
 pped sex    prob      SE  df asymp.LCL asymp.UCL
 No   Girl 0.0960 0.00919 Inf    0.0795    0.1156
 Yes  Girl 0.0537 0.00580 Inf    0.0434    0.0663
 No   Boy  0.1534 0.01300 Inf    0.1296    0.1806
 Yes  Boy  0.0883 0.00874 Inf    0.0726    0.1070

Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
fit_glmer_1 %>% 
  emmeans::emmeans(~ pped + sex,
                   type = "response") %>% 
  data.frame() %>% 
  dplyr::mutate(across(c(prob, asymp.LCL, asymp.UCL),
                       ~apaSupp::p_num(.x, stars = FALSE))) %>% 
  dplyr::mutate(cell = glue::glue("{prob} [{asymp.LCL}, {asymp.UCL}]")) %>% 
  dplyr::select(Sex = sex,
                "Attend Preschool" = pped, 
                "Est [95% CI]" = cell) %>% 
  flextable::as_grouped_data(groups = "Sex") %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Predicted Probability of Repeating a Primary Grade, by Sex and Preschool Attendance",
                     general_note = "Predications are for grand-mean school SES.")
Table 21.8
Predicted Probability of Repeating a Primary Grade, by Sex and Preschool Attendance

Sex

Attend Preschool

Est [95% CI]

Girl

No

.096 [ .079 , .116 ]

Yes

.054 [ .043 , .066 ]

Boy

No

.153 [ .130 , .181 ]

Yes

.088 [ .073 , .107 ]

Note. Predications are for grand-mean school SES.

21.3.4 Plot

21.3.4.1 Quick

interactions::cat_plot(model = fit_glmer_1,
                       pred = pped,
                       modx = sex,
                       geom = "line")

21.3.4.2 Publishable (95% CI)

fit_glmer_1 %>% 
  emmeans::emmeans(~ pped + sex,
                   type = "response") %>% 
  data.frame() %>% 
  dplyr::mutate(sex = forcats::fct_rev(sex)) %>% 
  ggplot(aes(x = pped,
             y = prob,
             group = sex,
             color = sex)) +
  geom_errorbar(aes(ymin = asymp.LCL,
                    ymax = asymp.UCL),
                width = .25) +
  geom_line(aes(linetype = sex),
  ) +
  geom_point(aes(shape = sex),
             size = 3) +
  theme_bw() +
  scale_color_manual(values = c("dodgerblue", "red")) +
  scale_linetype_manual(values = c("longdash", "solid")) +
  labs(x = "Attended Preschool",
       y = "Predicted Probabiliy of\nRepeated a Primary Grade",
       color = NULL,
       linetype = NULL,
       shape = NULL) +
  theme(legend.position = "inside",
        legend.position.inside = c(1.0, 1.0),
        legend.justification = c(1.1, 1.1),
        legend.background = element_rect(color = "black"),
        legend.key.width = unit(1.5, "cm"))