11 MLM, Longitudinal: Autism

Download/install a new GitHub package

install.packages("devtools")
devtools::install_github("sarbearschwartz/apaSupp") # updated 9/17/2025

Load/activate these packages

library(HLMdiag)      # package with the dataset
library(tidyverse)    
library(flextable)
library(apaSupp)      # Not on CRAN, on GitHub (see above)
library(psych)
library(VIM)         
library(naniar) 
library(lme4)        
library(lmerTest)
library(optimx)
library(performance)
library(interactions)
library(emmeans)
library(ggResidpanel)
library(HLMdiag)
library(sjPlot)

11.1 Background

The source: http://www-personal.umich.edu/~kwelch/

This data was collected by researchers at the University of Michigan (Anderson et al. 2009) as part of a prospective longitudinal study of 214 children. The children were divided into three diagnostic groups (bestest2) when they were 2 years old: Autism (autism), Pervasive Developmental Disorder (pdd), and non-spectrum children (none in this sample). The study was designed to collect information on each child at approximately 2, 3, 5, 9, and 13 years of age, although not all children were measured for each age. One of the study objectives was to assess the relative influence of the initial diagnostic category, language proficiency at age 2, and other covariates on the developmental trajectories of the socialization (vsae) of these children.

Study participants were children who had had consecutive referrals to one of two autism clinics before the age of 3 years. Social development was assessed at each age using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. VSAE (Vineland Socialization Age Equivalent), was a combined score that included assessments of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale; children were placed into one of three groups (sicdegp) based on their initial SICD scores on the expressive language subscale at age 2.

childid Child’s identification number for this study
sicdegp Sequenced Inventory of Communication Development group (an assessment of expressive language development) - a factor. Levels are low, med, and high
age2 Age (in years) centered around age 2 (age at diagnosis)
vsae Vineland Socialization Age Equivalent, Parent-reported measure of socialization, including measures of:
interpersonal relationships
play/leisure time activities
coping skills
gender Child’s gender - a factor. Levels are male and female
race Child’s race - a factor. Levels are white and nonwhite
bestest2 Diagnosis at age 2 - a factor. Levels are autism and pdd (pervasive developmental disorder)

data(autism, package = "HLMdiag")   # make the dataset 'active' from this package

tibble::glimpse(autism)             # first look at the dataset and its varaibles

Rows: 604
Columns: 7
$ childid  <int> 1, 1, 1, 1, 1, 10, 10, 10, 10, 100, 100, 100, 100, 101, 101, …
$ sicdegp  <fct> high, high, high, high, high, low, low, low, low, high, high,…
$ age2     <dbl> 0, 1, 3, 7, 11, 0, 1, 7, 11, 0, 1, 3, 7, 0, 1, 7, 11, 0, 1, 3…
$ vsae     <int> 6, 7, 18, 25, 27, 9, 11, 18, 39, 15, 24, 37, 135, 8, 24, 75, …
$ gender   <fct> male, male, male, male, male, male, male, male, male, male, m…
$ race     <fct> white, white, white, white, white, white, white, white, white…
$ bestest2 <fct> pdd, pdd, pdd, pdd, pdd, autism, autism, autism, autism, pdd,…

11.1.1 Long Format

data_long <- autism %>%                                    # save the dataset as a new name
  dplyr::mutate(childid = childid %>% factor) %>%          # declare grouping var a factor
  dplyr::mutate(age = 2 + age2) %>%                        # create the original age variable (unequally spaced)
  dplyr::mutate(obs = age %>% factor %>% as.numeric) %>%   # Observation Number = 1, 2, 3, 4, 5 (equally spaced)
  dplyr::select(childid,                                   # choose variables and order to keep
                gender, race, bestest2, sicdegp, 
                obs, age, age2, vsae) %>% 
  dplyr::arrange(childid, age2)                            # sort observations

data_long %>% 
  psych::headTail(top = 11, bottom = 6) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Part of the Dataset")

**Table 11.1**
Part of the Dataset
childid	gender	race	bestest2	sicdegp	obs	age	age2	vsae
1	male	white	pdd	high	1	2	0	6
1	male	white	pdd	high	2	3	1	7
1	male	white	pdd	high	3	5	3	18
1	male	white	pdd	high	4	9	7	25
1	male	white	pdd	high	5	13	11	27
2	male	white	autism	low	1	2	0	6
2	male	white	autism	low	2	3	1	7
2	male	white	autism	low	3	5	3	7
2	male	white	autism	low	4	9	7	8
2	male	white	autism	low	5	13	11	14
3	male	nonwhite	pdd	high	1	2	0	17
					...	...	...	...
211	male	nonwhite	autism	high	1	2	0	15
212	male	white	autism	med	1	2	0	7
212	male	white	autism	med	2	3	1	21
212	male	white	autism	med	3	5	3	29
212	male	white	autism	med	4	9	7	72
212	male	white	autism	med	5	13	11	147

11.1.2 Wide Format

data_wide <- data_long %>%                            # save the dataset as a new name
  dplyr::select(-age2, -obs) %>%                      # delete (by deselecting) this variable 
  tidyr::pivot_wider(names_from  = age,               # repeated indicator
                     values_from = vsae,              # variable repeated
                     names_prefix = "vsae_") %>%      # prefix in from of the 
  dplyr::arrange(childid)                             # sort observations

tibble::glimpse(data_wide)

Rows: 155
Columns: 10
$ childid  <fct> 1, 2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, …
$ gender   <fct> male, male, male, male, male, male, male, male, male, male, m…
$ race     <fct> white, white, nonwhite, nonwhite, white, nonwhite, white, whi…
$ bestest2 <fct> pdd, autism, pdd, autism, autism, autism, pdd, autism, autism…
$ sicdegp  <fct> high, low, high, high, low, low, med, low, med, low, med, med…
$ vsae_2   <int> 6, 6, 17, 12, 6, 9, 12, 9, 7, 6, 12, 13, 7, 5, 10, 17, 11, 11…
$ vsae_3   <int> 7, 7, 18, 14, 12, 12, 21, 11, 10, 10, 19, 8, 6, 10, 6, 27, 21…
$ vsae_5   <int> 18, 7, 12, 38, NA, 14, NA, NA, 8, 12, 14, 29, NA, 29, NA, NA,…
$ vsae_9   <int> 25, 8, 18, 114, 12, NA, 66, 18, NA, NA, 28, 24, 39, 32, NA, 7…
$ vsae_13  <int> 27, 14, 24, NA, 45, NA, 68, 39, NA, NA, 68, 44, 24, 67, NA, 1…

Notice the missing values, displayed as NA.

data_wide %>% 
  psych::headTail()

  childid gender     race bestest2 sicdegp vsae_2 vsae_3 vsae_5 vsae_9 vsae_13
1       1   male    white      pdd    high      6      7     18     25      27
2       2   male    white   autism     low      6      7      7      8      14
3       3   male nonwhite      pdd    high     17     18     12     18      24
4       4   male nonwhite   autism    high     12     14     38    114    <NA>
5    <NA>   <NA>     <NA>     <NA>    <NA>    ...    ...    ...    ...     ...
6     209   male    white   autism     med      2      4   <NA>     12      32
7     210   male    white   autism     low      4     25   <NA>    130    <NA>
8     211   male nonwhite   autism    high     15   <NA>   <NA>   <NA>    <NA>
9     212   male    white   autism     med      7     21     29     72     147

11.2 Exploratory Data Analysis

11.2.1 Demographic Summary

11.2.1.1 Using the WIDE formatted dataset

Each person’s data is only stored on a single line

data_wide %>% 
  dplyr::select(sicdegp,
                "Diagnosis" = bestest2, 
                "Gender"    = gender, 
                "Race"      = race) %>% 
  apaSupp::tab1(split = "sicdegp",
                caption = "Demographic Summary by Initial Lanuguage Development")

**Table 11.2**
Demographic Summary by Initial Lanuguage Development
	Total N = 155	low n = 49 (31.6%)	med n = 65 (41.9%)	high n = 41 (26.5%)	p-value
Diagnosis					.018*
autism	100 (64.5%)	38 (77.6%)	42 (64.6%)	20 (48.8%)
pdd	55 (35.5%)	11 (22.4%)	23 (35.4%)	21 (51.2%)
Gender					.706
male	134 (86.5%)	44 (89.8%)	55 (84.6%)	35 (85.4%)
female	21 (13.5%)	5 (10.2%)	10 (15.4%)	6 (14.6%)
Race					.944
white	101 (65.2%)	31 (63.3%)	43 (66.2%)	27 (65.9%)
nonwhite	54 (34.8%)	18 (36.7%)	22 (33.8%)	14 (34.1%)
Note. . .
* p < .05. p < .01. * p < .001.

11.2.1.2 Using the LONG formatted dataset

Each person’s data is stored on multiple lines, one for each time point. To ensure the summary table is correct, you must choose a single time point per person.

# Note: One person is missing Age 2
data_long %>% 
  dplyr::filter(age == 2) %>% # restrict to one-line per person
  dplyr::select(sicdegp,
                "Diagnosis" = bestest2, 
                "Gender"    = gender, 
                "Race"      = race) %>% 
  apaSupp::tab1(split = "sicdegp",
                caption = "Demographic Summary by Initial Lanuguage Development")

**Table 11.3**
Demographic Summary by Initial Lanuguage Development
	Total N = 154	low n = 49 (31.8%)	med n = 65 (42.2%)	high n = 40 (26.0%)	p-value
Diagnosis					.025*
autism	100 (64.9%)	38 (77.6%)	42 (64.6%)	20 (50.0%)
pdd	54 (35.1%)	11 (22.4%)	23 (35.4%)	20 (50.0%)
Gender					.714
male	134 (87.0%)	44 (89.8%)	55 (84.6%)	35 (87.5%)
female	20 (13.0%)	5 (10.2%)	10 (15.4%)	5 (12.5%)
Race					.950
white	100 (64.9%)	31 (63.3%)	43 (66.2%)	26 (65.0%)
nonwhite	54 (35.1%)	18 (36.7%)	22 (33.8%)	14 (35.0%)
Note. . .
* p < .05. p < .01. * p < .001.

11.2.2 Baseline Summary

11.2.2.1 Using the WIDE formatted dataset

Each person’s data is only stored on a single line

data_wide %>%
  dplyr::select(sicdegp, vsae_2) %>% 
  apaSupp::tab1(split = "sicdegp")

**Table 11.4**
Summary of Variables
	Total N = 155	low n = 49 (31.6%)	med n = 65 (41.9%)	high n = 41 (26.5%)	p-value
vsae_2	9.16 (3.84)	7.06 (2.73)	8.74 (3.51)	12.40 (3.43)	< .001***
Unknown	1	0	0	1
Note. Continuous variables are summarized with means (SD) and significant group differences assessed via independent one-way analysis of vaiance (ANOVA). Categorical variables are summarized with counts (%) and significant group differences assessed via Chi-squared tests for independence.
* p < .05. p < .01. * p < .001.

11.2.2.2 Using the LONG formatted dataset

Each person’s data is stored on multiple lines, one for each time point. To ensure the summary table is correct, you must choose a single time point per person.

data_long %>% 
  dplyr::filter(age == 2) %>% 
  dplyr::select(sicdegp, vsae) %>% 
  apaSupp::tab1(split = "sicdegp")

**Table 11.5**
Summary of Variables
	Total N = 154	low n = 49 (31.8%)	med n = 65 (42.2%)	high n = 40 (26.0%)	p-value
vsae	9.16 (3.84)	7.06 (2.73)	8.74 (3.51)	12.40 (3.43)	< .001***
Note. Continuous variables are summarized with means (SD) and significant group differences assessed via independent one-way analysis of vaiance (ANOVA). Categorical variables are summarized with counts (%) and significant group differences assessed via Chi-squared tests for independence.
* p < .05. p < .01. * p < .001.

11.2.3 Missing Data & Attrition

11.2.3.1 `VIM` package

Plot the amount of missing vlaues and the amount of each patter of missing values.

data_wide %>% 
  VIM::aggr(numbers = TRUE,   # shows the number to the far right
            prop    = FALSE)  # shows counts instead of proportions

11.2.3.2 `naniar` package

data_wide %>% 
  naniar::vis_miss()

data_wide %>% 
  naniar::gg_miss_var()

data_wide %>% 
  naniar::gg_miss_var(show_pct = TRUE,     # x-axis is PERCENT, not count
                      facet = sicdegp) +   # create seperate panels
  theme_bw()                               # add ggplot layers as normal

data_wide %>% 
  naniar::gg_miss_upset()

data_wide %>% 
  naniar::gg_miss_upset(sets = c("vsae_13_NA",
                                 "vsae_9_NA",
                                 "vsae_5_NA",
                                 "vsae_3_NA",
                                 "vsae_2_NA"),
                        keep.order = TRUE)

11.2.4 Means Across Time

11.2.4.1 Using the WIDE formatted dataset

data_wide %>%
  dplyr::select(sicdegp,
                "Age 2" = vsae_2,
                "Age 3" = vsae_3,
                "Age 5" = vsae_5,
                "Age 9" = vsae_9,
                "Age 13" = vsae_13) %>% 
  apaSupp::tab1(split = "sicdegp",
                caption = "Summary of Vineland Socialization at each Observation by Initial Lanuguage Development",
                general_note = "The Vineland Socialization is an Age Equivalent, Parent-reported measure of socialization combined score that included assessments of interpersonal relationships, play/leisure time activities, and coping skills.")

**Table 11.6**
Summary of Vineland Socialization at each Observation by Initial Lanuguage Development
	Total N = 155	low n = 49 (31.6%)	med n = 65 (41.9%)	high n = 41 (26.5%)	p-value
Age 2	9.16 (3.84)	7.06 (2.73)	8.74 (3.51)	12.40 (3.43)	< .001***
Unknown	1	0	0	1
Age 3	15.12 (7.81)	12.02 (6.33)	13.68 (5.42)	21.24 (9.38)	< .001***
Unknown	9	3	3	3
Age 5	21.48 (13.32)	15.03 (7.92)	17.69 (8.00)	33.92 (15.78)	< .001***
Unknown	64	20	29	15
Age 9	39.55 (32.62)	25.56 (28.42)	32.13 (23.40)	64.14 (34.59)	< .001***
Unknown	36	13	17	6
Age 13	59.49 (47.96)	37.11 (35.54)	56.18 (47.91)	88.69 (46.34)	< .001***
Unknown	61	21	25	15
Note. Continuous variables are summarized with means (SD) and significant group differences assessed via independent one-way analysis of vaiance (ANOVA). Categorical variables are summarized with counts (%) and significant group differences assessed via Chi-squared tests for independence. The Vineland Socialization is an Age Equivalent, Parent-reported measure of socialization combined score that included assessments of interpersonal relationships, play/leisure time activities, and coping skills.
* p < .05. p < .01. * p < .001.

11.2.4.2 Using the LONG formatted dataset

Each person’s data is stored on multiple lines, one for each time point.

FOR ALL DATA!

data_long %>% 
  dplyr::group_by(sicdegp, age2) %>%                       # specify the groups
  dplyr::summarise(vsae_n    = n(),                        # count of valid scores
                   vsae_mean = mean(vsae),                 # mean score
                   vsae_sd   = sd(vsae),                   # standard deviation of scores
                   vsae_sem  = vsae_sd / sqrt(vsae_n)) %>% # stadard error for the mean of scores
  flextable::as_grouped_data(groups = "sicdegp") %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa()

**Table 11.7**
Table Caption
sicdegp	age2	vsae_n	vsae_mean	vsae_sd	vsae_sem
low
	0.00	49	7.06	2.73	0.39
	1.00	46	12.02	6.33	0.93
	3.00	29	15.03	7.92	1.47
	7.00	36	25.56	28.42	4.74
	11.00	28	37.11	35.54	6.72
med
	0.00	65	8.74	3.51	0.44
	1.00	62	13.68	5.42	0.69
	3.00	36	17.69	8.00	1.33
	7.00	48	32.12	23.40	3.38
	11.00	40	56.17	47.91	7.58
high
	0.00	40	12.40	3.43	0.54
	1.00	38	21.24	9.38	1.52
	3.00	26	33.92	15.78	3.09
	7.00	35	64.14	34.59	5.85
	11.00	26	88.69	46.34	9.09

FOR COMPLETE CASES ONLY!!!

data_long %>% 
  dplyr::group_by(childid) %>%                    # group-by child
  dplyr::mutate(child_vsae_n = n()) %>%           # count the number of valid VSAE scores
  dplyr::filter(child_vsae_n == 5) %>%            # restrict to only thoes children with 5 valid scores
  dplyr::group_by(sicdegp, age2) %>%              # specify the groups
  dplyr::summarise(vsae_n    = n(),                      # count of valid scores
                   vsae_mean = mean(vsae),               # mean score
                   vsae_sd   = sd(vsae),                 # standard deviation of scores
                   vsae_sem  = vsae_sd / sqrt(vsae_n)) %>%    # stadard error for the mean of scores
  flextable::as_grouped_data(groups = "sicdegp") %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa()

**Table 11.8**
Table Caption
sicdegp	age2	vsae_n	vsae_mean	vsae_sd	vsae_sem
low
	0.00	10	6.60	2.88	0.91
	1.00	10	8.00	2.11	0.67
	3.00	10	12.50	5.54	1.75
	7.00	10	12.30	8.07	2.55
	11.00	10	22.40	24.45	7.73
med
	0.00	17	9.47	4.12	1.00
	1.00	17	14.29	6.08	1.47
	3.00	17	20.18	8.57	2.08
	7.00	17	34.41	22.54	5.47
	11.00	17	57.82	50.33	12.21
high
	0.00	14	12.50	4.09	1.09
	1.00	14	19.50	6.15	1.64
	3.00	14	33.93	18.50	4.94
	7.00	14	56.86	23.54	6.29
	11.00	14	80.93	44.38	11.86

11.2.5 Person Profile Plots

Use the data in LONG format

11.2.5.1 Unequally Spaced

data_long %>% 
  dplyr::mutate(sicdegp = fct_recode(sicdegp,
                                     "Low Communication" = "low",
                                     "Medium Communication" = "med",
                                     "High Communication" = "high")) %>% 
  ggplot(aes(x = age,
             y = vsae)) +
  geom_point(size = 0.75) +
  geom_line(aes(group = childid),
            alpha = .5,
            size = 1) +
  facet_grid(. ~ sicdegp)  +
  theme_bw() +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  labs(x = "Age of Child, Years",
       y = "Vineland Socialization Age Equivalent",
       color = "Sequenced Inventory of Communication Development") +
  geom_smooth(aes(color = "Flexible"),
              method = "loess",
              se = FALSE,) +
  geom_smooth(aes(color = "Linear"),
              method = "lm",
              se = FALSE) +
  scale_color_manual(name = "Smoother Type: ",
                     values = c("Flexible" = "blue", 
                                "Linear"    = "red")) +
  theme(legend.position = "bottom",
        legend.key.width = unit(2, "cm"))

11.2.5.2 Equally Spaced

data_long %>% 
  dplyr::mutate(sicdegp = fct_recode(sicdegp,
                                     "Low Communication" = "low",
                                     "Medium Communication" = "med",
                                     "High Communication" = "high")) %>% 
  ggplot(aes(x = obs,
             y = vsae)) +
  geom_point(size = 0.75) +
  geom_line(aes(group = childid),
            alpha = .5,
            size = 1) +
  facet_grid(. ~ sicdegp)  +
  theme_bw() +
  labs(x = "Observation Number",
       y = "Vineland Socialization Age Equivalent",
       color = "Sequenced Inventory of Communication Development") +
  geom_smooth(aes(color = "Flexible"),
              method = "loess",
              se = FALSE,) +
  geom_smooth(aes(color = "Linear"),
              method = "lm",
              se = FALSE) +
  scale_color_manual(name = "Smoother Type: ",
                     values = c("Flexible" = "blue", 
                                "Linear"    = "red")) +
  theme(legend.position = "bottom",
        legend.key.width = unit(2, "cm"))

11.2.6 Side-by-side Boxplots

data_long %>% 
  ggplot(aes(x = sicdegp,
             y = vsae,
             fill = sicdegp)) +
  geom_boxplot() +
  theme_bw() +
  facet_grid(. ~ age, 
             labeller = "label_both") +
  theme(legend.position = "none") +
  labs(x = "Initial Language Development\nSequenced Inventory of Communication Development (SICD) at Age 2",
       y = "Parent-Reported Measure of Socialization\nVineland Socialization Age Equivalent")

11.2.7 Means Plots

11.2.7.1 Default `stat_summary`

It is nice that the stat_summary() layer computes the standard error for the mean for you using the data in LONG format

data_long %>% 
  ggplot(aes(x = age,
             y = vsae,
             color = sicdegp)) +
  stat_summary() +                       # default: points at MEAN and extend vertically 1 standard error for the mean
  stat_summary(fun = "mean",             # plot the means
               geom = "line") +          # ...and connect with lines
  theme_bw() +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13))

data_long %>% 
  ggplot(aes(x = obs,
             y = vsae,
             color = sicdegp)) +
  stat_summary() +                       # default: points at MEAN and extend vertically 1 standard error for the mean
  stat_summary(fun = "mean",           # plot the means
               geom = "line") +          # ...and connect with lines
  theme_bw()

11.2.7.2 Manually Summarized

data_long %>% 
  dplyr::group_by(sicdegp, age2) %>%                     # specify the groups
  dplyr::summarise(vsae_n    = n(),                      # count of valid scores
                   vsae_mean = mean(vsae),               # mean score
                   vsae_sd   = sd(vsae),                 # standard deviation of scores
                   vsae_sem  = vsae_sd / sqrt(vsae_n)) %>% 
  dplyr::mutate(age = age2 + 2) %>% 
  ggplot() +
  aes(x = age,
      y = vsae_mean,
      color = sicdegp) +
  geom_errorbar(aes(ymin = vsae_mean - vsae_sem,   # mean +/- one SE for the mean
                    ymax = vsae_mean + vsae_sem),
                width = .25) +
  geom_point(aes(shape = sicdegp),
             size = 3) +
  geom_line(aes(group = sicdegp)) +
  theme_bw() +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  labs(x = "Age of Child, Years",
       y = "Vineland Socialization Age Equivalent",
       color = "Sequenced Inventory of Communication Development:",
       shape = "Sequenced Inventory of Communication Development:",
       linetype = "Sequenced Inventory of Communication Development:") +
  theme(legend.position = "bottom",
        legend.key.width = unit(2, "cm"))

11.3 Model 1: Full model with ‘loaded’ mean structure

Take top-down approach: Quadratic regression model, describing vsae as a function of age2

Each child has a unique parabolic trajectory over time, with coefficients that vary randomly around fixed-effects defining a mean growth curve for each SICD group. Since there is no age = 0 in our data, we will use the age2 variables, which is age -2, so that intercept has meaning (mean at baseline age).

I() denotes an internal calculated variable

Fixed-effects
I(age-2) age
I((age-2)^2) quadratic age or age-squared,
sicdegp SICD group (reference group = low)
SICD group x age/age-squared interactions
Random-effects
intercept
age and age-squared

11.3.1 Fit the Model

fit_lmer_1_re <- lmerTest::lmer(vsae ~ I(age-2)*sicdegp + I((age-2)^2)*sicdegp + 
                                  (I(age-2) + I((age-2)^2) | childid), 
                                data = data_long,
                                REML = TRUE,
                                control = lmerControl(optimizer = "optimx",    # get it to converge
                                                      calc.derivs = FALSE,
                                                      optCtrl = list(method = "nlminb",
                                                                     starttests = FALSE,
                                                                     kkt = FALSE)))

11.3.2 Table of Prameter Estimates

apaSupp::tab_lmer(fit_lmer_1_re,
                  caption = "MLM: Full Model")

**Table 11.9**
MLM: Full Model
	Est	(SE)	p
(Intercept)	8.40	(0.75)	< .001***
I(age - 2)	2.28	(0.74)	.002**
sicdegp
low	—	—
med	1.27	(0.99)	.201
high	5.39	(1.11)	< .001***
I((age - 2)^2)	0.07	(0.08)	.403
I(age - 2) * sicdegp
I(age - 2) * med	0.43	(0.98)	.665
I(age - 2) * high	3.31	(1.08)	.002**
sicdegp * I((age - 2)^2)
med * I((age - 2)^2)	0.00	(0.11)	.999
high * I((age - 2)^2)	0.14	(0.12)	.253
childid.var__(Intercept)	1.26
childid.cov__(Intercept).I(age - 2)	-0.17
childid.cov__(Intercept).I((age - 2)^2)	0.41
childid.var__I(age - 2)	14.03
childid.cov__I(age - 2).I((age - 2)^2)	-0.61
childid.var__I((age - 2)^2)	0.16
Residual.var__Observation	37.21
Note. Est = estimated regresssion coefficient for fixed effects and varaince/covariance estimates for random effects. p = significance from Wald t-test for parameter estimate using Satterthwaite's method for degrees of freedom.
* p < .05. p < .01. * p < .001.

11.3.3 Plot of the Estimated Marginal Means

Note: the \(x-axis\) is the age variable back on its original scale

interactions::interact_plot(model = fit_lmer_1_re, # model name
                            pred = age,            # x-axis variable (must be continuous)
                            modx = sicdegp,        # seperate lines
                            interval = TRUE) +     # shaded bands
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  theme_bw()

Figure 11.1
Model 1: Loaded Means Structure

11.4 Model 2A: Drop Random Intercepts

Note: There seems to be relatively little variation in baseline measurements of VSAE across individuals in the same SICD group, so the variation at age 2 can be attributed to random error, rather than between-subject variation.

This indicates we may want to try removing the random intercepts, while retaining the same fixed- and other random-effects.

This new model implies that children have common initial VSAE value at age 2, given their SICD group.

11.4.1 Fit the Model

fit_lmer_2a_re <- lmerTest::lmer(vsae ~ I(age-2)*sicdegp + I((age-2)^2)*sicdegp + 
                                   (0 + I(age-2) + I((age-2)^2) | childid), 
                                 data = data_long,
                                 REML = TRUE)

11.4.2 Assess the Signifcance

anova(fit_lmer_1_re, fit_lmer_2a_re, refit = FALSE)

Data: data_long
Models:
fit_lmer_2a_re: vsae ~ I(age - 2) * sicdegp + I((age - 2)^2) * sicdegp + (0 + I(age - 2) + I((age - 2)^2) | childid)
fit_lmer_1_re: vsae ~ I(age - 2) * sicdegp + I((age - 2)^2) * sicdegp + (I(age - 2) + I((age - 2)^2) | childid)
               npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)  
fit_lmer_2a_re   13 4587.0 4644.2 -2280.5    4561.0                       
fit_lmer_1_re    16 4586.5 4657.0 -2277.2    4554.5 6.4635  3    0.09111 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The more complicated model (including random intercepts) does NOT fit better, thus the random intercepts may be removed from the model. Model 2a is better than Model 1

11.5 Model 2B: Drop Random Quadratic Slope

We should formally test the necessity of quadratic age random-effect.

Comparison of nested models with REML-based LRT using a 50:50 mixture χ2-distribution with 1 and 2 df Difference of 2 covariance parameters

11.5.1 Fit the Model

fit_lmer_2b_re <- lmerTest::lmer(vsae ~ I(age-2)*sicdegp + I((age-2)^2)*sicdegp + 
                                   (0 + I(age-2)  | childid), 
                                 REML = TRUE, 
                                 data = data_long)

11.5.2 Assess the Signifcance

anova(fit_lmer_2a_re, fit_lmer_2b_re, refit = FALSE)

Data: data_long
Models:
fit_lmer_2b_re: vsae ~ I(age - 2) * sicdegp + I((age - 2)^2) * sicdegp + (0 + I(age - 2) | childid)
fit_lmer_2a_re: vsae ~ I(age - 2) * sicdegp + I((age - 2)^2) * sicdegp + (0 + I(age - 2) + I((age - 2)^2) | childid)
               npar    AIC    BIC  logLik -2*log(L) Chisq Df Pr(>Chisq)    
fit_lmer_2b_re   11 4669.3 4717.7 -2323.7    4647.3                        
fit_lmer_2a_re   13 4587.0 4644.2 -2280.5    4561.0 86.34  2  < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The more complicated model (including random intercepts) DOES fit better, thus the random slopes for both the linear AND the quadratic effect of age should be retained in the model. Model 2a is better than model 2b

11.6 Model 3: Drop Quadratic Time Fixed Effect

Fit the previous ‘best’ model via ML, not REML to compare nested model that differe in terms of fixed effects only

11.6.1 Fit the Models

fit_lmer_2a_ml <- lmerTest::lmer(vsae ~ I(age-2)*sicdegp + I((age-2)^2)*sicdegp + 
                                   (0 + I(age-2) + I((age-2)^2) | childid),  
                                 data = data_long, 
                                 REML = FALSE)

fit_lmer_3_ml <- lmerTest::lmer(vsae ~ I(age-2)*sicdegp  + 
                                  (0 + I(age-2) + I((age-2)^2) | childid), 
                                data = data_long, 
                                REML = FALSE)

11.6.2 Assess the Signifcance

anova(fit_lmer_2a_ml, fit_lmer_3_ml)

Data: data_long
Models:
fit_lmer_3_ml: vsae ~ I(age - 2) * sicdegp + (0 + I(age - 2) + I((age - 2)^2) | childid)
fit_lmer_2a_ml: vsae ~ I(age - 2) * sicdegp + I((age - 2)^2) * sicdegp + (0 + I(age - 2) + I((age - 2)^2) | childid)
               npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)  
fit_lmer_3_ml    10 4584.4 4628.5 -2282.2    4564.4                       
fit_lmer_2a_ml   13 4582.1 4639.3 -2278.0    4556.1 8.3704  3    0.03895 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The more complicated model (including fixed interaction between quadratic time and SICD group) DOES fit better, thus the higher level interaction should be retained in the model. Model 2a is better than model 3.

11.7 Final Model

11.7.1 Table of Parameter Esitmates

apaSupp::tab_lmer(fit_lmer_2a_re,
                  caption            = "MLM: Final Model")

**Table 11.10**
MLM: Final Model
	Est	(SE)	p
(Intercept)	8.41	(0.74)	< .001***
I(age - 2)	2.27	(0.74)	.002**
sicdegp
low	—	—
med	1.26	(0.97)	.195
high	5.36	(1.09)	< .001***
I((age - 2)^2)	0.07	(0.08)	.363
I(age - 2) * sicdegp
I(age - 2) * med	0.43	(0.98)	.662
I(age - 2) * high	3.33	(1.08)	.002**
sicdegp * I((age - 2)^2)
med * I((age - 2)^2)	0.00	(0.10)	.995
high * I((age - 2)^2)	0.13	(0.11)	.243
childid.var__I(age - 2)	13.99
childid.cov__I(age - 2).I((age - 2)^2)	-0.44
childid.var__I((age - 2)^2)	0.13
Residual.var__Observation	37.99
Note. Est = estimated regresssion coefficient for fixed effects and varaince/covariance estimates for random effects. p = significance from Wald t-test for parameter estimate using Satterthwaite's method for degrees of freedom.
* p < .05. p < .01. * p < .001.

11.7.2 Interpretation of Fixed Effects

11.7.2.1 Reference Group: `low` SICD group

\(\gamma_{0}\) = 8.408 is the estimated marginal mean VSAE score for children in the low SICD, at 2 years of age
\(\gamma_{a}\) = 2.269 and \(\gamma_{a^2}\) = 0.072 are the fixed effects for age and age-squared on VSAE for children in the low SICD group (change over time)

Thus the equation for the estimated marginal mean VASE trajectory for the low SICD group is:

\[ \begin{align*} VASE =& \gamma_{0} + \gamma_{a} (AGE - 2) + \gamma_{a^2} (AGE - 2)^2 \\ =& 8.408 + 2.269 (AGE - 2) + 0.072 (AGE - 2)^2 \\ \end{align*} \]

11.7.2.2 First Comparison Group: `medium` SICD group

\(\gamma_{med}\) = 1.264 is the DIFFERENCE in the estimated marginal mean VSAE score for children in the medium vs. the low SICD, at 2 years of age
\(\gamma_{med:\;a}\) = 0.429 and \(\gamma_{med:\;a^2}\) = 0.001 are the DIFFERENCE in the fixed effects for age and age-squared on VSAE for children in the medium vs. the low SICD group

Thus the equation for the estimated marginal mean VASE trajectory for the medium SICD group is:

\[ \begin{align*} VASE =& (\gamma_{0} + \gamma_{med}) + (\gamma_{a} + \gamma_{med:\;a}) (AGE - 2) + (\gamma_{a^2} + \gamma_{med:\;a^2})(AGE - 2)^2 \\ =& (8.408 + 1.264) + (2.269 + 0.429) (AGE - 2) + (0.072 + 0.001)(AGE - 2)^2 \\ =& 9.673 + 2.698 (AGE - 2) + 0.073 (AGE - 2)^2 \\ \end{align*} \]

11.7.2.3 Second Comparison Group: `high` SICD group

\(\gamma_{hi}\) = 5.365 is the DIFFERENCE in the estimated marginal mean VSAE score for children in the high vs. the low SICD, at 2 years of age
\(\gamma_{hi:\;a}\) = 3.326 and \(\gamma_{hi:\;a^2}\) = 0.133 are the DIFFERENCE in the fixed effects for age and age-squared on VSAE for children in the high vs. the low SICD group

Thus the equation for the estimated marginal mean VASE trajectory for the high SICD group is:

\[ \begin{align*} VASE =& (\gamma_{0} + \gamma_{hi}) + (\gamma_{a} + \gamma_{hi:\;a}) (AGE - 2) + (\gamma_{a^2} + \gamma_{hi:\;a^2})(AGE - 2)^2 \\ =& (8.408 + 5.365) + (2.269 + 3.326) (AGE - 2) + (0.072 + 0.133)(AGE - 2)^2 \\ =& 13.773 + 5.595 (AGE - 2) + 0.206 (AGE - 2)^2 \\ \end{align*} \]

11.7.3 Interpretation of Random Effects

lme4::VarCorr(fit_lmer_2a_re)%>% 
  print(comp = c("Variance", "Std.Dev"),
        digits = 3)

 Groups   Name           Variance Std.Dev. Cov  
 childid  I(age - 2)     13.991   3.741         
          I((age - 2)^2)  0.134   0.366    -0.44
 Residual                37.987   6.163

Here a group of observations = a CHILD

11.7.3.1 Residual Varaince

Within-child-variance

\(e_{ti}\) the residuals associated with observation at time \(t\) on child \(i\)

\[ \sigma^2 = \sigma^2_e = VAR[e_{ti}] = 37.987 \]

11.7.3.2 2 Variance Components

Between-children slope variances

Random LINEAR effect of age variance

\(u_{1i}\) the DIFFERENCE between child \(i\)’s specific linear component for age and the fixed linear component for age, given their SICD group

\[ \tau_{11} = \sigma^2_{u1} = VAR[u_{1i}] = 13.99 \]

Random QUADRATIC effect of age variance

\(u_{2i}\) the DIFFERENCE between child \(i\)’s specific quadratic component for age and the fixed quadratic component for age, given their SICD group

\[ \tau_{22} = \sigma^2_{u2} = VAR[u_{2i}] = 0.13 \]

11.7.3.3 1 Covariance (or correlation) Components

Slope-slope covariance

Random LINEAR and Quadratic effect of age covariance:

\[ \tau_{12} = \sigma^2_{u12} = COV[u_{1i}, u_{2i}] = -0.44 \] \[ Correlation(u_{1i}, u_{2i}) = -0.324 \]

11.7.4 Assumption Checking

The residuals are:

Assumed to be normally, independently, and identically distributed (conditional on other random-effects)
Assumed independent of random-effects

\[ e_{ti} \sim N(0, \sigma^2) \]

11.7.4.1 The `ggResidpanel` package

ggResidpanel::resid_panel(fit_lmer_2a_re)  # default = pearson residuals

11.7.4.2 Manually with `HLMdiag` and `ggplot2`

fit_lmer_2a_re %>% 
  HLMdiag::hlm_augment(level = 1,
                       include.ls = FALSE,
                       standardized = TRUE) %>%  
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 40,
                 color = "gray20",
                 alpha = .2) +
  theme_bw() +
  labs(main = "Histogram")

fit_lmer_2a_re %>% 
  ranef() %>% 
  data.frame() %>% 
  ggplot(aes(sample = condval)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~ term, scale = "free_y")+
  theme_bw() +
  labs(main = "Random Slopes")

fit_lmer_2a_re %>% 
  HLMdiag::hlm_augment(level = 1,
                       include.ls = FALSE,
                       standardized = TRUE) %>% 
  ggplot(aes(x = .fitted,
             y = .resid)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_point() +
  geom_smooth() +
  theme_bw() +
  labs(main = "Residual Plot")

fit_lmer_2a_re %>% 
  HLMdiag::hlm_augment(level = 1,
                       include.ls = FALSE,
                       standardized = TRUE) %>% 
  ggplot(aes(sample = .resid)) +
  geom_qq()+
  stat_qq_line() +
  theme_bw()

11.7.4.3 The `sjPlot` package

sjPlot::plot_model(fit_lmer_2a_re, 
                   type = "diag")

[[1]]


[[2]]
[[2]]$childid



[[3]]


[[4]]

11.7.5 Plot of the Estimated Marginal Means

11.7.5.1 Quick and Default

Note: the \(x-axis\) is the age variable, back on its original scale

interactions::interact_plot(model = fit_lmer_2a_re, # model name
                            pred = age,            # x-axis variable (must be continuous)
                            modx = sicdegp,        # seperate lines
                            interval = TRUE) +     # shaded bands
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13))

Figure 11.2
Final Model (2a)

interactions::interact_plot(model = fit_lmer_2a_re, # model name
                            pred = age,             # x-axis variable (must be continuous)
                            modx = sicdegp,         # separate lines
                            interval = TRUE,
                            x.label = "Age of Child, in years",
                            y.label = "Estimated Marginal Mean\nVineland Socialization Age Equivalent",
                            legend.main = "Initial Communication (SICD)",
                            modx.labels = c("Low", "Medium", "High"),
                            colors = c("black", "black", "black")) + 
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  scale_y_continuous(breaks = seq(from = 0, to = 120, by = 20)) +
  theme_bw() +
  theme(legend.position = c(0, 1),
        legend.justification = c(-0.1, 1.1),
        legend.background = element_rect(color = "black"),
        legend.key.width = unit(2, "cm"))

11.7.5.2 More Customized - Color

This version would look better on a poster or in a slide presentation.

effects::Effect(focal.predictors = c("age","sicdegp"),  # variables involved in interactions
                mod = fit_lmer_2a_re,
                xlevels = list(age2 = seq(from = 2, to = 13, by = .1))) %>%   # add more values to smooth out the prediction lines and ribbons
  data.frame() %>% 
  dplyr::mutate(sicdegp = factor(sicdegp,
                                 levels = c("high", "med", "low"),
                                 labels = c("High", "Medium", "Low"))) %>% 
  ggplot(aes(x = age,
             y = fit,
             group = sicdegp)) +
  geom_ribbon(aes(ymin = lower,        # 95% Confidence Intervals
                  ymax = upper,
                  fill = sicdegp),
              alpha = .3) +
  geom_line(aes(linetype = sicdegp,
                color = sicdegp),
            size = 1) +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +                  # mark values that were actually measured
  scale_y_continuous(breaks = seq(from = 0, to = 120, by = 20)) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted")) +
  theme_bw() +
  theme(legend.position = c(0, 1),
        legend.justification = c(-0.1, 1.1),
        legend.background = element_rect(color = "black"),
        legend.key.width = unit(2.5, "cm")) +
  labs(x = "Age of Child, in years",
       y = "Estimated Marginal Mean\nVineland Socialization Age Equivalent",
       linetype = "Initial Communication (SICD)",
       fill     = "Initial Communication (SICD)",
       color    = "Initial Communication (SICD)")

11.7.5.3 More Customized - Black and White

This version would be better for a publication.

effects::Effect(focal.predictors = c("age","sicdegp"),
                mod = fit_lmer_2a_re,
                xlevels = list(age2 = seq(from = 2, to = 13, by = .05))) %>% 
  data.frame() %>% 
  dplyr::mutate(sicdegp = factor(sicdegp,
                                 levels = c("high", "med", "low"),
                                 labels = c("High", "Medium", "Low"))) %>% 
  ggplot(aes(x = age,
             y = fit,
             group = sicdegp)) +
  geom_ribbon(aes(ymin = lower,
                  ymax = upper,
                  fill = sicdegp),
              alpha = .4) +
  geom_line(aes(linetype = sicdegp),
            size = .7) +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  scale_y_continuous(breaks = seq(from = 0, to = 120, by = 20)) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted")) +
  scale_fill_manual(values = c("gray10", "gray40", "gray60")) +
  theme_bw() +
  theme(legend.position = c(0, 1),
        legend.justification = c(-0.1, 1.1),
        legend.background = element_rect(color = "black"),
        legend.key.width = unit(2, "cm")) +
  labs(x = "Age of Child, in years",
       y = "Estimated Marginal Mean\nVineland Socialization Age Equivalent",
       linetype = "Initial Communication (SICD)",
       fill     = "Initial Communication (SICD)",
       color    = "Initial Communication (SICD)")

11.7.6 Post Hoc Compairisons

fit_lmer_2a_re %>% 
  emmeans::emmeans(pairwise ~ sicdegp,
                   at = list(age = 13))

$emmeans
 sicdegp emmean   SE  df lower.CL upper.CL
 low       42.1 7.97 135     26.3     57.9
 med       48.2 6.90 131     34.5     61.8
 high     100.2 8.45 131     83.5    116.9

Degrees-of-freedom method: kenward-roger 
Confidence level used: 0.95 

$contrasts
 contrast   estimate   SE  df t.ratio p.value
 low - med     -6.07 10.5 133  -0.575  0.8335
 low - high   -58.10 11.6 133  -5.002  <.0001
 med - high   -52.03 10.9 131  -4.772  <.0001

Degrees-of-freedom method: kenward-roger 
P value adjustment: tukey method for comparing a family of 3 estimates

totalSD <- apaSupp::lmer_sd(fit_lmer_2a_re)

totalSD

[1] 7.188084

fit_lmer_2a_re %>% 
  emmeans::emmeans(~ sicdegp,
                   at = list(age = 13))

 sicdegp emmean   SE  df lower.CL upper.CL
 low       42.1 7.97 135     26.3     57.9
 med       48.2 6.90 131     34.5     61.8
 high     100.2 8.45 131     83.5    116.9

Degrees-of-freedom method: kenward-roger 
Confidence level used: 0.95

fit_lmer_2a_re %>% 
  emmeans::emmeans(~ sicdegp,
                   at = list(age = 13)) %>% 
  pairs()

 contrast   estimate   SE  df t.ratio p.value
 low - med     -6.07 10.5 133  -0.575  0.8335
 low - high   -58.10 11.6 133  -5.002  <.0001
 med - high   -52.03 10.9 131  -4.772  <.0001

Degrees-of-freedom method: kenward-roger 
P value adjustment: tukey method for comparing a family of 3 estimates

fit_lmer_2a_re %>% 
  emmeans::emmeans(~ sicdegp,
                   at = list(age = 13)) %>% 
  pairs() %>% 
  data.frame() %>% 
  dplyr::mutate(SE = glue::glue("({apaSupp::apan(SE, 2)})")) %>% 
  dplyr::mutate(SMD = estimate/apaSupp::lmer_sd(fit_lmer_2a_re)) %>% 
  dplyr::mutate(p_Unadj = apaSupp::p_num(p.value)) %>% 
  dplyr::mutate(p_Adj = p.adjust(p.value, method = "fdr")%>% 
                  apaSupp::p_num()) %>% 
  dplyr::select("Compare" = contrast,
                "Difference in EMM_Est" = estimate,
                "Difference in EMM_(SE)" = SE,
                SMD, p_Unadj, p_Adj) %>% 
  flextable::flextable() %>% 
  flextable::separate_header() %>% 
  apaSupp::theme_apa(caption = "Pairwise Comparisons of Estimated Marginal Means",
                     general_note = "SMD = Standardized mean difference, a Cohen's d like effect size. Significance is provided as unadjusted (Unadj) and adjusted (Adj) using the false discovery rate method.") %>% 
  flextable::hline(part = "header", i = 1)

**Table 11.11**
Pairwise Comparisons of Estimated Marginal Means
Compare	Difference in EMM		SMD	p
Compare	Est	(SE)	SMD	Unadj	Adj
low - med	-6.07	(10.54)	-0.84	.833	.833
low - high	-58.10	(11.62)	-8.08	< .001***	< .001***
med - high	-52.03	(10.90)	-7.24	< .001***	< .001***
Note. SMD = Standardized mean difference, a Cohen's d like effect size. Significance is provided as unadjusted (Unadj) and adjusted (Adj) using the false discovery rate method.

11.7.7 Blups vs. Fixed Effects

BLUP = Best Linear Unbiased Predictor

A BLUP is the specific prediction for an individual supject, showin by black lines below. This includes the fixed effects as well as the specific random effects for a given individual.

Comparatively, the blue lines below display the predictions for fixed effects only.

data_long %>% 
  dplyr::mutate(sicdegp = fct_recode(sicdegp,
                                     "Low Communication" = "low",
                                     "Medium Communication" = "med",
                                     "High Communication" = "high")) %>% 
  dplyr::mutate(pred_fixed = predict(fit_lmer_2a_re, re.form = NA)) %>% # fixed effects only
  dplyr::mutate(pred_wrand = predict(fit_lmer_2a_re)) %>%               # fixed and random effects together
  ggplot(aes(x = age,
             y = vsae)) +
  geom_line(aes(y = pred_wrand,           # BLUP = fixed and random effects together
                group = childid,
                color = "BLUP",
                size  = "BLUP")) +
  geom_line(aes(y = pred_fixed,           # fixed effects only
                group = sicdegp,
                color = "Fixed",
                size  = "Fixed")) +
  scale_color_manual(name = "Model: ",
                     values = c("BLUP"  = "black",
                                "Fixed" = "blue")) +
  scale_size_manual(name = "Model: ",
                    values = c("BLUP"  = .5,
                               "Fixed" = 1.5)) +
  facet_grid(. ~ sicdegp) +
  theme_bw() +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  labs(x = "Age, in years",
       y = "Estimated Marginal Mean\nVineland Socialization Age Equivalent") +
  theme(legend.position = "bottom",
        legend.key.width = unit(1.5, "cm"))

data_long %>% 
  dplyr::mutate(pred_fixed = predict(fit_lmer_2a_re, re.form = NA)) %>% 
  dplyr::mutate(pred_wrand = predict(fit_lmer_2a_re)) %>% 
  dplyr::filter(childid %in% sample(levels(data_long$childid), 25)) %>%  # 25 randomly sampled children
  ggplot(aes(x = age,
             y = vsae)) +
  geom_point(aes(color = sicdegp),
             size = 3) +
  geom_line(aes(y = pred_wrand,
                linetype = "BLUP",
                size     = "BLUP"),
            color = "black") +
  geom_line(aes(y = pred_fixed,
                color = sicdegp,
                linetype = "Fixed",
                size     = "Fixed")) +
  scale_linetype_manual(name = "Model: ",
                        values = c("BLUP"  = "longdash",
                                   "Fixed" = "solid")) +
  scale_size_manual(name = "Model: ",
                    values = c("BLUP"  = .5,
                               "Fixed" = 1)) +
  facet_wrap(. ~ childid, labeller = "label_both") +
  theme_bw() +
  scale_x_continuous(breaks = c(2, 3, 5, 9, 13)) +
  theme(legend.position = "bottom",
        legend.key.width = unit(1.5, "cm")) +
  labs(x = "Age, in years",
       y = "Estimated Marginal Mean\nVineland Socialization Age Equivalent",
       color = "Communication:")