10 Ex: Cancer Experiment

# install.packages("remotes")
# remotes::install_github("sarbearschwartz/apaSupp")
# remotes::install_github("ddsjoberg/gtsummary")

library(magrittr)       
library(tidyverse)   
library(broom)     
library(naniar)
library(corrplot)   
library(GGally)
library(gtsummary)
library(apaSupp)
library(performance)
library(interactions)
library(effects)
library(emmeans)
library(car)
library(ggResidpanel)
library(modelsummary)
library(ppcor)
library(jtools)
library(olsrr)
library(DescTools)
library(effectsize)
library(ggpubr)
emmeans::emm_options(opt.digits = FALSE)  # revert to optimal digits

10.1 PURPOSE

10.1.1 Research Question

For this example page: Does weight vary with age?

10.1.2 Data Description

The Cancer data set contains 4 repeated measure of oral condition for 25 cancer patients to see if an aloe treatment helps with oral condition (ulcers).

cancer_raw <- haven::read_spss("https://raw.githubusercontent.com/CEHS-research/eBook_ANOVA/master/data/Cancer.sav")
tibble::glimpse(cancer_raw)
Rows: 25
Columns: 9
$ ID       <dbl> 1, 5, 6, 9, 11, 15, 21, 26, 31, 35, 39, 41, 45, 2, 12, 14, 16…
$ TRT      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1…
$ AGE      <dbl> 52, 77, 60, 61, 59, 69, 67, 56, 61, 51, 46, 65, 67, 46, 56, 4…
$ WEIGHIN  <dbl> 124.0, 160.0, 136.5, 179.6, 175.8, 167.6, 186.0, 158.0, 212.8…
$ STAGE    <dbl> 2, 1, 4, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 2, 4, 1, 2, 1, 4, 2, 1…
$ TOTALCIN <dbl> 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 4, 6, 6, 12, 5, …
$ TOTALCW2 <dbl> 6, 6, 9, 7, 7, 6, 11, 11, 9, 4, 8, 6, 8, 16, 10, 6, 11, 7, 11…
$ TOTALCW4 <dbl> 6, 10, 17, 9, 16, 6, 11, 15, 6, 8, 11, 9, 9, 9, 11, 8, 11, 6,…
$ TOTALCW6 <dbl> 7, 9, 19, 3, 13, 11, 10, 15, 8, 7, 11, 6, 10, 10, 9, 7, 14, 6…
cancer_clean <- cancer_raw %>% 
  dplyr::rename_all(tolower) %>% 
  dplyr::mutate(id = factor(id)) %>% 
  dplyr::mutate(trt = factor(trt,
                             labels = c("Placebo", 
                                        "Aloe Juice"))) %>% 
  dplyr::mutate(stage = factor(stage))

tibble::glimpse(cancer_clean)
Rows: 25
Columns: 9
$ id       <fct> 1, 5, 6, 9, 11, 15, 21, 26, 31, 35, 39, 41, 45, 2, 12, 14, 16…
$ trt      <fct> Placebo, Placebo, Placebo, Placebo, Placebo, Placebo, Placebo…
$ age      <dbl> 52, 77, 60, 61, 59, 69, 67, 56, 61, 51, 46, 65, 67, 46, 56, 4…
$ weighin  <dbl> 124.0, 160.0, 136.5, 179.6, 175.8, 167.6, 186.0, 158.0, 212.8…
$ stage    <fct> 2, 1, 4, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 2, 4, 1, 2, 1, 4, 2, 1…
$ totalcin <dbl> 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 4, 6, 6, 12, 5, …
$ totalcw2 <dbl> 6, 6, 9, 7, 7, 6, 11, 11, 9, 4, 8, 6, 8, 16, 10, 6, 11, 7, 11…
$ totalcw4 <dbl> 6, 10, 17, 9, 16, 6, 11, 15, 6, 8, 11, 9, 9, 9, 11, 8, 11, 6,…
$ totalcw6 <dbl> 7, 9, 19, 3, 13, 11, 10, 15, 8, 7, 11, 6, 10, 10, 9, 7, 14, 6…
psych::headTail(cancer_clean) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Sample of Cancer Dataset")
Table 10.1
Sample of Cancer Dataset

id

trt

age

weighin

stage

totalcin

totalcw2

totalcw4

totalcw6

1

Placebo

52

124

2

6

6

6

7

5

Placebo

77

160

1

9

6

10

9

6

Placebo

60

136.5

4

7

9

17

19

9

Placebo

61

179.6

1

6

7

9

3

...

...

...

...

...

...

42

Aloe Juice

73

181.5

0

8

11

16

44

Aloe Juice

67

187

1

5

7

7

7

50

Aloe Juice

60

164

2

6

8

16

58

Aloe Juice

54

172.8

4

7

8

10

8

10.2 EXPLORATORY DATA ANALYSIS

Before embarking on any inferencial anlaysis or modeling, always get familiar with your variables one at a time (univariate), as well as pairwise (bivariate).

10.2.1 Summary Statistics

10.2.1.1 Univariate

Center: mean and median Spread: standard deviation, range (max - min), interquartile range (Q3 - Q1)

cancer_clean %>% 
  dplyr::select("Age" = age,
                "Weight" = weighin) %>% 
  apaSupp::tab_desc(caption = "Description of Participants")
Table 10.2
Description of Participants

Variable

NA

M

SD

min

Q1

Mdn

Q3

max

Age

0

59.64

12.93

27.00

52.00

60.00

67.00

86.00

Weight

0

178.28

31.98

124.00

160.00

172.80

187.00

261.40

Note. NA = not available or missing. Mdn = median. Q1 = 25th percentile, Q3 = 75th percentile. N = 25.

10.2.1.2 Bivariate

cancer_clean %>% 
  cor.test(~ age + weighin,
           data = .)

    Pearson's product-moment correlation

data:  age and weighin
t = -1.4401, df = 23, p-value = 0.1633
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.6130537  0.1213316
sample estimates:
       cor 
-0.2875868 

10.2.2 Visualize Distributions

10.2.2.1 Univariate

cancer_clean %>% 
  apaSupp::spicy_histos(var = age)

cancer_clean %>% 
  apaSupp::spicy_histos(var = weighin)

10.2.2.2 Bivariate

cancer_clean %>% 
  ggplot(aes(x = age,
             y = weighin)) +
  geom_point() +
  geom_smooth(aes(color = "Linear"), 
              method = "lm", 
              formula = y ~ x,
              se = TRUE) +  
  geom_smooth(aes(color = "Loess"),  
              method = "loess", 
              se = FALSE) +     
  theme_bw() +
  labs(x = "Age, years",
       y = "Weight, pounds",
       color = "Smoother") +
  theme(legend.position = c(1, 1),
        legend.justification = c(1.1, 1.1),
        legend.background = element_rect(color = "black"))

10.3 REGRESSION ANALYSIS

The lm() function must be supplied with at least two options:

  • a formula: Y ~ X
  • a dataset: data = XXXXXXX

When a model is fit and directly saved as a named object via the assignment opperator (<-), no output is produced.

  • The dependent variable (DV) is weight at intake (\(Y\))
  • The independent variable (IV) is age at intake (\(X\))

10.3.1 Fit Model

fit_lm <- lm(weighin ~ age,
             data = cancer_clean)

10.3.2 Parameter Estimates

summary(fit_lm)

Call:
lm(formula = weighin ~ age, data = cancer_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-59.713 -19.535  -2.935  12.954  71.998 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 220.6899    30.1076    7.33 1.86e-07 ***
age          -0.7111     0.4938   -1.44    0.163    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 31.28 on 23 degrees of freedom
Multiple R-squared:  0.08271,   Adjusted R-squared:  0.04282 
F-statistic: 2.074 on 1 and 23 DF,  p-value: 0.1633

10.3.3 Prediction Equation

coef(fit_lm)
(Intercept)         age 
220.6899336  -0.7110988 

\[ \hat{weight} = 220.69 - 0.71 \times (age) \]

10.3.4 Table

apaSupp::tab_lm(fit_lm)
Table 10.3
Regression Model

Variable

b

(SE)

p

b*

𝜂²

𝜂ₚ²

(Intercept)

220.69

(30.11)

< .001 ***

age

-0.71

(0.49)

.163

-0.29

.083

.083

0.08

Adjusted R²

0.04

Note. b* = standardized estimate. 𝜂² = semi-partial correlation. 𝜂ₚ² = partial correlation.

* p < .05. ** p < .01. *** p < .001.

10.3.5 Estimated Marginal Means

fit_lm %>% 
  emmeans::emmeans(~ age)
   age emmean       SE df lower.CL upper.CL
 59.64 178.28 6.256869 23 165.3367 191.2233

Confidence level used: 0.95 
fit_lm %>% 
  emmeans::emmeans(~ age,
                   at = list(age = c(30, 60, 90))) 
 age  emmean        SE df lower.CL upper.CL
  30 199.357 15.917537 23 166.4290 232.2849
  60 178.024  6.259394 23 165.0755 190.9726
  90 156.691 16.245055 23 123.0856 190.2965

Confidence level used: 0.95 
fit_lm %>% 
  emmeans::emmeans(~ age,
                   at = list(age = seq(from = 30, to = 90, by = 10))) %>% 
  data.frame() %>% 
  dplyr::mutate(ci = glue::glue("[{apaSupp::apa2(lower.CL)}, {apaSupp::apa2(upper.CL)}]")) %>% 
  dplyr::select("Age" = age,
                "Est. Marginal Mean" = emmean,
                SE,
                "95% CI" = ci) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "Estimated Marginal Means for Weight Regression on Age") %>% 
  flextable::colformat_double(j = 1, digits = 0) %>% 
  flextable::align(align = "center")
Table 10.4
Estimated Marginal Means for Weight Regression on Age

Age

Est. Marginal Mean

SE

95% CI

30

199.36

15.92

[166.43, 232.28]

40

192.25

11.54

[168.37, 216.12]

50

185.13

7.86

[168.87, 201.40]

60

178.02

6.26

[165.08, 190.97]

70

170.91

8.08

[154.19, 187.63]

80

163.80

11.84

[139.31, 188.30]

90

156.69

16.25

[123.09, 190.30]

fit_lm %>% 
  effects::Effect(focal.predictors = c("age"))

 age effect
age
     30      40      60      70      90 
199.357 192.246 178.024 170.913 156.691 
fit_lm %>% 
  effects::Effect(focal.predictors = c("age"),
                  xlevels = list(age = seq(from = 30, to = 90, by = 1))) %>% 
  data.frame() 
# A tibble: 61 × 5
     age   fit    se lower upper
   <dbl> <dbl> <dbl> <dbl> <dbl>
 1    30  199.  15.9  166.  232.
 2    31  199.  15.5  167.  231.
 3    32  198.  15.0  167.  229.
 4    33  197.  14.6  167.  227.
 5    34  197.  14.1  167.  226.
 6    35  196.  13.7  167.  224.
 7    36  195.  13.2  168.  222.
 8    37  194.  12.8  168.  221.
 9    38  194.  12.4  168.  219.
10    39  193.  12.0  168.  218.
# ℹ 51 more rows
fit_lm %>% 
  effects::Effect(focal.predictors = c("age"),
                  xlevels = list(age = seq(from = 30, to = 90, by = 1))) %>% 
  data.frame() %>% 
  ggplot(aes(x = age,
             y = fit)) +
  geom_ribbon(aes(ymin = fit - se,
                  ymax = fit + se),
              alpha = .2) +
  geom_line() +
  theme_bw() +
  labs(x = "Age, years",
       y = "Estimated Marginal Mean\nWeight, pounds")

Figure 10.1
Weight Regressed on Age

Weight Regressed on Age