4 D&H Ch3b - Multiple Regression: “Sport Skill”

Darlington & Hayes, Chapter 3’s second example

# install.packages("remotes")
# remotes::install_github("sarbearschwartz/apaSupp")
# remotes::install_github("ddsjoberg/gtsummary")

library(magrittr)       
library(tidyverse)   
library(broom)     
library(naniar)
library(corrplot)   
library(GGally)
library(gtsummary)
library(apaSupp)
library(performance)
library(interactions)
library(effects)
library(emmeans)
library(car)
library(ggResidpanel)
library(modelsummary)
library(ppcor)
library(jtools)
library(olsrr)
library(DescTools)
library(effectsize)
library(ggpubr)

4.1 PURPOSE

4.1.1 Research Question

Are sports skills related to preference?

4.1.2 Data Description

4.1.2.1 Variables

Independent Variables (IV, \(X\)) * soft skill at softball * basket skill at basketball

Dependent Variable (DV, \(Y\)) * pref On a scale from 1 to 9, which sport do you prefer?” + 1 = much prefer softball + 5 = no preference + 9 = much prefer basketball

df_sport <- tibble::tribble(~id, ~soft, ~basket, ~pref,
                             1,   4, 17, 1,
                             2,  56, 60, 3,
                             3,  25,  3, 8,
                             4,  50, 52, 4,
                             5,   5, 16, 2,
                             6,  72, 84, 2,
                             7, 100, 95, 5,
                             8,  39, 20, 8,
                             9,  81, 75, 5,
                            10,  61, 47, 7)
df_sport %>% 
  dplyr::select("ID" = id,
                "Softball Skill" = soft,
                "Basketball Skill" = basket,
                "Preference Score" = pref) %>% 
  flextable::flextable() %>% 
  apaSupp::theme_apa(caption = "D&H TAble 3.5 (pg 78) Skill at Softball, Basketball, and Preference") %>% 
  flextable::colformat_double(digits = 0)
``` ```{=html}
(\#tab:unnamed-chunk-154)D&H TAble 3.5 (pg 78) Skill at Softball, Basketball, and Preference

ID

Softball Skill

Basketball Skill

Preference Score

1

4

17

1

2

56

60

3

3

25

3

8

4

50

52

4

5

5

16

2

6

72

84

2

7

100

95

5

8

39

20

8

9

81

75

5

10

61

47

7

4.2 EXPLORATORY DATA ANALYSIS

4.2.1 Univariate Statistics

Center: mean and median Spread: standard deviation, range (max - min), interquartile range (Q3 - Q1)

df_sport %>% 
  dplyr::select("Softball Skill" = soft,
                "Basketball Skill" = basket,
                "Preference Score" = pref) %>%
  apaSupp::tab_desc(caption = "Summary Statistics")
``` ```{=html}
(\#tab:unnamed-chunk-155)Summary Statistics

Measure

NA

M

SD

min

Q1

Mdn

Q3

max

Softball Skill

0

49.30

31.59

4.00

28.50

53.00

69.25

100.00

Basketball Skill

0

46.90

31.93

3.00

17.75

49.50

71.25

95.00

Preference Score

0

4.50

2.55

1.00

2.25

4.50

6.50

8.00

Note. NA = not available or missing. Mdn = median. Q1 = 25th percentile, Q3 = 75th percentile. N = 10.

4.2.2 Univariate Visualizations

df_sport %>% 
  apaSupp::spicy_histos(var = soft)

df_sport %>% 
  apaSupp::spicy_histos(var = basket)

df_sport %>% 
  apaSupp::spicy_histos(var = pref)

4.2.3 Bivariate Statistics

df_sport %>% 
  dplyr::select("Softball Skill" = soft,
                "Basketball Skill" = basket,
                "Preference Score" = pref) %>%
  apaSupp::tab_cor(caption = "Unadjusted, Pairwise Correlations")
``` ```{=html}
(\#tab:unnamed-chunk-159)Unadjusted, Pairwise Correlations

Variables

r

p

Softball Skill

Basketball Skill

0.920

< .001

***

Softball Skill

Preference Score

0.210

.562

Basketball Skill

Preference Score

-0.190

.590

Note. r = Pearson's Product-Moment correlation coefficient. N = 10.

* p < .05. ** p < .01. *** p < .001.

4.2.4 Bivariate Visualization

df_sport %>% 
  dplyr::select("Softball Skill" = soft,
                "Basketball Skill" = basket,
                "Preference Score" = pref) %>%
  data.frame %>% 
  GGally::ggscatmat() +
  theme_bw()

df_sport %>% 
  ggplot(aes(x = soft,        
           y = pref)) +   
  geom_point() +                    
  geom_smooth(method = "lm",
              formula = y ~ x) +
  ggpubr::stat_regline_equation(label.x = 75,
                                label.y = 1,
                                size = 6) +
  theme_bw() +
  labs(x = "Softball Skill",
       y = "Preference")

df_sport %>% 
  ggplot(aes(x = basket,        
             y = pref)) +   
  geom_point() +                    
  geom_smooth(method = "lm",
              formula = y ~ x) +
  ggpubr::stat_regline_equation(label.x = 75,
                                label.y = 1,
                                size = 6) +
  theme_bw() +
  labs(x = "Basketball Skill",
       y = "Preference")

4.3 REGRESSION ANALYSIS

  • The dependent variable (DV) is preference (\(Y\))
  • The independent variable (IVs) are skill levels (\(X\))

4.3.1 Fit the Models

fit_lm_soft <- lm(pref ~ soft,
                   data = df_sport)  

fit_lm_basket <- lm(pref ~ basket,
                   data = df_sport)

fit_lm_both <- lm(pref ~ soft + basket,
                   data = df_sport)
tab_lm3 <- apaSupp::tab_lms(list(fit_lm_soft, fit_lm_basket, fit_lm_both),
                            mod_names = c("Softball", "Basketball", "Both"),
                            var_labels = c("soft" = "Softball Skill",
                                           "basket" = "Basketball Skill"),
                            caption = "Parameter Estimates for Sport Preference Regression on Skill Level in Softball and Basketball",
                            general_note = "Dependent variable is preference rating on a scale of 1 (much prefer softball) to  9 (much prefer basketball).  Both softball and Baksetball skill levels are on a scale of 0 to 100.",
                            d = 3) 

tab_lm3
``` ```{=html}
(\#tab:unnamed-chunk-164)Parameter Estimates for Sport Preference Regression on Skill Level in Softball and Basketball

Softball

Basketball

Both

Variable

b

(SE)

p

b

(SE)

p

b

(SE)

p

(Intercept)

3.669

(1.610)

.052

5.228

(1.546)

.010 **

3.899

(0.200)

< .001 ***

Softball Skill

0.017

(0.028)

.562

0.198

(0.009)

< .001 ***

Basketball Skill

-0.016

(0.028)

.590

-0.195

(0.009)

< .001 ***

0.044

0.038

0.987

Adjusted R²

-0.076

-0.082

0.983

Note. Dependent variable is preference rating on a scale of 1 (much prefer softball) to 9 (much prefer basketball). Both softball and Baksetball skill levels are on a scale of 0 to 100.

* p < .05. ** p < .01. *** p < .001.

We call a set of regressors complementary if \(R^2\) for the set exceeds the sum of the individual values of \(r^2_{YX}\) . Thus, complementarity and collinearity are opposites, though either can occur only when regressors in a set are intercorrelated.

4.3.2 Visualize

interactions::interact_plot(model = fit_lm_both,
                            pred = soft,
                            modx = basket,
                            modx.values = c(25, 50, 75),
                            legend.main = "Basketball Skill\n(0-100)",
                            interval = TRUE) +
  theme_bw() +
  labs(x = "Softball Skill, (0-100)",
       y = "Estimated Marginal Mean Preference\n1 = much prefer softball\n9 = much prefer basketball") +
  coord_cartesian(ylim = c(1, 9)) +
  scale_y_continuous(breaks = 1:9) +
  theme(legend.position = c(0, 1),
        legend.justification = c(-.1, 1.1),
        legend.background = element_rect(color = "black"),
        legend.key.width = unit(1.5, "cm"))

4.3.3 Semipartial Correlation

effectsize::r2_semipartial(fit_lm_both)
# A tibble: 2 × 5
  Term   r2_semipartial    CI CI_low CI_high
  <chr>           <dbl> <dbl>  <dbl>   <dbl>
1 soft            0.949  0.95  0.756       1
2 basket          0.943  0.95  0.737       1

4.3.4 Venn Diagram - Variances

https://freetools.touchpoint.com/venn-diagram-template-generator

ABCD <- var(df_sport$pref)
ABCD
[1] 6.5
AB <- var(fit_lm_soft$residuals)
AB
[1] 6.216137
BC <- var(fit_lm_basket$residuals)
BC
[1] 6.254138
D <- var(fit_lm_both$residuals)
D
[1] 0.08349447
ABC <- ABCD - D
ABC
[1] 6.416506
A <- ABC - BC
A
[1] 0.1623674
C <- ABC - AB
C
[1] 0.2003681
B <- AB + BC - ABC
B
[1] 6.05377

4.3.5 Venn Diagram - Proportions

Total Variance in Preference Explained by Both Skills

R2 <- ABC/ABCD
R2
[1] 0.9871547

Variance in Preference Uniquely Explained by Softball skills, across all Basketball skill levels

sr2_soft <- A/ABCD
sr2_soft
[1] 0.0249796

Variance in Preference Uniquely Explained by Basketball skills, across all Softball skill levels

sr2_basket <- C/ABCD
sr2_basket
[1] 0.03082587

Variance in Preference Explained by Softball skills, when holding Basketball skill constant

pr2_soft <- A/(A + D)
pr2_soft
[1] 0.6604009
pr2_basket <- C/(C + D)
pr2_basket
[1] 0.7058631

4.3.6 Residual Diagnostics

performance::check_residuals(fit_lm_both)
OK: Simulated residuals appear as uniformly distributed (p = 0.676).
ggResidpanel::resid_panel(fit_lm_both)