Note:
- The example used in this tutorial, data about teacher discipline, is different from the example presented in Henry & Muthén (2010). This data is not currently publicly available.
- All models are estimated in
Mplus
via the R packageMplusAutomation
.
References
SOURCE CITATION: Henry, K. L., & Muthén, B. (2010). Multilevel latent class analysis: An application of adolescent smoking typologies with individual and contextual predictors. Structural Equation Modeling, 17(2), 193-215.
Hallquist, M. N., & Wiley, J. F. (2018). MplusAutomation: An R Package for Facilitating Large-Scale Latent Variable Analyses in Mplus. Structural equation modeling: a multidisciplinary journal, 25(4), 621-638.
Muthén, L.K. and Muthén, B.O. (1998-2017). Mplus User’s Guide. Eighth Edition. Los Angeles, CA: Muthén & Muthén
Prepare & explore data
loading packages…
library(tidyverse)
library(haven)
library(MplusAutomation)
library(rhdf5)
library(here)
library(glue)
library(stargazer)
library(gt)
library(janitor)
library(semPlot)
library(reshape2)
library(cowplot)
read spss >> write csv >> read csv
data_spss <- read_spss(here("data", "teacher_discip_strat_data.sav")) %>%
clean_names()
# write a CSV datafile (to remove labels)
write_csv(data_spss, here("data", "teach_discip_data.csv"))
read the unlabeled data back into R
view labeled data (create a codebook)
prepare data for MplusAutomation
shorten names to be < 8
characters
names(mlca_mplus) <- str_remove(names(mlca_mplus), pattern = "itive")
mlca_mplus <- mlca_mplus %>%
rename(pop = population, # Bullying is a big problem in this school
distcode = districtcode,
schlcode = schoolcode,
postcode = positioncode)
View descriptive statistics for LCA measurement indicators
lca_summary <- mlca_mplus %>%
select(53:67)
stargazer(as.data.frame(lca_summary), header = FALSE, digits=1)
Make table grouped by school (schlcode
)
# how many school clusters are there?
# length(unique(mlca_mplus$schlcode)) # 130 schools
school_summary <- mlca_mplus %>%
group_by(schlcode) %>%
summarize(
mean_lvl = mean(level, na.rm = TRUE),
mean_pun_1 = mean(pun_1, na.rm = TRUE),
mean_pos_1 = mean(pos_1, na.rm = TRUE),
mean_sel_1 = mean(sel_1, na.rm = TRUE),
sample_n = n())
school_summary[1:10,] %>%
gt()
schlcode | mean_lvl | mean_pun_1 | mean_pos_1 | mean_sel_1 | sample_n |
---|---|---|---|---|---|
10 | 1 | 1.704545 | 3.545455 | 3.477273 | 44 |
11 | 1 | 1.482759 | 3.482759 | 3.413793 | 29 |
14 | 1 | 1.451613 | 3.483871 | 3.548387 | 31 |
16 | 2 | 1.794872 | 3.102564 | 3.000000 | 39 |
17 | 2 | 1.797101 | 3.260870 | 3.043478 | 69 |
18 | 3 | 1.871795 | 2.948718 | 2.833333 | 78 |
20 | 1 | 1.435897 | 3.615385 | 3.615385 | 39 |
21 | 1 | 1.659574 | 3.617021 | 3.319149 | 47 |
23 | 2 | 1.838710 | 3.096774 | 3.032258 | 31 |
24 | 3 | 1.825000 | 2.825000 | 2.375000 | 40 |
Note: In order to reduce estimation time for this example 7 indicators were chosen and dichotomized. For the same reason the 4-class solution was used in all MLCA models for purposes of demonstration.
Recode data
7 indicators used in LCA demonstration
- pos_1 = Students are praised often.
- pos_3 = Teachers often let students know when they are being good.
- pos_2 = Students are often given rewards for being good.
- pos_4 = Classes get rewards for good 1 behavior.
- sel_5 = Students are taught they should care about how others feel.
- sel_2 = Students are taught to understand how others think and feel.
- sel_1 = Students are taught to feel responsible for how they act.
convert indicators to be dichotomous
mlca_mplus <- mlca_mplus %>%
mutate(
pos_1b = case_when(
pos_1 < 3 ~ 0, # disagree ~ responses 1 & 2
pos_1 >= 3 ~ 1)) %>% # agree ~ responses 3 & 4
mutate(
pos_3b = case_when(
pos_3 < 3 ~ 0,
pos_3 >= 3 ~ 1)) %>%
mutate(
pos_2b = case_when(
pos_2 < 3 ~ 0,
pos_2 >= 3 ~ 1)) %>%
mutate(
pos_4b = case_when(
pos_4 < 3 ~ 0,
pos_4 >= 3 ~ 1)) %>%
mutate(
sel_5b = case_when(
sel_5 < 3 ~ 0,
sel_5 >= 3 ~ 1)) %>%
mutate(
sel_2b = case_when(
sel_2 < 3 ~ 0,
sel_2 >= 3 ~ 1)) %>%
mutate(
sel_1b = case_when(
sel_1 < 3 ~ 0,
sel_1 >= 3 ~ 1))
table(mlca_mplus$sel_1)
##
## 1 2 3 4
## 81 659 3190 1157
##
## 0 1
## 740 4347
model 00: LCA enumeration (fixed effect model)
lca_k1_6 <- lapply(1:6, function(k) {
lca_enum <- mplusObject(
TITLE = glue("C{k}_mlca_enum_demo"),
VARIABLE =
glue(
"categorical = pos_1b-sel_1b;
usevar = pos_1b-sel_1b;
classes = c({k});"),
ANALYSIS =
"estimator = mlr;
type = mixture;
starts = 500 100;",
MODEL = "",
OUTPUT = "",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
lca_enum_fit <- mplusModeler(lca_enum,
dataout=glue(here("enum_mplus", "c_{k}_mlca_enum.dat")),
modelout=glue(here("enum_mplus", "c_{k}_mlca_enum.inp")) ,
check=TRUE, run = TRUE, hashfilename = FALSE)
})
Read models & plot LCA (\(K=4\))
output_enum <- readModels(here("enum_mplus"), quiet = TRUE)
enum_summary <- LatexSummaryTable(output_enum,
keepCols=c("Title",
"LL",
"BIC",
"aBIC"),
sortBy = "Title")
enum_summary %>%
gt()
Title | LL | BIC | aBIC |
---|---|---|---|
C1_mlca_enum_demo | -13867.62 | 27794.98 | 27772.73 |
C2_mlca_enum_demo | -11348.33 | 22824.67 | 22777.01 |
C3_mlca_enum_demo | -10971.08 | 22138.45 | 22065.36 |
C4_mlca_enum_demo | -10671.22 | 21607.00 | 21508.50 |
C5_mlca_enum_demo | -10628.89 | 21590.62 | 21466.69 |
C6_mlca_enum_demo | -10600.19 | 21601.50 | 21452.15 |
plot 4-class LCA probability plot
# extract posterior probabilities
pp1 <- as.data.frame(output_enum[["c_4_mlca_enum.out"]]
[["gh5"]]
[["means_and_variances_data"]]
[["estimated_probs"]]
[["values"]]
[seq(2, 14, 2),]) #seq("from","to","by")
# extract model estimated class sizes
c_size <- as.data.frame(output_enum[["c_4_mlca_enum.out"]]
[["class_counts"]]
[["modelEstimated"]]
[["proportion"]])
colnames(c_size) <- paste0("cs")
c_size <- c_size %>% mutate(cs = round(cs*100, 1))
colnames(pp1) <- paste0("C", 1:4, glue(" ({c_size[1:4,]}%)"))
pp1 <- cbind(Var = paste0("U", 1:7), pp1)
# choose the order of indicators & label
pp1$Var <- factor(pp1$Var,
levels = c("U1","U2","U3","U4","U5", "U6", "U7"),
labels = c("Verbal2", "Verbal1", "Reward1", "Reward2",
"Empathy1","Empathy2", "Responsible"))
pd_long <- melt(pp1, id.vars = "Var")
# plot data
ggplot(pd_long, aes(as.integer(Var), value, shape = variable,
colour = variable, lty = variable)) +
geom_point(size = 4) + geom_line() +
scale_x_continuous("", breaks = 1:7, labels = pp1$Var) +
scale_y_continuous("Probability") +
scale_colour_grey() +
theme_cowplot() +
theme(text=element_text(family="Times New Roman", size=12),
legend.key.width = unit(.5, "line"),
legend.text = element_text(family="Times New Roman", size=12),
legend.title = element_blank(),
legend.position = "top")
model00: Compute intra-class correlations (type = basic; w/ analysis = TWOLEVEL;)
# Note: In this example the ICC's are zero because items are dichotomous
mlca_00 <- mplusObject(
TITLE = "model00_basic__ICC_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
cluster = schlcode;
within = pos_1b-sel_1b;",
ANALYSIS =
"estimator = mlr;
type = basic twolevel; ! ask for ICC curves
processors = 10;",
MODEL = "",
OUTPUT = "sampstat;",
PLOT = "",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_00_fit <- mplusModeler(mlca_00,
dataout=here("mlca_mplus", "model00_basic.dat"),
modelout=here("mlca_mplus", "model00_basic.inp"),
check=TRUE, run = TRUE, hashfilename = FALSE)
Compare Multi-level parametric & non-parametric models described in Henry & Muthen (2010)
model01: parametric random effects model (4-class)
Figure 1. Picture adapted from, Henry & Muthen 2010
# warning, run-time is very slow
mlca_01 <- mplusObject(
TITLE = "model01_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = c(4);
cluster = schlcode; ! level 2 units are schools
within = pos_1b-sel_1b;",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
integration=montecarlo(1000);
starts = 100 50;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
C#1;
C#2;
C#3;
C#1 WITH C#2;
C#3 WITH C#1 C#2; ",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_01_fit <- mplusModeler(mlca_01,
dataout=here("mlca_mplus", "model01_parametric.dat"),
modelout=here("mlca_mplus", "model01_parametric.inp"),
check=TRUE, run = FALSE, hashfilename = FALSE)
model02: parametric model with 2nd level factor
Figure 2. Picture adapted from, Henry & Muthen 2010
mlca_02 <- mplusObject(
TITLE = "model02_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = c(4);
cluster = schlcode; ! level 2 units are schools
within = pos_1b-sel_1b;",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
FC by C#1 C#2 C#3;",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_02_fit <- mplusModeler(mlca_02,
dataout=here("mlca_mplus", "model02_parametric.dat"),
modelout=here("mlca_mplus", "model02_parametric.inp"),
check=TRUE, run = FALSE, hashfilename = FALSE)
model03: non-parametric model
Figure 3. Picture adapted from, Henry & Muthen 2010
mlca_03 <- mplusObject(
TITLE = "model03_non_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = CB(3) c(4);
cluster = schlcode; ! level 2 units are schools
within = pos_1b-sel_1b;
between = CB;",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
C on CB;
MODEL C:
%WITHIN%
%C#1%
[pos_1b$1-sel_1b$1];
%C#2%
[pos_1b$1-sel_1b$1];
%C#3%
[pos_1b$1-sel_1b$1];
%C#4%
[pos_1b$1-sel_1b$1]; ",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_03_fit <- mplusModeler(mlca_03,
dataout=here("mlca_mplus", "model03_non_parametric.dat"),
modelout=here("mlca_mplus", "model03_non_parametric.inp"),
check=TRUE, run = FALSE, hashfilename = FALSE)
model04: parametric model with 2nd level factor on random latent class indicators
Figure 4. Picture adapted from, Henry & Muthen 2010
mlca_04 <- mplusObject(
TITLE = "model04_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = c(4);
cluster = schlcode; ",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
FU by pos_1b-sel_1b;
[FU@0];
FU WITH C#1 C#2 C#3;
C#1;
C#2;
C#3;
C#1 WITH C#2;
C#3 WITH C#1 C#2;
%C#1%
[pos_1b$1-sel_1b$1];
%C#2%
[pos_1b$1-sel_1b$1];
%C#3%
[pos_1b$1-sel_1b$1];
%C#4%
[pos_1b$1-sel_1b$1]; ",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_04_fit <- mplusModeler(mlca_04,
dataout=here("mlca_mplus", "model04_parametric.dat"),
modelout=here("mlca_mplus", "model04_parametric.inp"),
check=TRUE, run = F, hashfilename = FALSE)
model05: parametric model with 2nd level factor on random latent class intercepts & 2nd level factor on random latent class indicators
Figure 5. Picture adapted from, Henry & Muthen 2010
mlca_05 <- mplusObject(
TITLE = "model05_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = c(4);
cluster = schlcode; ",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
FU by pos_1b-sel_1b;
[FU@0];
FC BY C#1 C#2 C#3;
FC WITH FU;
%C#1%
[pos_1b$1-sel_1b$1];
%C#2%
[pos_1b$1-sel_1b$1];
%C#3%
[pos_1b$1-sel_1b$1];
%C#4%
[pos_1b$1-sel_1b$1]; ",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_05_fit <- mplusModeler(mlca_05,
dataout=here("mlca_mplus", "model05_parametric.dat"),
modelout=here("mlca_mplus", "model05_parametric.inp"),
check=TRUE, run = F, hashfilename = FALSE)
model06: non-parametric model with level-2 factor on latent class indicators
Figure 6. Picture adapted from, Henry & Muthen 2010
mlca_06 <- mplusObject(
TITLE = "model06_non_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b;
categorical = pos_1b-sel_1b;
classes = CB(2) c(4);
cluster = schlcode; ! level 2 units are schools
between = CB;",
ANALYSIS =
"estimator = mlr;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
%BETWEEN%
%OVERALL%
FU BY pos_1b-sel_1b;
[FU@0];
C on CB;
MODEL CB:
%BETWEEN%
%CB#1%
[FU@0];
%CB#2%
[FU];
MODEL C:
%BETWEEN%
%C#1%
[pos_1b$1-sel_1b$1];
%C#2%
[pos_1b$1-sel_1b$1];
%C#3%
[pos_1b$1-sel_1b$1];
%C#4%
[pos_1b$1-sel_1b$1]; ",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_06_fit <- mplusModeler(mlca_06,
dataout=here("mlca_mplus", "model06_non_parametric.dat"),
modelout=here("mlca_mplus", "model06_non_parametric.inp"),
check=TRUE, run = FALSE, hashfilename = FALSE)
model07: parametric model with 2nd level factor on random latent class intercepts & 2nd level factor on random latent class indicators
Auxiliaries: one individual-level covariate & two school-level covariates
Figure 7. Picture adapted from, Henry & Muthen 2010
school-level covariates
# table(mlca_mplus$level) # 2836 elementary school students
# table(mlca_mplus$d_middle) # 1174 middle school students
# table(mlca_mplus$d_high) # 1084 high-school students
mlca_07 <- mplusObject(
TITLE = "model07_parametric_mlca",
VARIABLE =
"usevar = pos_1b-sel_1b d_middle d_high d_female;
categorical = pos_1b-sel_1b;
classes = c(4);
cluster = schlcode;
between = d_middle d_high;
within = d_female;
",
ANALYSIS =
"estimator = mlr;
algorithm = integration;
type = mixture twolevel;
starts = 20 10;
processors = 10;",
MODEL =
"%WITHIN%
%OVERALL%
C#1-C#3 on d_female;
%BETWEEN%
%OVERALL%
FU BY pos_1b@1;
FU BY pos_3b (F_pos_3b);
FU BY pos_2b (F_pos_2b);
FU BY pos_4b (F_pos_4b);
FU BY sel_5b (F_sel_5b);
FU BY sel_2b (F_sel_2b);
FU BY sel_1b (F_sel_1b);
[FU@0];
FC BY C#1 (FC_C1);
FC BY C#2 (FC_C2);
FC BY C#3 (FC_C3);
FU WITH FC;
C#2 ON d_middle (C2_mid);
C#2 ON d_high (C2_hs);
FC ON d_middle (FC_mid);
FC ON d_high (FC_hs);
FU ON d_middle (FU_mid);
FU ON d_high (FU_hs);
%C#1%
[pos_1b$1-sel_1b$1];
%C#2%
[pos_1b$1-sel_1b$1];
%C#3%
[pos_1b$1-sel_1b$1];
%C#4%
[pos_1b$1-sel_1b$1]; ",
MODELCONSTRAINT =
"NEW(MID_EV MID_pos3 MID_pos2 MID_pos4 MID_sel5 MID_sel2 MID_sel1
HS_EV HS_pos3 HS_pos2 HS_pos4 HS_sel5 HS_sel2 HS_sel1
C2_MIDSC C2_HIGHS);
MID_EV = FU_mid;
MID_pos3 = FU_mid*F_pos_3b;
MID_pos2 = FU_mid*F_pos_2b;
MID_pos4 = FU_mid*F_pos_4b;
MID_sel5 = FU_mid*F_sel_5b;
MID_sel2 = FU_mid*F_sel_2b;
MID_sel1 = FU_mid*F_sel_1b;
HS_EV = FU_hs;
HS_pos3 = FU_hs*F_pos_3b;
HS_pos2 = FU_hs*F_pos_2b;
HS_pos4 = FU_hs*F_pos_4b;
HS_sel5 = FU_hs*F_sel_5b;
HS_sel2 = FU_hs*F_sel_2b;
HS_sel1 = FU_hs*F_sel_1b;
C2_MIDSC = (FC_mid*FC_C2)+C2_mid;
C2_HIGHS = (FC_hs*FC_C2)+C2_hs;",
OUTPUT = "TECH8;",
PLOT =
"type = plot3;
series = pos_1b-sel_1b(*);",
usevariables = colnames(mlca_mplus),
rdata = mlca_mplus)
mlca_07_fit <- mplusModeler(mlca_07,
dataout=here("mlca_mplus", "model07_parametric.dat"),
modelout=here("mlca_mplus", "model07_parametric.inp"),
check=TRUE, run = F, hashfilename = FALSE)
Table of model fit
output_mlca <- readModels(here("mlca_out"), quiet = TRUE)
mlca_summary <- LatexSummaryTable(output_mlca,
keepCols=c("Title",
"Parameters",
"LL",
"BIC",
"aBIC"),
sortBy = "Title")
mlca_summary %>%
gt()
Title | Parameters | LL | BIC | aBIC |
---|---|---|---|---|
model01_parametric_mlca | 37 | -9928.028 | 20171.83 | 20054.26 |
model02_parametric_mlca | 34 | -10025.484 | 20341.14 | 20233.10 |
model03_non_parametric_mlca | 39 | -10018.788 | 20370.42 | 20246.49 |
model05_parametric_mlca | 42 | -9832.084 | 20022.62 | 19889.15 |
model06_non_parametric_mlca | 43 | -9847.624 | 20062.23 | 19925.59 |
model07_parametric_mlca | 51 | -9768.308 | 19971.87 | 19809.81 |
create a path diagram of the final model
# Read in the model to R
final_model <- readModels(here("mlca_out", "model07_par_covs.out"), quiet = TRUE)
# Plot model:
semPaths(final_model,
intercepts=FALSE,
fixedStyle = c(1))