Next two PCA examples included are adapted from Allison Horst’s ESM-206 course at UCSB (Horst, 2020):
DATA SOURCE: This lab exercise utilizes the NCES public-use dataset: Education Longitudinal Study of 2002 (Lauff & Ingels, 2014) \(\color{blue}{\text{See website: nces.ed.gov}}\)
library(FactoMineR)
library(factoextra)
library(skimr)
library(naniar)
library(ggfortify)
library(janitor)
library(tidyverse)
library(here)
lab_data <- read_csv(here("data", "els_sub4.csv"))
lab_tidy <- lab_data %>%
clean_names()
# remove variables that don't make sense in a PCA
lab_sub1 <- lab_tidy %>%
select(-stu_id, # these are random numbers
-sch_id,
-byrace, # nominal (non-ordered variable)
-byparace, # nominal (non-ordered variable)
-byparlng, # nominal (non-ordered variable)
-byfcomp, # nominal (non-ordered variable)
-bypared, -bymothed, -byfathed,
-bysctrl, -byurban, -byregion)
# select columns and rename variables to have descriptive names
lab_sub2 <- lab_sub1 %>%
select(1:9,
bys20a, bys20h, bys20j, bys20k, bys20m, bys20n,
bys21b, bys21d, bys22a, bys22b, bys22c, bys22d,
bys22e, bys22g, bys22h, bys24a, bys24b) %>%
rename("stu_exp" = "bystexp",
"par_asp" = "byparasp",
"mth_read" = "bytxcstd",
"mth_test" = "bytxmstd",
"rd_test" = "bytxrstd",
"freelnch" = "by10flp",
"stu_tch" = "bys20a",
"putdownt" = "bys20h",
"safe" = "bys20j",
"disrupt" = "bys20k",
"gangs" = "bys20m",
"rac_fght" = "bys20n",
"fair" = "bys21b",
"strict" = "bys21d",
"stolen" = "bys22a",
"drugs" = "bys22b",
"t_hurt" = "bys22c",
"p_fight" = "bys22d",
"hit" = "bys22e",
"damaged" = "bys22g",
"bullied" = "bys22h",
"late" = "bys24a",
"skipped" = "bys24b")
# write a CSV datafile of the new subset with renamed columns (will use for lab 8)
write_csv(lab_sub2, here("data", "lab7-8_els2002_data_subset.csv"))
naniar
} & make data summary with {skimr
}# Plot number of missings by variable
gg_miss_var(lab_sub2)
# Look at summary of data using skimr::skim()
skim(lab_sub2)
pca1 <- lab_sub2 %>%
drop_na()
prcomp()
(function does not permit NA values)pca_out1 <- prcomp(pca1, scale = TRUE)
plot(pca_out1)
#summary(pca_out1)
jpeg(here("figures", "biplot_pca1.jpg"), res = 100) # to save the biplot
my_biplot <- autoplot(pca_out1,
colour = NA,
loadings.label = TRUE,
loadings.label.size = 3,
loadings.label.colour = "black",
loadings.label.repel = TRUE) +
theme_minimal()
my_biplot
dev.off()
my_biplot
PCA(pca1, scale.unit = TRUE, ncp = 20, graph = TRUE)
Data:
ca_pb <- read_csv(here("data", "ca_pollution_burden.csv"))
ca_dem <- read_csv(here("data", "ca_census_demographics_2010.csv"))
ca_pb_nopct <- ca_pb %>%
clean_names() %>%
select(-contains("pctl")) %>%
select(-contains("perc")) %>%
select(-latitude, - longitude)
ca_dem_clean <- ca_dem %>%
clean_names()
First, starting with ca_pb_nopct:
Note: The pollution burden and population characteristic variables are aggregates (averages) of existing variables in the data frame, so we won’t include those. That means we’ll include columns:
ozone:solid_waste
, andasthma:housing_burden
First, just selecting those:
ca_pb_subset <- ca_pb_nopct %>%
select(ozone:solid_waste, asthma:housing_burden)
To run pca we will use the prcomp
function:
pb_pca <- prcomp(ca_pb_subset, scale = TRUE) # hmmm an error
A little aside: the naniar
package for exploring missingness! See: https://naniar.njtierney.com/
Use naniar::gg_miss_var()
to plot the number of missings by variable:
# Plot number of missings by variable
gg_miss_var(ca_pb_subset)
Let’s say our conclusion is that there are missings, but not many (compared to the actual scope of the data). We’ll only keep our complete cases (census tracts without any missings).
Use tidyr::drop_na()
with no variables specified to keep ONLY complete cases across all variables:
ca_pb_nona <- ca_pb_subset %>%
drop_na()
# Now check for NAs:
summary(ca_pb_nona)
# Or use `skimr::skim()`!
skim(ca_pb_nona)
Cool. No NAs, NOW let’s try PCA again:
my_ca_pca <- prcomp(ca_pb_nona, scale = TRUE)
plot(my_ca_pca)
jpeg(here("figures", "biplot_pca2.jpg"), res = 100) # to save the biplot
# Hmmm let's try something else (this requires ggfortify):
my_biplot <- autoplot(my_ca_pca,
colour = NA,
loadings.label = TRUE,
loadings.label.size = 3,
loadings.label.colour = "black",
loadings.label.repel = TRUE) +
theme_minimal()
my_biplot
dev.off()
ca_df <- ca_dem_clean %>%
inner_join(ca_pb_nopct, by = c("census_tract_number" = "census_tract"))
Look at the dataframe first & then use drop_na() to get complete cases only:
ca_df_nona <- ca_df %>%
drop_na()
Like (you can choose a different set):
Make our subset:
my_sub <- ca_df_nona %>%
select(white_percent,
elderly_65_percent,
pm2_5,
pesticides,
traffic,
asthma,
cardiovascular_disease,
poverty)
Then run PCA:
my_dem_pca <- prcomp(my_sub, scale = TRUE)
Check it out a bit:
# Proportion of variance (& cumulative variance) explained by each PC
summary(my_dem_pca)
# Rotations (linear combinations for each PC):
my_dem_pca
Make a sweet biplot:
jpeg(here("figures", "biplot_pca3.jpg"), res = 100) # to save the biplot
my_dem_biplot <- autoplot(my_dem_pca,
colour = NA,
loadings.label = TRUE,
loadings.label.size = 3,
loadings.label.colour = "black",
loadings.label.repel = TRUE) +
theme_minimal() +
scale_y_continuous(limits = c(-0.05, 0.05))
my_dem_biplot
dev.off()
Hallquist, M. N., & Wiley, J. F. (2018). MplusAutomation: An R Package for Facilitating Large-Scale Latent Variable Analyses in Mplus. Structural equation modeling: a multidisciplinary journal, 25(4), 621-638.
Horst, A. (2020). Course & Workshop Materials. GitHub Repositories, https://https://allisonhorst.github.io/
Muthén, L.K. and Muthén, B.O. (1998-2017). Mplus User’s Guide. Eighth Edition. Los Angeles, CA: Muthén & Muthén
R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL http://www.R-project.org/
Wickham et al., (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686