Factor analysis method is used to reduce the dimensions of the data. Each new independent variable in the analysis is a new dimension. So if by luck you have many relevant independent variable, you man end up in a dilemma regarding which variable to keep and which one to drop. As using all of them may lead to multicollineariy.
Factor Analysis proposes a smaller set of variables which may have same power of explanation as the total set of variables. But since the set is smaller, it has smaller dimension and complexity. This comes in handy if you have a large data set of many variables where using all of them might not be estimate-able based on the computing power available.
Loading libraries
library(readxl) # read excel file
library(tidyverse) # data manuplation
library(corrplot) # corrrelation plot
library(qgraph) # loading graphs
library(psych) # for rotated factor analysis
library(plyr) # managing data files
library(dplyr) # data management
library(pastecs) # for descriptive statistics
Import Functions
source(‘https://raw.githubusercontent.com/vqv/ggbiplot/master/R/ggbiplot.r’)
source(‘https://raw.githubusercontent.com/vqv/ggbiplot/master/R/ggscreeplot.r’)
Their followng libraries are not downloading in some versions but the codes is provided for you
library(usethis)
library(devtools)
install_github(“vqv/ggbiplot”, force = TRUE)
library(ggbiplot)
Importing data
sales <- read_excel(“D:/UMT notes/MPhil – MS courses/Applied Econometrics/lectures applied econometrics/lecture 6/data reduction using factor analysis/car_sales.xlsx”, skip = 1)
summarising data
summary(sales)
stat.desc(sales)
sales1 <- drop_na (sales)
corrplot(cor(sales1[,6:14]), method=”color”)
qgraph(cor(sales1[,6:14]), title = “Q Graph of Indicators”, theme = “gray”)
qgraph(cor(sales1[,6:14]), title = “Q Graph of Indicators”, layout=”spring”, shape=”rectangle”, theme = “gray”)
principle component matrix
pca <- princomp (sales1[c(6:14)], corr = TRUE)
pca$scores
df2 <- pca$scores
summary(pca)
loadings(pca)
identification of factors
plot(pca)
abline(h = 1, col = “red”, lty=5)
screeplot(pca, type = “lines”, npcs = 9, main = “Screeplot of the 9 PCs”)
abline(h = 1, col = “red”, lty=5)
legend (“topright”, legend = c(“Eigen value = 1”), col = c(“red”), lty=5, cex=0.6)
ggscreeplot(pca) + ggtitle(“PCA Screeplot”)
cumpro <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumpro[0:15], xlab = “PC #”, ylab = “Amount of explained variance”, main = “Cumulative variance plot”)
abline(v = 6, col=”blue”, lty=5)
abline(h = 0.88759, col=”blue”, lty=5)
legend(“topleft”, legend=c(“Cut-off @ PC6”),
col=c(“blue”), lty=5, cex=0.6)
biplot(pca,scale=0, cex=.7)
ggbiplot(pca, labels=rownames(sales1))+ ggtitle(“PCA Biplot”)
loading plot without rotation
sales.pca.loadings <- loadings(pca)
qgraph.loadings(sales.pca.loadings[,1:3], posCol=”darkgreen”, title = “PCA Loadings”, layout=”circle”, negCol=”darkmagenta”, edge.width=1)
rotation of factors
rpca <- principal(sales1[6:14], nfactors=3, rotate=”varimax”) ## since we have 3 factors above 1
rpca
summary(rpca)
biplot.psych(rpca, col=c(“black”,”red”), cex=c(0.7,0.8), xlim.s=c(-3,3), ylim.s=c(-2,4))
loading plot with rotation
rpca.loadings <- loadings(rpca)
qgraph.loadings(rpca.loadings[,1:3], posCol=”darkgreen”, layout=”circle”, negCol=”darkmagenta”, edge.width=1, title = “PCA Rotated Loading”)
qgraph.loadings(rpca.loadings[,1:3], posCol=”darkgreen”, layout=”spring”, negCol=”darkmagenta”, edge.width=1, title = “PCA Rotated Loading”)