<iframe width="100%" height="100%" src="https://www.youtube.com/embed/MEZ0eB-1wLc" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
When there is high multicollinearity confirmed between the independent variables and you are forced to include all of them in the model, it will lead to over estimated regression coefficients. Since they are changed, now they are biased. This is because high collinearity makes inverse of the matrix bigger.
This tutorial provides guidance to estimate advanced models which can shrink the coefficients to make them unbiased. These models are Ridge regression, Lasso regression and Elastic net regression. They models have good applicability in machine learning.
The codes for the tutorial are available below.
title: “R Notebook”
output: html_notebook
importing libraries
library(tidyverse) # for data management
library(readxl) # to read excel file
library(mctest) # for multicollinearity tests
library(olsrr) # post regression tests
library(GGally) # scatter plot
library(caret) # splits training and testing data
library(glmnet) # elastic net regression
library(corrplot) # correlation plots
library(qgraph) # Correlation distance plots
df <- read_excel(“C:/Users/Noman Arshed/OneDrive/Research projects/mubashir munir papers/paper 4 (logistics)/logisticsdata.xlsx”)
df <- df %>% mutate(tb = (nettrade/dgdp)*100)
df <- df %>% mutate (ldgdp = log(dgdp))
df <- df %>% mutate (lwgdp = log(wgdp-dgdp))
df1 <- df %>% drop_na()
df1 <- df1 %>% select(tb,l11, l21, l31, l41, l51, l71, lwgdp, er)
ggscatmat(df1, columns = 1: ncol(df1))
df.cor = cor(df1)
par(“mar”)
par(“mai”)
corrplot(df.cor)
qgraph(cor(df1), title = “Q Graph of Indicators”, theme = “gray”)
qgraph(cor(df1), title = “Q Graph of Indicators”, layout=”spring”, shape=”rectangle”, theme = “gray”)
model <- lm(tb ~ l11 + l21 + l31 + l41 + l51 + l71 + lwgdp + er, data = df1)
summary(model)
ols_correlations(model)
ols using GMNNET
set.seed(123)
training.samples <- df1$tb %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]
x <- model.matrix(tb~., train.data)[,-1]
y <- train.data$tb
model.ols <- glmnet(x, y)
plot(model.ols)
plot(model.ols, xvar = “lambda”, label = TRUE)
plot(model.ols, xvar = “dev”, label = TRUE)
cvols = cv.glmnet(x, y)
plot(cvols)
print(model.ols)
ridge reg
set.seed(123)
training.samples <- df1$tb %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]
Predictor variables
x <- model.matrix(tb~., train.data)[,-1]
Outcome variable
y <- train.data$tb
Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x, y, alpha = 0)
Display the best lambda value
cvms <- cv$cvm
lambdas <- cv$lambda
plot(cv)
cv$lambda.min
Fit the final model on the training data
model.ridge <- glmnet(x, y, alpha = 0, lambda = cv$lambda.min)
Display regression coefficients
coef(model.ridge)
summary(model.ridge)
W <- as.matrix (coef(model.ridge))
W
keep_x <- rownames(W)[W!=0]
keep_x <- keep_x[!keep_x == “(Intercept)”]
X <- x [,keep_x]
fit <- lm(formula = train.data$tb ~ X)
summary(fit)
Make predictions on the test data
x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.ridge %>% predict(x.test) %>% as.vector()
Model performance metrics
data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)
lasso reg
Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x, y, alpha = 1)
Display the best lambda value
cv$lambda.min
Fit the final model on the training data
model.lasso <- glmnet(x, y, alpha = 1, lambda = cv$lambda.min)
Dsiplay regression coefficients
coef(model.lasso)
W <- as.matrix (coef(model.lasso))
W
keep_x <- rownames(W)[W!=0]
keep_x <- keep_x[!keep_x == “(Intercept)”]
X <- x [,keep_x]
fit <- lm(formula = train.data$tb ~ X)
summary(fit)
Make predictions on the test data
x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.lasso %>% predict(x.test) %>% as.vector()
Model performance metrics
data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)
elastic net
set.seed(123)
training.samples <- df1$tb %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]
Build the model using the training set
set.seed(123)
model.elastic <- train(
tb ~., data = train.data, method = “glmnet”,
trControl = trainControl(“cv”, number = 10),
tuneLength = 10
)
Best tuning parameter
model.elastic$bestTune
Coefficient of the final model. You need
to specify the best lambda
coef(model.elastic$finalModel, model.elastic$bestTune$lambda)
Make predictions on the test data
x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.elastic %>% predict(x.test)
Model performance metrics
data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)
generating plots
Generate data
set.seed(123)
Predictor variables
x <- model.matrix(tb~., train.data)[,-1]
Outcome variable
y <- train.data$tb
Fit models:
fit.lasso <- glmnet(x, y, family=”gaussian”, alpha=1)
fit.ridge <- glmnet(x, y, family=”gaussian”, alpha=0)
fit.elnet <- glmnet(x, y, family=”gaussian”, alpha=.5)
10-fold Cross validation for each alpha = 0, 0.1, … , 0.9, 1.0
fit.lasso.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=1,
family=”gaussian”)
fit.ridge.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=0,
family=”gaussian”)
fit.elnet.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=.5,
family=”gaussian”)
for (i in 0:10) {
assign(paste(“fit”, i, sep=””), cv.glmnet(x, y, type.measure=”mse”,
alpha=i/10,family=”gaussian”))
}
Plot solution paths:
par(mfrow=c(3,2))
For plotting options, type ‘?plot.glmnet’ in R console
plot(fit.lasso, xvar=”lambda”)
plot(fit10, main=”LASSO”)
plot(fit.ridge, xvar=”lambda”)
plot(fit0, main=”Ridge”)
plot(fit.elnet, xvar=”lambda”)
plot(fit5, main=”Elastic Net”)