<iframe width="100%" height="100%" src="https://www.youtube.com/embed/MEZ0eB-1wLc" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

When there is high multicollinearity confirmed between the independent variables and you are forced to include all of them in the model, it will lead to over estimated regression coefficients. Since they are changed, now they are biased. This is because high collinearity makes inverse of the matrix bigger.

This tutorial provides guidance to estimate advanced models which can shrink the coefficients to make them unbiased. These models are Ridge regression, Lasso regression and Elastic net regression. They models have good applicability in machine learning.

The codes for the tutorial are available below.

title: “R Notebook”

output: html_notebook

importing libraries

library(tidyverse) # for data management
library(readxl) # to read excel file
library(mctest) # for multicollinearity tests
library(olsrr) # post regression tests
library(GGally) # scatter plot
library(caret) # splits training and testing data
library(glmnet) # elastic net regression
library(corrplot) # correlation plots
library(qgraph) # Correlation distance plots

df <- read_excel(“C:/Users/Noman Arshed/OneDrive/Research projects/mubashir munir papers/paper 4 (logistics)/logisticsdata.xlsx”)

df <- df %>% mutate(tb = (nettrade/dgdp)*100)
df <- df %>% mutate (ldgdp = log(dgdp))
df <- df %>% mutate (lwgdp = log(wgdp-dgdp))

df1 <- df %>% drop_na()
df1 <- df1 %>% select(tb,l11, l21, l31, l41, l51, l71, lwgdp, er)

ggscatmat(df1, columns = 1: ncol(df1))
df.cor = cor(df1)
par(“mar”)
par(“mai”)
corrplot(df.cor)

qgraph(cor(df1), title = “Q Graph of Indicators”, theme = “gray”)
qgraph(cor(df1), title = “Q Graph of Indicators”, layout=”spring”, shape=”rectangle”, theme = “gray”)

model <- lm(tb ~ l11 + l21 + l31 + l41 + l51 + l71 + lwgdp + er, data = df1)
summary(model)
ols_correlations(model)

ols using GMNNET

set.seed(123)
training.samples <- df1$tb %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]
x <- model.matrix(tb~., train.data)[,-1]
y <- train.data$tb
model.ols <- glmnet(x, y)
plot(model.ols)
plot(model.ols, xvar = “lambda”, label = TRUE)
plot(model.ols, xvar = “dev”, label = TRUE)

cvols = cv.glmnet(x, y)
plot(cvols)
print(model.ols)

ridge reg

set.seed(123)
training.samples <- df1$tb %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]

Predictor variables

x <- model.matrix(tb~., train.data)[,-1]

Outcome variable

y <- train.data$tb

Find the best lambda using cross-validation

set.seed(123)
cv <- cv.glmnet(x, y, alpha = 0)

Display the best lambda value

cvms <- cv$cvm
lambdas <- cv$lambda
plot(cv)
cv$lambda.min

Fit the final model on the training data

model.ridge <- glmnet(x, y, alpha = 0, lambda = cv$lambda.min)

Display regression coefficients

coef(model.ridge)
summary(model.ridge)
W <- as.matrix (coef(model.ridge))
W
keep_x <- rownames(W)[W!=0]
keep_x <- keep_x[!keep_x == “(Intercept)”]
X <- x [,keep_x]
fit <- lm(formula = train.data$tb ~ X)
summary(fit)

Make predictions on the test data

x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.ridge %>% predict(x.test) %>% as.vector()

Model performance metrics

data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)

lasso reg

Find the best lambda using cross-validation

set.seed(123)
cv <- cv.glmnet(x, y, alpha = 1)

Display the best lambda value

cv$lambda.min

Fit the final model on the training data

model.lasso <- glmnet(x, y, alpha = 1, lambda = cv$lambda.min)

Dsiplay regression coefficients

coef(model.lasso)
W <- as.matrix (coef(model.lasso))
W
keep_x <- rownames(W)[W!=0]
keep_x <- keep_x[!keep_x == “(Intercept)”]
X <- x [,keep_x]
fit <- lm(formula = train.data$tb ~ X)
summary(fit)

Make predictions on the test data

x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.lasso %>% predict(x.test) %>% as.vector()

Model performance metrics

data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)

elastic net

set.seed(123)
training.samples <- df1$tb %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- df1[training.samples, ]
test.data <- df1[-training.samples, ]

Build the model using the training set

set.seed(123)
model.elastic <- train(
tb ~., data = train.data, method = “glmnet”,
trControl = trainControl(“cv”, number = 10),
tuneLength = 10
)

Best tuning parameter

model.elastic$bestTune

Coefficient of the final model. You need

to specify the best lambda

coef(model.elastic$finalModel, model.elastic$bestTune$lambda)

Make predictions on the test data

x.test <- model.matrix(tb ~., test.data)[,-1] predictions <- model.elastic %>% predict(x.test)

Model performance metrics

data.frame(
RMSE = RMSE(predictions, test.data$tb),
Rsquare = R2(predictions, test.data$tb)
)

generating plots

Generate data

set.seed(123)

Predictor variables

x <- model.matrix(tb~., train.data)[,-1]

Outcome variable

y <- train.data$tb

Fit models:

fit.lasso <- glmnet(x, y, family=”gaussian”, alpha=1)
fit.ridge <- glmnet(x, y, family=”gaussian”, alpha=0)
fit.elnet <- glmnet(x, y, family=”gaussian”, alpha=.5)

10-fold Cross validation for each alpha = 0, 0.1, … , 0.9, 1.0

fit.lasso.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=1,
family=”gaussian”)
fit.ridge.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=0,
family=”gaussian”)
fit.elnet.cv <- cv.glmnet(x, y, type.measure=”mse”, alpha=.5,
family=”gaussian”)

for (i in 0:10) {
assign(paste(“fit”, i, sep=””), cv.glmnet(x, y, type.measure=”mse”,
alpha=i/10,family=”gaussian”))
}

Plot solution paths:

par(mfrow=c(3,2))

For plotting options, type ‘?plot.glmnet’ in R console

plot(fit.lasso, xvar=”lambda”)
plot(fit10, main=”LASSO”)

plot(fit.ridge, xvar=”lambda”)
plot(fit0, main=”Ridge”)

plot(fit.elnet, xvar=”lambda”)
plot(fit5, main=”Elastic Net”)

Chapter 2 – Estimating Advanced Multicollinearity Solution Models using R

output: html_notebook

importing libraries

ols using GMNNET

ridge reg

Predictor variables

Outcome variable

Find the best lambda using cross-validation

Display the best lambda value

Fit the final model on the training data

Display regression coefficients

Make predictions on the test data

Model performance metrics

lasso reg

Find the best lambda using cross-validation

Display the best lambda value

Fit the final model on the training data

Dsiplay regression coefficients

Make predictions on the test data

Model performance metrics

elastic net

Build the model using the training set

Best tuning parameter

Coefficient of the final model. You need

to specify the best lambda

Make predictions on the test data

Model performance metrics

generating plots

Generate data

Predictor variables

Outcome variable

Fit models:

10-fold Cross validation for each alpha = 0, 0.1, … , 0.9, 1.0

Plot solution paths:

For plotting options, type ‘?plot.glmnet’ in R console

Leave a Comment Cancel Reply