Chapter 1 – Python Jupyter – Estimating Simple Regression

Python Jupyter Notebook platform can be used to estimate simple regression on a data set. Python is one of the leading programming language with its application is data science.

import pandas as pd
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import normaltest
%matplotlib inline

df = pd.read_excel(r’C:\Users\Noman Arshed\OneDrive\Research projects\Submitted\dynamic capabilities paper\finaldata.xls’, skiprows=[0])

df.head()

df.dtypes

df.describe()

plt.figure(figsize=(10,10))
plt.title(‘Missing values in Vulnerability Data’)
sns_plot = sns.heatmap(df.isnull(), cbar=False)
fig = sns_plot.get_figure()

df1 = df.dropna()

sns.distplot(df1[‘sens’])

qqplot(df1[‘sens’], line=’s’)
plt.show()

stat, p = shapiro(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))
stat, p = normaltest(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))

sns_plot = sns.jointplot(x=’sens’,y=’seiz’,data=df1,kind=’scatter’)

import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
from matplotlib import rcParams
import scipy.stats as stats

reg = smf.ols(‘inno ~ sens + seiz’, data = df1)

res = reg.fit()
print(res.summary())

sm.graphics.influence_plot(res)

residuals = res.resid
fitted = res.fittedvalues
smoothed = lowess(residuals,fitted)
top3 = abs(residuals).sort_values(ascending = False)[:3]

plt.rcParams.update({‘font.size’: 16})
plt.rcParams[“figure.figsize”] = (8,7)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = ‘k’, facecolors = ‘none’)
ax.plot(smoothed[:,0],smoothed[:,1],color = ‘r’)
ax.set_ylabel(‘Residuals’)
ax.set_xlabel(‘Fitted Values’)
ax.set_title(‘Residuals vs. Fitted’)
ax.plot([min(fitted),max(fitted)],[0,0],color = ‘k’,linestyle = ‘:’, alpha = .3)

for i in top3.index:
ax.annotate(i,xy=(fitted[i],residuals[i]))

plt.show()

sorted_student_residuals = pd.Series(res.get_influence().resid_studentized_internal)
sorted_student_residuals.index = res.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df3 = pd.DataFrame(sorted_student_residuals)
df3.columns = [‘sorted_student_residuals’]
df3[‘theoretical_quantiles’] = stats.probplot(df3[‘sorted_student_residuals’], dist = ‘norm’, fit = False)[0]
rankings = abs(df3[‘sorted_student_residuals’]).sort_values(ascending = False)
top3 = rankings[:3]
fig, ax = plt.subplots()
x = df3[‘theoretical_quantiles’]
y = df3[‘sorted_student_residuals’]
ax.scatter(x,y, edgecolor = ‘k’,facecolor = ‘none’)
ax.set_title(‘Normal Q-Q’)
ax.set_ylabel(‘Standardized Residuals’)
ax.set_xlabel(‘Theoretical Quantiles’)
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = ‘r’, ls = ‘–‘)
for val in top3.index:
ax.annotate(val,xy=(df3[‘theoretical_quantiles’].loc[val],df3[‘sorted_student_residuals’].loc[val]))
plt.show()

Leave a Comment Cancel Reply