Python Jupyter Notebook platform can be used to estimate simple regression on a data set. Python is one of the leading programming language with its application is data science.
import pandas as pd
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import normaltest
%matplotlib inline
df = pd.read_excel(r’C:\Users\Noman Arshed\OneDrive\Research projects\Submitted\dynamic capabilities paper\finaldata.xls’, skiprows=[0])
df.head()
df.dtypes
df.describe()
plt.figure(figsize=(10,10))
plt.title(‘Missing values in Vulnerability Data’)
sns_plot = sns.heatmap(df.isnull(), cbar=False)
fig = sns_plot.get_figure()
df1 = df.dropna()
sns.distplot(df1[‘sens’])
qqplot(df1[‘sens’], line=’s’)
plt.show()
stat, p = shapiro(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))
stat, p = normaltest(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))
sns_plot = sns.jointplot(x=’sens’,y=’seiz’,data=df1,kind=’scatter’)
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
from matplotlib import rcParams
import scipy.stats as stats
reg = smf.ols(‘inno ~ sens + seiz’, data = df1)
res = reg.fit()
print(res.summary())
sm.graphics.influence_plot(res)
residuals = res.resid
fitted = res.fittedvalues
smoothed = lowess(residuals,fitted)
top3 = abs(residuals).sort_values(ascending = False)[:3]
plt.rcParams.update({‘font.size’: 16})
plt.rcParams[“figure.figsize”] = (8,7)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = ‘k’, facecolors = ‘none’)
ax.plot(smoothed[:,0],smoothed[:,1],color = ‘r’)
ax.set_ylabel(‘Residuals’)
ax.set_xlabel(‘Fitted Values’)
ax.set_title(‘Residuals vs. Fitted’)
ax.plot([min(fitted),max(fitted)],[0,0],color = ‘k’,linestyle = ‘:’, alpha = .3)
for i in top3.index:
ax.annotate(i,xy=(fitted[i],residuals[i]))
plt.show()
sorted_student_residuals = pd.Series(res.get_influence().resid_studentized_internal)
sorted_student_residuals.index = res.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df3 = pd.DataFrame(sorted_student_residuals)
df3.columns = [‘sorted_student_residuals’]
df3[‘theoretical_quantiles’] = stats.probplot(df3[‘sorted_student_residuals’], dist = ‘norm’, fit = False)[0]
rankings = abs(df3[‘sorted_student_residuals’]).sort_values(ascending = False)
top3 = rankings[:3]
fig, ax = plt.subplots()
x = df3[‘theoretical_quantiles’]
y = df3[‘sorted_student_residuals’]
ax.scatter(x,y, edgecolor = ‘k’,facecolor = ‘none’)
ax.set_title(‘Normal Q-Q’)
ax.set_ylabel(‘Standardized Residuals’)
ax.set_xlabel(‘Theoretical Quantiles’)
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = ‘r’, ls = ‘–‘)
for val in top3.index:
ax.annotate(val,xy=(df3[‘theoretical_quantiles’].loc[val],df3[‘sorted_student_residuals’].loc[val]))
plt.show()