import pandas as pd
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import normaltest
%matplotlib inline
df = pd.read_excel(r’C:\Users\Noman Arshed\OneDrive\Research projects\Submitted\dynamic capabilities paper\finaldata.xls’, skiprows=[0])
df.head()
df.dtypes
df.describe()
plt.figure(figsize=(10,10))
plt.title(‘Missing values in Vulnerability Data’)
sns_plot = sns.heatmap(df.isnull(), cbar=False)
fig = sns_plot.get_figure()
df1 = df.dropna()
sns.distplot(df1[‘sens’])
qqplot(df1[‘sens’], line=’s’)
plt.show()
stat, p = shapiro(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))
stat, p = normaltest(df1[‘sens’])
print(‘Statistics=%.3f, p=%.3f’ % (stat, p))
sns_plot = sns.jointplot(x=’sens’,y=’seiz’,data=df1,kind=’scatter’)
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
from matplotlib import rcParams
import scipy.stats as stats
reg = smf.ols(‘inno ~ sens + seiz’, data = df1)
res = reg.fit()
print(res.summary())
sm.graphics.influence_plot(res)
residuals = res.resid
fitted = res.fittedvalues
smoothed = lowess(residuals,fitted)
top3 = abs(residuals).sort_values(ascending = False)[:3]
plt.rcParams.update({‘font.size’: 16})
plt.rcParams[“figure.figsize”] = (8,7)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = ‘k’, facecolors = ‘none’)
ax.plot(smoothed[:,0],smoothed[:,1],color = ‘r’)
ax.set_ylabel(‘Residuals’)
ax.set_xlabel(‘Fitted Values’)
ax.set_title(‘Residuals vs. Fitted’)
ax.plot([min(fitted),max(fitted)],[0,0],color = ‘k’,linestyle = ‘:’, alpha = .3)
for i in top3.index:
ax.annotate(i,xy=(fitted[i],residuals[i]))
plt.show()
sorted_student_residuals = pd.Series(res.get_influence().resid_studentized_internal)
sorted_student_residuals.index = res.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df3 = pd.DataFrame(sorted_student_residuals)
df3.columns = [‘sorted_student_residuals’]
df3[‘theoretical_quantiles’] = stats.probplot(df3[‘sorted_student_residuals’], dist = ‘norm’, fit = False)[0]
rankings = abs(df3[‘sorted_student_residuals’]).sort_values(ascending = False)
top3 = rankings[:3]
fig, ax = plt.subplots()
x = df3[‘theoretical_quantiles’]
y = df3[‘sorted_student_residuals’]
ax.scatter(x,y, edgecolor = ‘k’,facecolor = ‘none’)
ax.set_title(‘Normal Q-Q’)
ax.set_ylabel(‘Standardized Residuals’)
ax.set_xlabel(‘Theoretical Quantiles’)
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = ‘r’, ls = ‘–‘)
for val in top3.index:
ax.annotate(val,xy=(df3[‘theoretical_quantiles’].loc[val],df3[‘sorted_student_residuals’].loc[val]))
plt.show()
No Comments