# data libs
import numpy as np
import pandas as pd
import scipy.stats as sts

#vizualizations
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

# machine learning / forecasting

# catch warnings for better readability
import warnings
warnings.filterwarnings('ignore')


# read in data
data = pd.read_csv('NCD_RisC_Lancet_2016_DM_age_standardised_countries.csv')


data.head()


data.rename(columns = {'Country/Region/World' : 'Country',
                    'Age-standardised diabetes prevalence':'AgStdPrev',
                    'Lower 95% uncertainty interval':'ci95low',
                    'Upper 95% uncertainty interval':'ci95high'}, inplace=True)


#turn the values into percentages and round them
for c in data.columns[4:]:
    data[c] = data[c]*100

#round to 3rd decimal place
data = data.round(3)


data.head()


data['ci95'] = data['ci95high'] - data['ci95low']


data.isna().sum()

Country      0
ISO          0
Sex          0
Year         0
AgStdPrev    0
ci95low      0
ci95high     0
ci95         0
dtype: int64


plt.figure(figsize = (12, 7))
sns.histplot(data = data, x = 'AgStdPrev', hue='Sex')

<AxesSubplot:xlabel='AgStdPrev', ylabel='Count'>


def normality_tests(x, alpha=0.05):
    """
    Apply multiple tests of normality on vector x at significance alpha
    x: array_like
    alpha: float
    
    Returns
    res_df: Pandas dataframe with test statistic, corresponding p value, 
        and pass/fail of null hypothesis
    """
    # define the tests
    s_shap, p_shap = sts.shapiro(x)
    s_dag, p_dag = sts.normaltest(x)
    s_jb, p_jb = sts.jarque_bera(x)
    s_k, p_k = sts.kstest(x, 'norm')
    
    # define the name of the tests
    test_names = ["Shapiro-Wilk", "D'Agostino", "Jarque-Bera", "Kolmogorov-Smirnov"]
    # create lists of the extracted statistics and p_values
    statistics = [s_shap, s_dag, s_jb, s_k]
    p_values = [p_shap, p_dag, p_jb, p_k]
    # create the dataframe of results
    res_df = pd.DataFrame(dict(test = test_names, statistic = statistics, p_val = p_values))
    res_df['Ho'] = res_df['p_val'].map(lambda y: 'Reject' if y < alpha else 'Accept')
    return res_df


normality_tests(data[data['Sex'] == 'Men']['AgStdPrev'])


normality_tests(data[data['Sex'] == 'Women']['AgStdPrev'])


sts.probplot(data[data['Sex'] == 'Men']['AgStdPrev'], dist = 'norm', plot = plt);


sts.probplot(data[data['Sex'] == 'Women']['AgStdPrev'], dist = 'norm', plot = plt);


stat_mwu, pval_mwu = sts.mannwhitneyu(data[data['Sex'] == 'Women']['AgStdPrev'], 
                 data[data['Sex'] == 'Men']['AgStdPrev'],
                use_continuity=True)
print(f"The p-value of the Mann-Whitney non parametric test is {np.round(pval_mwu, 3)}")

The p-value of the Mann-Whitney non parametric test is 0.036


comparison_df = pd.DataFrame(data = {"M":np.array(data[data['Sex'] == 'Men']['AgStdPrev']),
                                    "W":np.array(data[data['Sex'] == 'Women']['AgStdPrev'])})


means = comparison_df.mean()
stds = comparison_df.std()


# calcuate pooled standard deviations
s_pooled = np.sqrt(stds.loc['M']**2 + stds.loc['W']**2)*0.5

l = len(comparison_df) - 1  #the lengths for men and women are equal therefore use on value
s_pooled_star = np.sqrt((l*stds.loc['M']**2 + l*stds.loc['W']**2)/((2*l) - 2))


cohend = np.round((means.loc['W'] - means.loc['M'])/s_pooled, 3)
hedgeg = np.around((means.loc['W'] - means.loc['M'])/s_pooled_star, 3)
pr, pval = sts.pearsonr(comparison_df['W'], comparison_df['M'])

effects_df = pd.DataFrame(dict(Statistical_index = ["Cohen's d", "Hedge's g", "Pearson r assoc."],
                              index_value = [cohend, hedgeg, pr]))
effects_df.round(3)

	Country/Region/World	ISO	Sex	Year	Age-standardised diabetes prevalence	Lower 95% uncertainty interval	Upper 95% uncertainty interval
0	Afghanistan	AFG	Men	1980	0.044712	0.015339	0.094918
1	Afghanistan	AFG	Men	1981	0.046114	0.016883	0.093777
2	Afghanistan	AFG	Men	1982	0.047601	0.018745	0.094018
3	Afghanistan	AFG	Men	1983	0.049173	0.020375	0.093950
4	Afghanistan	AFG	Men	1984	0.050834	0.022269	0.093679

	Country	ISO	Sex	Year	AgStdPrev	ci95low	ci95high
0	Afghanistan	AFG	Men	1980	4.471	1.534	9.492
1	Afghanistan	AFG	Men	1981	4.611	1.688	9.378
2	Afghanistan	AFG	Men	1982	4.760	1.875	9.402
3	Afghanistan	AFG	Men	1983	4.917	2.037	9.395
4	Afghanistan	AFG	Men	1984	5.083	2.227	9.368

Value	Effect size
0.2	low
0.5	medium
0.8	strong

Effect size and why it matters.¶

Effect Size Calculations¶

Map prevalence in time¶

	test	statistic	Ho
0	Shapiro-Wilk	0.762138	Reject
1	D'Agostino	3948.194457	Reject
2	Jarque-Bera	33643.630432	Reject
3	Kolmogorov-Smirnov	0.980218	Reject

	test	statistic	Ho
0	Shapiro-Wilk	0.798992	Reject
1	D'Agostino	3268.323622	Reject
2	Jarque-Bera	18263.775251	Reject
3	Kolmogorov-Smirnov	0.986663	Reject

	Statistical_index	index_value
0	Cohen's d	0.075
1	Hedge's g	0.053
2	Pearson r assoc.	0.951