# usual imports 
import pandas as pd
import numpy as np
import scipy.stats as sts

# ml libs
from fbprophet import Prophet
import xgboost as xgb

# metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# plotting libs
import matplotlib.pyplot as plt
import seaborn as sns

from fbprophet.plot import plot_plotly


data1 = pd.read_csv('data-01', sep='\t', header = None)
data1.columns = ['date', 'time', 'code', 'value']


data1.head()


data1.code.unique()

array([58, 33, 34, 62, 48, 65, 60])


desired_codes = [48, 58, 60, 62]


#create a df with desired_codes only
bg_df= data1[data1['code'].isin(desired_codes)].copy()


# examine the data types
bg_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 0 to 940
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    369 non-null    object
 1   time    369 non-null    object
 2   code    369 non-null    int64 
 3   value   369 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 14.4+ KB


type(bg_df['date'].iloc[0]), type(bg_df['time'].iloc[0])

(str, str)


bg_df.loc[:,'ds'] = bg_df.loc[:,('date')].str.cat(bg_df.loc[:,('time')], sep = ' ')
bg_df['ds'] = pd.to_datetime(bg_df['ds'])
bg_df.rename(columns = {'value':'y'}, inplace = True)
bg_df = bg_df.loc[:, ('ds', 'y')].copy()
bg_df = bg_df.set_index(keys = 'ds', drop = True).copy()


bg_df.head()


print(f"Data collection started on {bg_df.index.min().strftime('%A')}, {bg_df.index.min().date()} at {bg_df.index.min().time()}")

Data collection started on Sunday, 1991-04-21 at 09:09:00


print(f"Data collection ended on {bg_df.index.max().strftime('%A')}, {bg_df.index.max().date()} at {bg_df.index.max().time()}")

Data collection ended on Tuesday, 1991-09-03 at 07:20:00


ts = (bg_df.index.max() - bg_df.index.min()).total_seconds()
t_min = np.round(ts/60., 2)
t_hours = np.round(t_min/60., 2)
t_days = np.round(t_hours/24., 2)
print(f"Total collection time: {t_hours} hours or {t_days} days")

Total collection time: 3238.18 hours or 134.92 days


# NPH insulin
nph_df = data1[data1['code'].isin([34])].copy()


nph_df.loc[:,'ds'] = nph_df.loc[:,'date'].str.cat(nph_df.loc[:,'time'], sep=' ')


nph_df['ds'] = pd.to_datetime(nph_df['ds'])


nph_df.drop(columns = ['date', 'time'], inplace=True)


nph_df = nph_df.set_index(keys = 'ds', drop = True).copy()
nph_df.head()


plt.figure(figsize = (16, 7))
plt.plot(bg_df.index, bg_df['y'], c = 'b', label = 'Blood glucose measurement')
plt.vlines(nph_df.index, ymin = 0, ymax = 350, colors = 'orange', 
           ls = '-', label = 'NPH insulin', alpha = 0.5)
plt.legend();


# regular insulin dose 
insulin_df = data1[data1['code'].isin([33])].copy()
insulin_df.loc[:,'ds'] = insulin_df.loc[:,'date'].str.cat(insulin_df.loc[:,'time'], sep=' ')
insulin_df['ds'] = pd.to_datetime(insulin_df['ds'])
insulin_df.drop(columns = ['date', 'time'], inplace=True)
insulin_df = insulin_df.set_index(keys = 'ds', drop = True).copy()
insulin_df.head()


plt.figure(figsize = (16, 7))
plt.plot(bg_df.index, bg_df['y'], c = 'b', label = 'Blood glucose measurement')
plt.vlines(insulin_df.index, ymin = 0, ymax = 350, colors = 'green', 
           ls = '--', alpha = 0.4, label = 'Regular insulin')
plt.legend();


plt.figure(figsize = (16, 7))
plt.plot(bg_df['y'], c = 'b', alpha = 0.2, label = 'Blood glucose measurement')
plt.plot(bg_df.resample('D').mean(), 'r--', label = 'Mean daily value')
plt.plot(bg_df.resample('W').mean(), 'g:', linewidth = 5, label = 'Mean weekly value')
plt.plot(bg_df.resample('D').std(), 'k:', linewidth = 3, label = '$\sigma$ of daily value')
plt.ylabel('Blood glucose', fontsize =12)
plt.legend();


class UniForecast():
    """
    Class to use in time-series analysis of univariable forecasts.
    """
    
    def create_features(self, s, n = 3):
        """
        Creates n sliding window features of a univariate variable stored in s
        n: int
        s: pandas series holding the parameter

        Returns
        df: expanded df with n features 
        """
        #make sure the user passed a pandas series object
        assert isinstance(s, pd.Series) == True, 'I need a series to work on...'
        assert s.isna().sum() == 0, 'Series contains nan values ...'
        #store the shifted series
        store_shifts = [] 
        #create the n shifted series 
        for i in range(0, n+1):
            store_shifts.append(s.shift(i).copy())
        #reverse the stored series to reflect a supervised series
        store_shifts.reverse()
        df = pd.concat(store_shifts, ignore_index = True, axis =1)
        #create appropriate feature names
        feat_names = []
        for i in range(0, n):
            feat_names.append("x"+str(i))
        #add the target's name
        feat_names.append("y")
        df.columns = feat_names

        return df.iloc[n:, :].reset_index(drop=True).copy()

    def walk_fwd(self, df, model, test_samples = 20): 
        """
        Performs a walk forward validation on the last test_samples number of data in df
        df: expanded dataframe from create_features
        test_samples: number of samples to test the algorithm on

        Returns
        -------
        preds, real: list
        Lists of predictions and reported values
        """
        #create the initial slice on which the algorithm will be trained on
        train_data = df.iloc[:-test_samples, :].copy()
        Xtrain = train_data.iloc[:-test_samples,:-1].copy()
        ytrain = train_data.iloc[:-test_samples,-1].copy()
        #fit the on the initial slice
        model.fit(Xtrain, ytrain)
        #create two lists to hold the recorded (real) and predicted values
        preds, real = [], []
        for i in range(0, test_samples):
            #append the real value to be predicted from the test set
            real.append(df.iloc[len(df)-test_samples + i, -1])
            #create a dataframe of the incoming vector that predicts the next value
            x = df.iloc[len(df) - test_samples + i, :-1].to_frame().T
            preds.append(model.predict(x))
            #update the training data by adding the first out the test_samples at the bottom
            update_train_data = df.iloc[: -test_samples + i, :].copy()
            update_Xtrain = df.iloc[:-test_samples+i, :-1].copy()
            update_ytrain = df.iloc[:-test_samples+i, -1].copy()
        #retrain model with the added point
            model.fit(update_Xtrain, update_ytrain)
        return real, preds


#instantiate the class
uf = UniForecast()


# use the create features method to create a df with 3 features i.e. the algorithm learns the next value by examining the previous 3
df1 = uf.create_features(bg_df.iloc[:, 0].reset_index(drop=True), n=3)
# make the predictions
real, preds = uf.walk_fwd(df = df1,
                      model = xgb.XGBRegressor(n_estimators = 500, max_depth = 2, objective = 'reg:squarederror'))


# predictions of model are arrays - conver to list.
preds = [p[0] for p in preds]


fig = plt.figure(figsize = (10, 6))
ax = fig.add_subplot(1, 1, 1)
sns.histplot(real, ax = ax)
sns.histplot(preds, color = 'orange', ax = ax);


statistic, p_val = sts.mannwhitneyu(real, preds)
print(f"The p value of the test is {np.round(p_val, 3)}.")
if p_val < 0.05:
    print("Evidence suggest to reject Ho.")
else:
    print("Evidence suggest to accept Ho.")

The p value of the test is 0.357.
Evidence suggest to accept Ho.


plt.figure(figsize = (12, 6))
plt.plot(real, 'b-', label = 'Measured blood glucose')
plt.plot(preds, 'r--', label = 'Predicted blood glucose')
plt.ylabel("Blood glucose", fontsize = 12)
plt.xticks(ticks = range(0, len(preds)+1, 5))
plt.legend();


preds_df = pd.DataFrame(np.array(preds),columns = ['yhat'], index = range(len(bg_df)-len(preds), len(bg_df)))


plt.figure(figsize = (12, 6))
plt.plot(bg_df['y'].reset_index(drop=True), 'b-', alpha = 0.3, label = 'Real')
# plt.plot(real, 'b--', label = 'Real')
plt.plot(preds_df['yhat'], 'r-', label = 'Predicted')
plt.legend();


mae = mean_absolute_error(real, preds)
np.round(mae, 3)

70.675


# prophet requires a df with 2 cols with names: ds, y
# let's create it
bg_df = bg_df.reset_index().copy()


bg_df.shape

(369, 2)


train_df = bg_df.iloc[:301, :].copy()


#instantiate the Prophet class with yearly seasonality off
prophet = Prophet(growth = 'linear', 
                  weekly_seasonality = True,
                 daily_seasonality = True,
                 yearly_seasonality = False)


# run the fit method to train the model
prophet.fit(train_df)

<fbprophet.forecaster.Prophet at 0x7fe620d9c940>


forecast = prophet.predict(bg_df)
forecast.head()


plt.figure(figsize = (12, 7))
plt.plot(bg_df['ds'], bg_df['y'], label = 'Data')
plt.plot(forecast['ds'], forecast['yhat'], label = 'Prediction')
plt.legend();


fig1 = prophet.plot(forecast)


fig2 = prophet.plot_components(forecast)

	y
ds
1991-04-21 09:09:00	100
1991-04-21 17:08:00	119
1991-04-21 22:51:00	123
1991-04-22 07:35:00	216
1991-04-22 16:56:00	211

	ds	trend	yhat_lower	yhat_upper	trend_lower	trend_upper	additive_terms	additive_terms_lower	additive_terms_upper	daily	daily_lower	daily_upper	weekly	weekly_lower	weekly_upper	yhat
0	1991-04-21 09:09:00	136.126888	82.488539	257.249492	136.126888	136.126888	31.165188	31.165188	31.165188	50.376030	50.376030	50.376030	-19.210842	-19.210842	-19.210842	167.292076
1	1991-04-21 17:08:00	136.132570	66.902627	232.498700	136.132570	136.132570	15.799656	15.799656	15.799656	26.155872	26.155872	26.155872	-10.356216	-10.356216	-10.356216	151.932225
2	1991-04-21 22:51:00	136.136639	66.884307	234.566205	136.136639	136.136639	11.235103	11.235103	11.235103	13.124258	13.124258	13.124258	-1.889155	-1.889155	-1.889155	147.371741
3	1991-04-22 07:35:00	136.142854	80.809322	245.764480	136.142854	136.142854	29.673194	29.673194	29.673194	23.314128	23.314128	23.314128	6.359067	6.359067	6.359067	165.816048
4	1991-04-22 16:56:00	136.149509	80.263175	249.058773	136.149509	136.149509	30.590593	30.590593	30.590593	27.567815	27.567815	27.567815	3.022778	3.022778	3.022778	166.740102

Diabetes patient record: time series analysis and forecasting.¶

Data import and preprocessing¶

Insulin¶

Forecasting with gradient boosted trees.¶

Facebook's prophet¶

	date	time	code	value
0	04-21-1991	9:09	58	100
1	04-21-1991	9:09	33	9
2	04-21-1991	9:09	34	13
3	04-21-1991	17:08	62	119
4	04-21-1991	17:08	33	7

	code	value
ds
1991-04-21 09:09:00	34	13
1991-04-22 07:35:00	34	13
1991-04-23 07:25:00	34	13
1991-04-24 07:52:00	34	14
1991-04-25 07:29:00	34	14