#data crunching imports
import pandas as pd
import numpy as np

#vis imports
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.express as px

#splitting of data
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

#metrics and stats
from sklearn.metrics import r2_score, explained_variance_score 
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import norm

#modeling regressors: trees and linear with Ridge
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb

# ML model interpretation with ELI5 package
import eli5 as eli
from eli5.sklearn import PermutationImportance

# supress warnings - added to create a crisp output. Please, check versions of libraries you use for consistency.
import warnings
warnings.filterwarnings('ignore')


#read in data
concrete = pd.read_csv('concrete.csv')


concrete.columns #read columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa. megapascals)'],
      dtype='object')


data = concrete.copy() #create a copy and rename cols
data.columns = ['Cement', 'BFS','FlyAsh','Water','Superplasticizer','CoarseAggr', 'FineAggr', 'Age','CompressiveStrength']


data


pd.plotting.scatter_matrix(data, figsize=(20,12), color='r', diagonal='kde');


cor_coefs = data.corr(method='kendall')


cor_coefs.round(3)


plt.figure(figsize=(12,8))
sns.heatmap(cor_coefs.round(3), cmap='coolwarm', annot=True);


# max correlation value and corresponding feature 
# get index where max appears
idx_max_corr = cor_coefs['CompressiveStrength'].drop(index = ['CompressiveStrength']).argmax()
# use index to extract the feature name
feat_max_corr = cor_coefs['CompressiveStrength'].index[idx_max_corr]
# use the feature name to get the max value
val_max_corr = cor_coefs['CompressiveStrength'].drop(index = ['CompressiveStrength'])[feat_max_corr]

print(f"Max correlation observed for feature {feat_max_corr} with value {np.round(val_max_corr, 3)}")

Max correlation observed for feature Age with value 0.449


data.isna().sum()

Cement                 0
BFS                    0
FlyAsh                 0
Water                  0
Superplasticizer       0
CoarseAggr             0
FineAggr               0
Age                    0
CompressiveStrength    0
dtype: int64


y = data['CompressiveStrength'] #response variable (target)
X = data[data.columns[:-1]] #predictors


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, 
                                                shuffle = True, 
                                                random_state = 2021)


regressors = [xgb.XGBRegressor(),
              lgb.LGBMRegressor(),
             RandomForestRegressor(),
              ExtraTreesRegressor(),
             Ridge(alpha=5.)]
models = ['xgboost', 'lgbm', 'random_forest', 'x_trees', 'ridge'] # model names


predictions = [] #list to hold predictions of each model
for regressor in regressors:
    modeling = Pipeline(steps = [('regressor', regressor)])
    modeling.fit(Xtrain, ytrain)
    predictions.append(modeling.predict(Xtest))


fig, ax = plt.subplots(1, len(models), figsize = (12,7))
fig.tight_layout()
for i in range(0, len(models)):
    sns.regplot(x = ytest, y = predictions[i], ax = ax[i])
    ax[i].set_title(models[i])
    ax[i].set_xlabel('Measured Comp. Strength [MPa]')
    ax[i].set_ylabel('Predicted Comp. Strength [MPa]');


def examine_residuals(ytest, yhat, model_name):
    """
    provides a visual for the behavior of residual values:
    normalncy and outliers.
    """
    residuals = ytest - yhat
    mu, std = norm.fit(residuals)
    x_vals = np.linspace(residuals.min(), residuals.max(), 100)
    p = norm.pdf(x_vals, mu, std)
    fig, ax = plt.subplots(1, 2, figsize = (12, 7))
    fig.suptitle("Residuals of modeling with " + model_name, fontsize = 14)
    ax[0].hist(residuals, density = True, color = 'b')
    ax[0].plot(x_vals, p, linewidth = 2, color = 'orange')
    sns.boxplot(x = residuals, ax = ax[1]);

def extract_regression_metrics(ytest, predictions, models):
    """
    create a dataframe that holds regression metrics 
    for each of the regressors
    """
    metrics_df = pd.DataFrame()
    for i in range(0, len(models)):
        metrics_dict = {'r2':r2_score(ytest, predictions[i]).round(3),
                'Explained_Var': explained_variance_score(ytest, predictions[i]).round(3),
               'MAE':mean_absolute_error(ytest, predictions[i]).round(3),
               'MSE':mean_squared_error(ytest, predictions[i]).round(3)}
        metrics_dfi = pd.DataFrame(metrics_dict, index = [models[i]])
        metrics_df = pd.concat([metrics_dfi,metrics_df])
    return metrics_df


for i in range(0, len(models)):
    examine_residuals(ytest, predictions[i], model_name=models[i])


metrics = extract_regression_metrics(ytest, predictions, models)


metrics


fig, ax = plt.subplots(1, 4, figsize = (16,5))
fig.tight_layout()
for i in range(0, len(metrics.columns)):
    ax[i].plot(metrics.index, metrics[metrics.columns[i]], marker = 'o')
    ax[i].set_ylabel('')
    ax[i].set_xticklabels(list(metrics.index), rotation = 45)
    ax[i].set_title(metrics.columns[i]);


weights = eli.explain_weights(regressors[0].fit(Xtrain,ytrain))
weights


weights_df = eli.format_as_dataframe(weights)
plt.figure(figsize = (12,7))
sns.barplot(x = 'feature', y = 'weight', data = weights_df, palette = 'summer')
plt.grid(axis='y')


perm = PermutationImportance(regressors[0], scoring = 'r2').fit(Xtest, ytest)
eli.show_weights(perm, feature_names = list(Xtest.columns))


from sklearn.model_selection import cross_validate


Xtrain_, Xtest_, ytrain_, ytest_ = train_test_split(X, y, test_size = 0.4, 
                                                shuffle = True, 
                                                random_state = 42)


scores_=cross_validate(regressors[0], Xtest_, ytest_, 
                       scoring=['r2', 'explained_variance', 
                                'neg_mean_absolute_error','neg_mean_squared_error'], 
                       cv=30)


cv_scores_df = pd.DataFrame(scores_)


cv_scores_df.head()


cv_scores_df = cv_scores_df.loc[:,'test_r2':'test_neg_mean_squared_error']
cv_scores_df['test_MAE'] = -1*cv_scores_df['test_neg_mean_absolute_error']
cv_scores_df['test_MSE'] = -1*cv_scores_df['test_neg_mean_squared_error']
cv_scores_df.drop(columns=['test_neg_mean_absolute_error', 'test_neg_mean_squared_error'], inplace=True)


fig, ax = plt.subplots(1, 4, figsize=(16, 6))
for j in range(0, 4):
#     print(cv_scores_df.columns[j])
    sns.histplot(cv_scores_df[cv_scores_df.columns[j]],ax = ax[j])


cv_scores_df.describe().drop(index=['count']).round(3) #statistics of 30 cross validations

	Cement	BFS	FlyAsh	Water	Superplasticizer	CoarseAggr	FineAggr	Age	CompressiveStrength
0	540.0	0.0	0.0	162.0	2.5	1040.0	676.0	28.0	79.99
1	540.0	0.0	0.0	162.0	2.5	1055.0	676.0	28.0	61.89
2	332.5	142.5	0.0	228.0	0.0	932.0	594.0	270.0	40.27
3	332.5	142.5	0.0	228.0	0.0	932.0	594.0	365.0	41.05
4	198.6	132.4	0.0	192.0	0.0	978.4	825.5	360.0	44.30
...	...	...	...	...	...	...	...	...	...
1025	276.4	116.0	90.3	179.6	8.9	870.1	768.3	28.0	44.28
1026	322.2	0.0	115.6	196.0	10.4	817.9	813.4	28.0	31.18
1027	148.5	139.4	108.6	192.7	6.1	892.4	780.0	28.0	23.70
1028	159.1	186.7	0.0	175.6	11.3	989.6	788.9	28.0	32.77
1029	260.9	100.5	78.3	200.6	8.6	864.5	761.5	28.0	32.40

	Cement	BFS	FlyAsh	Water	Superplasticizer	CoarseAggr	FineAggr	Age	CompressiveStrength
Cement	1.000	-0.168	-0.329	-0.065	0.028	-0.103	-0.119	0.004	0.327
BFS	-0.168	1.000	-0.203	0.036	0.078	-0.247	-0.220	-0.015	0.119
FlyAsh	-0.329	-0.203	1.000	-0.210	0.350	0.056	0.044	0.002	-0.060
Water	-0.065	0.036	-0.210	1.000	-0.529	-0.150	-0.245	0.063	-0.206
Superplasticizer	0.028	0.078	0.350	-0.529	1.000	-0.139	0.121	-0.006	0.250
CoarseAggr	-0.103	-0.247	0.056	-0.150	-0.139	1.000	-0.054	-0.031	-0.124
FineAggr	-0.119	-0.220	0.044	-0.245	0.121	-0.054	1.000	-0.042	-0.122
Age	0.004	-0.015	0.002	0.063	-0.006	-0.031	-0.042	1.000	0.449
CompressiveStrength	0.327	0.119	-0.060	-0.206	0.250	-0.124	-0.122	0.449	1.000

	r2	Explained_Var	MAE	MSE
ridge	0.548	0.551	8.789	124.787
x_trees	0.908	0.908	3.221	25.563
random_forest	0.902	0.902	3.625	27.124
lgbm	0.927	0.927	3.069	20.253
xgboost	0.927	0.927	2.886	20.311

Weight	Feature
0.2510	Age
0.2394	Cement
0.1538	Superplasticizer
0.1379	BFS
0.1007	Water
0.0464	FineAggr
0.0463	FlyAsh
0.0246	CoarseAggr

Weight	Feature
0.8474 ± 0.1388	Age
0.6476 ± 0.1019	Cement
0.2088 ± 0.0391	BFS
0.1826 ± 0.0292	Water
0.1080 ± 0.0245	Superplasticizer
0.0421 ± 0.0150	FineAggr
0.0298 ± 0.0100	CoarseAggr
0.0029 ± 0.0071	FlyAsh

Analysis of the Concrete Compressive Strength dataset¶

Read and examine the data¶

Preprocessing¶

Set up the pipeline of regressors, train, and predict¶

Explanation of weights for best performing model¶

Cross validate the model¶

Conclusion¶

	fit_time	score_time	test_r2	test_explained_variance	test_neg_mean_absolute_error	test_neg_mean_squared_error
0	0.114666	0.002334	0.908671	0.909659	-3.396428	-26.555192
1	0.053154	0.002363	0.849104	0.852225	-3.352535	-19.849173
2	0.054944	0.002347	0.929630	0.933802	-3.977411	-23.962970
3	0.057264	0.002356	0.846868	0.848218	-5.781331	-56.167634
4	0.054327	0.002251	0.876412	0.912978	-2.690353	-12.663648

	test_r2	test_explained_variance	test_MAE	test_MSE
mean	0.860	0.875	3.976	35.575
std	0.094	0.081	1.246	26.838
min	0.555	0.641	2.054	6.851
25%	0.847	0.857	2.973	16.149
50%	0.872	0.889	3.856	28.254
75%	0.922	0.928	4.756	45.550
max	0.973	0.982	7.089	125.086