!pip install wget 

import numpy as np
import pandas as pd
import wget
import matplotlib.pyplot as plt
%matplotlib inline

wget.download('https://raw.githubusercontent.com/R-miss-tastic/website/master/static/how-to/python/produceNA.py')
wget.download('https://raw.githubusercontent.com/R-miss-tastic/website/master/static/how-to/python/tools.py')

from produceNA import *

from tools import color_imputedvalues_orange

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit, cross_val_score

from sklearn.datasets import fetch_california_housing

import random

Requirement already satisfied: wget in /Users/audesportisse/anaconda3/lib/python3.7/site-packages (3.2)


np.random.seed(0)  # fix the seed 

n = 1000
d = 3

mean = np.repeat(0, d)
cov = 0.5 * (np.ones((d,d)) + np.eye(d))
x_comp = np.random.multivariate_normal(mean, cov, size = n)
b = np.random.uniform(low = 0, high = 3, size = d)
sigma_eps = 0.25
eps = np.random.normal(scale = sigma_eps, size = n)
y = x_comp.dot(b) + eps


p = 0.3 #30% of Missing Values

XproduceNA =  produce_NA(x_comp, p_miss=p, mecha="MCAR") #True for missing values, false for others
X_miss = XproduceNA['X_incomp']
x_miss = X_miss.numpy()
print('Ratio nan in X:', np.round(100 * np.sum(np.isnan(x_miss)) / x_miss.size,1), '%')

Ratio nan in X: 30.3 %


pd.DataFrame(x_miss).head().style.highlight_null(null_color='orange')


plt.scatter(x_miss[:400,0], x_miss[:400,1]);  #  data with missing coordinate are not plotted


## Mean imputation
x_mean = SimpleImputer().fit_transform(x_miss)
pd.DataFrame(x_mean).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


plt.scatter(x_mean[:400,0], x_mean[:400,1])

missing_mask = np.where([any(x) for x in np.isnan(x_miss[:400][:,[0,1]])])
plt.scatter(x_mean[missing_mask,0], x_mean[missing_mask,1])

<matplotlib.collections.PathCollection at 0x1a226ddf60>


## Mean imputation, return with indicator
x_mean_with_indicator = SimpleImputer(add_indicator=True).fit_transform(x_miss)
pd.DataFrame(x_mean_with_indicator).head().style.applymap(color_imputedvalues_orange, x_miss=np.concatenate((x_miss,np.isnan(x_miss)*1),axis=1))


# Iterative imputation
x_ice = IterativeImputer().fit_transform(x_miss)
pd.DataFrame(x_ice).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


plt.scatter(x_ice[:400,0], x_ice[:400,1])

missing_mask = np.where([any(x) for x in np.isnan(x_miss[:400][:,[0,1]])])
plt.scatter(x_ice[missing_mask,0], x_ice[missing_mask,1])

<matplotlib.collections.PathCollection at 0x1a22810d68>


## Iterative imputation, return with indicator
x_mice_with_indicator = IterativeImputer(add_indicator=True).fit_transform(x_miss)
pd.DataFrame(x_mice_with_indicator).head().style.applymap(color_imputedvalues_orange, x_miss=np.concatenate((x_miss,np.isnan(x_miss)*1),axis=1))


from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

class MinMaxImputer(BaseEstimator, TransformerMixin):
    # replacing the `nan` by two extremes values (by duplicating each column)
    
    def fit(self, X, y=None):
        mm = MinMaxScaler().fit(X)
        self.data_min_ = mm.data_min_
        self.data_max_ = mm.data_max_
        return self

    def transform(self, X):
        X_min, X_max = X.copy(), X.copy()

        for feature_idx in range(X.shape[1]):
            nan_mask = np.isnan(X[:, feature_idx])
            X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
            X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1

        return np.concatenate([X_min, X_max], axis=1)


x_MIA = MinMaxImputer().fit_transform(x_miss)


pd.DataFrame(x_MIA).head().style.applymap(color_imputedvalues_orange, x_miss=np.concatenate((x_miss,np.isnan(x_miss)*1),axis=1))


x_train, x_test, y_train, y_test = train_test_split(x_miss, y)
print('train shape: X.shape={}, y.shape={}'.format(x_train.shape, y_train.shape))
print('test shape: X.shape={}, y.shape={}'.format(x_test.shape, y_test.shape))

train shape: X.shape=(750, 3), y.shape=(750,)
test shape: X.shape=(250, 3), y.shape=(250,)


learner = RandomForestRegressor()


""" Line below would raise an ValueError: 
'Input contains NaN, infinity or a value too large for dtype('float64').'
Because LinearRegressor couldn't deal with NaN values.""";

# learner.fit(X_train, y_train)


imputer = SimpleImputer(strategy='mean')
imputer.fit(x_train)

x_train_imp = imputer.transform(x_train)
x_test_imp = imputer.transform(x_test)


learner.fit(x_train_imp, y_train)
print('Score on train:', learner.score(x_train_imp, y_train))
print('Score on test:', learner.score(x_test_imp, y_test))

Score on train: 0.9559328366636736
Score on test: 0.8036095091168469


def score_pred(X, y, learner, p, nbsim, opt_learner = "Learner_MIA", mecha = "MCAR"):
    """
    Compare imputation methods (Mean imputation and Iterative imputation, with or without concatenating missing values indicator) 
    with the Missing In Attribute method in terms of prediction performance for chosen learning algorithms.
    
    
    Parameters
    ----------
    X : complete data matrix (covariates).
    y : output variable.
    learner : dictionnary containing the learners to be performed for comparing strategies (e.g. random forests, linear regression).
    learner_name : names of the learners.
    p : percentage of missing values to introduce.
    mecha : missing-data mechanism to use for introducing the missing values. 
    
    Returns
    -------
    Scores for each strategy. 
    """
    
    score_results = pd.DataFrame()
    Methods = ['Mean','Iterative','Mean + Mask','Iterative + Mask','MIA']
    for meth in Methods:
        score_results[meth]=[]
    
    for sim in range(0,nbsim):
        if mecha == 'MCAR':
            XproduceNA =  produce_NA(x_comp, p_miss=p, mecha=mecha) 
            x_miss = XproduceNA['X_incomp'].numpy()
        else:
            XproduceNA =  produce_NA(x_comp, p_miss=p, mecha=mecha, opt="logistic", p_obs=0.5) 
            x_miss = XproduceNA['X_incomp'].numpy()

        x_train, x_test, y_train, y_test = train_test_split(x_miss, y)

        #### Mean imputation without mask + learner
        mean_imputer = SimpleImputer(add_indicator=False)
        mean_imputer.fit(x_train)
        x_train_imp = mean_imputer.transform(x_train)
        x_test_imp = mean_imputer.transform(x_test)
        learner.fit(x_train_imp, y_train)
        score_mean = learner.score(x_test_imp, y_test)

        #### Iterative imputation without mask + learner
        it_imputer = IterativeImputer(add_indicator=False)
        it_imputer.fit(x_train)
        x_train_imp = it_imputer.transform(x_train)
        x_test_imp = it_imputer.transform(x_test)
        learner.fit(x_train_imp, y_train)
        score_it = learner.score(x_test_imp, y_test)

        #### Mean imputation with mask + learner
        mean_wh_imputer = SimpleImputer(add_indicator=False)
        mean_wh_imputer.fit(x_train)
        x_train_imp = mean_wh_imputer.transform(x_train)
        x_test_imp = mean_wh_imputer.transform(x_test)
        learner.fit(x_train_imp, y_train)
        score_mean_wh = learner.score(x_test_imp, y_test)

        #### Iterative imputation with mask + learner
        it_wh_imputer = IterativeImputer(add_indicator=False)
        it_wh_imputer.fit(x_train)
        x_train_imp = it_wh_imputer.transform(x_train)
        x_test_imp = it_wh_imputer.transform(x_test)
        learner.fit(x_train_imp, y_train)
        score_it_wh = learner.score(x_test_imp, y_test)

        #### MIA
        if opt_learner == 'Learner_MIA':
            learner.fit(x_train,y_train)
            score_MIA = learner.score(x_test, y_test)
        else: 
            MIA_imputer = MinMaxImputer()
            x_train_imp = MIA_imputer.fit_transform(x_train)
            x_test_imp = MIA_imputer.transform(x_test)
            learner.fit(x_train_imp, y_train)
            score_MIA = learner.score(x_test_imp, y_test)
        
        new_score = {'Mean': score_mean, 'Iterative': score_it, 'Mean + Mask': score_mean_wh, 'Iterative + Mask': score_it_wh, 'MIA': score_MIA}
        score_results = score_results.append(new_score, ignore_index=True)

        
    return(score_results)


learners = {'LinReg': LinearRegression(),
              'RandomForest': RandomForestRegressor(),
              'HGBoost': HistGradientBoostingRegressor()}
p = 0.2
nbsim = 10

fig, ax = plt.subplots(3,2,figsize=(20,10),sharex=True)
for ind_l, (learner_name, learner) in enumerate(learners.items()):
    for ind_m, mecha in enumerate(['MCAR', 'MNAR']):
        if learner_name=='HGBoost':
            opt_learner="Learner_MIA"
        else:
            opt_learner="Learner_MIA_not"
        res=score_pred(x_comp, y, learner, p, nbsim=nbsim, opt_learner = opt_learner, mecha = mecha)
        labels = res.columns
        ax[ind_l,ind_m].set_title(learner_name+" "+mecha)  
        ax[ind_l,ind_m].boxplot(res, labels=labels, vert=False, patch_artist=True)
fig.tight_layout()


def plot_score_realdatasets(X, y, learner, learner_name='learner', X_complete=None, cv=5):
    """
    Compare imputation methods (Mean imputation and Iterative imputation, with or without concatenating missing values indicator) 
    with the Missing In Attribute method in terms of prediction performance for chosen learning algorithms.
    
    
    Parameters
    ----------
    X : data matrix containing missing values.
    y : output variable (containing no missing values).
    learner: dictionnary containing the learners to be performed for comparing strategies.
    learner_name: name for the learners containing in learner.
    X_complete: complete data matrix (None if the dataset contains real missing values).
    cv: number of folds in the stratified folds.
    
    Returns
    -------
    Boxplot scores for each strategy.
    """
    
    imputers_wo_indicator = {'Mean': SimpleImputer(add_indicator=False),
                         'Iterative': IterativeImputer(add_indicator=False)}
    imputers_with_indicator = {'Mean': SimpleImputer(add_indicator=True),
                           'Iterative': IterativeImputer(add_indicator=True),
                           'MinMax': MinMaxImputer()}
    
    l_score = []
    for imputers in [imputers_wo_indicator, imputers_with_indicator]:
        for imputer_name, imputer in imputers.items():
            pipeline = Pipeline([(imputer_name, imputer), (learner_name, learner)])
            score = cross_val_score(pipeline, X, y, n_jobs=-1, cv = cv)
            l_score.append(score)
            
    score_ground_truth = cross_val_score(learner, X_complete, y, n_jobs=-1, cv = cv)
    l_score.append(score_ground_truth)
    plt.title(learner_name)     
    labels = ['Mean', 'Iterative', 'Mean+Mask', 'Iterative+Mask', 'MIA', 'X_complete']
    bplot = plt.boxplot(l_score[::-1], labels=labels[::-1],
                vert=False, patch_artist=True)

    # Add colors to boxplots
    #for patch, color in zip(bplot['boxes'],
                            #['C0','C1','C2','C3', 'C4', 'C5']):
        #patch.set_facecolor(color)


x_comp, y = fetch_california_housing(return_X_y = True)


p=0.2
## MCAR values
XproduceNA_MCAR =  produce_NA(x_comp, p_miss=p, mecha="MCAR") 
x_MCAR = XproduceNA_MCAR['X_incomp'].numpy()
print('Ratio nan in X:', np.round(100 * np.sum(np.isnan(x_MCAR)) / x_MCAR.size,1), '%')
# MNAR values
XproduceNA_MNAR = produce_NA(x_comp, p_miss=p, mecha="MNAR", opt="logistic", p_obs=0.5)
x_MNAR = XproduceNA_MNAR['X_incomp'].numpy()
print('Ratio nan in X:', np.round(100 * np.sum(np.isnan(x_MNAR)) / x_MNAR.size,1), '%')

Ratio nan in X: 20.0 %
Ratio nan in X: 19.9 %


learners = {'LinReg': LinearRegression(),
              'RandomForest': RandomForestRegressor(),
              "HGBoost": HistGradientBoostingRegressor()}

for learner_name, learner in learners.items():
    plt.figure(figsize=(20,10))
    for ii, (X, X_name) in enumerate(zip([x_MCAR, x_MNAR], ['MCAR', 'MNAR'])):
        plt.subplot(3, 2, ii+1)
        plot_score_realdatasets(X, y, learner, learner_name + ' ' + X_name, x_comp)

	0	1	2
0	-0.875268	-1.922959	-1.522801
1	-2.393913	-2.481345	-0.613787
2	-0.835337	-0.670269	-0.821626
3	0.504373	-0.827087	-0.683043
4	-0.365120	-0.810354	nan

	0	1	2
0	-0.875268	-1.922959	-1.522801
1	-2.393913	-2.481345	-0.613787
2	-0.835337	-0.670269	-0.821626
3	0.504373	-0.827087	-0.683043
4	-0.365120	-0.810354	0.029433

	0	1	2	5
0	-0.875268	-1.922959	-1.522801	0.000000
1	-2.393913	-2.481345	-0.613787	0.000000
2	-0.835337	-0.670269	-0.821626	0.000000
3	0.504373	-0.827087	-0.683043	0.000000
4	-0.365120	-0.810354	0.029433	1.000000

	0	1	2
0	-0.875268	-1.922959	-1.522801
1	-2.393913	-2.481345	-0.613787
2	-0.835337	-0.670269	-0.821626
3	0.504373	-0.827087	-0.683043
4	-0.365120	-0.810354	-0.462637

	0	1	2	5
0	-0.875268	-1.922959	-1.522801	0.000000
1	-2.393913	-2.481345	-0.613787	0.000000
2	-0.835337	-0.670269	-0.821626	0.000000
3	0.504373	-0.827087	-0.683043	0.000000
4	-0.365120	-0.810354	-0.462637	1.000000

Description of the different strategies¶

Two-steps strategy¶

One-step strategy¶

Pipeline¶

Method selection on synthetic data¶

Method selection on real data¶