!pip install wget 
!pip install geomloss
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge


## Downoald additional code
import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/softimpute.py')
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/imputers.py')
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/data_loaders.py')
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')
wget.download('https://raw.githubusercontent.com/R-miss-tastic/website/master/static/how-to/python/produceNA.py')
wget.download('https://raw.githubusercontent.com/R-miss-tastic/website/master/static/how-to/python/tools.py')
wget.download('https://raw.githubusercontent.com/R-miss-tastic/website/master/static/how-to/python/MIWAE_functions.py')
from softimpute import softimpute, cv_softimpute
from imputers import OTimputer
from data_loaders import * 

import numpy as np

import pandas as pd

from utils import *

import torch
import torchvision
import torch.nn as nn
import torch.distributions as td
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

import scipy.stats
import scipy.io
import scipy.sparse
from scipy.io import loadmat

import matplotlib.pyplot as plt

from produceNA import *

from tools import color_imputedvalues_orange

from itertools import product 

from sklearn.preprocessing import scale

import os

from MIWAE_functions import *

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... done
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=0c7f841db83f69b14122d9a967e0c53232b43cf8806a5b6b11a5fa798254eab7
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting geomloss
  Downloading https://files.pythonhosted.org/packages/bb/96/97ff3dff46de2c09c7289ef02da574c2b35812a7165edbe1942e2d617bf5/geomloss-0.2.4-py3-none-any.whl
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from geomloss) (1.19.5)
Installing collected packages: geomloss
Successfully installed geomloss-0.2.4


#### Simulation of the data matrix ####

np.random.seed(0)  # fix the seed 

n = 1000 
p = 10 

mean = np.repeat(0, p) 
cov = 0.5 * (np.ones((p,p)) + np.eye(p)) 
x_comp = np.random.multivariate_normal(mean, cov, size = n)


pd.DataFrame(x_comp).head()


#### Introduction of missing values ####

perc_miss = 0.3 # 30% NA

XproduceNA =  produce_NA(x_comp, p_miss=perc_miss, mecha="MCAR")
X_miss = XproduceNA['X_incomp']
x_miss = X_miss.numpy()
Mask = XproduceNA['mask'] # True for missing values, False for others


pd.DataFrame(x_miss).head().style.highlight_null(null_color='orange')


x_mean = SimpleImputer().fit_transform(x_miss)


pd.DataFrame(x_mean).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


cv_error, grid_lambda = cv_softimpute(x_miss, grid_len=15)
lbda = grid_lambda[np.argmin(cv_error)]
x_soft = softimpute((x_miss), lbda)[1]


pd.DataFrame(x_soft).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


x_ice = IterativeImputer(random_state=0, max_iter=50).fit_transform(x_miss)


pd.DataFrame(x_ice).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


estimator_rf = ExtraTreesRegressor(n_estimators=10, random_state=0)
x_rf = IterativeImputer(estimator=estimator_rf, random_state=0, max_iter=50).fit_transform(x_miss)


pd.DataFrame(x_rf).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


X_true = torch.from_numpy(x_comp).double()

eps = pick_epsilon(X_miss)

sk_imputer = OTimputer(eps=eps, batchsize=128, lr=0.01, niter=15)
sk_imp, _, _ = sk_imputer.fit_transform(X_miss, X_true=X_true)


sk_imp_np = sk_imp.detach().numpy()
pd.DataFrame(sk_imp_np).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


x_miwae = MIWAE(X_miss)


pd.DataFrame(x_miwae).head().style.applymap(color_imputedvalues_orange, x_miss=x_miss)


def how_to_impute(X, perc_list , mecha_list , nbsim):
    """
    Compare in terms of MSE several imputation methods for different percentages of missing values and missing-data mechanisms.
    
    Parameters
    ----------
    X : the complete data set where the missing values will be introduced (numpy array).
    perc_list : list containing the different percentage of missing values.
    mecha_list : list containing the different missing-data mechanisms ("MCAR","MAR" or "MNAR").
    nbsim : number of simulations performed.
    
    Returns
    -------
    df: dataframe containing the mean of the MSEs for the simulations performed. 
    """
    mecha_perc_list = pd.DataFrame([(mecha,perc) for mecha, perc in product(mecha_list,perc_list)])
    df = mecha_perc_list.apply(ComparMethods, axis=1, X=X, nbsim=nbsim)
    df.index = mecha_perc_list.apply(lambda x : x[0] + " " + str(x[1]), axis=1)
    
    return df

def ComparMethods(mecha_perc, X, nbsim):
    """
    Compare in terms of MSE several imputation methods for a given percentage of missing values and a given missing-data mechanism.
    
    Parameters
    ----------
    mecha_perc : list containing the missing-data mechanism and the percentage of missing values to be used for introducing missing values. 
    X : the complete data set where the missing values will be introduced (matrix).
    nbsim : number of simulations performed.
    
    Returns
    -------
    df: dataframe containing the mean of the MSEs.
    """
    mecha = mecha_perc[0]
    perc = mecha_perc[1]
    
    RMSE_results = pd.DataFrame()
    Methods = ['mean', 'softimpute', 'ice', 'rf','sk','miwae']
    for meth in Methods:
        RMSE_results[meth]=[]
    
    for sim in range(0,nbsim):
        ## Introduction NA
        if mecha == "MAR":
            XproduceNA = produce_NA(X, perc, mecha, p_obs=0.5)
        elif mecha == "MNAR":
            XproduceNA = produce_NA(X, perc, mecha, p_obs=0.5, opt="logistic")
        else: 
            XproduceNA = produce_NA(X, perc, mecha)
        mask = XproduceNA['mask'].numpy()
        x_miss = XproduceNA['X_incomp'].numpy()
        
        ## Mean
        x_mean = SimpleImputer().fit_transform(x_miss)
        rmse_mean = RMSE(x_mean, X, mask)

        ## SoftImpute
        cv_error, grid_lambda = cv_softimpute(x_miss, grid_len=15)
        lbda = grid_lambda[np.argmin(cv_error)]
        x_soft = softimpute((x_miss), lbda)[1]
        rmse_soft = RMSE(x_soft, X, mask)

        ## Ice
        x_ice = IterativeImputer(random_state=0, max_iter=50).fit_transform(x_miss)
        rmse_ice = RMSE(x_ice, X, mask)

        ## Random Forests
        estimator_rf = ExtraTreesRegressor(n_estimators=10, random_state=0)
        x_rf = IterativeImputer(estimator=estimator_rf, random_state=0, max_iter=50).fit_transform(x_miss)
        rmse_rf = RMSE(x_rf, X, mask)
        
        ## Sinkhorn imputation
        X_true = torch.from_numpy(X).double()
        X_miss = XproduceNA['X_incomp']
        batchsize = 128
        lr = 1e-2
        epsilon = pick_epsilon(X_miss)
        sk_imputer = OTimputer(eps=epsilon, batchsize=batchsize, lr=lr, niter=2000)
        sk_imp, _, _ = sk_imputer.fit_transform(X_miss, verbose=True, report_interval=500, X_true=X_true)
        rmse_sk_imp = RMSE(sk_imp.detach().numpy(), X, mask)

        ## MIWAE 
        x_miwae = MIWAE(X_miss)
        rmse_miwae = RMSE(x_miwae, X, mask)
        
        new_rmse = {'mean': rmse_mean, 'softimpute': rmse_soft, 'ice': rmse_ice, 'rf': rmse_rf, 'sk': rmse_sk_imp, 'miwae': rmse_miwae}
        RMSE_results = RMSE_results.append(new_rmse, ignore_index=True)

        
    return RMSE_results.mean()


perc_list = [0.1, 0.3, 0.5]
mecha_list = ["MCAR", "MAR", "MNAR"]

results_how_to_impute = how_to_impute(x_comp, perc_list , mecha_list , nbsim=2)


results_how_to_impute


ax = results_how_to_impute.plot(kind="bar",rot=30)
ax.get_legend().set_bbox_to_anchor((1, 1))


if not os.path.isdir('datasets'):
  os.mkdir('datasets')
wine_red = dataset_loader('wine_quality_red')
wine_white = dataset_loader('wine_quality_white')
slump = dataset_loader('concrete_slump')


sc = True
if sc:
  wine_white =  scale(wine_white)
  wine_red = scale(wine_red)
  slump = scale(slump)


datasets_list = dict(wine_white=wine_white, wine_red=wine_red, slump=slump)
names_dataset = ['wine_white','wine_red','slump']
perc = [0.1]
mecha = ["MCAR"]
nbsim = 2


def how_to_impute_real(datasets_list, perc, mecha, nbsim, names_dataset):
    """
    Compare in terms of MSE several imputation methods for different complete datasets where missing values are introduced with a given percentage of missing values and a given missing-data mechanism.
    
    Parameters
    ----------
    datasets_list : dictionnary of complete datasets.
    perc : percentage of missing values.
    mecha_list : missing-data mechanism ("MCAR","MAR" or "MNAR").
    nbsim : number of simulations performed.
    names_dataset : vector of the names of datasets.
    
    Returns
    -------
    res: dataframe containing the mean of the MSEs for the simulations performed. 
    """

    for dat in range(0,len(datasets_list)):
      df = how_to_impute(datasets_list[names_dataset[dat]], perc, mecha, nbsim)
      if dat==0:
        res = df
      else:
        res = pd.concat([res,df])
    res.index = names_dataset
    return(res)


results_how_to_impute_real = how_to_impute_real(datasets_list, perc, mecha, nbsim, names_dataset)


results_how_to_impute_real


ax = results_how_to_impute_real.plot(kind="bar",rot=30)
ax.get_legend().set_bbox_to_anchor((1, 1))

Class (or function)	Data Types	Underlying Method	Imputation	Comments
SingleImputer with strategy='mean' (default), sklearn.impute	quantitative	imputation by the mean	single	Easiest method
softImpute function (mimics R into Python)	quantitative	low-rank matrix completion	single	Strong theoretical guarantees, regularization parameter to tune
IterativeImputer with BayesianRidge (default), sklearn.impute	mixed	imputation by chained equations	single	Very flexible to data types, no parameter to tune
IterativeImputer with ExtraTreesRegressor, sklearn.impute	mixed	random forests	single	Requires large sample sizes, no parameter to tune
Sinkhorn imputation	quantitative	optimal transport	single

	0	1	2	3	4	5	6	7	8	9
0	-1.039823	-1.406895	-0.701245	-0.243445	-0.512836	-2.146725	-2.522237	-0.761441	-2.379464	-1.368450
1	0.868731	-0.006509	0.654021	-0.705815	-0.288841	-0.423724	-0.259798	0.275939	-1.031320	-0.150938
2	2.331809	2.866483	2.478842	2.833329	2.348079	2.431913	0.914379	0.759329	1.057632	0.911685
3	0.138767	-0.401317	0.912531	-0.471823	-1.372814	0.072473	0.632320	-0.727102	0.330743	-0.262900
4	-0.174949	-0.192462	0.099167	1.580241	1.062209	-0.191656	0.591569	1.045559	2.122082	1.834517

	0	1	2	3	4	5	6	7	8	9
0	-1.039823	-1.406895	-0.701245	-0.243445	nan	-2.146725	-2.522237	nan	-2.379464	-1.368450
1	0.868731	nan	0.654021	nan	-0.288841	-0.423724	nan	0.275939	nan	-0.150938
2	nan	2.866483	2.478842	2.833329	nan	nan	0.914379	0.759329	1.057632	0.911685
3	0.138767	-0.401317	0.912531	-0.471823	-1.372814	0.072473	0.632320	nan	0.330743	-0.262900
4	nan	-0.192462	0.099167	nan	1.062209	-0.191656	0.591569	nan	2.122082	1.834517

	0	1	2	3	4	5	6	7	8	9
0	-1.039823	-1.406895	-0.701245	0.017925	-0.512836	-2.146725	-2.522237	-0.041027	-2.379464	-1.368450
1	0.868731	-0.024509	0.654021	-0.705815	-0.288841	-0.423724	-0.259798	-0.041027	-1.031320	-0.150938
2	2.331809	2.866483	2.478842	2.833329	2.348079	0.004780	0.914379	0.759329	1.057632	0.911685
3	0.138767	-0.401317	0.912531	-0.471823	-1.372814	0.072473	0.632320	-0.727102	0.330743	-0.262900
4	-0.174949	-0.192462	0.006289	1.580241	1.062209	0.004780	0.591569	1.045559	2.122082	1.834517

	0	1	2	3	4	5	6	7	8	9
0	-1.002617	-1.406895	-0.701245	-0.243445	-0.856365	-2.146725	-2.522237	-0.761441	-2.379464	-1.368450
1	0.868731	0.067122	0.026269	0.047901	0.060389	-0.423724	0.042934	-0.009069	0.058313	-0.150938
2	2.331809	1.342976	2.478842	2.833329	2.348079	2.431913	0.914379	1.266218	1.057632	1.373486
3	-0.026684	-0.401317	0.912531	-0.471823	-0.020879	0.072473	0.632320	-0.727102	-0.027941	-0.262900
4	-0.174949	-0.192462	0.099167	0.267834	1.062209	0.273338	0.591569	1.045559	0.224839	0.287975

Description of imputation methods on synthetic data¶

Imputation by the mean¶

softimpute¶

Iterative chained equations¶

Sinkhorn imputation¶

MIWAE¶

Numerical experiments to compare the different methods¶

Synthetic data¶

Real datasets¶

	mean	softimpute	ice	rf	sk	miwae
MCAR 0.1	1.026818	0.774073	0.756121	0.816552	0.798953	0.817892
MCAR 0.3	0.993447	0.764741	0.759083	0.818374	0.801228	0.819268
MCAR 0.5	0.999771	0.795559	0.869922	0.868574	0.826878	0.836447
MAR 0.1	1.056386	0.745423	0.725796	0.778805	0.772490	0.793880
MAR 0.3	1.047601	0.776821	0.756006	0.820924	0.793902	0.821282
MAR 0.5	1.045310	0.772097	0.871754	0.823757	0.775699	0.814068
MNAR 0.1	1.001638	0.734826	0.715029	0.784016	0.771394	0.789443
MNAR 0.3	1.031509	0.775543	0.768620	0.823002	0.806572	0.817011
MNAR 0.5	1.023310	0.786646	0.912613	0.871177	0.805759	0.834723

	mean	softimpute	ice	rf	sk	miwae
wine_white	1.006575	0.806511	0.747366	0.608184	0.789439	0.987656
wine_red	0.993276	0.829521	0.804788	0.661857	0.760766	0.944917
slump	1.032272	0.772971	0.721526	0.860345	0.755637	1.062962