Multiple imputation of missing data with mice

Example by Jerry Reiter for short course at Odum Institute, March 2018. This short example shows R commands for handling missing data using the mice package.

Preliminaries (load packages and data)

If you have not already, install the mice package with the following command (remove #):

#install.packages("mice")

Once you have mice installed, load it into memory using the command:

library(mice)

## Loading required package: lattice

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

We illustrate the package with a simple data set from the mice package:

data(nhanes2)

dim(nhanes2)

## [1] 25  4

summary(nhanes2)

##     age          bmi          hyp          chl       
##  20-39:12   Min.   :20.40   no  :13   Min.   :113.0  
##  40-59: 7   1st Qu.:22.65   yes : 4   1st Qu.:185.0  
##  60-99: 6   Median :26.75   NA's: 8   Median :187.0  
##             Mean   :26.56             Mean   :191.4  
##             3rd Qu.:28.93             3rd Qu.:212.0  
##             Max.   :35.30             Max.   :284.0  
##             NA's   :9                 NA's   :10

Note that the age and hyp variables are already defined as factor variables in this dataset. If you have a dataset with categorical variables without this pre-definition, you should make the variables factors before doing mice. For example, to make a variable called var1 into a factor in some dataset called thedata, type thedata$var1 = as.factor(thedata$var1).

Missing data

The NA values represent missing data.

Let’s look at the missing data pattern:

md.pattern(nhanes2)

##    age hyp bmi chl   
## 13   1   1   1   1  0
## 3    1   1   1   0  1
## 1    1   1   0   1  1
## 1    1   0   0   1  2
## 7    1   0   0   0  3
##      0   8   9  10 27

Exploratory data analyses

Let’s look at some exploratory data analyses based on the complete cases.

First make the complete cases

ccnhanes2data = cc(nhanes2)
ccnhanes2data

##      age  bmi hyp chl
## 2  40-59 22.7  no 187
## 5  20-39 20.4  no 113
## 7  20-39 22.5  no 118
## 8  20-39 30.1  no 187
## 9  40-59 22.0  no 238
## 13 60-99 21.7  no 206
## 14 40-59 28.7 yes 204
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 19 20-39 35.3  no 218
## 22 20-39 33.2  no 229
## 23 20-39 27.5  no 131
## 25 40-59 27.4  no 186

Note that there are only 3 cases with hypertension = yes. Relationships will be hard to model accurately due to small sample size…

We will use these data for illustration of the MI procedures. Really we need more data to say anything meaningful about relationships among these variables.

plot(ccnhanes2data$bmi, x=ccnhanes2data$chl, xlab = "Cholesterol", ylab = "BMI", main = "Complete cases: bmi versus chl")

boxplot(ccnhanes2data$bmi ~ ccnhanes2data$age, xlab = "Age", ylab = "BMI", main = "Complete cases: bmi versus age")

boxplot(ccnhanes2data$bmi ~ ccnhanes2data$hyp, xlab = "Hypertensive", ylab = "BMI", main = "Complete cases: bmi versus hyp")

Multiple imputation

How to use `mice`

Let’s create 5 multiple imputations for the missing values. We will use normal linear regressions for continuous variables, logistic regressions for binary variables, multinomial regressions for unordered categorical variables, and proportional odds models for ordered categorical variables. Note: We have not discussed the last two models in this class, but they are popular models for these kinds of data.

nhanes2MI5 = mice(nhanes2, m=5, defaultMethod = c("norm", "logreg", "polyreg", "polr"))

## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl

Look at the first couple of completed datasets

d1 = complete(nhanes2MI5, 1)
d1

##      age      bmi hyp       chl
## 1  20-39 27.10510  no 178.72730
## 2  40-59 22.70000  no 187.00000
## 3  20-39 27.85599  no 187.00000
## 4  60-99 31.65444 yes 321.80262
## 5  20-39 20.40000  no 113.00000
## 6  60-99 18.55172  no 184.00000
## 7  20-39 22.50000  no 118.00000
## 8  20-39 30.10000  no 187.00000
## 9  40-59 22.00000  no 238.00000
## 10 40-59 23.79742  no 150.99083
## 11 20-39 18.36895  no  81.90879
## 12 40-59 19.17256  no 159.52815
## 13 60-99 21.70000  no 206.00000
## 14 40-59 28.70000 yes 204.00000
## 15 20-39 29.60000  no 187.43072
## 16 20-39 29.57671  no 166.30587
## 17 60-99 27.20000 yes 284.00000
## 18 40-59 26.30000 yes 199.00000
## 19 20-39 35.30000  no 218.00000
## 20 60-99 25.50000 yes 271.85660
## 21 20-39 26.56711  no 154.82231
## 22 20-39 33.20000  no 229.00000
## 23 20-39 27.50000  no 131.00000
## 24 60-99 24.90000  no 231.30378
## 25 40-59 27.40000  no 186.00000

d2 = complete(nhanes2MI5, 2)
d2

##      age      bmi hyp       chl
## 1  20-39 32.93049  no 183.58290
## 2  40-59 22.70000  no 187.00000
## 3  20-39 31.09278  no 187.00000
## 4  60-99 18.87489  no 195.42848
## 5  20-39 20.40000  no 113.00000
## 6  60-99 15.20205 yes 184.00000
## 7  20-39 22.50000  no 118.00000
## 8  20-39 30.10000  no 187.00000
## 9  40-59 22.00000  no 238.00000
## 10 40-59 28.12083  no 227.16156
## 11 20-39 26.44805  no 185.25915
## 12 40-59 29.50729  no 219.47739
## 13 60-99 21.70000  no 206.00000
## 14 40-59 28.70000 yes 204.00000
## 15 20-39 29.60000  no 185.81944
## 16 20-39 20.44892  no  84.21949
## 17 60-99 27.20000 yes 284.00000
## 18 40-59 26.30000 yes 199.00000
## 19 20-39 35.30000  no 218.00000
## 20 60-99 25.50000 yes 250.17108
## 21 20-39 23.73500  no 118.28273
## 22 20-39 33.20000  no 229.00000
## 23 20-39 27.50000  no 131.00000
## 24 60-99 24.90000  no 237.60771
## 25 40-59 27.40000  no 186.00000

Imputation diagnostics

Plot imputed and observed continuous variables

stripplot(nhanes2MI5, col=c("grey",mdc(2)),pch=c(1,20))

Grey dots are observed values and red dots are imputed values. There is no evidence of problematic imputations for these variables.

If you just want to see one continuous variable, say bmi, use:

stripplot(nhanes2MI5, bmi~.imp, col=c("grey",mdc(2)),pch=c(1,20))

Also you can do plots by values of categorical variable, say bmi by age grouping:

stripplot(nhanes2MI5, bmi~.imp|age, col=c("grey",mdc(2)),pch=c(1,20))

And also you can plot all the imputations in one plot to see relationships:

stripplot(nhanes2MI5, bmi~chl|age, col=c("grey",mdc(2)),pch=c(1,20))

Scatter plots of bmi and chl by age:

stripplot(nhanes2MI5, bmi~chl|age, col=c("grey",mdc(2)),pch=c(1,20))

Dot plots of bmi by hyp by age:

stripplot(nhanes2MI5, bmi~hyp|age, col=c("grey",mdc(2)),pch=c(1,20))

There are no obvious problems with the imputations from these plots.

You can also try posterior predictive checks. Let’s append the data and make replicates:

nhanes2ppcheck = rbind(nhanes2, nhanes2)
#check to make sure we've done what we intended
nhanes2ppcheck

##       age  bmi  hyp chl
## 1   20-39   NA <NA>  NA
## 2   40-59 22.7   no 187
## 3   20-39   NA   no 187
## 4   60-99   NA <NA>  NA
## 5   20-39 20.4   no 113
## 6   60-99   NA <NA> 184
## 7   20-39 22.5   no 118
## 8   20-39 30.1   no 187
## 9   40-59 22.0   no 238
## 10  40-59   NA <NA>  NA
## 11  20-39   NA <NA>  NA
## 12  40-59   NA <NA>  NA
## 13  60-99 21.7   no 206
## 14  40-59 28.7  yes 204
## 15  20-39 29.6   no  NA
## 16  20-39   NA <NA>  NA
## 17  60-99 27.2  yes 284
## 18  40-59 26.3  yes 199
## 19  20-39 35.3   no 218
## 20  60-99 25.5  yes  NA
## 21  20-39   NA <NA>  NA
## 22  20-39 33.2   no 229
## 23  20-39 27.5   no 131
## 24  60-99 24.9   no  NA
## 25  40-59 27.4   no 186
## 110 20-39   NA <NA>  NA
## 26  40-59 22.7   no 187
## 31  20-39   NA   no 187
## 41  60-99   NA <NA>  NA
## 51  20-39 20.4   no 113
## 61  60-99   NA <NA> 184
## 71  20-39 22.5   no 118
## 81  20-39 30.1   no 187
## 91  40-59 22.0   no 238
## 101 40-59   NA <NA>  NA
## 111 20-39   NA <NA>  NA
## 121 40-59   NA <NA>  NA
## 131 60-99 21.7   no 206
## 141 40-59 28.7  yes 204
## 151 20-39 29.6   no  NA
## 161 20-39   NA <NA>  NA
## 171 60-99 27.2  yes 284
## 181 40-59 26.3  yes 199
## 191 20-39 35.3   no 218
## 201 60-99 25.5  yes  NA
## 211 20-39   NA <NA>  NA
## 221 20-39 33.2   no 229
## 231 20-39 27.5   no 131
## 241 60-99 24.9   no  NA
## 251 40-59 27.4   no 186

Now blank every value in 3 variables with missing values:

nhanes2ppcheck[26:50, 2:4] = NA
nhanes2ppcheck

##       age  bmi  hyp chl
## 1   20-39   NA <NA>  NA
## 2   40-59 22.7   no 187
## 3   20-39   NA   no 187
## 4   60-99   NA <NA>  NA
## 5   20-39 20.4   no 113
## 6   60-99   NA <NA> 184
## 7   20-39 22.5   no 118
## 8   20-39 30.1   no 187
## 9   40-59 22.0   no 238
## 10  40-59   NA <NA>  NA
## 11  20-39   NA <NA>  NA
## 12  40-59   NA <NA>  NA
## 13  60-99 21.7   no 206
## 14  40-59 28.7  yes 204
## 15  20-39 29.6   no  NA
## 16  20-39   NA <NA>  NA
## 17  60-99 27.2  yes 284
## 18  40-59 26.3  yes 199
## 19  20-39 35.3   no 218
## 20  60-99 25.5  yes  NA
## 21  20-39   NA <NA>  NA
## 22  20-39 33.2   no 229
## 23  20-39 27.5   no 131
## 24  60-99 24.9   no  NA
## 25  40-59 27.4   no 186
## 110 20-39   NA <NA>  NA
## 26  40-59   NA <NA>  NA
## 31  20-39   NA <NA>  NA
## 41  60-99   NA <NA>  NA
## 51  20-39   NA <NA>  NA
## 61  60-99   NA <NA>  NA
## 71  20-39   NA <NA>  NA
## 81  20-39   NA <NA>  NA
## 91  40-59   NA <NA>  NA
## 101 40-59   NA <NA>  NA
## 111 20-39   NA <NA>  NA
## 121 40-59   NA <NA>  NA
## 131 60-99   NA <NA>  NA
## 141 40-59   NA <NA>  NA
## 151 20-39   NA <NA>  NA
## 161 20-39   NA <NA>  NA
## 171 60-99   NA <NA>  NA
## 181 40-59   NA <NA>  NA
## 191 20-39   NA <NA>  NA
## 201 60-99   NA <NA>  NA
## 211 20-39   NA <NA>  NA
## 221 20-39   NA <NA>  NA
## 231 20-39   NA <NA>  NA
## 241 60-99   NA <NA>  NA
## 251 40-59   NA <NA>  NA

Run the MI software on the completed data:

nhanes2MI5ppcheck = mice(nhanes2ppcheck, m=5, defaultMethod = c("norm", "logreg", "polyreg", "polr"))

## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl

Get the completed datasets – in the interest of time look at first two datasets:

d1ppcheck = complete(nhanes2MI5ppcheck, 1)
d2ppcheck = complete(nhanes2MI5ppcheck, 2)

Let’s graph histograms of bmi for each of the datasets:

par(mfcol=c(2,1))
hist(d1ppcheck$bmi[1:25], xlab = "BMI", main = "BMI completed data")
hist(d1ppcheck$bmi[26:50], xlab = "BMI", main = "BMI replicated data")

hist(d2ppcheck$bmi[1:25], xlab = "BMI", main = "BMI completed data")
hist(d2ppcheck$bmi[26:50], xlab = "BMI", main = "BMI replicated data")

You can also use scatter plots to check relationship between variables:

plot(d2ppcheck$bmi[1:25]~d2ppcheck$chl[1:25], ylab = "BMI", xlab = "Cholesterol", main = "BMI vs Chl completed data")

plot(d2ppcheck$bmi[26:50]~d2ppcheck$chl[26:50], ylab = "BMI", xlab = "Cholesterol", main = "BMI vs Chl replicated data")

Looks pretty similar! No evidence that imputation models are poorly specified for what we want to do.

Inference using the completed datasets

To do model specification, i.e., transformations, either look at the complete cases or use one of the completed datasets. For example, to use the first dataset d1 in a regression of bmi on age, hyp and chl, use:

bmiregd1 = lm(bmi~age+hyp+chl, data = d1)

To check residuals, you can examine the fit of the model in one or more completed datasets. Any transformations will have to apply to all the datasets, so don’t be too dataset-specific in your checks.

plot(bmiregd1$residual, x = d1$chl, xlab = "Cholesterol", ylab = "Residual")
abline(0,0)

boxplot(bmiregd1$residual ~ d1$age, xlab = "Age", ylab = "Residual")

boxplot(bmiregd1$residual ~ d1$hyp, xlab = "Hypertension", ylab = "Residual")

Pretty reasonable residual plots. A good idea is to repeat this for more than one completed dataset. If you decide transformations are needed, you might reconsider the imputation models too and fit them with transformed values.

If you want to do multiple imputation inferences on all m=5 data sets, use the with command. For example, to fit a linear regression of bmi on age + hyp + chl:

bmiregMI5 = with(data=nhanes2MI5, lm(bmi~age+hyp+chl))
summary(bmiregMI5)

## # A tibble: 25 x 5
##    term        estimate std.error statistic    p.value
##    <chr>          <dbl>     <dbl>     <dbl>      <dbl>
##  1 (Intercept)  13.7       2.37       5.77  0.0000121 
##  2 age40-59     -5.58      1.27      -4.41  0.000269  
##  3 age60-99    -10.3       1.67      -6.15  0.00000527
##  4 hypyes        1.09      1.63       0.670 0.511     
##  5 chl           0.0839    0.0139     6.04  0.00000663
##  6 (Intercept)  12.3       2.44       5.06  0.0000601 
##  7 age40-59     -5.97      1.46      -4.08  0.000581  
##  8 age60-99    -11.9       1.74      -6.82  0.00000124
##  9 hypyes        0.350     1.53       0.229 0.822     
## 10 chl           0.0955    0.0143     6.66  0.00000176
## # ... with 15 more rows

To get the multiple imputation inferences based on the Rubin (1987) combining rules – see the slides – use the pool command:

bmireg = pool(bmiregMI5)
summary(bmireg)

##                estimate  std.error  statistic       df     p.value
## (Intercept)  14.1840605 3.96739391  3.5751581 5.194110 0.007687387
## age40-59     -6.6797335 1.87964387 -3.5537229 7.713681 0.007925676
## age60-99    -12.1435283 2.63248828 -4.6129468 5.433678 0.001898904
## hypyes        2.0346388 2.13133658  0.9546305 7.110528 0.368718755
## chl           0.0854122 0.02096924  4.0732134 6.743999 0.003852516

If you want to do a nested F test (well, technically a test that is asymptotically equivalent to a nested F test), then use pool.compare. Suppose we want to see if age is a useful predictor. We have to redo the with-age regression to have age as the last predictor.

bmiregMI5 = with(data=nhanes2MI5, lm(bmi~hyp+chl+age))
bmiregMI5noage = with(data=nhanes2MI5, lm(bmi~hyp+chl))
pool.compare(bmiregMI5, bmiregMI5noage)

## $call
## pool.compare(fit1 = bmiregMI5, fit0 = bmiregMI5noage)
## 
## $call11
## with.mids(data = nhanes2MI5, expr = lm(bmi ~ hyp + chl + age))
## 
## $call12
## mice(data = nhanes2, m = 5, defaultMethod = c("norm", "logreg", 
##     "polyreg", "polr"))
## 
## $call01
## with.mids(data = nhanes2MI5, expr = lm(bmi ~ hyp + chl))
## 
## $call02
## mice(data = nhanes2, m = 5, defaultMethod = c("norm", "logreg", 
##     "polyreg", "polr"))
## 
## $method
## [1] "wald"
## 
## $nmis
## age bmi hyp chl 
##   0   9   8  10 
## 
## $m
## [1] 5
## 
## $qbar1
## (Intercept)      hypyes         chl    age40-59    age60-99 
##  14.1840605   2.0346388   0.0854122  -6.6797335 -12.1435283 
## 
## $qbar0
## (Intercept)      hypyes         chl 
## 20.48011951 -0.51088587  0.03075059 
## 
## $ubar1
## [1] 7.2233187839 2.6090856965 0.0002439248 2.1363015527 3.2939027614
## 
## $ubar0
## [1] 1.832516e+01 7.286680e+00 5.266938e-04
## 
## $deviances
## NULL
## 
## $Dm
##          [,1]
## [1,] 17.47214
## 
## $rm
## [1] 0.8788535
## 
## $df1
## [1] 2
## 
## $df2
## [1] 17.74014
## 
## $pvalue
##              [,1]
## [1,] 6.410541e-05

You also can fit logistic regressions. For example to predict hypertension from all the other variables, use:

hyplogregMI5 = with(data=nhanes2MI5, glm(hyp~bmi+chl+age, family = binomial))

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

hyplogreg = pool(hyplogregMI5)
summary(hyplogreg)

##                 estimate   std.error     statistic       df   p.value
## (Intercept) -3759.574521   824933.72 -4.557426e-03 18.25904 0.9964131
## bmi            65.363442    14284.78  4.575741e-03 18.25904 0.9963987
## chl             6.585068     1471.53  4.474981e-03 18.25904 0.9964780
## age40-59      736.660721   162896.64  4.522259e-03 18.25904 0.9964408
## age60-99      411.603822 12252744.44  3.359279e-05 18.25904 0.9999736

This turns out to be problematic because we have some logistic regressions with perfect predictions.
We do not have enough data to do a meaningful logistic regression here, unless we drop age as a predictor. But the command structure is fine.