Missing covariate imputation

In this example, we use the nhanes2 data set from the mice R-package to illustrate how to do missing covariate imputation in INLA by using a measurement error model. As noted in the paper, the nhanes2 data set is really small, it only has 25 observations and 9 of them are missing, so this is not really a good application of this. However, we chose to use this as it is a data set that is commonly used in other missing data applications in INLA, and so we reasoned that using the same data set would make it easier to compare the implementations.

Loading packages

library(mice)       # Just used for the nhanes2 data set
library(INLA)       # INLA modelling
library(dplyr)      # Data wrangling of the results
library(gt)         # Tables
library(tidyverse)  # Data wrangling and plotting
library(showtext)   # Font
library(colorspace) # Color adjustments

inla.setOption(num.threads = "1:1")

Loading and preparing the data

# Using the nhanes data set found in mice:
data(nhanes2)

nhanes2

     age  bmi  hyp chl
1  20-39   NA <NA>  NA
2  40-59 22.7   no 187
3  20-39   NA   no 187
4  60-99   NA <NA>  NA
5  20-39 20.4   no 113
6  60-99   NA <NA> 184
7  20-39 22.5   no 118
8  20-39 30.1   no 187
9  40-59 22.0   no 238
10 40-59   NA <NA>  NA
11 20-39   NA <NA>  NA
12 40-59   NA <NA>  NA
13 60-99 21.7   no 206
14 40-59 28.7  yes 204
15 20-39 29.6   no  NA
16 20-39   NA <NA>  NA
17 60-99 27.2  yes 284
18 40-59 26.3  yes 199
19 20-39 35.3   no 218
20 60-99 25.5  yes  NA
21 20-39   NA <NA>  NA
22 20-39 33.2   no 229
23 20-39 27.5   no 131
24 60-99 24.9   no  NA
25 40-59 27.4   no 186

n <- nrow(nhanes2)

# Manually dummy-code age:
age2 <- ifelse(nhanes2$age == "40-59", 1, 0)
age3 <- ifelse(nhanes2$age == "60-99", 1, 0)

# Center the response and continuous covariates
chl <- scale(nhanes2$chl, scale = FALSE)[,1]
bmi <- scale(nhanes2$bmi, scale = FALSE)[,1]

Aim

We want to fit the model

\[ chl \sim \beta_0 + \beta_{age2} age_2 + \beta_{age3} age_3 + \beta_{bmi} bmi + \varepsilon, \] but there is missingness in bmi, so we will consider two different imputation models for this.

Simple imputation model

We first fit a model that is identical to the one in Gómez-Rubio and Rue (2018).

Specifying priors

# Priors for model of interest coefficients
prior.beta = c(0, 0.001) # Gaussian, c(mean, precision)

# Priors for exposure model coefficient
#prior.alpha <- c(26.56, 1/71.07) # Gaussian, c(mean, precision)

# Priors for y, measurement error
prior.prec.y <- c(1, 0.00005) # Gamma
prior.prec.u_c <- c(0.5, 0.5) # Gamma
#prior.prec.x <- c(2.5-1,(2.5)*4.2^2) # Gamma

# Initial values
prec.y <- 1
prec.u_c <- 1
#prec.x <- 1/71.07
prec.x <- 1/71.07

Setting up the matrices for the joint model

Y <- matrix(NA, 3*n, 3)

Y[1:n, 1] <- chl             # Regression model of interest response
Y[n+(1:n), 2] <- bmi         # Error model response
Y[2*n+(1:n), 3] <- rep(0, n) # Imputation model response

beta.0 <- c(rep(1, n), rep(NA, n), rep(NA, n))
beta.bmi <- c(1:n, rep(NA, n), rep(NA, n))
beta.age2 <- c(age2, rep(NA, n), rep(NA, n))
beta.age3 <- c(age3, rep(NA, n), rep(NA, n))

id.x <- c(rep(NA, n), 1:n, 1:n) 
weight.x <- c(rep(1, n), rep(1, n), rep(-1, n))

offset.imp <- c(rep(NA, n), rep(NA, n), rep(26.56, n))

dd <- data.frame(Y = Y, 
                 beta.0 = beta.0,
                 beta.bmi = beta.bmi,
                 beta.age2 = beta.age2,
                 beta.age3 = beta.age3,
                 id.x = id.x,
                 weight.x = weight.x,
                 offset.imp = offset.imp)

INLA formula

formula = Y ~ - 1 + beta.0 + beta.age2 + beta.age3 + 
  f(beta.bmi, copy="id.x", 
    hyper = list(beta = list(param = prior.beta, fixed=FALSE))) +
  f(id.x, weight.x, model="iid", values = 1:n, 
    hyper = list(prec = list(initial = -15, fixed=TRUE)))

Scaling of ME precision

Since we are not assuming any measurement error here, we need to “turn off” the error model by scaling the error precision to be very large (it makes no difference if we scale the precision only for the observed values or for the observed and missing values).

Scale <- c(rep(1, n), rep(10^12, n), rep(1, n))

Fitting the model

set.seed(1)
model_missing1 <- inla(formula, data = dd, scale = Scale, offset = log(dat$pop),
                     family = c("gaussian", "gaussian", "gaussian"),
                     control.family = list(
                       list(hyper = list(prec = list(initial = log(prec.y), 
                                                     param = prior.prec.y, 
                                                     fixed = FALSE))),
                       list(hyper = list(prec = list(initial = log(prec.u_c), 
                                                     param = prior.prec.u_c, 
                                                     fixed = FALSE))),
                       list(hyper = list(prec = list(initial = log(prec.x),
                                                     fixed = TRUE)))
                     ),
                     control.fixed = list(
                       mean = list(beta.0 = prior.beta[1], 
                                   beta.age2 = prior.beta[1], 
                                   beta.age3 = prior.beta[1]), 
                       prec = list(beta.0 = prior.beta[2], 
                                   beta.age2 = prior.beta[2], 
                                   beta.age3 = prior.beta[2])),
                     verbose=F)
model_missing1$summary.fixed

               mean       sd 0.025quant  0.5quant 0.975quant mode          kld
beta.0    -17.60626 10.93451 -37.971728 -18.03594   5.183261   NA 9.910360e-08
beta.age2  29.24290 16.09734  -4.203562  29.84339  59.318094   NA 9.648499e-08
beta.age3  51.16447 20.73070   7.996870  52.09611  89.369885   NA 1.611619e-07

model_missing1$summary.hyperpar

                                                  mean           sd
Precision for the Gaussian observations    0.001036804 0.0004234011
Precision for the Gaussian observations[2] 0.916159737 0.5484311863
Beta for beta.bmi                          5.035216083 0.4842624861
                                             0.025quant     0.5quant
Precision for the Gaussian observations    0.0003789667 0.0009755623
Precision for the Gaussian observations[2] 0.2023001516 0.8039241305
Beta for beta.bmi                          4.1750752965 5.0067526677
                                            0.975quant mode
Precision for the Gaussian observations    0.002025766   NA
Precision for the Gaussian observations[2] 2.276555004   NA
Beta for beta.bmi                          6.082937710   NA

Full imputation model

This model includes age as a covariate in the imputation model.

Specifying priors

For the intercepts and slopes of the model of interest and imputation model we set very wide priors centered at zero.

# Priors for model of interest coefficients
prior.beta = c(0, 1e-6) # Gaussian, c(mean, precision)

# Priors for exposure model coefficients
prior.alpha <- c(0, 1e-6) # Gaussian, c(mean, precision)

For the precision of $y$ and $x$ we set more informative priors. The precisions are given gamma priors, and in INLA this is parameterized by the shape and rate parameters. Let’s first look at the precision for $y$.

We base our priors for $x$ and $y$ around the values that we want as the modes of the distributions. For $y$, we look at the regression chl~bmi+age2+age3. The standard error is estimated to be $29.1$. We decide we want $1/29.1^2$ to be the mode of our prior. Since the gamma distribution with shape $\alpha$ and rate $\beta$ has mode $\frac{\alpha-1}{\beta}$, we can choose $\alpha = s+1$ and $\beta = s\cdot29.1^2$ in order to get a distribution with mode equal to $1/29.1^2$. Then we need to choose $s$ to decide the spread, the variance of the inverse gamma distribution will decrease as $s$ increases. For $x$ we similarly construct the prior to have its mode at $4.1$.

An alternative approach could be to construct an upper limit for the variance by choosing that to be the variance of the covariate itself, as well as some lower limit, and then select the inverses of these limits to be the 2.7% and 97.5% quantiles in the gamma prior. in our case, the variance of chl is $2044$, so we would choose $1/2044$ to be the lower limit for $\tau_y$. For the upper limit, we decide on roughly a 10th of that variation, so the upper limit is $1/204.4$. By specifying these points as the 2.7% and 97.5% quantiles, respectively, we can achieve the parameters of a gamma distribution with these quantiles through numerical optimization. We get the distribution $\tau_y \sim G(3.4, 1588)$. Similarly, for $x$ (bmi), we find that the sample variance of bmi is $17.8$, and so we could set the lower limit of $\tau_x$ to $1/17.8$. Again, by choosing roughly a 10th of that variation we get a upper limit for the precision at $1/1.78$, giving the gamma distribution $\tau_x \sim G(3.4, 13.9)$.

# Priors for y, measurement error and true x-value precision
# Start by getting a reasonable prior guess for the standard error of the regression and exp. models
summary(lm(chl~bmi+age2+age3))$sigma

[1] 29.10126

summary(lm(bmi~age2+age3))$sigma

[1] 4.160342

# Use those values to create reasonable priors:
s <- 0.5
prior.prec.y <- c(s+1, s*29.1^2) # Gamma
prior.prec.x <- c(s+1, s*4.1^2) # Gamma

# We can visualize these priors:
curve(dgamma(x, s+1, s*29.1^2), 0, 0.02) 
abline(v=1/(29.1^2))

curve(dgamma(x, s+1, s*4.1^2), 0, 1)
abline(v=1/(4.2^2))

# Initial values
prec.y <- 1/29.1^2
prec.u_c <- 1
prec.x <- 1/4.2^2

Setting up the matrices for the joint model

Y <- matrix(NA, 3*n, 3)


Y[1:n, 1] <- chl             # Regression model of interest response
Y[n+(1:n), 2] <- bmi         # Error model response
Y[2*n+(1:n), 3] <- rep(0, n) # Imputation model response

beta.0 <- c(rep(1, n), rep(NA, n), rep(NA, n))
beta.bmi <- c(1:n, rep(NA, n), rep(NA, n))
beta.age2 <- c(age2, rep(NA, n), rep(NA, n))
beta.age3 <- c(age3, rep(NA, n), rep(NA, n))

id.x <- c(rep(NA, n), 1:n, 1:n) 
weight.x <- c(rep(1, n), rep(1, n), rep(-1, n))

alpha.0 <- c(rep(NA, n), rep(NA, n), rep(1, n))
alpha.age2 <- c(rep(NA, n), rep(NA, n), age2)
alpha.age3 <- c(rep(NA, n), rep(NA, n), age3)

dd <- data.frame(Y = Y, 
                 beta.0 = beta.0,
                 beta.bmi = beta.bmi,
                 beta.age2 = beta.age2,
                 beta.age3 = beta.age3,
                 id.x = id.x,
                 weight.x = weight.x,
                 alpha.0 = alpha.0,
                 alpha.age2 = alpha.age2,
                 alpha.age3 = alpha.age3)

INLA formula

formula = Y ~ - 1 + beta.0 + beta.age2 + beta.age3 + 
  f(beta.bmi, copy="id.x", 
    hyper = list(beta = list(param = prior.beta, fixed=FALSE))) +
  f(id.x, weight.x, model="iid", values = 1:n, 
    hyper = list(prec = list(initial = -15, fixed=TRUE))) +
  alpha.0 + alpha.age2 + alpha.age3

Scaling of ME precision

Since we are (again) not assuming any measurement error here, we need to “turn off” the error model by scaling the error precision to be very large (it makes no difference if we scale the precision only for the observed values or for the observed and missing values).

Scale <- c(rep(1, n), rep(10^12, n), rep(1, n))

Fitting the model

set.seed(1)
model_missing2 <- inla(formula, data = dd, scale = Scale,
                     family = c("gaussian", "gaussian", "gaussian"),
                     control.family = list(
                       list(hyper = list(prec = list(initial = log(prec.y), 
                                                     param = prior.prec.y, 
                                                     fixed = FALSE))),
                       list(hyper = list(prec = list(initial = log(prec.u_c),
                                                     fixed = TRUE))),
                       list(hyper = list(prec = list(initial = log(prec.x), 
                                                     param = prior.prec.x, 
                                                     fixed = FALSE)))
                     ),
                     control.fixed = list(
                       mean = list(beta.0 = prior.beta[1], 
                                   beta.age2 = prior.beta[1], 
                                   beta.age3 = prior.beta[1],  
                                   alpha.0 = prior.alpha[1], 
                                   alpha.age2 = prior.alpha[1],
                                   alpha.age3 = prior.alpha[1]), 
                       prec = list(beta.0 = prior.beta[2], 
                                   beta.age2 = prior.beta[2], 
                                   beta.age3 = prior.beta[2],  
                                   alpha.0 = prior.alpha[2], 
                                   alpha.age2 = prior.alpha[2],
                                   alpha.age3 = prior.alpha[2])),
                     verbose=F)

# Save results:
saveRDS(model_missing2, file = "results/model_missing2.rds")

Fitting a complete case model

# Where is bmi missing? 
missing_bmi <- is.na(bmi)

dd_naive <- data.frame(Y = chl, 
                       beta.0 = rep(1, length(bmi)),
                       beta.bmi = bmi, 
                       beta.age2 = age2, 
                       beta.age3 = age3)[!missing_bmi, ]


# Formula
formula <- Y ~ - 1 + beta.0 + beta.age2 + beta.age3 + beta.bmi

# Fit model
set.seed(1)
model_naive <- inla(formula,
              data = dd_naive,
              family = c("gaussian"),
              control.family = list(
                list(hyper = list(prec = list(initial = prec.y, 
                                              param = prior.prec.y, 
                                              fixed = FALSE)))),
              control.fixed = list(
                       mean = list(beta.0 = prior.beta[1], 
                                   beta.age2 = prior.beta[1], 
                                   beta.age3 = prior.beta[1],
                                   beta.bmi = prior.beta[1]), 
                       prec = list(beta.0 = prior.beta[2], 
                                   beta.age2 = prior.beta[2], 
                                   beta.age3 = prior.beta[2],  
                                   beta.bmi = prior.beta[2]))
)

Results

The posterior means and standard deviations are presented in the table below. Note that the data set is quite small (25 observations where 9 are missing), and so the differing result should not be interpreted too seriously.

	me_adjusted			naive			inla_mcmc
	mean	lower	upper	mean	lower	upper	mean	lower	upper
Model of interest
beta.0	-32.246	-53.825	-10.209	-36.471	-60.971	-11.955	43.469	-79.233	166.171
beta.age2	49.780	16.172	82.716	55.767	19.014	92.496	29.501	-5.526	64.528
beta.age3	83.479	40.509	124.367	104.643	55.072	154.173	49.449	3.963	94.935
Beta for beta.bmi	4.935	3.541	6.181	6.919	3.026	10.811	4.864	0.540	9.188
Imputation model
alpha.0	1.983	-0.979	4.978	NA	NA	NA	NA	NA	NA
alpha.age2	-3.126	-7.829	1.544	NA	NA	NA	NA	NA	NA
alpha.age3	-4.525	-9.547	0.320	NA	NA	NA	NA	NA	NA

We can also look at the imputed values themselves, they can be found in marginals.random$id.x inside the model object.

bmi_imputed <- model_missing2$marginals.random$id.x[missing_bmi]
bmi_imp_df <- data.table::rbindlist(lapply(bmi_imputed, as.data.frame), idcol = TRUE)

ggplot(bmi_imp_df, aes(x = x, y = y)) +
  geom_line() +
  facet_wrap(~ .id, ncol = 3) +
  theme_minimal()

A summary can also be seen from summary.random$id.x:

model_missing2$summary.random$id.x[,1:6]

   ID       mean           sd  0.025quant   0.5quant 0.975quant
1   1  1.9833916 4.3417083305  -6.6211007  1.9775924 10.6216277
2   2 -3.8624998 0.0009999933  -3.8644610 -3.8624998 -3.8605387
3   3  3.2045709 3.3767919876  -3.4545358  3.2001626  9.8975173
4   4 -2.5416313 4.5282288945 -11.6122240 -2.5139190  6.3676961
5   5 -6.1624995 0.0009999933  -6.1644607 -6.1624995 -6.1605384
6   6 -5.7573424 3.8336379811 -13.3935391 -5.7357398  1.7336245
7   7 -4.0624997 0.0009999933  -4.0644609 -4.0624997 -4.0605386
8   8  3.5375000 0.0009999933   3.5355388  3.5375000  3.5394611
9   9 -4.5624993 0.0009999933  -4.5644605 -4.5624993 -4.5605382
10 10 -1.1424814 4.4628547510 -10.0052047 -1.1424825  7.7202482
11 11  1.9833916 4.3417083305  -6.6211007  1.9775924 10.6216277
12 12 -1.1424814 4.4628547510 -10.0052047 -1.1424825  7.7202482
13 13 -4.8624999 0.0009999933  -4.8644611 -4.8624999 -4.8605388
14 14  2.1374996 0.0009999933   2.1355385  2.1374996  2.1394608
15 15  3.0374999 0.0009999933   3.0355388  3.0374999  3.0394611
16 16  1.9833916 4.3417083305  -6.6211007  1.9775924 10.6216277
17 17  0.6375001 0.0009999933   0.6355389  0.6375001  0.6394612
18 18 -0.2625001 0.0009999933  -0.2644613 -0.2625001 -0.2605390
19 19  8.7374996 0.0009999933   8.7355385  8.7374996  8.7394608
20 20 -1.0625001 0.0009999933  -1.0644612 -1.0625001 -1.0605390
21 21  1.9833916 4.3417083305  -6.6211007  1.9775924 10.6216277
22 22  6.6374999 0.0009999933   6.6355388  6.6374999  6.6394611
23 23  0.9374998 0.0009999933   0.9355387  0.9374998  0.9394610
24 24 -1.6625001 0.0009999933  -1.6644612 -1.6625001 -1.6605389
25 25  0.8374996 0.0009999933   0.8355385  0.8374996  0.8394608

Model summary

summary(model_missing2)


Call:
   c("inla.core(formula = formula, family = family, contrasts = contrasts, 
   ", " data = data, quantiles = quantiles, E = E, offset = offset, ", " 
   scale = scale, weights = weights, Ntrials = Ntrials, strata = strata, 
   ", " lp.scale = lp.scale, link.covariates = link.covariates, verbose = 
   verbose, ", " lincomb = lincomb, selection = selection, control.compute 
   = control.compute, ", " control.predictor = control.predictor, 
   control.family = control.family, ", " control.inla = control.inla, 
   control.fixed = control.fixed, ", " control.mode = control.mode, 
   control.expert = control.expert, ", " control.hazard = control.hazard, 
   control.lincomb = control.lincomb, ", " control.update = 
   control.update, control.lp.scale = control.lp.scale, ", " 
   control.pardiso = control.pardiso, only.hyperparam = only.hyperparam, 
   ", " inla.call = inla.call, inla.arg = inla.arg, num.threads = 
   num.threads, ", " blas.num.threads = blas.num.threads, keep = keep, 
   working.directory = working.directory, ", " silent = silent, inla.mode 
   = inla.mode, safe = FALSE, debug = debug, ", " .parent.frame = 
   .parent.frame)") 
Time used:
    Pre = 2.96, Running = 0.273, Post = 0.0205, Total = 3.25 
Fixed effects:
              mean     sd 0.025quant 0.5quant 0.975quant mode kld
beta.0     -32.246 10.999    -53.825  -32.333    -10.209   NA   0
beta.age2   49.780 16.772     16.172   49.906     82.716   NA   0
beta.age3   83.479 21.180     40.509   83.873    124.367   NA   0
alpha.0      1.983  1.501     -0.979    1.978      4.978   NA   0
alpha.age2  -3.126  2.361     -7.829   -3.120      1.544   NA   0
alpha.age3  -4.525  2.487     -9.547   -4.494      0.320   NA   0

Random effects:
  Name    Model
    id.x IID model
   beta.bmi Copy

Model hyperparameters:
                                            mean    sd 0.025quant 0.5quant
Precision for the Gaussian observations    0.001 0.001      0.001    0.001
Precision for the Gaussian observations[3] 0.063 0.016      0.036    0.062
Beta for beta.bmi                          4.935 0.657      3.541    4.953
                                           0.975quant mode
Precision for the Gaussian observations         0.003   NA
Precision for the Gaussian observations[3]      0.098   NA
Beta for beta.bmi                               6.181   NA

Marginal log-Likelihood:  -366.84 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

References

Gómez-Rubio, V., & Rue, H. (2018). Markov chain Monte Carlo with the integrated nested Laplace approximation. Statistics and Computing, 28, 1033–1051. doi: 10.1007/s11222-017-9778-y