1 Data Generation

Load the necessary libraries:

library(HTLR)
library(bayesplot)
#> This is bayesplot version 1.9.0
#> - Online documentation and vignettes at mc-stan.org/bayesplot
#> - bayesplot theme set to bayesplot::theme_default()
#>    * Does _not_ affect other ggplot2 plots
#>    * See ?bayesplot_theme_set for details on theme setting

The description of the dataset generating scheme is found from Li and Yao (2018).

There are 4 groups of features:

SEED <- 1234

n <- 510
p <- 2000

means <- rbind(
  c(0, 1, 0),
  c(0, 0, 0),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1),
  c(0, 0, 1)
) * 2

means <- rbind(means, matrix(0, p - 10, 3))

A <- diag(1, p)

A[1:10, 1:3] <-
  rbind(
    c(1, 0, 0),
    c(2, 1, 0),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1),
    c(0, 0, 1)
  )

set.seed(SEED)
dat <- gendata_FAM(n, means, A, sd_g = 0.5, stdx = TRUE)
str(dat)
#> List of 4
#>  $ X  : num [1:510, 1:2000] -1.423 -0.358 -1.204 -0.556 0.83 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2000] "V1" "V2" "V3" "V4" ...
#>  $ muj: num [1:2000, 1:3] -0.456 0 -0.456 -0.376 -0.376 ...
#>  $ SGM: num [1:2000, 1:2000] 0.584 0.597 0 0 0 ...
#>  $ y  : int [1:510] 1 2 3 1 2 3 1 2 3 1 ...

Look at the correlation between features:

# require(corrplot)
cor(dat$X[ , 1:11]) %>% corrplot::corrplot(tl.pos = "n")

Split the data into training and testing sets:

set.seed(SEED)
dat <- split_data(dat$X, dat$y, n.train = 500)
str(dat)
#> List of 4
#>  $ x.tr: num [1:500, 1:2000] 0.889 -0.329 1.58 0.213 0.214 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2000] "V1" "V2" "V3" "V4" ...
#>  $ y.tr: int [1:500] 2 3 2 1 2 3 3 3 1 2 ...
#>  $ x.te: num [1:10, 1:2000] 0.83 -0.555 1.041 -1.267 1.15 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2000] "V1" "V2" "V3" "V4" ...
#>  $ y.te: int [1:10] 2 3 2 1 2 2 2 1 2 3

2 Model Fitting

Fit a HTLR model with all default settings:

set.seed(SEED)
system.time(
  fit.t <- htlr(dat$x.tr, dat$y.tr)
)
#>    user  system elapsed 
#> 150.604   0.093  19.445
print(fit.t)
#> Fitted HTLR model 
#> 
#>  Data:
#> 
#>   response:  3-class
#>   observations:  500
#>   predictors:    2001 (w/ intercept)
#>   standardised:  TRUE 
#> 
#>  Model:
#> 
#>   prior dist:    t (df = 1, log(w) = -10.0)
#>   init state:    lasso 
#>   burn-in:   1000
#>   sample:    1000 (posterior sample size) 
#> 
#>  Estimates:
#> 
#>   model size:    4 (w/ intercept)
#>   coefficients: see help('summary.htlr.fit')

With another configuration:

set.seed(SEED)
system.time(
  fit.t2 <- htlr(X = dat$x.tr, y = dat$y.tr, 
                 prior = htlr_prior("t", df = 1, logw = -20, sigmab0 = 1500), 
                 iter = 4000, init = "bcbc", keep.warmup.hist = T)
)
#>    user  system elapsed 
#> 248.045   1.075  33.179
print(fit.t2)
#> Fitted HTLR model 
#> 
#>  Data:
#> 
#>   response:  3-class
#>   observations:  500
#>   predictors:    2001 (w/ intercept)
#>   standardised:  TRUE 
#> 
#>  Model:
#> 
#>   prior dist:    t (df = 1, log(w) = -20.0)
#>   init state:    bcbc 
#>   burn-in:   2000
#>   sample:    2000 (posterior sample size) 
#> 
#>  Estimates:
#> 
#>   model size:    4 (w/ intercept)
#>   coefficients: see help('summary.htlr.fit')

3 Model Inspection

Look at the point summaries of posterior of selected parameters:

summary(fit.t2, features = c(1:10, 100, 200, 1000, 2000), method = median)
#>                 class 2       class 3
#> Intercept -3.5440246641 -0.9141670025
#> V1        11.0882079079 -0.0686738999
#> V2        -6.9590455998 -0.0132882097
#> V3        -0.0184985178  3.6755150379
#> V4        -0.0040381365  0.0056580000
#> V5        -0.0094181202 -0.0132027600
#> V6        -0.0030070886 -0.0077282568
#> V7         0.0043427966  0.0193755283
#> V8        -0.0049296905 -0.0042783070
#> V9         0.0058325713  0.0052551571
#> V10       -0.0007789674  0.0102389732
#> V100      -0.0009377969 -0.0009939219
#> V200       0.0008059268  0.0048683453
#> V1000     -0.0001098662  0.0038746255
#> V2000      0.0067078764 -0.0063507908
#> attr(,"stats")
#> [1] "median"

Plot interval estimates from posterior draws using bayesplot:

post.t <- as.matrix(fit.t2, k = 2)
## signal parameters
mcmc_intervals(post.t, pars = c("Intercept", "V1", "V2", "V3", "V1000"))

Trace plot of MCMC draws:

as.matrix(fit.t2, k = 2, include.warmup = T) %>%
  mcmc_trace(c("V1", "V1000"), facet_args = list("nrow" = 2), n_warmup = 2000)