Community typing with Dirichlet Multinomial Mixtures

Dirichlet Multinomial Mixtures (DMM) (Quince et al. 2012) is a probabilistic method for community typing (or clustering) of microbial community profiling data. It is an infinite mixture model, which means that the method can infer the optimal number of community types. Note that the number of community types is likely to grow with data size.

Let us load example data.

library(microbiome)
library(DirichletMultinomial)
library(reshape2)
library(magrittr)
library(dplyr)
# Load example data
data(dietswap)
pseq <- dietswap

# To speed up, only consider the core taxa
# that are prevalent at 0.1% relative abundance in 50% of the samples
# (note that this is not strictly correct as information is
# being discarded; one alternative would be to aggregate rare taxa)
pseq.comp <- microbiome::transform(pseq, "compositional")
taxa <- core_members(pseq.comp, detection = 0.1/100, prevalence = 50/100)
pseq <- prune_taxa(taxa, pseq)

# Pick the OTU count matrix
# and convert it into samples x taxa format
dat <- abundances(pseq)
count <- as.matrix(t(dat))

Fit the DMM model. Let us set the maximum allowed number of community types to 3 to speed up the example.

fit <- lapply(1:3, dmn, count = count, verbose=TRUE)
##   Soft kmeans
##   Expectation Maximization setup
##   Expectation Maximization
##   Hessian
##   Soft kmeans
##     iteration 10 change 0.000020
##   Expectation Maximization setup
##   Expectation Maximization
##   Hessian
##   Soft kmeans
##     iteration 10 change 0.005320
##     iteration 20 change 0.000003
##   Expectation Maximization setup
##   Expectation Maximization
##     iteration 10 change 0.000062
##   Hessian

Check model fit with different number of mixture components using standard information criteria

lplc <- base::sapply(fit, DirichletMultinomial::laplace) # AIC / BIC / Laplace
aic  <- base::sapply(fit, DirichletMultinomial::AIC) # AIC / BIC / Laplace
bic  <- base::sapply(fit, DirichletMultinomial::BIC) # AIC / BIC / Laplace
#plot(lplc, type="b", xlab="Number of Dirichlet Components", ylab="Model Fit")
#lines(aic, type="b", lty = 2)
#lines(bic, type="b", lty = 3)

Pick the optimal model

best <- fit[[which.min(unlist(lplc))]]

Mixture parameters pi and theta

mixturewt(best)
##          pi     theta
## 1 0.3738026 159.10479
## 2 0.3188925  81.91144
## 3 0.3073049  64.24718

Sample-component assignments

ass <- apply(mixture(best), 1, which.max)

Contribution of each taxonomic group to each component

for (k in seq(ncol(fitted(best)))) {
  d <- melt(fitted(best))
  colnames(d) <- c("OTU", "cluster", "value")
  d <- subset(d, cluster == k) %>%
     # Arrange OTUs by assignment strength
     arrange(value) %>%
     mutate(OTU = factor(OTU, levels = unique(OTU))) %>%
     # Only show the most important drivers
     filter(abs(value) > quantile(abs(value), 0.8))     

  p <- ggplot(d, aes(x = OTU, y = value)) +
       geom_bar(stat = "identity") +
       coord_flip() +
       labs(title = paste("Top drivers: community type", k))
  print(p)
}