Dirichlet Multinomial Mixtures (DMM) (Quince et al. 2012) is a probabilistic method for community typing (or clustering) of microbial community profiling data. It is an infinite mixture model, which means that the method can infer the optimal number of community types. Note that the number of community types is likely to grow with data size.
Let us load example data.
library(microbiome)
library(DirichletMultinomial)
library(reshape2)
library(magrittr)
library(dplyr)
# Load example data
data(dietswap)
pseq <- dietswap
# To speed up, only consider the core taxa
# that are prevalent at 0.1% relative abundance in 50% of the samples
# (note that this is not strictly correct as information is
# being discarded; one alternative would be to aggregate rare taxa)
pseq.comp <- microbiome::transform(pseq, "compositional")
taxa <- core_members(pseq.comp, detection = 0.1/100, prevalence = 50/100)
pseq <- prune_taxa(taxa, pseq)
# Pick the OTU count matrix
# and convert it into samples x taxa format
dat <- abundances(pseq)
count <- as.matrix(t(dat))
Fit the DMM model. Let us set the maximum allowed number of community types to 3 to speed up the example.
fit <- lapply(1:3, dmn, count = count, verbose=TRUE)
## Soft kmeans
## Expectation Maximization setup
## Expectation Maximization
## Hessian
## Soft kmeans
## iteration 10 change 0.000020
## Expectation Maximization setup
## Expectation Maximization
## Hessian
## Soft kmeans
## iteration 10 change 0.005320
## iteration 20 change 0.000003
## Expectation Maximization setup
## Expectation Maximization
## iteration 10 change 0.000062
## Hessian
Check model fit with different number of mixture components using standard information criteria
lplc <- base::sapply(fit, DirichletMultinomial::laplace) # AIC / BIC / Laplace
aic <- base::sapply(fit, DirichletMultinomial::AIC) # AIC / BIC / Laplace
bic <- base::sapply(fit, DirichletMultinomial::BIC) # AIC / BIC / Laplace
#plot(lplc, type="b", xlab="Number of Dirichlet Components", ylab="Model Fit")
#lines(aic, type="b", lty = 2)
#lines(bic, type="b", lty = 3)
Pick the optimal model
best <- fit[[which.min(unlist(lplc))]]
Mixture parameters pi and theta
mixturewt(best)
## pi theta
## 1 0.3738026 159.10479
## 2 0.3188925 81.91144
## 3 0.3073049 64.24718
Sample-component assignments
ass <- apply(mixture(best), 1, which.max)
Contribution of each taxonomic group to each component
for (k in seq(ncol(fitted(best)))) {
d <- melt(fitted(best))
colnames(d) <- c("OTU", "cluster", "value")
d <- subset(d, cluster == k) %>%
# Arrange OTUs by assignment strength
arrange(value) %>%
mutate(OTU = factor(OTU, levels = unique(OTU))) %>%
# Only show the most important drivers
filter(abs(value) > quantile(abs(value), 0.8))
p <- ggplot(d, aes(x = OTU, y = value)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = paste("Top drivers: community type", k))
print(p)
}