See also related functions for the analysis of rare and variable taxa (rare_members; rare_abundance; rare_members; rare_abundance; low_abundance).

library("devtools")
#install_github("microbiome/microbiome")

HITChip Data

Load example data:

# Load data
library(microbiome)
data(peerj32)

# Rename the data
pseq <- peerj32$phyloseq

# Calculate compositional version of the data
# (relative abundances)
pseq.rel <- microbiome::transform(pseq, "compositional")

Prevalence of taxonomic groups

Relative population frequencies; at 1% compositional abundance threshold:

head(prevalence(pseq.rel, detection = 1/100, sort = TRUE))
## Roseburia intestinalis et rel.     Eubacterium hallii et rel.     Clostridium nexile et rel.     Ruminococcus obeum et rel. 
##                      1.0000000                      1.0000000                      1.0000000                      0.9772727 
##   Coprococcus eutactus et rel.  Ruminococcus lactaris et rel. 
##                      0.9772727                      0.9545455

Absolute population frequencies (sample count):

head(prevalence(pseq.rel, detection = 1/100, sort = TRUE, count = TRUE))
## Roseburia intestinalis et rel.     Eubacterium hallii et rel.     Clostridium nexile et rel.     Ruminococcus obeum et rel. 
##                             44                             44                             44                             43 
##   Coprococcus eutactus et rel.  Ruminococcus lactaris et rel. 
##                             43                             42

Core microbiota analysis

If you only need the names of the core taxa, do as follows. This returns the taxa that exceed the given prevalence and detection thresholds.

core.taxa.standard <- core_members(pseq.rel, detection = 0, prevalence = 50/100)

A full phyloseq object of the core microbiota is obtained as follows:

pseq.core <- core(pseq.rel, detection = 0, prevalence = .5)

We can also collapse the rare taxa into an “Other” category

pseq.core2 <- aggregate_rare(pseq.rel, "Genus", detection = 0, prevalence = .5)

Retrieving the core taxa names from the phyloseq object:

core.taxa <- taxa(pseq.core)

Core abundance and diversity

Total core abundance in each sample (sum of abundances of the core members):

core.abundance <- sample_sums(core(pseq.rel, detection = .01, prevalence = .95))

Core visualization

Core line plots

Determine core microbiota across various abundance/prevalence thresholds with the blanket analysis (Salonen et al. CMI, 2012) based on various signal and prevalences.

# With compositional (relative) abundances
det <- c(0, 0.1, 0.5, 2, 5, 20)/100
prevalences <- seq(.05, 1, .05)
 #ggplot(d) + geom_point(aes(x, y)) + scale_x_continuous(trans="log10", limits=c(NA,1))


plot_core(pseq.rel, 
          prevalences = prevalences, 
          detections = det, 
          plot.type = "lineplot") + 
  xlab("Relative Abundance (%)")

Core heatmaps

This visualization method has been used for instance in Intestinal microbiome landscaping: Insight in community assemblage and implications for microbial modulation strategies. Shetty et al. FEMS Microbiology Reviews fuw045, 2017.

Note that you can order the taxa on the heatmap with the taxa.order argument.

# Core with compositionals:
library(RColorBrewer)
library(reshape)

prevalences <- seq(.05, 1, .05)

detections <- round(10^seq(log10(0.01), log10(.2), length = 9), 3)

# Also define gray color palette
gray <- gray(seq(0,1,length=5))

#Added pseq.rel, I thin... must be checked if it was in the the rednred version,; where it is initialized
#pseq.rel<- microbiome::transform(pseq, 'compositional')
#min-prevalence gets the 100th highest prevalence
p <- plot_core(pseq.rel,
               plot.type = "heatmap", 
               colours = gray,
               prevalences = prevalences, 
               detections = detections, 
               min.prevalence = prevalence(pseq.rel, sort = TRUE)[100]) +
  labs(x = "Detection Threshold\n(Relative Abundance (%))") +
    
  #Adjusts axis text size and legend bar height
  theme(axis.text.y= element_text(size=8, face="italic"),
        axis.text.x.bottom=element_text(size=8),
        axis.title = element_text(size=10),
        legend.text = element_text(size=8),
        legend.title = element_text(size=10))

print(p)

# Core with absolute counts and horizontal view:
# and minimum population prevalence (given as percentage)
detections <- seq(from = 50, to = round(max(abundances(pseq))/10, -1), by = 100)

p <- plot_core(pseq, plot.type = "heatmap",
               prevalences = prevalences,
               detections = detections,
               colours = rev(brewer.pal(5, "Spectral")),
               min.prevalence = .2, horizontal = TRUE) +
  theme(axis.text.x= element_text(size=8, face="italic", hjust=1),
        axis.text.y= element_text(size=8),
        axis.title = element_text(size=10),
        legend.text = element_text(size=8),
        legend.title = element_text(size=10))

print(p)

Core Microbiota using Amplicon data

Make phyloseq object

This tutorial is useful for analysis of output files from (Mothur), (QIIME or QIIME2) or any tool that gives a biom file as output. There is also a simple way to read comma seperated (*.csv) files.

Simple comma seperated files:

library(microbiome)


otu.file <-
    system.file("extdata/qiita1629_otu_table.csv",
        package='microbiome')

tax.file <- system.file("extdata/qiita1629_taxonomy_table.csv",
        package='microbiome')

meta.file <- system.file("extdata/qiita1629_mapping_subset.csv",
        package='microbiome')

pseq.csv <- read_phyloseq(
          otu.file=otu.file, 
          taxonomy.file=tax.file, 
          metadata.file=meta.file, type = "simple")

Biom file:

# Read the biom file
biom.file <- 
  system.file("extdata/qiita1629.biom", 
              package = "microbiome")

# Read the mapping/metadata file
 meta.file <- 
  system.file("extdata/qiita1629_mapping.csv", 
              package = "microbiome")
# Make phyloseq object
pseq.biom <- read_phyloseq(otu.file = biom.file, 
                         metadata.file = meta.file, 
                         taxonomy.file = NULL, type = "biom")

Mothur shared OTUs and Consensus Taxonomy:

otu.file <- system.file(
 "extdata/Baxter_FITs_Microbiome_2016_fit.final.tx.1.subsample.shared",
    package='microbiome')

tax.file <- system.file(
 "extdata/Baxter_FITs_Microbiome_2016_fit.final.tx.1.cons.taxonomy",
    package='microbiome')

meta.file <- system.file(
 "extdata/Baxter_FITs_Microbiome_2016_mapping.csv",
    package='microbiome')
 
pseq.mothur <- read_phyloseq(otu.file=otu.file,
        taxonomy.file =tax.file,
        metadata.file=meta.file, type = "mothur")
print(pseq.mothur)

Now, we proceed to core microbiota analysis.

Core microbiota analysis

Here the data from Caporaso, J. Gregory, et al. “Moving pictures of the human microbiome.” Genome biology 12.5 (2011): R50. will be used which is stored as example in jeevanuDB

# install
# install.packages("devtools")
# devtools::install_github("microsud/jeevanuDB")

# check the data 
library(jeevanuDB)
ps <- moving_pictures
table(meta(ps)$sample_type, meta(ps)$host_subject_id)
##         
##           F4  M3
##   saliva 135 373
##   skin   268 723
##   stool  131 336
# Filter the data to include only gut samples from M3 subject
ps.m3 <- subset_samples(ps, sample_type == "stool" & host_subject_id == "M3") 
print(ps.m3)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3644 taxa and 336 samples ]
## sample_data() Sample Data:       [ 336 samples by 89 sample variables ]
## tax_table()   Taxonomy Table:    [ 3644 taxa by 7 taxonomic ranks ]
# keep only taxa with positive sums
ps.m3 <- prune_taxa(taxa_sums(ps.m3) > 0, ps.m3)
print(ps.m3)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 611 taxa and 336 samples ]
## sample_data() Sample Data:       [ 336 samples by 89 sample variables ]
## tax_table()   Taxonomy Table:    [ 611 taxa by 7 taxonomic ranks ]
# Calculate compositional version of the data
# (relative abundances)
ps.m3.rel <- microbiome::transform(ps.m3, "compositional")

Output of deblur/dada2 will most likely have seqs as rownames instead of OTU ids or taxa names

taxa_names(ps.m3.rel)[1:2]
## [1] "TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATACCGGTGCTTAACACCGGAACTGCCTCTAATACTGTTGAACTAGAGAGTAGTTGCGGTAGGCGGAATGTATGG"
## [2] "TACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGTCAGATGTGAAATCCCCGAGCTCAACTTGGGAACTGCGTTTGAAACTACAAGACTAGAATATGTCAGAGGGGGGTAGAATTCCACG"

We can change it to ASVIDs

ps.m3.rel <- microbiome::add_refseq(ps.m3.rel)
# Check if ref_seq slot is added to phyloseq object
print(ps.m3.rel)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 611 taxa and 336 samples ]
## sample_data() Sample Data:       [ 336 samples by 89 sample variables ]
## tax_table()   Taxonomy Table:    [ 611 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 611 reference sequences ]
# now check taxa names are ASVids
taxa_names(ps.m3.rel)[1:3]
## [1] "ASV1" "ASV2" "ASV3"

Core microbiota analysis

If you only need the names of the core taxa, do as follows. This returns the taxa that exceed the given prevalence and detection thresholds.

core.taxa.standard <- core_members(ps.m3.rel, detection = 0.0001, prevalence = 50/100)

core.taxa.standard
##  [1] "ASV20"  "ASV34"  "ASV35"  "ASV55"  "ASV66"  "ASV88"  "ASV107" "ASV119" "ASV141" "ASV148" "ASV167" "ASV174" "ASV197" "ASV225"
## [15] "ASV226" "ASV233" "ASV260" "ASV279" "ASV293" "ASV303" "ASV317" "ASV326" "ASV329" "ASV330" "ASV341" "ASV352" "ASV358" "ASV361"
## [29] "ASV380" "ASV387" "ASV395" "ASV399" "ASV400" "ASV402" "ASV428" "ASV439" "ASV476" "ASV486" "ASV494" "ASV568" "ASV579" "ASV580"
## [43] "ASV588" "ASV605"

We notice that ASV ids by themselves are not informative in this case. In this phyloseq object, the unclassified taxonomic values have a pattern like `k__``to represent kingdom level and so on. We need to change these to NAs. This can be variable depending on your data.

# first combine genus and species names. 
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "k__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "p__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "c__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "o__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "f__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "g__"] <- NA
tax_table(ps.m3.rel)[tax_table(ps.m3.rel) == "s__"] <- NA

tax_table(ps.m3.rel)[, colnames(tax_table(ps.m3.rel))] <- gsub(tax_table(ps.m3.rel)[, colnames(tax_table(ps.m3.rel))],  pattern = "[a-z]__", replacement = "")

# Use the microbiome function add_besthit to get taxonomic identities of ASVs.
ps.m3.rel.f <- microbiome::add_besthit(ps.m3.rel)

# Check 
taxa_names(ps.m3.rel.f)[1:10]
##  [1] "ASV1:Rikenellaceae"         "ASV2:Methylotenera.mobilis" "ASV3:Blautia"               "ASV4:Peptoniphilus"        
##  [5] "ASV5:Lachnospiraceae"       "ASV6:Clostridium"           "ASV7:Lachnospiraceae"       "ASV8:Clostridiales"        
##  [9] "ASV9:Lachnospiraceae"       "ASV10:Streptococcus"

Now we add the best taxonomic classification available.

core.taxa.standard <- core_members(ps.m3.rel.f, detection = 0.0001, prevalence = 50/100)

core.taxa.standard
##  [1] "ASV20:Faecalibacterium.prausnitzii"  "ASV34:Oscillospira"                  "ASV35:Odoribacter"                  
##  [4] "ASV55:Lachnospiraceae"               "ASV66:Lachnospiraceae"               "ASV88:Rikenellaceae"                
##  [7] "ASV107:Roseburia"                    "ASV119:Faecalibacterium.prausnitzii" "ASV141:Oscillospira"                
## [10] "ASV148:Faecalibacterium"             "ASV167:Faecalibacterium"             "ASV174:Lachnospiraceae"             
## [13] "ASV197:Phascolarctobacterium"        "ASV225:Oscillospira"                 "ASV226:Oscillospira"                
## [16] "ASV233:[Ruminococcus].torques"       "ASV260:Oscillospira"                 "ASV279:Blautia"                     
## [19] "ASV293:Holdemania"                   "ASV303:Bacteroides"                  "ASV317:Rikenellaceae"               
## [22] "ASV326:Oscillospira"                 "ASV329:Ruminococcus"                 "ASV330:Faecalibacterium.prausnitzii"
## [25] "ASV341:Oscillospira"                 "ASV352:Erysipelotrichaceae"          "ASV358:Lachnospira"                 
## [28] "ASV361:Oscillospira"                 "ASV380:Bacteroides.ovatus"           "ASV387:Clostridiales"               
## [31] "ASV395:Ruminococcaceae"              "ASV399:Oscillospira"                 "ASV400:Parabacteroides"             
## [34] "ASV402:Clostridium.colinum"          "ASV428:Blautia.obeum"                "ASV439:Coprococcus"                 
## [37] "ASV476:Parabacteroides"              "ASV486:Ruminococcus"                 "ASV494:Roseburia"                   
## [40] "ASV568:Bacteroides.caccae"           "ASV579:Oscillospira"                 "ASV580:Sutterella"                  
## [43] "ASV588:Lachnospiraceae"              "ASV605:Coprococcus"

A full phyloseq object of the core microbiota is obtained as follows:

pseq.core <- core(ps.m3.rel.f, detection = 0.0001, prevalence = .5)

Retrieving the associated taxonomy from the phyloseq object:

core.taxa <- taxa(pseq.core)
class(core.taxa)
## [1] "character"
# get the taxonomy data
tax.mat <- tax_table(pseq.core)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(head(core.taxa.class))
Domain Phylum Class Order Family Genus Species OTU
ASV20:Faecalibacterium.prausnitzii Bacteria Firmicutes Clostridia Clostridiales Ruminococcaceae Faecalibacterium prausnitzii ASV20:Faecalibacterium.prausnitzii
ASV34:Oscillospira Bacteria Firmicutes Clostridia Clostridiales Ruminococcaceae Oscillospira NA ASV34:Oscillospira
ASV35:Odoribacter Bacteria Bacteroidetes Bacteroidia Bacteroidales [Odoribacteraceae] Odoribacter NA ASV35:Odoribacter
ASV55:Lachnospiraceae Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae NA NA ASV55:Lachnospiraceae
ASV66:Lachnospiraceae Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae NA NA ASV66:Lachnospiraceae
ASV88:Rikenellaceae Bacteria Bacteroidetes Bacteroidia Bacteroidales Rikenellaceae NA NA ASV88:Rikenellaceae

Core visualization

Core line plots

Determine core microbiota across various abundance/prevalence thresholds with the blanket analysis (Salonen et al. CMI, 2012) based on various signal and prevalences.

# With compositional (relative) abundances
det <- c(0, 0.1, 0.5, 2, 5, 20)/100
prevalences <- seq(.05, 1, .05)

plot_core(ps.m3.rel.f, prevalences = prevalences, 
          detections = det, plot.type = "lineplot") + 
  xlab("Relative Abundance (%)") + 
  theme_bw()

Core heatmaps

This visualization method has been used for instance in Intestinal microbiome landscaping: Insight in community assemblage and implications for microbial modulation strategies. Shetty et al. FEMS Microbiology Reviews fuw045, 2017.

Note that you can order the taxa on the heatmap with the order.taxa argument.

# Core with compositionals:
prevalences <- seq(.05, 1, .05)
detections <- round(10^seq(log10(1e-2), log10(.2), length = 10), 3)

#Deletes "ASV" from taxa_names, e.g. ASV1 --> 1
#taxa_names(ps.m3.rel) = taxa_names(ps.m3.rel) %>% str_replace("ASV", "")
# Also define gray color palette
gray <- gray(seq(0,1,length=5))

p1 <- plot_core(ps.m3.rel.f,
  plot.type = "heatmap",
  colours = gray,
  prevalences = prevalences,
  detections = detections, min.prevalence = .5) +
  xlab("Detection Threshold (Relative Abundance (%))")

p1 <- p1 + theme_bw() + ylab("ASVs")
p1

Using viridis color palette

library(viridis)
print(p1 + scale_fill_viridis())
## Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.

Genus level

ps.m3.rel.gen <- aggregate_taxa(ps.m3.rel, "Genus")

# Check if any taxa with no genus classification. aggregate_taxa will merge all unclassified to Unknown
any(taxa_names(ps.m3.rel.gen) == "Unknown")
## [1] TRUE
# Remove Unknown
ps.m3.rel.gen <- subset_taxa(ps.m3.rel.gen, Genus!="Unknown")
library(RColorBrewer)
prevalences <- seq(.05, 1, .05)
detections <- round(10^seq(log10(1e-5), log10(.2), length = 10), 3)

p1 <- plot_core(ps.m3.rel.gen, 
                plot.type = "heatmap", 
                colours = rev(brewer.pal(5, "RdBu")),
                prevalences = prevalences, 
                detections = detections, min.prevalence = .5) +
    xlab("Detection Threshold (Relative Abundance (%))")
p1 <- p1 + theme_bw() + ylab("ASVs")
p1

Some taxa name are long. Shorten them as follows and plot.

taxa_names(ps.m3.rel.gen) <- gsub("Bacteria_Firmicutes_Clostridia_Clostridiales_",
                                  "", taxa_names(ps.m3.rel.gen))
p1 <- plot_core(ps.m3.rel.gen, 
                plot.type = "heatmap", 
                colours = rev(brewer.pal(5, "RdBu")),
                prevalences = prevalences, 
                detections = detections, min.prevalence = .5) +
    xlab("Detection Threshold (Relative Abundance (%))") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle=90),
        axis.text.y = element_text(face = "italic"))

p1