4 Bar Plots

Bar plots are commonly used to visualize the relative abundance of taxa or other features within different groups or conditions in microbiome data. In this section, we will demonstrate different methods for plotting bar plots.

4.1 Explore categorical variables

The freq() function is in the funModeling package provides a concise summary of the frequency distribution of a categorical variable in a dataset. It displays the count and percentage of each category, allowing users to quickly understand the distribution and prevalence of different categories within the variable. This function is useful for exploratory data analysis and understanding the composition of categorical variables in the dataset.

library(phyloseq)
library(dplyr)
library(microbiome)
library(funModeling)

df <-ps_df %>% 
  select(-sample_id) %>% 
  rename_all(tolower)

freq(df, 
     input = c("sex", "nationality", "grp", "bmi"), 
     plot = TRUE,
     na.rm = FALSE, 
     # path_out="figures"
     )
     sex frequency percentage cumulative_perc
1   male     49604      54.09           54.09
2 female     42096      45.91          100.00
  nationality frequency percentage cumulative_perc
1         AAM     50468      55.04           55.04
2         AFR     41232      44.96          100.00
  grp frequency percentage cumulative_perc
1  ED     31080      33.89           33.89
2  HE     31068      33.88           67.77
3  DI     29552      32.23          100.00
         bmi frequency percentage cumulative_perc
1      obese     37080      40.44           40.44
2 overweight     31320      34.15           74.59
3       lean     23300      25.41          100.00
[1] "Variables processed: sex, nationality, grp, bmi"

4.2 Comparative results from Mothur and QIIME2

# scripts/create_barplots.R
library(readr)
library(dplyr)
library(ggplot2)
library(svglite)

read_csv("../imap-data-preparation/data/mothur/mothur_composite.csv", show_col_types = FALSE) %>% 
  mutate(pipeline="MOTHUR", .before=2) %>% 
  select(-OTU, -3) %>% 
  ggplot(aes(x=Genus, y=rel_abund, fill=pipeline)) +
  facet_grid(~pipeline) +
  geom_col() +
  coord_flip() +
  labs(y="Relative Abundance (%)") +
  theme(legend.position="none")


read_csv("../imap-data-preparation/data/qiime2/qiime2_composite.csv", show_col_types = FALSE) %>% 
  mutate(pipeline="QIIME2", .before=2) %>% 
  select(-feature, -c(3:14)) %>% 
  ggplot(aes(x=Genus, y=rel_abund, fill=pipeline)) +
  facet_grid(~pipeline) +
  geom_col() +
  coord_flip() +
  labs(y="Relative Abundance (%)") + 
  theme(legend.position="none")



ggsave(file="figures/taxon_barplot.png", width=10, height=10)
ggsave(file="figures/taxon_barplot.svg", width=10, height=10)

4.3 Compute sequence count per sample

Here we only show the head and tail count data

seqcount_per_sample <- ps_df %>%
  group_by(sample_id) %>% 
  summarise(nseqs = sum(count), .groups = "drop") %>% 
  arrange(-nseqs)

head_tail <- rbind(head(seqcount_per_sample, 5), tail(seqcount_per_sample, 5))

max_y <- max(head_tail$nseqs)

head_tail %>% 
  mutate(sample_id = factor(sample_id),
         sample_id = fct_reorder(sample_id, nseqs, .desc = F),
         sample_id = fct_shift(sample_id, n = 0)) %>%
  ggplot(aes(x = reorder(sample_id, nseqs), y = nseqs)) +
  geom_col(position = position_stack(), fill = "steelblue") +
  labs(x = "sample ID", y = "Number of sequences", subtitle = "Phylum: Top and bottom sequence count \nwith no data labels") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))

ggsave("figures/basic_barplot.png", width=5, height=4)


#---------------------------------------


head_tail %>% 
  mutate(sample_id = factor(sample_id),
         sample_id = fct_reorder(sample_id, nseqs, .desc = F),
         sample_id = fct_shift(sample_id, n = 0)) %>%
  ggplot(aes(x = reorder(sample_id, nseqs), y = nseqs)) +
  geom_col(position = position_stack(), fill = "steelblue") +
  labs(x = "sample ID", y = "Number of sequences", subtitle = "Phylum: Top and bottom sequence count \nwith data labels") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  coord_cartesian(ylim = c(0, max_y)) +
  geom_text(aes(label = nseqs), vjust = -0.3, color = "#AAAAAA")

ggsave("figures/barplot_w_labels.png", width=5, height=4)

4.4 Relative abundance of taxa

The comp_barplot function in the microViz R package is designed to create comparative bar plots for visualizing microbiome data. This function allows users to compare the relative abundance of taxa or other features across different groups or conditions. It supports customization options such as color palette selection and plot title specification to enhance the appearance of the bar plot.

library(microViz)

psextra_raw %>%
  comp_barplot(
    tax_level = "Genus", n_taxa = 15, other_name = "Other",
    taxon_renamer = function(x) stringr::str_remove(x, " [ae]t rel."),
    palette = distinct_palette(n = 15, add = "grey90"),
    merge_other = FALSE, bar_outline_colour = "darkgrey"
  ) +
  coord_flip() +
  facet_wrap("nationality", nrow = 1, scales = "free") +
  labs(x = NULL, y = NULL) +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())

4.5 Bar plot on ps_caporaso dataset

library(microViz)

ps_caporaso %>%
  tax_fix() %>% 
  comp_barplot(
    tax_level = "Genus", n_taxa = 15, other_name = "Other",
    taxon_renamer = function(x) stringr::str_remove(x, " [ae]t rel."),
    palette = distinct_palette(n = 15, add = "grey90"),
    merge_other = FALSE, bar_outline_colour = "darkgrey"
  ) +
  coord_flip() +
  # facet_wrap("SampleType", nrow = 1, scales = "free") +
  labs(x = NULL, y = NULL) +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())

4.6 Bar plots for Taxa relative abundane

set.seed(1234)
library(tidyverse)
library(readxl)
library(ggtext)
library(RColorBrewer)



otu_rel_abund <- ps_df %>%
  mutate(nationality = factor(nationality, 
                      levels = c("AAM", "AFR"),
                      labels = c("African American", "African")),
         bmi = factor(bmi,
                      levels = c("lean", "overweight", "obese"),
                      labels = c("Lean", "Overweight", "Obese")),
         sex = factor(sex,
                      levels =c("female", "male"),
                      labels = c("Female", "Male")))

save(otu_rel_abund, file = "data/otu_rel_abund.rda")

phylum_rel_abund <- otu_rel_abund %>%
  filter(level=="phylum",
         !grepl(".*unassigned.*|.*nclassified.*|.*ncultured.*",taxon)) %>%
  group_by(bmi, sample_id, taxon) %>%
  summarise(rel_abund = sum(rel_abund), .groups="drop") %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = 100*mean(rel_abund), .groups="drop") %>%
  mutate(taxon = str_replace(taxon, "(.*)_unclassified", "Unclassified<br>*\\1*"),
         taxon = str_replace(taxon, "^$", "*\\1*"),
         taxon = str_replace(taxon, "_", " "))

## Pool low abundance to "Others"
taxon_pool <- phylum_rel_abund %>%
  group_by(taxon) %>%
  summarise(pool = max(mean_rel_abund) < 1,
            mean = mean(mean_rel_abund), .groups="drop")


inner_join(phylum_rel_abund, taxon_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = sum(mean_rel_abund),
            mean = min(mean), .groups="drop") %>%
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n=1)) %>%
  ggplot(aes(x=bmi, y=mean_rel_abund, fill=taxon)) +
  geom_col(position = position_stack()) +
  scale_fill_discrete(name=NULL) +
  scale_x_discrete(breaks = c("lean", "overweight", "obese"), labels = c("Lean", "Overweight", "Obese")) +
  scale_fill_manual(name=NULL,
                    breaks=c("*Actinobacteria*","*Bacteroidetes*", "*Cyanobacteria*", "*Firmicutes*","*Proteobacteria*", "Other"),
                    labels = c("Actinobacteria","Bacteroidetes", "Cyanobacteria", "Firmicutes", "Proteobacteria", "Other"),
                    values = c(brewer.pal(5, "Dark2"), "gray")) +
  labs(x = NULL, y = "Mean Relative Abundance", subtitle = "Phyla stacked bars filled by taxon") +
  theme_classic() +
  theme(axis.text.x = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(10, "pt"))

ggsave("figures/stacked_phyla.png", width=5, height=4)
#---------------------------------------

taxon_rel_abund <- otu_rel_abund %>%
  filter(level=="genus",
         !grepl(".*unassigned.*|.*nclassified.*|.*ncultured.*",taxon)) %>%
  group_by(bmi, sample_id, taxon) %>%
  summarise(rel_abund = sum(rel_abund), .groups="drop") %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = 100*mean(rel_abund), .groups="drop") %>%
  mutate(taxon = str_replace(taxon, "(.*)_unclassified", "Unclassified<br>*\\1*"),
         taxon = str_replace(taxon, "^$", "*\\1*"),
         taxon = str_replace(taxon, "_", " "))

#---------------------------------------

## Pool low abundance to "Others"

taxon_pool <- taxon_rel_abund %>%
  group_by(taxon) %>%
  summarise(pool = max(mean_rel_abund) < 2,
            mean = mean(mean_rel_abund), .groups="drop")


inner_join(taxon_rel_abund, taxon_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = sum(mean_rel_abund),
            mean = min(mean), .groups="drop") %>%
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n=1)) %>%
  ggplot(aes(x=bmi, y=mean_rel_abund, fill=taxon)) +
  geom_col(position = position_stack()) +
  scale_fill_discrete(name=NULL) +
  scale_x_discrete(breaks = c("lean", "overweight", "obese"), labels = c("Lean", "Overweight", "Obese")) +
  labs(x = NULL, y = "Mean Relative Abundance", subtitle = "Taxa stacked bars filled by taxon") +
  theme_classic() +
  theme(axis.text.x = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(10, "pt"))
ggsave("figures/stacked_taxa.png", width=5, height=4)

#---------------------------------------

phylum_rel_abund <- otu_rel_abund %>%
  filter(level=="phylum",
         !grepl(".*unassigned.*|.*nclassified.*|.*ncultured.*",taxon)) %>%
  group_by(bmi, sample_id, taxon) %>%
  summarise(rel_abund = sum(rel_abund), .groups="drop") %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = 100*mean(rel_abund), .groups="drop") %>%
  mutate(taxon = str_replace(taxon, "(.*)_unclassified", "Unclassified<br>*\\1*"),
         taxon = str_replace(taxon, "^$", "*\\1*"),
         taxon = str_replace(taxon, "_", " "))

## Pool low abundance to "Others"
taxon_pool <- phylum_rel_abund %>%
  group_by(taxon) %>%
  summarise(pool = max(mean_rel_abund) < 1,
            mean = mean(mean_rel_abund), .groups="drop")


inner_join(phylum_rel_abund, taxon_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = sum(mean_rel_abund),
            mean = min(mean), .groups="drop") %>%
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n=1)) %>%
  ggplot(aes(y=taxon, x = mean_rel_abund, fill= bmi)) +
  geom_col(width=0.8, position = position_dodge()) +
  labs(y = NULL, x = "Mean Relative Abundance", subtitle = "Phyla grouped by BMI category", fill = NULL) +
  theme_classic() +
  theme(axis.text.x = element_markdown(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(12, "pt"),
        panel.background = element_blank(),
        # panel.grid.major.x =  element_line(colour = "lightgray", size = 0.1),
        panel.border = element_blank()) +
  guides(fill = guide_legend(ncol=1)) +
  scale_x_continuous(expand = c(0, 0))


ggsave("figures/grouped_phyla.png", width=5, height=4)

#---------------------------------------

taxon_rel_abund <- otu_rel_abund %>%
  filter(level=="genus",
         !grepl(".*unassigned.*|.*nclassified.*|.*ncultured.*",taxon)) %>%
  group_by(bmi, sample_id, taxon) %>%
  summarise(rel_abund = sum(rel_abund), .groups="drop") %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = 100*mean(rel_abund), .groups="drop") %>%
  mutate(taxon = str_replace(taxon, "(.*)_unclassified", "Unclassified<br>*\\1*"),
         taxon = str_replace(taxon, "^$", "*\\1*"),
         taxon = str_replace(taxon, "_", " "))

## Pool low abundance to "Others"

taxon_pool <- taxon_rel_abund %>%
  group_by(taxon) %>%
  summarise(pool = max(mean_rel_abund) < 2,
            mean = mean(mean_rel_abund), .groups="drop")

## Plotting
inner_join(taxon_rel_abund, taxon_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  group_by(bmi, taxon) %>%
  summarise(mean_rel_abund = sum(mean_rel_abund),
            mean = min(mean), .groups="drop") %>%
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n=1)) %>%
  ggplot(aes(y=taxon, x = mean_rel_abund, fill= bmi)) +
  geom_col(width=0.8, position = position_dodge()) +
  labs(y = NULL, x = "Mean Relative Abundance", subtitle = "Taxa grouped by BMI category", fill = NULL) +
  theme_classic() +
  theme(axis.text.x = element_markdown(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(12, "pt"),
        panel.background = element_blank(),
        # panel.grid.major.x =  element_line(colour = "lightgray", size = 0.1),
        panel.border = element_blank()) +
  guides(fill = guide_legend(ncol=1)) +
  scale_x_continuous(expand = c(0, 0))


ggsave("figures/grouped_taxa.png", width=5, height=4)
#---------------------------------------

phylum <- ps_df %>%
  filter(level == "phylum") 

phylum_pool <- phylum %>%
  group_by(taxon) %>%
  summarise(pool = max(rel_abund) < 0.02,
            mean = mean(rel_abund), .groups="drop")  
  
inner_join(phylum, phylum_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  ggplot(aes(x = sample_id, y = 100*rel_abund, fill = taxon)) +
  theme_bw() + 
  geom_bar(stat = "identity", position = position_stack()) +
  labs(x="Sample", y="Relative Abundance", subtitle = "Taxa Relative Abundance faceted by group", fill = NULL) +
  facet_grid(~ nationality, space = "free", scales = "free") +
  theme(axis.text.x = element_blank(),
        legend.text = element_markdown(),
        strip.background = element_rect(colour = "lightblue", fill = "lightblue")) +
  scale_fill_manual(name=NULL,
                    breaks=c("*Actinobacteria*","*Bacteroidetes*", "*Firmicutes*","*Proteobacteria*", "*Verrucomicrobia*", "Other"),
                    labels=c("*Actinobacteria*","*Bacteroidetes*", "*Firmicutes*","*Proteobacteria*", "*Verrucomicrobia*", "Other"),
                    values = c(brewer.pal(5, "Paired"), "gray"))

 
ggsave("figures/faceted_phyla_bar.png", width=5, height=4)
#---------------------------------------

genus <-  ps_df %>%
  filter(level == "genus") 

genus_pool <- genus %>%
  group_by(taxon) %>%
  summarise(pool = max(rel_abund) < 0.15,
            mean = mean(rel_abund), .groups="drop")  
  
inner_join(genus, genus_pool, by="taxon") %>%
  mutate(taxon = if_else(pool, "Other", taxon)) %>%
  ggplot(aes(x = sample_id, y = 100*rel_abund, fill = taxon)) +
  theme_bw() + 
  geom_bar(stat = "identity", position = position_stack()) +
  labs(x="Sample", y="Relative Abundance", subtitle = "Taxa Relative Abundance faceted by group", fill = NULL) +
  facet_grid(~ nationality, space = "free", scales = "free") +
  theme(axis.text.x = element_blank(),
        legend.text = element_markdown(),
        strip.background = element_rect(colour = "lightblue", fill = "lightblue"))


ggsave("figures/faceted_taxa_bar.png", width=5, height=4)
#---------------------------------------

ps_rel %>% 
microbial::plotbar( level="Phylum", group = "nationality", top = 10) +
  theme(axis.text.x = element_text(angle = 0)) + 
  labs(x="Sample", 
       y="Relative Abundance", 
       subtitle = "Phyla Relative Abundance by plotbar()\nin microbial package", 
       fill = NULL)

ggsave("figures/plotbar_phyla_bar.png", width=5, height=4)

#---------------------------------------

ps_rel %>% 
microbial::plotbar(level="Genus", group = "nationality", top = 10) +
  theme(axis.text.x = element_text(angle = 0)) + 
  labs(x="Sample", 
       y="Relative Abundance", 
       subtitle = "Taxa Relative Abundance by plotbar()\nin microbial package", 
       fill = NULL)

ggsave("figures/plotbar_taxa_bar.png", width=5, height=4)
#---------------------------------------
# Using ggpubr package
#---------------------------------------

library(ggpubr)
library(RColorBrewer)
library(viridis)

psmelt(ps_rel) %>% 
  select(Sample, nationality, Phylum, Abundance) %>% 
  group_by(Sample, nationality, Phylum) %>% 
  summarise(Abundance = sum(Abundance), .groups = "drop") %>% 
  filter(Abundance > 0.01) %>% 
  mutate(Phylum = fct_reorder(Phylum, -Abundance)) %>% 
  ggbarplot(x = "Phylum", y = "Abundance", 
            fill = "nationality",
            color = "nationality",
            palette = "jco",
            sort.by.groups = FALSE,     
            rotate = TRUE,
            position = position_dodge()) +
  theme_classic() +
  theme(axis.text.x = element_markdown(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(12, "pt"))  +
  labs(x=NULL, 
       y="Relative Abundance (%)",
       subtitle = "Phyla Relative Abundance by ggbarplot()\nin ggpubr package grouped by nationality", 
       fill = NULL) +
  guides(color = "none")

ggsave("figures/ggpubr_phyla_bar.png", width=5, height=4)
#------------------------------
# Plot top n
#------------------------------
library(phyloseq)
library(tidyverse)
library(ggpubr)

df <- psmelt(ps_rel) %>% 
  select(Sample, nationality, Genus, Abundance) %>% 
  group_by(Sample, nationality, Genus) %>% 
  summarise(Abundance = sum(Abundance), .groups = "drop")

# Function
top_n_unique <- function(df, n, group_var, value_var) {
  df %>%
    group_by({{ group_var }}) %>%
    mutate(rank = dense_rank(desc({{ value_var }}))) %>%
    filter(rank <= n) %>%
    ungroup() %>%
    select(-rank)
}

# Plot
n <- 10  # Number of top unique taxa to select
top_n_unique(df, n, Genus, Abundance) %>% 
  filter(Abundance > 0.2) %>% 
  mutate(Genus = fct_reorder(Genus, -Abundance)) %>% 
  ggbarplot(x = "Genus", y = "Abundance", 
            fill = "nationality",
            color = "nationality",
            palette = "jco",
            sort.by.groups = FALSE,     
            rotate = TRUE,
            position = position_dodge()) +
  theme_classic() +
  theme(axis.text.x = element_markdown(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_markdown(),
        legend.text = element_markdown(),
        legend.key.size = unit(12, "pt"))  +
  labs(x=NULL, 
       y="Relative Abundance (%)",
       subtitle = "Genera Relative Abundance by ggbarplot()\nin ggpubr package grouped by nationality", 
       fill = NULL) +
  guides(color = "none")

ggsave("figures/ggpubr_top_n_bar.png", width=5, height=4)