10 Distance Calculation in Microbiome Data Analysis

Calculating distance matrices between samples using appropriate metrics is a fundamental step in data preprocessing. This practice efficiently captures the similarity or dissimilarity between samples, laying the groundwork for subsequent analysis and dimensionality reduction techniques such as PCA, MDS, or t-SNE.

10.1 Import processed and transformed data objects

set.seed(110912)

library(phyloseq)
library(microbiome)
library(vegan)

load("data/phyloseq_objects.rda", verbose = TRUE)
Loading objects:
  ps_GlobalPatterns
  ps_dietswap
  ps_caporaso
  ps_kostic_crc
load("data/ps_transformed.rda", verbose = TRUE)
Loading objects:
  ps_asin
  ps_identity
  ps_compositional
  ps_z_otu
  ps_z_sample
  ps_log10
  ps_log10p
  ps_clr
  ps_shift
  ps_scale
load("data/bray_distances.rda", verbose = TRUE)
Loading objects:
  ps_asin_bray_dist
  ps_compositional_bray_dist
  ps_z_otu_bray_dist
  ps_z_sample_bray_dist
  ps_log10_bray_dist
  ps_log10p_bray_dist
  ps_clr_bray_dist
  ps_shift_bray_dist
  ps_scale_bray_dist
load("data/phyloseq_raw_rel_psextra_df_objects.rda", verbose = TRUE)
Loading objects:
  ps_raw
  ps_rel
  psextra_raw
  psextra_rel
  ps_df

10.2 Bray-Curtis Dissimilarity

The Bray-Curtis dissimilarity calculates the dissimilarity between two samples based on the relative abundances of taxa. It considers both presence and abundance information but is not sensitive to differences in abundance magnitude.

library(vegan)
library(dplyr)

ps_asin_bray_dist  <- vegdist(t(otu_table(ps_asin)), method="bray") %>%
  as.matrix()
ps_identity_bray_dist  <- vegdist(t(otu_table(ps_identity)), method="bray") %>%
  as.matrix 
ps_compositional_bray_dist  <- vegdist(t(otu_table(ps_compositional)), method="bray") %>%
  as.matrix 
ps_z_otu_bray_dist  <- vegdist(t(otu_table(ps_z_otu)), method="bray") %>%
  as.matrix 
ps_z_sample_bray_dist  <- vegdist(t(otu_table(ps_z_sample)), method="bray") %>%
  as.matrix 
ps_log10_bray_dist  <- vegdist(t(otu_table(ps_log10)), method="bray") %>%
  as.matrix 
ps_log10p_bray_dist  <- vegdist(t(otu_table(ps_log10p)), method="bray") %>%
  as.matrix 
ps_clr_bray_dist  <- vegdist(t(otu_table(ps_clr)), method="bray") %>%
  as.matrix 
ps_shift_bray_dist  <- vegdist(t(otu_table(ps_shift)), method="bray") %>%
  as.matrix 
ps_scale_bray_dist  <- vegdist(t(otu_table(ps_scale)), method="bray") %>%
  as.matrix 

save(ps_asin_bray_dist, 
     ps_compositional_bray_dist, 
     ps_z_otu_bray_dist, 
     ps_z_sample_bray_dist, 
     ps_log10_bray_dist, 
     ps_log10p_bray_dist, 
     ps_clr_bray_dist, 
     ps_shift_bray_dist, 
     ps_scale_bray_dist, 
     file = "data/bray_distances.rda")

10.3 Jaccard Distance

The Jaccard distance measures the dissimilarity between two samples based on the presence or absence of taxa, ignoring their abundance information. It is calculated as the ratio of the number of taxa found in only one of the samples to the total number of taxa found in both samples.

library(vegan)
library(dplyr)

ps_asin_jaccard_dist  <- vegdist(t(otu_table(ps_asin)), method="jaccard") %>%
  as.matrix()
ps_identity_jaccard_dist  <- vegdist(t(otu_table(ps_identity)), method="jaccard") %>%
  as.matrix 
ps_compositional_jaccard_dist  <- vegdist(t(otu_table(ps_compositional)), method="jaccard") %>%
  as.matrix 
ps_z_otu_jaccard_dist  <- vegdist(t(otu_table(ps_z_otu)), method="jaccard") %>%
  as.matrix 
ps_z_sample_jaccard_dist  <- vegdist(t(otu_table(ps_z_sample)), method="jaccard") %>%
  as.matrix 
ps_log10_jaccard_dist  <- vegdist(t(otu_table(ps_log10)), method="jaccard") %>%
  as.matrix 
ps_log10p_jaccard_dist  <- vegdist(t(otu_table(ps_log10p)), method="jaccard") %>%
  as.matrix 
ps_clr_jaccard_dist  <- vegdist(t(otu_table(ps_clr)), method="jaccard") %>%
  as.matrix 
ps_shift_jaccard_dist  <- vegdist(t(otu_table(ps_shift)), method="jaccard") %>%
  as.matrix 
ps_scale_jaccard_dist  <- vegdist(t(otu_table(ps_scale)), method="jaccard") %>%
  as.matrix 

save(ps_asin_jaccard_dist, 
     ps_compositional_jaccard_dist, 
     ps_z_otu_jaccard_dist, 
     ps_z_sample_jaccard_dist, 
     ps_log10_jaccard_dist, 
     ps_log10p_jaccard_dist, 
     ps_clr_jaccard_dist, 
     ps_shift_jaccard_dist, 
     ps_scale_jaccard_dist, 
     file = "data/jaccard_distances.rda")

Confirm computed Jaccard distance objects

load("data/jaccard_distances.rda", verbose = T)
Loading objects:
  ps_asin_jaccard_dist
  ps_compositional_jaccard_dist
  ps_z_otu_jaccard_dist
  ps_z_sample_jaccard_dist
  ps_log10_jaccard_dist
  ps_log10p_jaccard_dist
  ps_clr_jaccard_dist
  ps_shift_jaccard_dist
  ps_scale_jaccard_dist

10.4 Euclidean Distance

The Euclidean distance calculates the straight-line distance between two samples in a multidimensional space based on their abundance profiles. It considers differences in both presence and abundance of taxa.

library(vegan)
library(dplyr)

ps_asin_euclidean_dist  <- vegdist(t(otu_table(ps_asin)), method="euclidean") %>%
  as.matrix()
ps_identity_euclidean_dist  <- vegdist(t(otu_table(ps_identity)), method="euclidean") %>%
  as.matrix 
ps_compositional_euclidean_dist  <- vegdist(t(otu_table(ps_compositional)), method="euclidean") %>%
  as.matrix 
ps_z_otu_euclidean_dist  <- vegdist(t(otu_table(ps_z_otu)), method="euclidean") %>%
  as.matrix 
ps_z_sample_euclidean_dist  <- vegdist(t(otu_table(ps_z_sample)), method="euclidean") %>%
  as.matrix 
ps_log10_euclidean_dist  <- vegdist(t(otu_table(ps_log10)), method="euclidean") %>%
  as.matrix 
ps_log10p_euclidean_dist  <- vegdist(t(otu_table(ps_log10p)), method="euclidean") %>%
  as.matrix 
ps_clr_euclidean_dist  <- vegdist(t(otu_table(ps_clr)), method="euclidean") %>%
  as.matrix 
ps_shift_euclidean_dist  <- vegdist(t(otu_table(ps_shift)), method="euclidean") %>%
  as.matrix 
ps_scale_euclidean_dist  <- vegdist(t(otu_table(ps_scale)), method="euclidean") %>%
  as.matrix 

save(ps_asin_euclidean_dist, 
     ps_compositional_euclidean_dist, 
     ps_z_otu_euclidean_dist, 
     ps_z_sample_euclidean_dist, 
     ps_log10_euclidean_dist, 
     ps_log10p_euclidean_dist, 
     ps_clr_euclidean_dist, 
     ps_shift_euclidean_dist, 
     ps_scale_euclidean_dist, 
     file = "data/euclidean_distances.rda")

Confirm computed Euclidean distance objects

load("data/euclidean_distances.rda", verbose = T)
Loading objects:
  ps_asin_euclidean_dist
  ps_compositional_euclidean_dist
  ps_z_otu_euclidean_dist
  ps_z_sample_euclidean_dist
  ps_log10_euclidean_dist
  ps_log10p_euclidean_dist
  ps_clr_euclidean_dist
  ps_shift_euclidean_dist
  ps_scale_euclidean_dist

10.5 Aitchison Distance

The Aitchison distance, also known as the compositional distance or log-ratio distance, is specifically designed for compositional data such as microbiome abundance data. It calculates the dissimilarity between two samples based on the log-ratio transformations of their compositional vectors, providing meaningful comparisons between samples while addressing the compositional constraints inherent in relative abundance data.

library(microViz)
library(dplyr)

# ps_identity_aitchison_dists <- ps_asin %>% dist_calc("aitchison")
ps_GlobalPatterns_aitchison_dists <- ps_GlobalPatterns %>% dist_calc("aitchison")
ps_dietswap_aitchison_dists <- ps_dietswap %>% dist_calc("aitchison")
ps_kostic_crc_aitchison_dists <- ps_kostic_crc %>% dist_calc("aitchison")

save(ps_GlobalPatterns_aitchison_dists, 
     ps_dietswap_aitchison_dists, 
     ps_kostic_crc_aitchison_dists, 
     file = "data/aitchison_distances.rda")

Confirm computed Aitchison distance objects

load("data/aitchison_distances.rda", verbose = T)
Loading objects:
  ps_GlobalPatterns_aitchison_dists
  ps_dietswap_aitchison_dists
  ps_kostic_crc_aitchison_dists