6 Data Splitting
The dataset is divided into separate subsets for training and testing. This partitioning allows for an unbiased evaluation of the model’s performance on unseen data, helping to assess its ability to generalize to new observations.
6.2 Split using caret::createDataPartition()
# Split data into training and testing sets
set.seed(123) # for reproducibility
train_index <- caret::createDataPartition(data$target, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
write_csv(test_data, "models/test_data.csv")
save(train_data, test_data, file = "data/train_test_data.rda")
6.3 Review train and test datasets
cat("\nDimension of the train data using caret package\n is", base::dim(train_data)[1], "rows and", base::dim(train_data)[2], "columns.\n")
Dimension of the train data using caret package
is 179 rows and 144 columns.
cat("\nDimension of the test data using caret package\n is", base::dim(test_data)[1], "rows and", base::dim(test_data)[2], "columns.\n")
Dimension of the test data using caret package
is 43 rows and 144 columns.
cat("\nThe intersection between test and train dataset is", nrow(test_data %>% intersect(train_data)))
The intersection between test and train dataset is 0
6.4 Spliting using dplyr::sample_n() and dplyr::setdiff() functions**
set.seed(123)
library(dplyr)
test_df = train_data %>% dplyr::sample_n(0.2*nrow(train_data))
train_df = train_data %>% dplyr::setdiff(test_df)
cat("\nDimension of the test data using dplyr package\n is", base::dim(test_df)[1], "rows and", base::dim(test_df)[2], "columns.\n")
Dimension of the test data using dplyr package
is 35 rows and 144 columns.
cat("\nDimension of the train data using dplyr package\n is", base::dim(train_df)[1], "rows and", base::dim(train_df)[2], "columns.\n")
Dimension of the train data using dplyr package
is 144 rows and 144 columns.
cat("\nThe intersection between test and train dataset is", nrow(test_df %>% intersect(train_df)))
The intersection between test and train dataset is 0
6.5 Relative abundance in training and testing datasets
library(dplyr)
library(ggplot2)
library(ggpubr)
cols <- c("0" = "red","1" = "green4")
p1 <- train_data %>% ggplot(aes(x = `Bacteroides vulgatus`, y = `Prevotella melaninogenica`, color = factor(target))) + geom_point(size = 2, shape = 16, alpha = 0.6) +
labs(title = "Relative Abundance in Train Dataset using \ncaret::createDataPartition() function") +
scale_colour_manual(values = cols, labels = c("African American", "Africa"), name="Level") +
theme_bw() +
theme(text = element_text(size = 8))
p2 <- test_data %>% ggplot(aes(x = `Bacteroides vulgatus`, y = `Prevotella melaninogenica`, color = factor(target))) + geom_point(size = 2, shape = 16, alpha = 0.6) +
labs(title = "Relative Abundance in Test Dataset using \ncaret::createDataPartition() function") +
scale_colour_manual(values = cols, labels = c("African American", "Africa"), name="Level") +
theme_bw() +
theme(text = element_text(size = 8))
p3 <- train_df %>% ggplot(aes(x = `Bacteroides vulgatus`, y = `Prevotella melaninogenica`, color = factor(target))) + geom_point(size = 2, shape = 16, alpha = 0.6) +
labs(title = "Relative Abundance in Train Dataset using \ndplyr::sample_n() and dplyr::setdiff() functions") +
scale_colour_manual(values = cols, labels = c("African American", "African"), name="Level") +
theme_bw() +
theme(text = element_text(size = 8))
p4 <- test_df %>% ggplot(aes(x = `Bacteroides vulgatus`, y = `Prevotella melaninogenica`, color = factor(target))) + geom_point(size = 2, shape = 16, alpha = 0.6) +
labs(title = "Relative Abundance in Test Dataset using \ndplyr::sample_n() and dplyr::setdiff() functions") +
scale_colour_manual(values = cols, labels = c("African American", "African"), name="Level") +
theme_bw() +
theme(text = element_text(size = 8))
# Arrange plots using ggpubr
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE, legend = "right", heights = c(1, 1))