.Rhistory

suppressMessages(library(tidyverse))
suppressMessages(library(tidyverse))
suppressMessages(library(parallel))
suppressMessages(library(data.table))
suppressMessages(library(glue))
suppressMessages(library(dada2))
suppressMessages(library(DECIPHER))
suppressMessages(library(decontam))
suppressMessages(library(phyloseq))
suppressMessages(library(DESeq2))
suppressMessages(library(vegan))
all_times <- list()  # store the time for each chunk
knitr::knit_hooks$set(time_it = local({
now <- NULL
function(before, options) {
if (before) {
now <<- Sys.time()
} else {
res <- difftime(Sys.time(), now, units = "secs")
all_times[[options$label]] <<- res
}
}
}))
knitr::opts_chunk$set(
tidy = TRUE,
tidy.opts = list(width.cutoff = 95),
message = FALSE,
warning = FALSE,
time_it = TRUE,
error = TRUE,
echo = TRUE,
engine.opts = list(bash = "-l")
)
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(glue))
suppressMessages(library(phyloseq))
suppressMessages(library(vegan))
suppressMessages(library(MBECS))
suppressMessages(library(ggpubr))
suppressMessages(library(HMP))
suppressMessages(library(ALDEx2))
suppressMessages(library(DESeq2))
suppressMessages(library(rstatix))
suppressMessages(library(ggrepel))
suppressMessages(library(patchwork))
suppressMessages(library(ANCOMBC))
# Function to calculate proportion of NAs by column
na_proportion <- function(df) {
sapply(df, function(col) {
sum(is.na(col)) / length(col)*100
})
}
# Creates all possible combination of samples.
generateComparisons <- function(elements) {
# Initialize an empty list to store the vectors
comparisons <- list()
index = 0
# Generate all possible combinations of two elements
for (i in 1:(length(elements) - 1)) {
for (j in (i + 1):length(elements)) {
if (type(elements) == "list") {
index <- paste0(names(elements)[i], "-vs-", names(elements)[j])
comparisons[[index]] <- list(elements[[i]], elements[[j]])
} else {
vector <- c(elements[i], elements[j])
comparisons <- append(comparisons, list(vector))
}
}
}
return(comparisons)
}
count_tab <- read.table("~/Data/Metagenomics/Dolores_Mesa_Metagenomics/test/ASVs_counts.tsv", header = T, row.names = 1, check.names = F, sep = "\t")
# remove control samples and samples without an assigned group.
count_tab <- count_tab[, -grep("Control", colnames(count_tab))]
count_tab <- count_tab[, -grep("Unk", colnames(count_tab))]
colnames(count_tab) <- gsub("_R1.fastq.gz","", colnames(count_tab))
tax_tab <- as.matrix(read.table("~/Data/Metagenomics/Dolores_Mesa_Metagenomics/test/ASVs_taxonomy.tsv", header = T, row.names = 1, check.names = F, sep = "\t"))
filenames <- list.files("~/Data/Metagenomics/Dolores_Mesa_Metagenomics/test/rawReads", pattern = "R1_001.fastq.gz", full.names = FALSE)
filenames <- gsub("_R1_001.fastq.gz", "", filenames)
sample_tab <- rbindlist(lapply(filenames, function(col.name){
shunks <- unlist(strsplit(col.name,"_"))
sample_name <- shunks[1]
batch <- shunks[4]
shunks <- unlist(strsplit(sample_name,"-"))
sample <- shunks[1]
sample_number <- shunks[2]
time <- shunks[3]
group <- shunks[length(shunks)]
time_group = paste0(time, "_", group)
data.frame(sample_name = sample_name, sample = sample, sample_number = sample_number, time = time, group = group, time_group = time_group, batch = batch)
})) %>% as.data.frame()
rownames(sample_tab) <- sample_tab$sample_name
sample_tab <- sample_tab[colnames(count_tab),]
# Creating the phyloseq object by introducing the count table, the taxonomic assignment and the sample metadata.
ASV_physeq <- phyloseq(otu_table(count_tab, taxa_are_rows=T), tax_table(tax_tab), sample_data(sample_tab))
barplot(sort(sample_sums(ASV_physeq)), xlab = "Samples", ylab = "Number of reads", names.arg = "")
abline(h = 5e4, col = "red")
ASV_physeq_filter <- subset_samples(ASV_physeq, sample_sums(ASV_physeq) >= 50000)
ASV_physeq_filter <- prune_taxa(taxa_sums(ASV_physeq_filter) > 0, ASV_physeq_filter)
# Calculate and print the proportion of undetermined ASVs.
na_proportion(as.data.frame(tax_table(ASV_physeq_filter)))
taxa = "Family"
print(paste0("Numero de ASVs totales: ", length(rownames(count_tab))))
print(paste0("Numero de ASVs asignados: ", sum(table(tax_table(ASV_physeq_filter)[, taxa]))))
table(tax_table(ASV_physeq_filter)[, taxa])
# using phyloseq to make a count table that has summed all ASVs
# that were in a given taxa
count_tab <- as.data.frame(otu_table(ASV_physeq_filter))
taxa_counts_tab <- as.data.frame(otu_table(tax_glom(ASV_physeq_filter, taxrank = taxa)))
# making a vector of class names to set as row names
rownames(taxa_counts_tab) <- as.vector(tax_table(tax_glom(ASV_physeq_filter, taxrank = taxa))[,taxa])
# we also have to account for sequences that weren't assigned any
# taxonomy even at the class level
# these came into R as 'NAs' in the taxonomy table, but their counts are
# still in the count table
# so we can get that value for each sample by subtracting the column sums
# of this new table (that has everything that had a class assigned to it)
# from the column sums of the starting count table (that has all
# representative sequences)
unclassified_tax_counts <- colSums(count_tab) - colSums(taxa_counts_tab)
# and we'll add this row to our taxa count table:
taxa_counts_tab <- rbind(taxa_counts_tab, "Unclassified" = unclassified_tax_counts)
# if "TRUE", we know nothing fell through the cracks
identical(colSums(taxa_counts_tab), colSums(count_tab))
# now we'll generate a proportions table for summarizing:
taxa_proportions_tab <- apply(taxa_counts_tab, 2, function(x) x/sum(x)*100)
# if we check the dimensions of this table at this point
# we see there are currently 42 rows, which might be a little busy for a
# summary figure
# many of these taxa make up a very small percentage, so we're going to
# filter some out
# this is a completely arbitrary decision solely to ease visualization and
# intepretation, entirely up to your data and you
# here, we'll only keep rows (taxa) that make up greater than 5% in any
# individual sample
temp_filt_major_taxa_proportions_tab <- data.frame(taxa_proportions_tab[apply(taxa_proportions_tab, 1, max) > 5, ])
# though each of the filtered taxa made up less than 5% alone, together they
# may add up and should still be included in the overall summary
# so we're going to add a row called "Other" that keeps track of how much we
# filtered out (which will also keep our totals at 100%)
filtered_proportions <- colSums(taxa_proportions_tab) - colSums(temp_filt_major_taxa_proportions_tab)
filt_major_taxa_proportions_tab <- rbind(temp_filt_major_taxa_proportions_tab, "Other" = filtered_proportions)
colnames(filt_major_taxa_proportions_tab) <- gsub("\\.","-", gsub("X","", colnames(filt_major_taxa_proportions_tab)))
# Create also a filtered counts table with the "other" row sumarizing the minor taxa.
major_taxa <- rownames(filt_major_taxa_proportions_tab)
major_taxa <- major_taxa[major_taxa != "Other"]
all_taxa <- rownames(taxa_counts_tab)
filtered_taxa <- all_taxa[!(all_taxa %in% major_taxa)]
other_taxa_counts_tab <- colSums(taxa_counts_tab[filtered_taxa, ])
filt_major_taxa_counts_tab <- data.frame(rbind(taxa_counts_tab[major_taxa,], "Other" = other_taxa_counts_tab))
colnames(filt_major_taxa_counts_tab) <- gsub("\\.","-", gsub("X","", colnames(filt_major_taxa_counts_tab)))
identical(colSums(filt_major_taxa_counts_tab), colSums(count_tab))
rm(taxa, taxa_counts_tab, unclassified_tax_counts, temp_filt_major_taxa_proportions_tab, taxa_tax_tab, taxa_counts_tab, taxa_proportions_tab, filtered_proportions, major_taxa, all_taxa, filtered_taxa, other_taxa_counts_tab)
rm(taxa, taxa_counts_tab, unclassified_tax_counts, temp_filt_major_taxa_proportions_tab, taxa_tax_tab, taxa_counts_tab, taxa_proportions_tab, filtered_proportions, major_taxa, all_taxa, filtered_taxa, other_taxa_counts_tab)
# first let's make a copy of our table that's safe for manipulating
filt_major_taxa_proportions_tab_for_plot <- filt_major_taxa_proportions_tab
# and add a column of the taxa names so that it is within the table, rather
# than just as row names (this makes working with ggplot easier)
filt_major_taxa_proportions_tab_for_plot$major_taxa <- row.names(filt_major_taxa_proportions_tab_for_plot)
# now we'll transform the table into narrow, or long, format (also makes
# plotting easier)
filt_major_taxa_proportions_tab_for_plot <- pivot_longer(filt_major_taxa_proportions_tab_for_plot, !major_taxa, names_to = "sample", values_to = "proportion") %>% data.frame()
# now we want a table with "color" and "characteristics" of each sample to
# merge into our plotting table so we can use that more easily in our plotting
# function
# here we're making a new table by pulling what we want from the sample
# information table
sample_info_for_merge <- data.frame("sample" = row.names(sample_tab), "group" = sample_tab$group, "time" = sample_tab$time, "time_group" = sample_tab$time_group, stringsAsFactors = FALSE)
# and here we are merging this table with the plotting table we just made
# (this is an awesome function!)
filt_major_taxa_proportions_tab_for_plot <- merge(filt_major_taxa_proportions_tab_for_plot, sample_info_for_merge)
# and now we're ready to make some summary figures with our wonderfully
# constructed table
rm(sample_info_for_merge)
# one common way to look at this is with stacked bar charts for each taxon per sample:
ggplot(filt_major_taxa_proportions_tab_for_plot, aes(x = sample, y = proportion, fill = major_taxa)) +
geom_bar(aes(color = major_taxa, fill = major_taxa), stat = "identity", position = "stack") +
theme_classic() +
theme(legend.title=element_blank(),
panel.background = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_wrap(~ time + group, scales = "free") +
labs(y = "% of ASVs", title = "Relative abundance per sample")
# Subset groups.
t1_G1 <- filt_major_taxa_counts_tab[, grepl("t1-G1", colnames(filt_major_taxa_counts_tab))] %>% t(.)
t1_G2 <- filt_major_taxa_counts_tab[, grepl("t1-G2", colnames(filt_major_taxa_counts_tab))] %>% t(.)
t3_G1 <- filt_major_taxa_counts_tab[, grepl("t3-G1", colnames(filt_major_taxa_counts_tab))] %>% t(.)
t3_G2 <- filt_major_taxa_counts_tab[, grepl("t3-G2", colnames(filt_major_taxa_counts_tab))] %>% t(.)
data <- list(t1_G1, t1_G2, t3_G1, t3_G2)
elements <- c("t1_G1", "t1_G2", "t3_G1", "t3_G2")
names(data) <- elements
comparisons <- generateComparisons(data)
# LRT test using HPM package.
# Look at the p-values for each comparison.
xdc <- lapply(comparisons, Xdc.sevsample)
xdc
ggplot(filt_major_taxa_proportions_tab_for_plot, aes(x = major_taxa, y = proportion)) +
geom_jitter(aes(color = group, shape = time), size = 2, width = 0.15, height = 0) +
geom_boxplot(fill=NA, outlier.color=NA) +
ylim(0, 100) +
theme_classic() +
facet_wrap(~ time + group) + #,scales = "free") +
theme(axis.text.x=element_text(angle=45, hjust=1), legend.title=element_blank()) +
labs(x = "Major Taxa", y= "% of ASVs", title = "Relative abundance per major taxa") +
stat_compare_means(method = "kruskal.test", label.x = 2.5)
stat.by.time <- filt_major_taxa_proportions_tab_for_plot %>%
group_by(time, major_taxa) %>%
t_test(proportion ~ group) %>%
adjust_pvalue(method = "BH") %>%
add_significance() %>%
add_xy_position(x = "major_taxa")
stat.by.group <- filt_major_taxa_proportions_tab_for_plot %>%
group_by(group, major_taxa) %>%
t_test(proportion ~ time) %>%
adjust_pvalue(method = "BH") %>%
add_significance() %>%
add_xy_position(x = "major_taxa")
ggplot(filt_major_taxa_proportions_tab_for_plot, aes(x = major_taxa, y = proportion, fill = time, shape = time)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_brewer(palette = "Pastel1") +
geom_point(position = position_jitterdodge(), alpha=0.3) +
ylim(0, 100) +
theme_classic() +
facet_wrap(~ group) +
theme(axis.text.x=element_text(angle=45, hjust=1), legend.title=element_blank()) +
labs(x = "Major Taxa", y= "% of ASVs", title = "Relative abundance per major taxa and group") +
stat_pvalue_manual(aes(group = group), data = stat.by.time, x = "major_taxa", label = "p.adj.signif", y.position = 100, size = 3)
ggplot(filt_major_taxa_proportions_tab_for_plot, aes(x = major_taxa, y = proportion, fill = group, shape = group)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_brewer(palette = "Pastel2") +
geom_point(position = position_jitterdodge(), alpha=0.3) +
ylim(0, 100) +
theme_classic() +
facet_wrap(~ time) +
theme(axis.text.x=element_text(angle = 45, hjust = 1), legend.title = element_blank()) +
labs(x = "Major Taxa", y = "% of ASVs", title = "Relative abundance per major taxa and time") +
stat_pvalue_manual(aes(group = group), data = stat.by.group, x = "major_taxa", label = "p.adj.signif", y.position = 100, size = 3)
View(tax_tab)