From 135150015669610ba8585b039ed456e289ae84a0 Mon Sep 17 00:00:00 2001
From: silask <silas.kieser@gmail.com>
Date: Mon, 23 Oct 2023 17:07:09 +0200
Subject: [PATCH] added Deseq2

---
 R/Analyze_genecatalog.Rmd | 132 ++++++++++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 34 deletions(-)

diff --git a/R/Analyze_genecatalog.Rmd b/R/Analyze_genecatalog.Rmd
index c55710a..e452a26 100644
--- a/R/Analyze_genecatalog.Rmd
+++ b/R/Analyze_genecatalog.Rmd
@@ -14,10 +14,7 @@ knitr::opts_chunk$set(
 
 
 
-```
-BiocManager::install("tidybulk")
 
-```
 
 
 ```{r, libraries,include=FALSE}
@@ -42,7 +39,6 @@ library(tibble)
 
 library(arrow)
 library(yaml)
-library(SummarizedExperiment)
 ```
 
 Atlas output files are stored in the `DiarrheaExample` folder.
@@ -78,21 +74,6 @@ abundance_file <- genecatalog_files$coverage
 
 
 
-Load the metadata
-
-
-```{r metadata}
-
-
-metadata <- read.csv(file.path(data_dir, "metadata.csv")) %>%
-  column_to_rownames("sample_accession")
-
-head(metadata)
-
-group_variable <- "group"
-```
-
-
 
 
 
@@ -153,6 +134,7 @@ head(sample_stats)
 # Plot the two numeric columns side by side using facet_wrap
 df_melt <- sample_stats %>%
   select(all_of(c("Sum_coverage", "Genes_nz_coverage"))) %>%
+  print() %>%
   rename(c(
     Sum_coverage = "Total coverage",
     Genes_nz_coverage = "N detected genes"
@@ -433,21 +415,19 @@ gene_nrs_with_annotations <- annotations %>%
   unique()
 
 
-data <- load_subset_of_genes(abundance_file, gene_nrs_with_annotations)
+annotated_genes <- load_subset_of_genes(abundance_file, gene_nrs_with_annotations)
 
 
-data[1:10, 1:5]
+annotated_genes[1:10, 1:5]
 ```
 
 
 ```{r }
 # Normalize
 
-annotated_genes_gcpm <- data %*% diag(1 / total_covarage[colnames(data)]) * 1e6
-colnames(annotated_genes_gcpm) <- colnames(data)
-
+annotated_genes_gcpm <- annotated_genes %*% diag(1 / total_covarage[colnames(annotated_genes)]) * 1e6
+colnames(annotated_genes_gcpm) <- colnames(annotated_genes)
 
-rm(data)
 annotated_genes_gcpm[1:5, 1:5]
 ```
 In theory, genes can have multiple annotations we need to create a matrix with genes as columns and annotations as rows
@@ -471,7 +451,7 @@ rownames(annotation_matrix) <- ko_ids
 
 annotation_matrix[1:5, 1:5]
 
-cat(" Genes have max ", max(colSums(annotation_matrix)), " annotations.\n")
+#cat(" Genes have max ", max(colSums(annotation_matrix)), " annotations.\n")
 ```
 
 
@@ -518,6 +498,94 @@ heatmap(log_annotation_cpm,
 )
 ```
 
+# Differencial abundance with Deseq2
+
+
+Load the metadata
+
+
+```{r metadata}
+
+
+metadata <- read.csv(file.path(data_dir, "metadata.csv")) %>%
+  column_to_rownames("sample_accession")
+
+head(metadata)
+
+group_variable <- "group"
+```
+
+Deseq2 requires the unnormalized annotations counts. let's calculate them.
+```{r }
+
+
+assertthat::assert_that(all(colnames(annotation_matrix) == rownames(annotated_genes)))
+
+# multiply gene coverage with annotation matrix,
+annotation_genecounts <- annotation_matrix %*% annotated_genes
+
+annotation_genecounts <- as.matrix(annotation_genecounts)
+
+
+# Save the normalized data as tsv.gz file
+write_delim(as.data.frame(annotation_genecounts),
+  file.path(data_dir, "Genecatalog", "Kegg_counts.tsv.gz"),
+  delim = "\t"
+)
+
+
+annotation_genecounts[1:5, 1:5]
+```
+
+
+To perform a differential abundance testing in R with abundance data.
+Deseq2 required the unnormalized data.
+You will need the DESeq2 package for performing differential abundance analysis.
+If you haven't already, install and load the package:
+
+```{r}
+# BiocManager::install("DESeq2")
+library(DESeq2)
+
+```
+
+
+
+Prepare Data:
+
+```{r }
+   # Assuming annotation_cpm is your count data (CPM)
+   dds <- DESeqDataSetFromMatrix(countData = annotation_genecounts,
+                      colData = metadata[colnames(annotation_genecounts),],
+                      design = ~ group)
+```
+
+
+
+Normalize the data using the DESeq2 normalization methods, and estimate differencial abundance.
+
+```{r}
+   dds <- DESeq(dds)
+```
+
+```{r}
+   res <- results(dds) %>% as.data.frame()
+
+head(res %>% arrange(padj),n=10)
+```
+
+```{r }
+ 
+
+ ma_plot <- ggplot(res, aes(x = baseMean, y = log2FoldChange)) +
+            geom_point(aes(color = padj < 0.05)) +
+            scale_color_manual(values = c("grey", "red")) +
+            labs(title = "MA Plot", x = "Mean of normalized counts", y = "log2(Fold Change)")
+
+ # Display the MA plot
+ print(ma_plot)
+ 
+```
 
 
 ## Alternative ways to normalize
@@ -528,21 +596,17 @@ Example the one from [MuSiCC](https://github.com/borenstein-lab/MUSiCC/blob/mast
 MUSiCC: A marker genes based framework for metagenomic normalization and accurate profiling of gene abundances in the microbiome. Ohad Manor and Elhanan Borenstein. Genome Biology
 
 
-
 ## Integrate them in a SummarizedExperiment object
 
 ```{r }
 
-
-rownames(annotations) <- NULL
-
 SE <- SummarizedExperiment(
-  assays = list(gcpm = data),
-  colData = metadata[colnames(data), ],
-  rowData = annotations
+  assays = list(gcpm = annoation_cpm),
+  colData = metadata[colnames(annoation_cpm), ],
+  #rowData = annotations
 )
-```
 
+```
 
 ```{}
 library(tidybulk)