microbiome · ChouaibB · Aug 4, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/R/makeTreeSummarizedExperimentFromBiom.R b/R/makeTreeSummarizedExperimentFromBiom.R
@@ -13,6 +13,12 @@
 #' taxonomic ranks on feature table, should they be scraped from prefixes?
 #' (default \code{rankFromPrefix = FALSE})
 #' 
+#' @param clean.taxa.names \code{character} or \code{regex}: If file have
+#' some taxonomic character naming artifacts to be removed. Additionally
+#' when \code{clean.taxa.names = "auto"} automatically artifacts are detected
+#' and removed. Otherwise \code{clean.taxa.names = NULL} no cleaning is performed.
+#' (default \code{clean.taxa.names = "auto"})
+#' 
 #' @param ... optional arguments (not used).
 #' 
 #' @return An object of class
@@ -31,14 +37,25 @@
 #'   # load from file
 #'   rich_dense_file  = system.file("extdata", "rich_dense_otu_table.biom",
 #'                                  package = "biomformat")
-#'   se <- loadFromBiom(rich_dense_file, removeTaxaPrefixes = TRUE, rankFromPrefix = TRUE)
-#'
+#'   se <- loadFromBiom(rich_dense_file, removeTaxaPrefixes = TRUE,
+#'                     rankFromPrefix = TRUE)
+#'                     
 #'   # load from object
 #'   x1 <- biomformat::read_biom(rich_dense_file)
 #'   se <- makeTreeSEFromBiom(x1)
 #'   # Convert SE to TreeSE
 #'   tse <- as(se, "TreeSummarizedExperiment")
 #'   tse
+#'   
+#'   # Cleaning artifacts from Taxonomy data
+#'   f <- system.file("extdata/testdata/Aggregated_humanization2.biom",
+#'                   package="mia")
+#'   biom_object <- biomformat::read_biom(f)
+#'   tse <- makeTreeSEFromBiom(biom_object,
+#'                             removeTaxaPrefixes=TRUE,
+#'                             rankFromPrefix=TRUE,
+#'                             clean.taxa.names = "auto")
+#'   tse
 #' }
 NULL
 
@@ -56,9 +73,11 @@ loadFromBiom <- function(file, ...) {
 #' @param obj object of type \code{\link[biomformat:read_biom]{biom}}
 #'
 #' @export
-#' @importFrom S4Vectors make_zero_col_DFrame
+#' @importFrom S4Vectors make_zero_col_DFrame DataFrame
+#' @importFrom dplyr %>%
 makeTreeSEFromBiom <- function(
-        obj, removeTaxaPrefixes = FALSE, rankFromPrefix = FALSE, ...){
+        obj, removeTaxaPrefixes = FALSE, rankFromPrefix = FALSE,
+        clean.taxa.names="auto", ...){
     # input check
     .require_package("biomformat")
     if(!is(obj,"biom")){
@@ -70,6 +89,9 @@ makeTreeSEFromBiom <- function(
     if( !.is_a_bool(rankFromPrefix) ){
         stop("'rankFromPrefix' must be TRUE or FALSE.", call. = FALSE)
     }
+    if( !is.null(clean.taxa.names) && !.is_non_empty_character(clean.taxa.names) ){
+        stop("'clean.taxa.names' must be a character, NULL or 'auto'.", call. = FALSE)
+    }
     #
     counts <- as(biomformat::biom_data(obj), "matrix")
     sample_data <- biomformat::sample_metadata(obj)
@@ -103,13 +125,22 @@ makeTreeSEFromBiom <- function(
         rownames(feature_data) <- rownames(counts)
     # Otherwise convert it into correct format if it is a list
     } else if( is(feature_data, "list") ){
+        # Clean feature_data from possible character artifacts
+        feature_data <- .detect_taxa_artifacts_and_clean(feature_data,
+                                                         clean.taxa.names)
+        # Taxonomy rank names
+        if (is.null(names(feature_data))) {
+            # Assign temporary ones if they do not exist 
+            colnames <- paste0("taxonomy", seq_along(feature_data))
+        } else {
+            # Get them if they exist
+            colnames <- names(feature_data)
+        }
+
         # Feature data is a list of taxa info
         # Get the maximum length of list
         max_length <- max( lengths(feature_data) )
-        # Get the column names from the taxa info that has all the levels that occurs
-        # in the data
-        colnames <- names( head( feature_data[ lengths(feature_data) == 
-                                                   max_length ], 1)[[1]] )
+
         # Convert the list so that all individual taxa info have the max length
         # of the list objects. All vectors are appended with NAs, if they do not
         # have all the levels. E.g., if only Kingdom level is found, all lower
@@ -120,8 +151,15 @@ makeTreeSEFromBiom <- function(
         })
         # Create a data.frame from the list
         feature_data <- do.call(rbind, feature_data)
+        # Transposing feature_data and make it DFrame object
+        feature_data <- DataFrame(t(feature_data))
         # Add correct colnames
         colnames(feature_data) <- colnames
+    # Otherwise if it is already a data.frame clean from artifacts
+    } else if (is(feature_data, "data.frame")) {
+        # Clean feature_data from possible character artifacts
+        feature_data <- DataFrame(.detect_taxa_artifacts_and_clean(feature_data,
+                                                         clean.taxa.names))
     }
 
     # Replace taxonomy ranks with ranks found based on prefixes
@@ -132,7 +170,7 @@ makeTreeSEFromBiom <- function(
         ranks <- lapply(colnames(feature_data),
                         .replace_colnames_based_on_prefix, x=feature_data)
         # Replace old ranks with found ranks
-        colnames(feature_data) <- ranks
+        colnames(feature_data) <- unlist(ranks)
     }
 
     # Remove prefixes if specified and rowData includes info
@@ -204,3 +242,68 @@ makeTreeSummarizedExperimentFromBiom <- function(obj, ...){
     }
     return(colname)    
 }
+
+# Detect and clean non wanted characters from Taxonomy data if needed.
+.detect_taxa_artifacts_and_clean <- function(x, patterns) {
+
+    # No cleaning if NULL
+    if (!is.null(patterns)) {
+        # Automatic cleaning
+        if (patterns=="auto") {
+            # General regex pattern that corresponds to taxonomy namings
+            PATTERN <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]"
+            patterns <- .detect_taxa_artifacts(x, PATTERN, invert=TRUE)
+            # Clean from artifacts if found
+            if (patterns!="") {
+                x <- .clean_from_artifacts(x, patterns)
+            }
+        # Clean with the character or regex provided
+        } else {
+            pattern <- .detect_taxa_artifacts(x, patterns=patterns)
+            # patterns provided not found
+            if (pattern=="") {
+                warning("The '", patterns, "' provided at 'clean.taxa.names' were
+                    not found in rowData.",
+                        call. = FALSE)
+                # patterns found and cleaned
+            } else {
+                x <- .clean_from_artifacts(x, pattern)
+            }
+        }
+    }
+
+    return(x)
+}
+
+# Helper function for detecting taxa artifacts 
+.detect_taxa_artifacts <- function(
+        x,
+        patterns,
+        invert=FALSE) {
+    if (is(x, "list")) {
+        patterns <- lapply(x, function(x_sub) {
+            grep(patterns,
+                 x_sub[[1]] %>% stringr::str_split("") %>% unlist(),
+                 invert = invert, value = TRUE) %>% unique()
+        }) %>% unlist() %>% unique() %>% paste0(collapse = "")
+    } else if (is(x, "data.frame")){
+        patterns <- apply(x, 2, function(x_sub) {
+            grep(patterns,
+                 x_sub %>% stringr::str_split("") %>% unlist(),
+                 invert = invert, value = TRUE) %>% unique()
+        }) %>% unlist() %>% unique() %>% paste0(collapse = "")
+    }
+    return(patterns)
+}
+
+.clean_from_artifacts <- function(x, patterns) {
+    if (is(x, "list")) {
+        x <- lapply(x, gsub, pattern = patterns, replacement = "")
+    } else if (is(x, "data.frame")) {
+        x <- apply(x, 2, gsub, pattern = patterns, replacement = "")
+    }
+    # warn what was cleaned
+    warning("The following artifacts: '", patterns, "' were cleaned from 
+                    rowData.", call. = FALSE)
+    return(x)
+}
diff --git a/inst/extdata/testdata/Aggregated_humanization2.biom b/inst/extdata/testdata/Aggregated_humanization2.biom
diff --git a/inst/extdata/testdata/Data_humanization_phylo_aggregation.tre b/inst/extdata/testdata/Data_humanization_phylo_aggregation.tre
@@ -0,0 +1 @@
+(172647198:0.15816,((((1726478:0.00015,1726479:0.0072)0.973:0.1587,((172647201:0.10901,(172647222:1.06672,17264798:0.09727)0.030:0.00015)0.880:0.08808,((172647195:0.13208,(((172647181:0.00015,1726472:0.0145)0.999:0.27321,(172647171:0.13092,172647267:0.13968)0.211:0.04419)0.228:0.08057,(((((17264748:0.00015,(17264747:0.00015,17264731:0.00723)0.785:0.00714)0.926:0.04875,(1726473:0.01419,(172647179:0.04406,(((172647412:0.14505,(172647215:0.02662,17264737:0.07957)0.693:0.01044)0.843:0.02306,(172647132:0.00274,((172647211:0.00014,(17264736:0.01472,17264728:0.00016)0.843:0.00719)0.988:0.09556,(((((172647217:0.00721,17264724:0.00014)0.623:0.02253,(((172647157:0.00726,17264727:0.00014)0.772:0.00723,172647214:0.00016)0.881:0.00016,(17264732:0.03045,17264726:0.00016)0.880:0.00724)0.863:0.02324)1.000:0.12923,(((172647230:0.00719,(17264717:0.00406,17264714:0.01359)0.779:0.00406)0.743:0.00016,1726470:0.00014)1.000:0.00015,172647133:0.08307)0.819:0.01463)0.232:0.00015,1726474:0.00015)0.771:0.01332,(((1726477:0.01474,(17264720:0.00721,1726471:0.00014)0.507:0.00016)0.896:0.01485,17264725:0.00729)0.768:0.00721,172647213:0.00014)0.839:0.01661)0.903:0.04798)0.824:0.03685)0.909:0.04266)0.918:0.08553,((17264770:0.00895,17264752:0.00596)0.990:0.18336,(17264756:0.00016,(172647176:0.00014,(17264739:0.00719,17264715:0.00014)1.000:0.04686)0.840:0.00721)0.616:0.00732)0.456:0.07395)0.792:0.07389)0.691:0.01597)0.791:0.0197)0.165:0.01315,17264751:0.07808)0.950:0.06783,((172647126:0.06492,(172647113:0.00016,(172647192:0.00016,17264769:0.02212)0.829:0.00722)0.951:0.00014)0.644:0.00509,(17264721:0.07167,(1726475:0.04058,17264735:0.10514)0.240:0.05167)0.949:0.1203)0.839:0.0303)0.995:0.2628,(172647407:0.05751,(172647137:0.00016,((172647138:0.00014,((172647140:0.0072,17264744:0.00016)0.834:0.00863,17264749:0.01962)0.832:0.00863)0.964:0.02243,17264729:0.00015)0.842:0.00719)0.966:0.23098)0.405:0.03446)0.963:0.20131)0.118:0.01575)0.600:0.02558,17264768:0.08645)0.899:0.09942)0.685:0.039)0.844:0.07595,(((((((((17264777:0.01271,((172647177:0.00015,17264753:0.02241)1.000:0.12894,17264779:0.01788)0.345:0.01636)0.999:0.09059,17264781:0.00016)0.993:0.06393,((172647166:0.01987,(172647168:0.0072,172647167:0.00014)0.787:0.01285)0.779:0.01205,(((172647136:0.00015,(17264718:0.07572,17264760:0.01946)0.529:0.01946)0.984:0.03913,((((172647108:0.0072,17264778:0.00015)0.387:0.02309,(172647172:0.00772,((172647170:0.00014,((172647100:0.00724,172647266:0.00723)0.381:0.00014,172647173:0.00721)0.938:0.00014)0.903:0.00015,(17264740:0.00726,(17264733:0.00016,(172647114:0.00016,172647169:0.00726)0.883:0.00726)0.126:0.00726)0.273:0.0001)0.785:0.007)0.827:0.01653)0.804:0.02316,(17264723:0.01851,((17264746:0.00378,17264738:0.00371)0.960:0.05454,(172647289:0.00015,17264761:0.03041)0.719:0.00926)0.856:0.03507)0.193:0.02952)0.782:0.03675,17264730:0.03795)0.506:0.00012)0.919:0.02245,(172647228:0.01472,172647220:0.00016)0.963:0.0312)0.075:0.00014)0.663:0.03046)0.715:0.00931,17264716:0.00016)0.987:0.06379,1726476:0.00877)0.736:0.00979,(17264712:0.00879,(17264775:0.13083,172647116:0.04075)0.240:0.00569)0.981:0.06338)0.893:0.03404,(17264711:0.03166,(172647175:0.02265,((172647180:0.00016,172647186:0.00722)0.997:0.11695,(172647111:0.01457,172647243:0.00014)0.859:0.03672)0.717:0.01514)0.861:0.03157)0.787:0.02254)0.874:0.0512,(172647208:0.06791,17264757:0.0705)0.718:0.01513)0.830:0.04872,((17264754:0.03062,17264755:0.01483)0.988:0.13383,((17264710:0.01276,17264774:0.00015)0.930:0.12501,172647146:0.22327)0.783:0.04156)0.516:0.02918)0.260:0.03229)0.810:0.03014,(17264794:0.08338,(172647223:0.01795,(172647117:0.04677,(172647120:0.03212,172647206:0.03938)0.850:0.02593)0.741:0.01266)0.913:0.07451)0.801:0.03514)0.789:0.02732,(((172647139:0.05418,17264782:0.07072)0.277:0.02918,(((((17264766:0.00727,(17264799:0.02464,172647190:0.08275)0.832:0.02355)0.841:0.02683,(17264762:0.00014,17264734:0.00723)0.768:0.01307)0.953:0.07505,(172647204:0.00015,17264767:0.00722)0.860:0.04149)0.832:0.04404,(172647142:0.11167,(172647303:0.00016,((172647216:0.00015,17264722:0.00725)0.982:0.0302,(172647189:0.00016,172647156:0.00726)0.010:0.00014)0.837:0.00723)0.754:0.02462)0.720:0.04728)0.933:0.07682,172647283:0.05564)0.708:0.00016)0.872:0.05547,(((17264743:0.03088,(172647135:0.00725,(172647128:0.00724,17264741:0.00014)0.536:0.00016)0.767:0.02396)0.980:0.07753,(17264792:0.24516,17264719:0.0123)0.802:0.02538)0.820:0.02383,(17264742:0.30539,((172647145:0.00014,172647147:0.0072)1.000:0.12319,((((17264788:0.00015,(17264750:0.01465,17264786:0.00016)0.300:0.00725)0.963:0.02233,172647219:0.00016)0.834:0.00724,17264745:0.00016)0.992:0.00014,(17264780:0.00015,(17264771:0.00728,(17264784:0.00016,(17264759:0.00723,17264772:0.00014)0.863:0.00728)0.782:0.0074)0.911:0.01504)0.972:0.03009)0.986:0.09962)0.425:0.00016)0.841:0.05325)0.771:0.03701)0.781:0.02693);
diff --git a/inst/extdata/testdata/Mapping_file_ADHD_aggregated.csv b/inst/extdata/testdata/Mapping_file_ADHD_aggregated.csv
@@ -0,0 +1,28 @@
+#SampleID,Treatment,Cohort,TreatmentxCohort,Description
+A110,ADHD,Cohort_1,ADHD_Cohort_1,A110
+A12,ADHD,Cohort_1,ADHD_Cohort_1,A12
+A15,ADHD,Cohort_1,ADHD_Cohort_1,A15
+A19,ADHD,Cohort_1,ADHD_Cohort_1,A19
+A21,ADHD,Cohort_2,ADHD_Cohort_2,A21
+A23,ADHD,Cohort_2,ADHD_Cohort_2,A23
+A25,ADHD,Cohort_2,ADHD_Cohort_2,A25
+A28,ADHD,Cohort_2,ADHD_Cohort_2,A28
+A29,ADHD,Cohort_2,ADHD_Cohort_2,A29
+A34,ADHD,Cohort_3,ADHD_Cohort_3,A34
+A36,ADHD,Cohort_3,ADHD_Cohort_3,A36
+A37,ADHD,Cohort_3,ADHD_Cohort_3,A37
+A39,ADHD,Cohort_3,ADHD_Cohort_3,A39
+A111,Control,Cohort_1,Control_Cohort_1,A111
+A13,Control,Cohort_1,Control_Cohort_1,A13
+A14,Control,Cohort_1,Control_Cohort_1,A14
+A16,Control,Cohort_1,Control_Cohort_1,A16
+A17,Control,Cohort_1,Control_Cohort_1,A17
+A18,Control,Cohort_1,Control_Cohort_1,A18
+A210,Control,Cohort_2,Control_Cohort_2,A210
+A22,Control,Cohort_2,Control_Cohort_2,A22
+A24,Control,Cohort_2,Control_Cohort_2,A24
+A26,Control,Cohort_2,Control_Cohort_2,A26
+A27,Control,Cohort_2,Control_Cohort_2,A27
+A33,Control,Cohort_3,Control_Cohort_3,A33
+A35,Control,Cohort_3,Control_Cohort_3,A35
+A38,Control,Cohort_3,Control_Cohort_3,A38
diff --git a/man/makeTreeSEFromBiom.Rd b/man/makeTreeSEFromBiom.Rd
diff --git a/tests/testthat/test-0diversity.R b/tests/testthat/test-0diversity.R
@@ -24,12 +24,12 @@ test_that("diversity estimates", {
     ginisimpson <- 1 - lambda
     invsimpson <- 1 / lambda
 
-    expect_equal(lambda, .simpson_lambda(assays(tse_idx)$relabundance))
-    expect_equal(ginisimpson, colData(tse_idx)$gini_simpson)
-    expect_equal(ginisimpson, .calc_gini_simpson(assays(tse_idx)$relabundance))
+    expect_equal(lambda, unname(.simpson_lambda(assays(tse_idx)$relabundance)))
+    expect_equal(ginisimpson, unname(colData(tse_idx)$gini_simpson))
+    expect_equal(ginisimpson, unname(.calc_gini_simpson(assays(tse_idx)$relabundance)))
 
-    expect_equal(invsimpson, colData(tse_idx)$inverse_simpson)
-    expect_equal(invsimpson, .calc_inverse_simpson(assays(tse_idx)$relabundance))
+    expect_equal(invsimpson, unname(colData(tse_idx)$inverse_simpson))
+    expect_equal(invsimpson, unname(.calc_inverse_simpson(assays(tse_idx)$relabundance)))
 
     cd <- colData(tse_idx)
     expect_equal(unname(round(cd$shannon, 5)), c(2.24937, 2.76239, 2.03249))
@@ -63,8 +63,8 @@ test_that("diversity estimates", {
         table(cut(x, cutpoints))
         }))
 
-    expect_equal(test1, test2)
-    expect_equal(round(test1, 6), c(7.256706, 6.098354, 7.278894))
+    expect_equal(unname(test1), unname(test2))
+    expect_equal(unname(round(test1, 6)), c(7.256706, 6.098354, 7.278894))
 
     # Tests faith index with esophagus data
     for( i in c(1:(length(colnames(tse_idx)))) ){