Merge branch 'master' into issue_958

biocore · May 7, 2024 · 0fe0ad6 · 0fe0ad6
2 parents b8ed659 + 0cb7fcc
commit 0fe0ad6
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 15 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -4,10 +4,17 @@ BIOM-Format ChangeLog
 biom 2.1.15-dev
 ---------------
 
-General Maintenance:
+New features:
 
+* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
 * The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)
 
+Bug Fixes:
+
+* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
+* In `subsample(..., with_replacement=True)`, it was possible to trigger a numerical stability on sum, see issue [#952](https://github.com/biocore/biom-format/issues/952)
+* `update_ids(..., strict=False)` could yield truncated IDs, see issue [#957](https://github.com/biocore/biom-format/issues/957)
+
 Performance improvements:
 
 * Add Windows support. PR[#951](https://github.com/biocore/biom-format/pull/951) revises codebase to be Windows compatible and adds this support to the CI testing matrix.
@@ -26,8 +33,7 @@ Bug fixes:
 * Allow `Table.to_json` to properly handle numpy types in metadata, see issue [#886](https://github.com/biocore/biom-format/issues/886)
 * Do not modify IDs in place in the presence of duplicate relabels, see issue [#892](https://github.com/biocore/biom-format/issues/892)
 * Catch an edge case where a failured ID update in place would actually change IDs, see issue [#892](https://github.com/biocore/biom-format/issues/892)
-* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
-
+
 New features:
 
 * `biom.parse.save_table` makes saving less tedious, see issue [#897](https://github.com/biocore/biom-format/issues/897)

diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
@@ -41,13 +41,18 @@ cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
         cnp.int32_t start,end,length
         Py_ssize_t i
         cnp.ndarray[cnp.float64_t, ndim=1] pvals
-
+        cnp.ndarray[cnp.float64_t, ndim=1] data_ceil 
+
+    data_ceil = np.ceil(data)
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
-        counts_sum = data[start:end].sum()
-
-        pvals = data[start:end] / counts_sum
+
+        # base p-values on integer data to avoid small numerical issues with 
+        # float on sum
+        counts_sum = data_ceil[start:end].sum()
+        pvals = data_ceil[start:end] / counts_sum
+
         data[start:end] = rng.multinomial(n, pvals)
 
 

diff --git a/biom/table.py b/biom/table.py
@@ -1422,7 +1422,12 @@ def update_ids(self, id_map, axis='sample', strict=True, inplace=True):
         >>> print(updated_table.ids(axis='sample'))
         ['s1.1' 's2.2' 's3.3']
         """
-        str_dtype = 'U%d' % max([len(v) for v in id_map.values()])
+        max_str_len = max([len(v) for v in id_map.values()])
+        if not strict:
+            ids = self.ids(axis=axis)
+            max_str_len = max(max_str_len, max([len(i) for i in ids]))
+
+        str_dtype = 'U%d' % max_str_len
         updated_ids = zeros(self.ids(axis=axis).size, dtype=str_dtype)
         for idx, old_id in enumerate(self.ids(axis=axis)):
             if strict and old_id not in id_map:
@@ -2914,7 +2919,8 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
         with_replacement : boolean, optional
             If `False` (default), subsample without replacement. If `True`,
             resample with replacement via the multinomial distribution.
-            Should not be `True` if `by_id` is `True`.
+            Should not be `True` if `by_id` is `True`. Important: If `True`,
+            samples with a sum below `n` are retained.
         seed : int, optional
             If provided, set the numpy random seed with this value
 
@@ -2931,14 +2937,16 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
 
         Notes
         -----
-        Subsampling is performed without replacement. If `n` is greater than
-        the sum of a given vector, that vector is omitted from the result.
-
-        Adapted from `skbio.math.subsample`, see biom-format/licenses for more
-        information about scikit-bio.
+        If subsampling is performed without replacement, vectors with a sum
+        less than `n` are omitted from the result. This condition is not held
+        when operating with replacement.
 
         This code assumes absolute abundance if `by_id` is False.
 
+        If subsampling with replacement, `np.ceil` is applied prior to
+        calculating p-values to ensure that low-abundance features have a
+        chance to be sampled.
+
         Examples
         --------
         >>> import numpy as np
@@ -4863,7 +4871,7 @@ def to_json(self, generated_by, direct_io=None, creation_date=None):
             for col_index, val in enumerate(obs[0]):
                 if float(val) != 0.0:
                     built_row.append(
-                        "[%d,%d,%r]" % (obs_index, col_index, val)
+                        "[%d,%d,%f]" % (obs_index, col_index, val)
                     )
             if built_row:
                 # if we have written a row already, its safe to add a comma

diff --git a/biom/tests/test_data/edgecase_issue_952.biom b/biom/tests/test_data/edgecase_issue_952.biom
diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py
@@ -2412,6 +2412,16 @@ def test_transpose(self):
                          self.st_rich.data('2', 'observation'))
         self.assertEqual(obs.transpose(), self.st_rich)
 
+    def test_update_ids_strict_dtype_bug_issue_957(self):
+        t = Table(np.arange(6).reshape(2, 3),
+                  ['O1', 'O2'],
+                  ['ab', 'cdef', 'ghijkl'])
+        exp = Table(np.arange(6).reshape(2, 3),
+                    ['O1', 'O2'],
+                    ['AB', 'cdef', 'ghijkl'])
+        obs = t.update_ids({'ab': 'AB'}, strict=False, inplace=False)
+        self.assertEqual(obs, exp)
+
     def test_update_ids_inplace_bug_892(self):
         t = example_table.copy()
         exp = t.ids().copy()
@@ -3203,6 +3213,23 @@ def f(vals, id_, md):
         with errstate(empty='raise'), self.assertRaises(TableException):
             self.st_rich.filter(f, 'observation')
 
+    def test_subsample_edgecase_issue_952(self):
+        # this file triggers an exception on Linux on subsample
+        # with replacement where the pvals computed sum to > 1. It is a
+        # subset of the data reported in issue 952, specifically constrained
+        # to the first 10 features with any empty samples removed.
+        path = 'test_data/edgecase_issue_952.biom'
+
+        # ...existing logic for test_data, not ideal, but consistent
+        cwd = os.getcwd()
+        if '/' in __file__:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        table = Table.from_hdf5(h5py.File(path, 'r'))
+        os.chdir(cwd)
+
+        obs = table.subsample(10, with_replacement=True)
+        self.assertEqual(set(obs.sum('sample')), {10.0, })
+
     def test_subsample_same_seed_without_replacement(self):
         table = Table(np.array([[3, 1, 2], [0, 3, 4]]), ['O1', 'O2'],
                       ['S1', 'S2', 'S3'])