Skip to content

Commit

Permalink
Merge branch 'master' into issue_958
Browse files Browse the repository at this point in the history
  • Loading branch information
wasade authored May 7, 2024
2 parents b8ed659 + 0cb7fcc commit 0fe0ad6
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 15 deletions.
12 changes: 9 additions & 3 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@ BIOM-Format ChangeLog
biom 2.1.15-dev
---------------

General Maintenance:
New features:

* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
* The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)

Bug Fixes:

* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
* In `subsample(..., with_replacement=True)`, it was possible to trigger a numerical stability on sum, see issue [#952](https://github.com/biocore/biom-format/issues/952)
* `update_ids(..., strict=False)` could yield truncated IDs, see issue [#957](https://github.com/biocore/biom-format/issues/957)

Performance improvements:

* Add Windows support. PR[#951](https://github.com/biocore/biom-format/pull/951) revises codebase to be Windows compatible and adds this support to the CI testing matrix.
Expand All @@ -26,8 +33,7 @@ Bug fixes:
* Allow `Table.to_json` to properly handle numpy types in metadata, see issue [#886](https://github.com/biocore/biom-format/issues/886)
* Do not modify IDs in place in the presence of duplicate relabels, see issue [#892](https://github.com/biocore/biom-format/issues/892)
* Catch an edge case where a failured ID update in place would actually change IDs, see issue [#892](https://github.com/biocore/biom-format/issues/892)
* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)


New features:

* `biom.parse.save_table` makes saving less tedious, see issue [#897](https://github.com/biocore/biom-format/issues/897)
Expand Down
13 changes: 9 additions & 4 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,18 @@ cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.int32_t start,end,length
Py_ssize_t i
cnp.ndarray[cnp.float64_t, ndim=1] pvals

cnp.ndarray[cnp.float64_t, ndim=1] data_ceil

data_ceil = np.ceil(data)
for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

pvals = data[start:end] / counts_sum

# base p-values on integer data to avoid small numerical issues with
# float on sum
counts_sum = data_ceil[start:end].sum()
pvals = data_ceil[start:end] / counts_sum

data[start:end] = rng.multinomial(n, pvals)


Expand Down
24 changes: 16 additions & 8 deletions biom/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,7 +1422,12 @@ def update_ids(self, id_map, axis='sample', strict=True, inplace=True):
>>> print(updated_table.ids(axis='sample'))
['s1.1' 's2.2' 's3.3']
"""
str_dtype = 'U%d' % max([len(v) for v in id_map.values()])
max_str_len = max([len(v) for v in id_map.values()])
if not strict:
ids = self.ids(axis=axis)
max_str_len = max(max_str_len, max([len(i) for i in ids]))

str_dtype = 'U%d' % max_str_len
updated_ids = zeros(self.ids(axis=axis).size, dtype=str_dtype)
for idx, old_id in enumerate(self.ids(axis=axis)):
if strict and old_id not in id_map:
Expand Down Expand Up @@ -2914,7 +2919,8 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
with_replacement : boolean, optional
If `False` (default), subsample without replacement. If `True`,
resample with replacement via the multinomial distribution.
Should not be `True` if `by_id` is `True`.
Should not be `True` if `by_id` is `True`. Important: If `True`,
samples with a sum below `n` are retained.
seed : int, optional
If provided, set the numpy random seed with this value
Expand All @@ -2931,14 +2937,16 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
Notes
-----
Subsampling is performed without replacement. If `n` is greater than
the sum of a given vector, that vector is omitted from the result.
Adapted from `skbio.math.subsample`, see biom-format/licenses for more
information about scikit-bio.
If subsampling is performed without replacement, vectors with a sum
less than `n` are omitted from the result. This condition is not held
when operating with replacement.
This code assumes absolute abundance if `by_id` is False.
If subsampling with replacement, `np.ceil` is applied prior to
calculating p-values to ensure that low-abundance features have a
chance to be sampled.
Examples
--------
>>> import numpy as np
Expand Down Expand Up @@ -4863,7 +4871,7 @@ def to_json(self, generated_by, direct_io=None, creation_date=None):
for col_index, val in enumerate(obs[0]):
if float(val) != 0.0:
built_row.append(
"[%d,%d,%r]" % (obs_index, col_index, val)
"[%d,%d,%f]" % (obs_index, col_index, val)
)
if built_row:
# if we have written a row already, its safe to add a comma
Expand Down
Binary file added biom/tests/test_data/edgecase_issue_952.biom
Binary file not shown.
27 changes: 27 additions & 0 deletions biom/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2412,6 +2412,16 @@ def test_transpose(self):
self.st_rich.data('2', 'observation'))
self.assertEqual(obs.transpose(), self.st_rich)

def test_update_ids_strict_dtype_bug_issue_957(self):
t = Table(np.arange(6).reshape(2, 3),
['O1', 'O2'],
['ab', 'cdef', 'ghijkl'])
exp = Table(np.arange(6).reshape(2, 3),
['O1', 'O2'],
['AB', 'cdef', 'ghijkl'])
obs = t.update_ids({'ab': 'AB'}, strict=False, inplace=False)
self.assertEqual(obs, exp)

def test_update_ids_inplace_bug_892(self):
t = example_table.copy()
exp = t.ids().copy()
Expand Down Expand Up @@ -3203,6 +3213,23 @@ def f(vals, id_, md):
with errstate(empty='raise'), self.assertRaises(TableException):
self.st_rich.filter(f, 'observation')

def test_subsample_edgecase_issue_952(self):
# this file triggers an exception on Linux on subsample
# with replacement where the pvals computed sum to > 1. It is a
# subset of the data reported in issue 952, specifically constrained
# to the first 10 features with any empty samples removed.
path = 'test_data/edgecase_issue_952.biom'

# ...existing logic for test_data, not ideal, but consistent
cwd = os.getcwd()
if '/' in __file__:
os.chdir(__file__.rsplit('/', 1)[0])
table = Table.from_hdf5(h5py.File(path, 'r'))
os.chdir(cwd)

obs = table.subsample(10, with_replacement=True)
self.assertEqual(set(obs.sum('sample')), {10.0, })

def test_subsample_same_seed_without_replacement(self):
table = Table(np.array([[3, 1, 2], [0, 3, 4]]), ['O1', 'O2'],
['S1', 'S2', 'S3'])
Expand Down

0 comments on commit 0fe0ad6

Please sign in to comment.