Skip to content

Commit

Permalink
Merge pull request #164 from nvictus/feat-mark-compress-runs
Browse files Browse the repository at this point in the history
Add mark_runs and merge_runs
  • Loading branch information
nvictus authored Oct 18, 2024
2 parents 19470b4 + 942b46a commit 6515f1c
Show file tree
Hide file tree
Showing 5 changed files with 305 additions and 2 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,4 @@ jobs:
- run: |
python -m pip install --upgrade pip hatch
pip install -e .[dev]
# Stop the build if there are Python syntax errors or undefined names
ruff . --select=E9,F63,F7,F82 --show-source
hatch run test
4 changes: 4 additions & 0 deletions bioframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
"expand",
"merge",
"overlap",
"mark_runs",
"merge_runs",
"select",
"select_indices",
"select_labels",
Expand Down Expand Up @@ -107,6 +109,8 @@
frac_gene_coverage,
frac_mapped,
make_chromarms,
mark_runs,
merge_runs,
pair_by_distance,
seq_gc,
)
Expand Down
200 changes: 200 additions & 0 deletions bioframe/extras.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import numpy as np
import pandas as pd

Expand All @@ -13,6 +15,8 @@
"seq_gc",
"frac_gene_coverage",
"pair_by_distance",
"mark_runs",
"merge_runs"
]


Expand Down Expand Up @@ -538,3 +542,199 @@ def pair_by_distance(
out_df.reset_index(drop=True, inplace=True)

return out_df


def mark_runs(
df: pd.DataFrame,
col: str,
*,
allow_overlaps: bool = False,
reset_counter: bool = True,
run_col: str = 'run',
cols: tuple[str, str, str] | None = None,
) -> pd.DataFrame:
"""
Mark runs of spatially consecutive intervals sharing the same value of
``col``.
Parameters
----------
df : DataFrame
A bioframe dataframe.
col : str
The column to mark runs of values for.
allow_overlaps : bool, optional [default: False]
If True, allow intervals in ``df`` to overlap. This may cause
unexpected results.
reset_counter : bool, optional [default: True]
If True, reset the run counter for each chromosome.
run_col : str, optional [default: 'run']
The name of the column to store the run numbers in.
Returns
-------
pandas.DataFrame
A reordered copy the input dataframe with an additional column 'run'
marking runs of values in the input column.
Notes
-----
This is similar to :func:`cluster`, but only clusters intervals sharing
the same value of ``col``.
Examples
--------
>>> df = pd.DataFrame({
... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
... 'start': [0, 100, 200, 300, 400, 500],
... 'end': [100, 200, 300, 400, 500, 600],
... 'value': [1, 1, 1, 2, 2, 2],
... })
>>> mark_runs(df, 'value')
chrom start end value run
0 chr1 0 100 1 0
1 chr1 100 200 1 0
2 chr1 200 300 1 0
3 chr1 300 400 2 1
4 chr1 400 500 2 1
5 chr1 500 600 2 1
See Also
--------
merge_runs
cluster
merge
"""
ck, sk, ek = _get_default_colnames() if cols is None else cols

if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
raise ValueError("Not a proper bedGraph: found overlapping intervals.")

result = []
n_runs = 0

for _, group in df.groupby(ck, sort=False):
group = group.sort_values([sk, ek])
starts = group[sk].to_numpy()
ends = group[ek].to_numpy()

# Extend ends by running max
ends = np.maximum.accumulate(ends)

# Find borders of interval clusters and assign cluster ids
is_cluster_border = np.r_[True, starts[1:] > ends[:-1], False]

# Find borders of consecutive equal values
values = group[col].to_numpy()
if values.dtype.kind == 'f':
is_value_border = np.r_[
True,
~np.isclose(values[1:], values[:-1], equal_nan=True),
False
]
else:
is_value_border = np.r_[True, values[1:] != values[:-1], False]

# Find index extents of runs
is_border = is_cluster_border | is_value_border
sum_borders = np.cumsum(is_border)
run_ids = sum_borders[:-1] - 1

# Assign run numbers to intervals
if reset_counter:
n_runs = 0
group[run_col] = n_runs + run_ids
n_runs += sum_borders[-1]

result.append(group)

return pd.concat(result)


def merge_runs(
df: pd.DataFrame,
col: str,
*,
allow_overlaps: bool = False,
agg: dict | None = None,
cols: tuple[str, str, str] | None = None,
) -> pd.DataFrame:
"""
Merge runs of spatially consecutive intervals sharing the same value of
``col``.
Parameters
----------
df : DataFrame
A bioframe dataframe.
col : str
The column to compress runs of values for.
allow_overlaps : bool, optional [default: False]
If True, allow intervals in ``df`` to overlap. This may cause
unexpected results.
agg : dict, optional [default: None]
A dictionary of additional column names and aggregation functions to
apply to each run. Takes the format:
{'agg_name': ('column_name', 'agg_func')}
Returns
-------
pandas.DataFrame
Dataframe with consecutive intervals in the same run merged.
Notes
-----
This is similar to :func:`merge`, but only merges intervals sharing
the same value of ``col``.
Examples
--------
>>> df = pd.DataFrame({
... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
... 'start': [0, 100, 200, 300, 400, 500],
... 'end': [100, 200, 300, 400, 500, 600],
... 'value': [1, 1, 1, 2, 2, 2],
... })
>>> merge_runs(df, 'value')
chrom start end value
0 chr1 0 300 1
1 chr1 300 600 2
>>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')})
chrom start end value sum
0 chr1 0 300 1 3
1 chr1 300 600 2 6
See Also
--------
mark_runs
cluster
merge
"""
ck, sk, ek = _get_default_colnames() if cols is None else cols

if agg is None:
agg = {}

df_runs = mark_runs(
df,
col,
allow_overlaps=allow_overlaps,
reset_counter=False,
run_col='_run',
)
df_merged = (
df_runs
.groupby('_run')
.agg(**{
ck: (ck, 'first'),
sk: (sk, 'min'),
ek: (ek, 'max'),
col: (col, 'first'),
**agg
})
)
return df_merged.reset_index(drop=True)
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ classifiers = [
"Programming Language :: Python :: 3.11",
]
readme = "README.md"
requires-python = ">=3.8"
dependencies = [
"matplotlib",
"numpy>=1.10, <2",
Expand All @@ -49,6 +50,7 @@ dependencies = [
[project.optional-dependencies]
dev = [
"biopython",
"pre-commit",
"pysam",
"pybbi",
"pytest",
Expand Down
99 changes: 99 additions & 0 deletions tests/test_extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,102 @@ def test_pair_by_distance():
bioframe.pair_by_distance(
df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9
)


def test_mark_merge_runs():
df1 = pd.DataFrame([
# chr1
# consecutive run of "c"
["chr1", 85563, 129897, "c", 0.2],
["chr1", 129897, 508340, "c", 0.8],
["chr1", 508340, 620903, "c", 0.5],

# singleton run of "c" separated by 1bp from previous run
["chr1", 620904, 688020, "c", 0.7],

# consecutive with previous interval but different value of "name"
["chr1", 688020, 858415, "b", 0.8],

# chr2
["chr2", 548402, 639680, "a", 0.6],
["chr2", 639680, 1026586, "b", 0.8],

# chr3
["chr3", 260538, 272930, "c", 0.5],
["chr3", 272930, 470969, "c", 0.5],
["chr3", 470969, 502336, "c", 0.5],
], columns=["chrom", "start", "end", "name", "score"])

runs = bioframe.mark_runs(df1, "name")
assert (
runs["name"].to_numpy()
== np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"])
).all()
assert (
runs["run"].to_numpy()
== np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0])
).all()

runs = bioframe.mark_runs(df1, "name", reset_counter=False)
assert (
runs["run"].to_numpy()
== np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
).all()

runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
assert (
runs["foo"].to_numpy()
== np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
).all()

merged = bioframe.merge_runs(
df1, "name", agg={"score_mean": ("score", "mean")}
)
assert (
merged["name"].to_numpy()
== np.array(["c", "c", "b", "a", "b", "c"])
).all()
assert np.allclose(
merged["score_mean"].to_numpy(),
np.array([0.5, 0.7, 0.8, 0.6, 0.8, 0.5]),
)


def test_mark_merge_runs__with_overlaps():
df1 = pd.DataFrame([
# chr1
# consecutive run of "c"
["chr1", 85563, 129897, "c", 0.2],
["chr1", 129897, 508340, "c", 0.8],
["chr1", 508340, 620903, "c", 0.5],

# singleton run of "c" separated by 1bp from previous run
["chr1", 620904, 688020, "c", 0.7],

# consecutive with previous interval but different value of "name"
["chr1", 688020, 858415, "b", 0.8],
# overlapping with previous interval
["chr1", 700000, 900000, "b", 0.8],

# chr2
["chr2", 548402, 639680, "a", 0.6],
["chr2", 639680, 1026586, "b", 0.8],

# chr3
["chr3", 260538, 272930, "c", 0.5],
["chr3", 272930, 470969, "c", 0.5],
["chr3", 470969, 502336, "c", 0.5],
], columns=["chrom", "start", "end", "name", "score"])

with pytest.raises(ValueError):
bioframe.mark_runs(df1, "name")

runs = bioframe.mark_runs(df1, "name", allow_overlaps=True)
assert (
runs["name"].to_numpy()
== np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
).all()
assert (
runs["run"].to_numpy()
== np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0])
).all()

0 comments on commit 6515f1c

Please sign in to comment.