Merge pull request #164 from nvictus/feat-mark-compress-runs

Add mark_runs and merge_runs
open2c · Oct 18, 2024 · 6515f1c · 6515f1c
2 parents 19470b4 + 942b46a
commit 6515f1c
Show file tree

Hide file tree

Showing 5 changed files with 305 additions and 2 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -27,6 +27,4 @@ jobs:
  - run: |
  python -m pip install --upgrade pip hatch
  pip install -e .[dev]
- # Stop the build if there are Python syntax errors or undefined names
- ruff . --select=E9,F63,F7,F82 --show-source
  hatch run test
diff --git a/bioframe/__init__.py b/bioframe/__init__.py
@@ -65,6 +65,8 @@
  "expand",
  "merge",
  "overlap",
+ "mark_runs",
+ "merge_runs",
  "select",
  "select_indices",
  "select_labels",
@@ -107,6 +109,8 @@
  frac_gene_coverage,
  frac_mapped,
  make_chromarms,
+ mark_runs,
+ merge_runs,
  pair_by_distance,
  seq_gc,
 )

diff --git a/bioframe/extras.py b/bioframe/extras.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 
@@ -13,6 +15,8 @@
  "seq_gc",
  "frac_gene_coverage",
  "pair_by_distance",
+ "mark_runs",
+ "merge_runs"
 ]
 
 
@@ -538,3 +542,199 @@ def pair_by_distance(
  out_df.reset_index(drop=True, inplace=True)
 
  return out_df
+
+
+def mark_runs(
+ df: pd.DataFrame,
+ col: str,
+ *,
+ allow_overlaps: bool = False,
+ reset_counter: bool = True,
+ run_col: str = 'run',
+ cols: tuple[str, str, str] | None = None,
+) -> pd.DataFrame:
+ """
+ Mark runs of spatially consecutive intervals sharing the same value of
+ ``col``.
+
+ Parameters
+ ----------
+ df : DataFrame
+ A bioframe dataframe.
+ col : str
+ The column to mark runs of values for.
+ allow_overlaps : bool, optional [default: False]
+ If True, allow intervals in ``df`` to overlap. This may cause
+ unexpected results.
+ reset_counter : bool, optional [default: True]
+ If True, reset the run counter for each chromosome.
+ run_col : str, optional [default: 'run']
+ The name of the column to store the run numbers in.
+
+ Returns
+ -------
+ pandas.DataFrame
+ A reordered copy the input dataframe with an additional column 'run'
+ marking runs of values in the input column.
+
+ Notes
+ -----
+ This is similar to :func:`cluster`, but only clusters intervals sharing
+ the same value of ``col``.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({
+ ... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
+ ... 'start': [0, 100, 200, 300, 400, 500],
+ ... 'end': [100, 200, 300, 400, 500, 600],
+ ... 'value': [1, 1, 1, 2, 2, 2],
+ ... })
+
+ >>> mark_runs(df, 'value')
+ chrom start end value run
+ 0 chr1 0 100 1 0
+ 1 chr1 100 200 1 0
+ 2 chr1 200 300 1 0
+ 3 chr1 300 400 2 1
+ 4 chr1 400 500 2 1
+ 5 chr1 500 600 2 1
+
+ See Also
+ --------
+ merge_runs
+ cluster
+ merge
+ """
+ ck, sk, ek = _get_default_colnames() if cols is None else cols
+
+ if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
+ raise ValueError("Not a proper bedGraph: found overlapping intervals.")
+
+ result = []
+ n_runs = 0
+
+ for _, group in df.groupby(ck, sort=False):
+ group = group.sort_values([sk, ek])
+ starts = group[sk].to_numpy()
+ ends = group[ek].to_numpy()
+
+ # Extend ends by running max
+ ends = np.maximum.accumulate(ends)
+
+ # Find borders of interval clusters and assign cluster ids
+ is_cluster_border = np.r_[True, starts[1:] > ends[:-1], False]
+
+ # Find borders of consecutive equal values
+ values = group[col].to_numpy()
+ if values.dtype.kind == 'f':
+ is_value_border = np.r_[
+ True,
+ ~np.isclose(values[1:], values[:-1], equal_nan=True),
+ False
+ ]
+ else:
+ is_value_border = np.r_[True, values[1:] != values[:-1], False]
+
+ # Find index extents of runs
+ is_border = is_cluster_border | is_value_border
+ sum_borders = np.cumsum(is_border)
+ run_ids = sum_borders[:-1] - 1
+
+ # Assign run numbers to intervals
+ if reset_counter:
+ n_runs = 0
+ group[run_col] = n_runs + run_ids
+ n_runs += sum_borders[-1]
+
+ result.append(group)
+
+ return pd.concat(result)
+
+
+def merge_runs(
+ df: pd.DataFrame,
+ col: str,
+ *,
+ allow_overlaps: bool = False,
+ agg: dict | None = None,
+ cols: tuple[str, str, str] | None = None,
+) -> pd.DataFrame:
+ """
+ Merge runs of spatially consecutive intervals sharing the same value of
+ ``col``.
+
+ Parameters
+ ----------
+ df : DataFrame
+ A bioframe dataframe.
+ col : str
+ The column to compress runs of values for.
+ allow_overlaps : bool, optional [default: False]
+ If True, allow intervals in ``df`` to overlap. This may cause
+ unexpected results.
+ agg : dict, optional [default: None]
+ A dictionary of additional column names and aggregation functions to
+ apply to each run. Takes the format:
+ {'agg_name': ('column_name', 'agg_func')}
+
+ Returns
+ -------
+ pandas.DataFrame
+ Dataframe with consecutive intervals in the same run merged.
+
+ Notes
+ -----
+ This is similar to :func:`merge`, but only merges intervals sharing
+ the same value of ``col``.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({
+ ... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
+ ... 'start': [0, 100, 200, 300, 400, 500],
+ ... 'end': [100, 200, 300, 400, 500, 600],
+ ... 'value': [1, 1, 1, 2, 2, 2],
+ ... })
+
+ >>> merge_runs(df, 'value')
+ chrom start end value
+ 0 chr1 0 300 1
+ 1 chr1 300 600 2
+
+ >>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')})
+ chrom start end value sum
+ 0 chr1 0 300 1 3
+ 1 chr1 300 600 2 6
+
+ See Also
+ --------
+ mark_runs
+ cluster
+ merge
+ """
+ ck, sk, ek = _get_default_colnames() if cols is None else cols
+
+ if agg is None:
+ agg = {}
+
+ df_runs = mark_runs(
+ df,
+ col,
+ allow_overlaps=allow_overlaps,
+ reset_counter=False,
+ run_col='_run',
+ )
+ df_merged = (
+ df_runs
+ .groupby('_run')
+ .agg(**{
+ ck: (ck, 'first'),
+ sk: (sk, 'min'),
+ ek: (ek, 'max'),
+ col: (col, 'first'),
+ **agg
+ })
+ )
+ return df_merged.reset_index(drop=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ classifiers = [
  "Programming Language :: Python :: 3.11",
 ]
 readme = "README.md"
+requires-python = ">=3.8"
 dependencies = [
  "matplotlib",
  "numpy>=1.10, <2",
@@ -49,6 +50,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
  "biopython",
+ "pre-commit",
  "pysam",
  "pybbi",
  "pytest",

diff --git a/tests/test_extras.py b/tests/test_extras.py
@@ -304,3 +304,102 @@ def test_pair_by_distance():
  bioframe.pair_by_distance(
  df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9
  )
+
+
+def test_mark_merge_runs():
+ df1 = pd.DataFrame([
+ # chr1
+ # consecutive run of "c"
+ ["chr1", 85563, 129897, "c", 0.2],
+ ["chr1", 129897, 508340, "c", 0.8],
+ ["chr1", 508340, 620903, "c", 0.5],
+
+ # singleton run of "c" separated by 1bp from previous run
+ ["chr1", 620904, 688020, "c", 0.7],
+
+ # consecutive with previous interval but different value of "name"
+ ["chr1", 688020, 858415, "b", 0.8],
+
+ # chr2
+ ["chr2", 548402, 639680, "a", 0.6],
+ ["chr2", 639680, 1026586, "b", 0.8],
+
+ # chr3
+ ["chr3", 260538, 272930, "c", 0.5],
+ ["chr3", 272930, 470969, "c", 0.5],
+ ["chr3", 470969, 502336, "c", 0.5],
+ ], columns=["chrom", "start", "end", "name", "score"])
+
+ runs = bioframe.mark_runs(df1, "name")
+ assert (
+ runs["name"].to_numpy()
+ == np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"])
+ ).all()
+ assert (
+ runs["run"].to_numpy()
+ == np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0])
+ ).all()
+
+ runs = bioframe.mark_runs(df1, "name", reset_counter=False)
+ assert (
+ runs["run"].to_numpy()
+ == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
+ ).all()
+
+ runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
+ assert (
+ runs["foo"].to_numpy()
+ == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
+ ).all()
+
+ merged = bioframe.merge_runs(
+ df1, "name", agg={"score_mean": ("score", "mean")}
+ )
+ assert (
+ merged["name"].to_numpy()
+ == np.array(["c", "c", "b", "a", "b", "c"])
+ ).all()
+ assert np.allclose(
+ merged["score_mean"].to_numpy(),
+ np.array([0.5, 0.7, 0.8, 0.6, 0.8, 0.5]),
+ )
+
+
+def test_mark_merge_runs__with_overlaps():
+ df1 = pd.DataFrame([
+ # chr1
+ # consecutive run of "c"
+ ["chr1", 85563, 129897, "c", 0.2],
+ ["chr1", 129897, 508340, "c", 0.8],
+ ["chr1", 508340, 620903, "c", 0.5],
+
+ # singleton run of "c" separated by 1bp from previous run
+ ["chr1", 620904, 688020, "c", 0.7],
+
+ # consecutive with previous interval but different value of "name"
+ ["chr1", 688020, 858415, "b", 0.8],
+ # overlapping with previous interval
+ ["chr1", 700000, 900000, "b", 0.8],
+
+ # chr2
+ ["chr2", 548402, 639680, "a", 0.6],
+ ["chr2", 639680, 1026586, "b", 0.8],
+
+ # chr3
+ ["chr3", 260538, 272930, "c", 0.5],
+ ["chr3", 272930, 470969, "c", 0.5],
+ ["chr3", 470969, 502336, "c", 0.5],
+ ], columns=["chrom", "start", "end", "name", "score"])
+
+ with pytest.raises(ValueError):
+ bioframe.mark_runs(df1, "name")
+
+ runs = bioframe.mark_runs(df1, "name", allow_overlaps=True)
+ assert (
+ runs["name"].to_numpy()
+ == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
+ ).all()
+ assert (
+ runs["run"].to_numpy()
+ == np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0])
+ ).all()