Skip to content

Commit

Permalink
✨ Allow passing vcf as 2nd arg to macros (#45)
Browse files Browse the repository at this point in the history
  • Loading branch information
pwwang committed Sep 27, 2023
1 parent 7a46551 commit 30ddbca
Show file tree
Hide file tree
Showing 24 changed files with 755 additions and 684 deletions.
13 changes: 13 additions & 0 deletions docs/macros.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ def GTTYPEs(variant)

To get the genotype in sample 1 in formula: `GTTYPEs{0}`. You can also use sample name as well: `GTTYPEs{some sample}`

It's also allowed to pass `vcf` (the instance of `cyvcf.VCF`) as the second argument to the macro. For example:

```python
from vcf.macros import cont

@cont
def MIXED_INFO(variant, vcf):
...
```

Check the [API documentation](https://brentp.github.io/cyvcf2/docstrings.html) of `cyvcf2` to see what information we can get from `vcf`.


## Macros with filters

`aggregation`s have different syntax for filters. Here we are discussing about `continuous` and `categorical`.
Expand Down
Binary file modified examples/allele-frequency-on-each-chromosome.violin.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/depths-between-sample-1-and-2.scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/gq-vs-depth-sample-1.scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/mutant-genotypes-on-each-chromosome-sample-1.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-substitutions-of-snps-passed.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-substitutions-of-snps.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome-first-5.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome-modified.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/overall-allele-frequency-distribution.histogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-each-chromosome.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-whole-genome.pie.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,285 changes: 656 additions & 629 deletions poetry.lock

Large diffs are not rendered by default.

74 changes: 37 additions & 37 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,28 +109,28 @@ def test_term_init():

def test_term_run(variants):
term = Term("FILTER", "PASS")
assert term.run(variants[0], passed=True) == False
assert term.run(variants[5], passed=True) == ["PASS"]
assert term.run(variants[0], None, passed=True) == False
assert term.run(variants[5], None, passed=True) == ["PASS"]

term = Term("FILTER2")
assert term.run(variants[5], passed=True) == False
assert term.run(variants[5], None, passed=True) == False

term = Term("GTTYPEs", None, ["0"])
term.set_samples(variants[-1])
assert term.run(variants[0], passed=False) == ["HOM_REF"]
assert term.run(variants[0], None, passed=False) == ["HOM_REF"]

term = Term("AAF", [0.126, None])
# .125
assert term.run(variants[0], passed=False) == False
assert term.run(variants[2], passed=False) == [0.25]
assert term.run(variants[0], None, passed=False) == False
assert term.run(variants[2], None, passed=False) == [0.25]
term = Term("AAF", [None, 0.24])
# .25
assert term.run(variants[0], passed=False) == [0.125]
assert term.run(variants[2], passed=False) == False
assert term.run(variants[0], None, passed=False) == [0.125]
assert term.run(variants[2], None, passed=False) == False

term = Term("FILTER", "PASS")
assert term.run(variants[0], passed=False) == False
assert term.run(variants[5], passed=False) == ["PASS"]
assert term.run(variants[0], None, passed=False) == False
assert term.run(variants[5], None, passed=False) == ["PASS"]


def test_aggr_init():
Expand Down Expand Up @@ -194,54 +194,54 @@ def test_aggr_run(variants):
aggr = Aggr(
"COUNT", One(), filter=Term("FILTER", ["PASS"]), group=Term("VARTYPE")
)
aggr.run(variants[0], passed=True)
aggr.run(variants[0], None, passed=True)
assert len(aggr.cache) == 0

aggr2 = Aggr("COUNT", One(), filter=Term("FILTER", ["PASS"]))
with pytest.raises(RuntimeError):
aggr2.run(variants[5], passed=True)
aggr2.run(variants[5], None, passed=True)

aggr3 = Aggr(
"COUNT", One(), filter=Term("FILTER", ["PASS"]), group=Term("FILTER2")
)
aggr3.run(variants[5], passed=False)
aggr3.run(variants[5], None, passed=False)
assert len(aggr3.cache) == 0

aggr4 = Aggr("COUNT", One(), Term("GTTYPEs"))
with pytest.raises(ValueError):
aggr4.run(variants[0], passed=False)
aggr4.run(variants[0], None, passed=False)

aggr5 = Aggr("COUNT", One(), Term("VARTYPE"))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"snp": [1]}
aggr5.run(variants[1], passed=False)
aggr5.run(variants[1], None, passed=False)
assert aggr5.cache == {"snp": [1, 1]}
aggr5.run(variants[3], passed=False)
aggr5.run(variants[3], None, passed=False)
assert aggr5.cache == {"snp": [1, 1], "indel": [1]}

assert aggr5.dump() == {"snp": 2, "indel": 1}

aggr5.cache.clear()
aggr5.setxgroup(Term("FILTER", None))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}

aggr5.cache.clear()
aggr5.setxgroup(Term("FILTER2", None))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}
aggr5.run(variants[5], passed=False)
aggr5.run(variants[5], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}
aggr5.run(variants[1], passed=False)
aggr5.run(variants[1], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1, 1]}}
assert aggr5.dump() == {"MinMQ": [(2, "snp")]}

aggr5.setxgroup(Term("GTTYPEs", ["HOM_REF", "HET"]))
with pytest.raises(ValueError):
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)

aggr6 = Aggr("MEAN", Term("AAF", [".2", None]), Term("CHROM"))
aggr6.run(variants[0], passed=False) # .125
aggr6.run(variants[0], None, passed=False) # .125
assert len(aggr6.cache) == 0


Expand Down Expand Up @@ -272,14 +272,14 @@ def test_formula_run(variants):
data = []
fmula = Formula("AFs{0,1} ~ GTTYPEs{0-2}", variants[-1], False, "title")
with pytest.raises(RuntimeError):
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)

fmula = Formula("FILTER2 ~ CHROM", variants[-1], False, "title")
fmula.run(variants[5], data.append, data.extend)
fmula.run(variants[5], None, data.append, data.extend)
assert data == []

fmula = Formula("GTTYPEs ~ CHROM", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == [
("HOM_REF", "1"),
("HOM_REF", "1"),
Expand All @@ -289,7 +289,7 @@ def test_formula_run(variants):

data = []
fmula = Formula("CHROM ~ GTTYPEs", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == [
("1", "HOM_REF"),
("1", "HOM_REF"),
Expand All @@ -304,8 +304,8 @@ def test_formula_run(variants):
False,
"title",
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
assert data == []
fmula.done(data.append, data.extend)
assert data == [(2, 2, "1")]
Expand All @@ -317,7 +317,7 @@ def test_formula_run(variants):
False,
"title",
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == []

with pytest.raises(ValueError):
Expand All @@ -329,24 +329,24 @@ def test_formula_run(variants):
)

fmula = Formula("COUNT(1) ~ CHROM", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
assert data == []
assert fmula.Y.cache == {"1": [1, 1]}
fmula.done(data.append, data.extend)
assert data == [(2, "1")]

fmula = Formula("CHROM ~ COUNT(1)", variants[-1], False, "title")
with pytest.raises(TypeError):
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)

data = []
fmula = Formula(
"COUNT(1, group = VARTYPE) ~ CHROM", variants[-1], False, "title"
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[2], data.append, data.extend)
fmula.run(variants[3], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
fmula.run(variants[2], None, data.append, data.extend)
fmula.run(variants[3], None, data.append, data.extend)
fmula.done(data.append, data.extend)
assert data == [(3, "1", "snp"), (1, "1", "indel")]
2 changes: 1 addition & 1 deletion tests/test_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_one_iterate(tmp_path):
False,
)
with pytest.raises(AttributeError):
instance.iterate(None)
instance.iterate(None, None)


# def test_summarize(instance):
Expand Down
20 changes: 20 additions & 0 deletions tests/test_macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cyvcf2 import VCF

from vcfstats.macros import (
cat,
_ONE,
TITV,
VARTYPE,
Expand All @@ -26,6 +27,12 @@
HERE = Path(__file__).parent.resolve()


@cat
def mixedinfo(variant, vcf):
"""Global position of the variant"""
return vcf.raw_header[:6] + variant.CHROM


@pytest.fixture(scope="module")
def variants():
vcf = VCF(
Expand All @@ -35,6 +42,19 @@ def variants():
return list(vcf)


@pytest.fixture(scope="module")
def variants_vcf():
vcf = VCF(
str(HERE.parent.joinpath("examples", "sample.vcf")),
gts012=True,
)
return list(vcf), vcf


def test_variant_vcf(variants_vcf):
assert mixedinfo(variants_vcf[0][0], variants_vcf[1]) == "##file1"


def test_vartype(variants):
assert VARTYPE(variants[0]) == "snp"
assert VARTYPE(variants[1]) == "snp"
Expand Down
2 changes: 1 addition & 1 deletion vcfstats/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Main entrance for `python -m vcfstats`"""

# pragma: no cover
from .cli import main

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion vcfstats/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def main():
for i, variant in enumerate(vcf):
for instance in ones:
# save entries, cache aggr
instance.iterate(variant)
instance.iterate(variant, vcf)
if i % 10000 == 0: # pragma: no cover
logger.debug("- %s variants read.", i)
logger.info(
Expand Down
29 changes: 16 additions & 13 deletions vcfstats/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,14 @@ def __eq__(self, other):
def __ne__(self, other):
return not self.__eq__(other)

def run(self, variant, passed):
def run(self, variant, vcf, passed):
"""Run the variant"""
if passed and variant.FILTER:
return False
value = self.term["func"](variant)
if self.term["nargs"] == 2:
value = self.term["func"](variant, vcf)
else:
value = self.term["func"](variant)

if value is False or value is None:
return False
Expand Down Expand Up @@ -206,21 +209,21 @@ def setxgroup(self, xvar):
else:
self.xgroup = xvar

def run(self, variant, passed):
def run(self, variant, vcf, passed):
"""Run each variant"""
if self.filter and self.filter.run(variant, passed) is False:
if self.filter and self.filter.run(variant, vcf, passed) is False:
return

if not self.group:
raise RuntimeError(
"No group specified, don't know how to aggregate."
)

group = self.group.run(variant, passed)
group = self.group.run(variant, vcf, passed)
if group is False:
return

value = self.term.run(variant, passed)
value = self.term.run(variant, vcf, passed)
if value is False:
return

Expand All @@ -233,7 +236,7 @@ def run(self, variant, passed):

xgroup = None
if self.xgroup:
xgroup = self.xgroup.run(variant, passed)
xgroup = self.xgroup.run(variant, vcf, passed)
if xgroup is False:
return
if len(xgroup) > 1 and len(value) != len(xgroup):
Expand Down Expand Up @@ -312,12 +315,12 @@ def __init__(self, formula, samples, passed, title):
):
self.passed = False

def run(self, variant, data_append, data_extend):
def run(self, variant, vcf, data_append, data_extend):
"""Run each variant"""
if isinstance(self.Y, Term) and isinstance(self.X, Term):
yvar, xvar = (
self.Y.run(variant, self.passed),
self.X.run(variant, self.passed),
self.Y.run(variant, vcf, self.passed),
self.X.run(variant, vcf, self.passed),
)
if yvar is False or xvar is False:
return
Expand All @@ -336,10 +339,10 @@ def run(self, variant, data_append, data_extend):

data_extend(((yvar[i], rvar) for i, rvar in enumerate(xvar)))
elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr):
self.Y.run(variant, self.passed)
self.X.run(variant, self.passed)
self.Y.run(variant, vcf, self.passed)
self.X.run(variant, vcf, self.passed)
elif isinstance(self.Y, Aggr) and isinstance(self.X, Term):
self.Y.run(variant, self.passed)
self.Y.run(variant, vcf, self.passed)
else:
raise TypeError(
"Cannot do 'TERM ~ AGGREGATION'. "
Expand Down
4 changes: 2 additions & 2 deletions vcfstats/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ def __init__(
def __del__(self):
del self.data

def iterate(self, variant):
def iterate(self, variant, vcf):
"""Iterate over each variant"""
# Y
self.formula.run(variant, self.data.append, self.data.extend)
self.formula.run(variant, vcf, self.data.append, self.data.extend)

def summarize(self):
"""Calculate the aggregations"""
Expand Down
8 changes: 8 additions & 0 deletions vcfstats/macros.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""Builtin marcros for vcfstats"""
import warnings
from inspect import signature
from functools import partial

from .utils import MACROS


def _nargs(func):
"""Get the number of arguments of a function"""
return len(signature(func).parameters)


def categorical(func=None, alias=None, _name=None):
"""Categorical decorator"""
if alias:
Expand All @@ -14,6 +20,7 @@ def categorical(func=None, alias=None, _name=None):
MACROS[funcname] = {}
MACROS[funcname]["func"] = MACROS[funcname].get("func", func)
MACROS[funcname]["type"] = "categorical"
MACROS[funcname]["nargs"] = _nargs(func)
if _name:
MACROS[_name] = MACROS[funcname]
return MACROS[funcname]["func"]
Expand All @@ -28,6 +35,7 @@ def continuous(func=None, alias=None, _name=None):
MACROS[funcname] = {}
MACROS[funcname]["func"] = MACROS[funcname].get("func", func)
MACROS[funcname]["type"] = "continuous"
MACROS[funcname]["nargs"] = _nargs(func)
if _name:
MACROS[_name] = MACROS[funcname]
return MACROS[funcname]["func"]
Expand Down

0 comments on commit 30ddbca

Please sign in to comment.