-
Notifications
You must be signed in to change notification settings - Fork 45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds a method to use a high performance implementation of UniFrac #143
Changes from all commits
d1c6b75
ea52be1
adf0248
dfc78f9
fb0b28b
e4aa0ef
e246e02
5f06c76
8200a7d
d624c59
17758c1
be2a581
80091a3
79e0a29
bf7ffef
0eaba83
df4fbc1
ae91c7d
8b88643
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,13 @@ | |
import skbio.diversity | ||
import skbio.tree | ||
import sklearn.metrics | ||
import unifrac | ||
import psutil | ||
|
||
from q2_types.feature_table import BIOMV210Format | ||
from q2_types.tree import NewickFormat | ||
|
||
from functools import partial | ||
|
||
|
||
# We should consider moving these functions to scikit-bio. They're part of | ||
|
@@ -20,6 +27,13 @@ def phylogenetic_metrics(): | |
return {'unweighted_unifrac', 'weighted_unifrac'} | ||
|
||
|
||
def phylogenetic_metrics_alt_dict(): | ||
return {'unweighted_unifrac': unifrac.unweighted, | ||
'weighted_unifrac': unifrac.weighted_unnormalized, | ||
'weighted_normalized_unifrac': unifrac.weighted_normalized, | ||
'generalized_unifrac': unifrac.generalized} | ||
|
||
|
||
def non_phylogenetic_metrics(): | ||
return {'cityblock', 'euclidean', 'seuclidean', 'sqeuclidean', 'cosine', | ||
'correlation', 'hamming', 'jaccard', 'chebyshev', 'canberra', | ||
|
@@ -63,6 +77,39 @@ def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, | |
return results | ||
|
||
|
||
def beta_phylogenetic_alt(table: BIOMV210Format, phylogeny: NewickFormat, | ||
metric: str, n_jobs: int=1, | ||
variance_adjusted: bool=False, | ||
alpha=None, | ||
bypass_tips: bool=False) -> skbio.DistanceMatrix: | ||
|
||
metrics = phylogenetic_metrics_alt_dict() | ||
generalized_unifrac = 'generalized_unifrac' | ||
|
||
if metric not in metrics: | ||
raise ValueError("Unknown metric: %s" % metric) | ||
|
||
if alpha is not None and metric != generalized_unifrac: | ||
raise ValueError('The alpha parameter is only allowed when the choice' | ||
' of metric is generalized_unifrac') | ||
|
||
# this behaviour is undefined, so let's avoid a seg fault | ||
cpus = psutil.cpu_count(logical=False) | ||
if n_jobs > cpus: | ||
raise ValueError('The value of n_jobs cannot exceed the number of ' | ||
'processors (%d) available in this system.' % cpus) | ||
|
||
if metric == generalized_unifrac: | ||
alpha = 1.0 if alpha is None else alpha | ||
f = partial(metrics[metric], alpha=alpha) | ||
else: | ||
f = metrics[metric] | ||
|
||
# unifrac processes tables and trees should be filenames | ||
return f(str(table), str(phylogeny), threads=n_jobs, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
variance_adjusted=variance_adjusted, bypass_tips=bypass_tips) | ||
|
||
|
||
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: | ||
if metric not in non_phylogenetic_metrics(): | ||
raise ValueError("Unknown metric: %s" % metric) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
# ---------------------------------------------------------------------------- | ||
|
||
from qiime2.plugin import (Plugin, Str, Properties, MetadataCategory, Choices, | ||
Metadata, Int, Bool, Range) | ||
Metadata, Int, Bool, Range, Float) | ||
|
||
import q2_diversity | ||
from q2_diversity import _alpha as alpha | ||
|
@@ -38,15 +38,27 @@ | |
'and exploring community alpha and beta diversity through ' | ||
'statistics and visualizations in the context of sample ' | ||
'metadata.'), | ||
short_description='Plugin for exploring community diversity.' | ||
short_description='Plugin for exploring community diversity.', | ||
citation_text=('Unweighted UniFrac: ' | ||
'Lozupone and Knight 2005 Appl Environ Microbiol; DOI: ' | ||
'10.1128/AEM.71.12.8228-8235.2005.\n' | ||
'Weighted UniFrac: ' | ||
'Lozupone et al. 2007 Appl Environ Microbiol; DOI: ' | ||
'10.1128/AEM.01996-06.\n' | ||
'Variance adjusted UniFrac: ' | ||
'Chang et al. BMC Bioinformatics 2011 ' | ||
'https://doi.org/10.1186/1471-2105-12-118.\n' | ||
'Generalized UniFrac: ' | ||
'Chen et al. 2012 Bioinformatics; DOI: ' | ||
'10.1093/bioinformatics/bts342') | ||
) | ||
|
||
plugin.methods.register_function( | ||
function=q2_diversity.beta_phylogenetic, | ||
inputs={'table': FeatureTable[Frequency], | ||
'phylogeny': Phylogeny[Rooted]}, | ||
parameters={'metric': Str % Choices(beta.phylogenetic_metrics()), | ||
'n_jobs': Int}, | ||
'n_jobs': Int % Range(1, None)}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is minor, and ultimately whatever fits better into the surrounding terminology is preferred, but it is more correct to describe this as threads. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jobs is probably better for now, since it sounds like this is actually CPU threads, and not a process thread. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's process threads (i.e., There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I wrote this, I knew it was misleading, but carried on since it matches beta_phylogenetic. Would probably be nice to have a standard designation for "workers" all throughout QIIME. |
||
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))], | ||
input_descriptions={ | ||
'table': ('The feature table containing the samples over which beta ' | ||
|
@@ -68,6 +80,61 @@ | |
" for all pairs of samples in a feature table.") | ||
) | ||
|
||
|
||
plugin.methods.register_function( | ||
function=q2_diversity.beta_phylogenetic_alt, | ||
inputs={'table': FeatureTable[Frequency], | ||
'phylogeny': Phylogeny[Rooted]}, | ||
parameters={'metric': Str % Choices(beta.phylogenetic_metrics_alt_dict()), | ||
'n_jobs': Int, | ||
'variance_adjusted': Bool, | ||
'alpha': Float % Range(0, 1, inclusive_end=True), | ||
'bypass_tips': Bool}, | ||
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))], | ||
input_descriptions={ | ||
'table': ('The feature table containing the samples over which beta ' | ||
'diversity should be computed.'), | ||
'phylogeny': ('Phylogenetic tree containing tip identifiers that ' | ||
'correspond to the feature identifiers in the table. ' | ||
'This tree can contain tip ids that are not present in ' | ||
'the table, but all feature ids in the table must be ' | ||
'present in this tree.') | ||
}, | ||
parameter_descriptions={ | ||
'metric': 'The beta diversity metric to be computed.', | ||
'n_jobs': 'The number of workers to use.', | ||
'variance_adjusted': ('Perform variance adjustment based on Chang et ' | ||
'al. BMC Bioinformatics 2011. Weights distances ' | ||
'based on the proportion of the relative ' | ||
'abundance represented between the samples at a' | ||
' given node under evaluation.'), | ||
'alpha': ('This parameter is only used when the choice of metric is ' | ||
'generalized_unifrac. The value of alpha controls importance' | ||
' of sample proportions. 1.0 is weighted normalized UniFrac.' | ||
' 0.0 is close to unweighted UniFrac, but only if the sample' | ||
' proportions are dichotomized.'), | ||
'bypass_tips': ('In a bifurcating tree, the tips make up about 50% of ' | ||
'the nodes in a tree. By ignoring them, specificity ' | ||
'can be traded for reduced compute time. This has the' | ||
' effect of collapsing the phylogeny, and is analogous' | ||
' (in concept) to moving from 99% to 97% OTUs') | ||
}, | ||
output_descriptions={'distance_matrix': 'The resulting distance matrix.'}, | ||
name='Beta diversity (phylogenetic) - High Performance Computation', | ||
description=("Computes a user-specified phylogenetic beta diversity metric" | ||
" for all pairs of samples in a feature table. This " | ||
"implementation is recommended for large datasets, otherwise " | ||
"the results are identical to beta_phylogenetic.\n\n" | ||
"This method is an implementation of the Strided State " | ||
"UniFrac algorithm. Multiple variants of the UniFrac metric " | ||
"are available, including Generalized UniFrac (Chen et al. " | ||
"2012), Variance Adjusted UniFrac (Chang et al. 2011), " | ||
"as well as Weighted normalized and unnormalized UniFrac " | ||
"(Lozupone et al. 2007) and unweighted UniFrac " | ||
"(Lozupone et al. 2005)") | ||
) | ||
|
||
|
||
plugin.methods.register_function( | ||
function=q2_diversity.beta, | ||
inputs={'table': FeatureTable[Frequency]}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
((169901:0.39755,(100344:0.37296,((1684221:0.33084,((1136443:0.00014,4374042:0.00204):0.2205,((((263705:0.02657,376397:0.05743):0.02493,263452:0.09328):0.09446,((((229459:0.04726,1571092:0.0371):0.03291,827195:0.03981):0.16617,(((162991:0.01263,318370:0.02928):0.00873,(267689:0.0087,335952:0.01692):0.00962):0.08285,276044:0.25511):0.02402)'c__Clostridia; o__Clostridiales':0.01046,(((186521:0.09255,(((195840:0.05097,181603:0.04737):0.0487,(269378:0.05555,(((215193:0.01971,199307:0.02787):0.03584,344527:0.06642):0.01912,187790:0.06549):0.05223):0.06458):0.01861,(267041:0.06668,441494:0.29606):0.00852):0.00686):0.01105,((274438:0.08266,((268121:0.04956,(194787:0.05527,(337331:0.0305,267411:0.12086):0.00656):0.01187):0.00293,181205:0.07086):0.00228):0.00953,(4418586:0.09245,(((((((4471135:0.04783,195445:0.06353):0.00973,(172705:0.09236,2575787:0.21784):0.01157):0.00173,(269576:0.04917,(((259593:0.02332,1105328:0.012):0.00102,208571:0.01117):0.0008,(182621:0.08672,271449:0.02772):0.00713):0.01471):0.01507):0.00147,181419:0.03032)'c__Clostridia; o__Clostridiales':0.00169,(((((((191958:0.01976,((316842:0.02735,194822:0.01128):0.00721,175432:0.01301):0.01834):0.00247,343906:0.02151):0.00921,3392842:0.00869):0.00315,(351881:0.04144,346098:0.02522):0.00695):0.048,(((((191077:0.05103,351794:0.07438):0.00467,(((4364243:0.04249,261419:0.04021):0.01097,231169:0.01264):0.0132,(403497:0.04479,(261334:0.01174,340189:0.05963):0.01024):0.00256):0.00753):0.00429,(293580:0.02795,3265889:0.05414):0.00656):0.00543,292745:0.04202):0.00014,(322062:0.01797,190273:0.01058):0.04098):0.00564):0.00451,(288931:0.03379,((553395:0.05483,302407:0.0199):0.01312,(833390:0.00627,(275707:0.02319,450047:0.00348):0.01342):0.0492):0.0115):0.01294)'c__Clostridia; o__Clostridiales; f__Lachnospiraceae':0.00448,((132114:0.18434,((274422:0.21213,(314810:0.04013,422727:0.06862):0.08472)'c__Clostridia; o__Clostridiales':0.00016,(((((((((190242:0.01884,(265940:0.00636,170950:0.02803):0.03275):0.05634,187133:0.06732):0.03409,4417539:0.16601):0.01312,(267452:0.01884,231021:0.02579):0.22655)'f__Ruminococcaceae; g__Ruminococcus':0.01162,((550807:0.14219,((((((((4484382:0.03105,(((276404:0.01013,(258725:0.01776,4403349:0.01479):0.01233):0.02161,321484:0.05463):0.00374,263546:0.05758):0.04859):0.00344,196194:0.03084):0.01292,269019:0.02212):0.00207,270385:0.07485):0.02724,((182033:0.00895,263165:0.00701):0.0503,(260753:0.01819,191816:0.05696):0.0122):0.05687):0.00674,270519:0.05105):0.022,((((262677:0.00993,272953:0.00844):0.01976,(187807:0.0179,276985:0.03212):0.00809):0.0029,263106:0.01522):0.06462,176118:0.04609):0.04356)'g__Oscillospira':0.02244,(170555:0.01542,263044:0.02129):0.19443):0.02295):0.02515,(301870:0.15284,(259212:0.12329,185222:0.23345):0.03418)'f__Clostridiaceae':0.02365):0.01303):0.00843,((732128:0.0582,166099:0.08664):0.00226,4480176:0.02225):0.21389):0.0112,((179188:0.04915,194662:0.09909):0.00318,261511:0.06272):0.01032)'c__Clostridia; o__Clostridiales':0.00284,(273084:0.05892,(((((260663:0.02738,(265786:0.02702,260397:0.00524):0.00102):0.00014,275627:0.06381):0.14991,4396877:0.05675):0.01023,828435:0.05156):0.08237,((45363:0.01629,259266:0.05658):0.01623,4407703:0.06932):0.13663)'c__Erysipelotrichi':0.11955):0.00127):0.01284,262409:0.18866):0.00839):0.01056):0.00805,((262766:0.04848,(180206:0.05998,(((262869:0.07152,260655:0.03801):0.00395,(((349142:0.02154,195385:0.00634):0.00159,((176858:0.01629,263362:0.01476):0.0004,233313:0.00963):0.00408):0.00147,(((178031:0.01612,(259263:0.0181,263946:0.02447):0.07026):0.00147,268416:0.00842):0.00532,(97294:0.02181,258969:0.03813):0.00502):0.0039):0.00851):0.01331,(((187703:0.03164,259056:0.01433):0.01444,191772:0.02529):0.01041,276260:0.03673):0.0168):0.02587):0.00758):0.00603,(((184151:0.03171,176039:0.02716):0.05897,310490:0.04338):0.00551,((333053:0.04164,((((((((331965:0.08704,261409:0.01737):0.01822,185777:0.01487):0.04786,(259434:0.02291,276531:0.02123):0.01824):0.00853,303479:0.02542):0.00508,263908:0.04835):0.01457,((258522:0.02949,274021:0.05646):0.00554,181344:0.04727):0.00566):0.00176,(272454:0.03068,176850:0.06171):0.01689):0.01607,275563:0.01872):0.00741):0.00923,318563:0.06893):0.01131):0.00257):0.0081):0.0007):0.0016):0.00336,((((((312476:0.02917,174272:0.04651):0.00785,(((268755:0.01177,268581:0.02385):0.00878,(179719:0.04377,264021:0.05912):0.00417):0.00394,(260058:0.03496,(167204:0.01011,829401:0.01565):0.03983):0.00244):0.00992):0.01245,((((((((178926:0.02199,196825:0.01879):0.00883,(443945:0.02387,310748:0.00905):0.03013):0.01777,260756:0.01673):0.00669,((((233817:0.00475,259910:0.01695):0.0001,197775:0.02372):0.00786,130335:0.03254):0.00866,4365109:0.0344):0.00354):0.00157,((334365:0.03359,271766:0.03304):0.00976,(206494:0.02039,181249:0.04899):0.01657):0.00257):0.00474,(((((265828:0.01611,216403:0.03131):0.00681,180919:0.07082):0.00659,((270662:0.04106,229386:0.01722):0.04132,(187989:0.01675,164308:0.02502):0.01745):0.01998):0.00272,((307595:0.04286,178779:0.01324):0.01155,259888:0.06879):0.0098):0.01198,173417:0.03133):0.0056):0.00441,(186526:0.01221,233411:0.0164):0.06098):0.00533,(301012:0.03892,(266483:0.02079,839215:0.0076):0.02063):0.00659):0.01471):0.00433,(((((((175416:0.06589,(((181141:0.02134,197216:0.04356):0.00062,(273515:0.06915,188536:0.03918):0.0068):0.009,697874:0.0303):0.0203):0.00368,(179181:0.08572,(268923:0.11712,351859:0.05093):0.03893):0.00565):0.00429,((((185743:0.07302,274844:0.06304):0.02997,(190460:0.01175,192971:0.02492):0.0557):0.00889,270396:0.13586):0.00157,336145:0.06697):0.00809):0.00759,(((274597:0.05276,((259228:0.09494,264373:0.04067):0.00411,(((343581:0.00883,266595:0.03526):0.00128,269359:0.02279):0.00928,275150:0.04595):0.01442):0.01243):0.00596,(197790:0.07695,((274106:0.03127,260828:0.02915):0.00728,((180105:0.00572,(196777:0.02608,179063:0.02119):0.02629):0.00667,177802:0.01713):0.01597):0.00679):0.0135):0.01445,((((((275078:0.05198,271378:0.04527):0.00987,276580:0.06941):0.00745,260653:0.01951):0.00577,(259335:0.00857,269992:0.01649):0.02089):0.00607,(261606:0.01776,((260205:0.01675,837473:0.0327):0.01931,259175:0.0661):0.01057):0.03771):0.01365,(272812:0.01332,(275136:0.0504,265641:0.04142):0.01641):0.05327):0.01447):0.01207):0.01462,(180362:0.0488,214471:0.03306):0.02282):0.00489,((276663:0.07249,(1108453:0.12347,267457:0.04345):0.01011):0.00558,(174959:0.05484,185754:0.01372):0.08833):0.00679):0.00361,(181091:0.05605,(319909:0.01099,4462541:0.01663):0.05346):0.0137):0.0035):0.00421,(((((291750:0.01414,178659:0.02705):0.00913,350381:0.03212):0.01138,(((((314963:0.02373,(234121:0.00158,274018:0.01522):0.01383):0.00336,((330296:0.00877,343420:0.00828):0.00199,267388:0.05277):0.00813):0.00348,((175573:0.00729,(((274521:0.02619,191887:0.02143):0.00412,2120775:0.03204):0.0148,436341:0.01208):0.02026):0.04471,(199181:0.05741,266816:0.03119):0.01288):0.0044):0.00294,((180972:0.00911,183211:0.01607):0.01057,227886:0.01216):0.00806):0.04236,(348398:0.00318,270491:0.02772):0.02149):0.016):0.00697,(327236:0.072,182016:0.06278):0.02379):0.00015,(258250:0.03667,(275819:0.02306,275470:0.00947):0.0143):0.01639):0.00479):0.00187,((320490:0.04718,311174:0.01972):0.00144,(193463:0.0092,191398:0.04605):0.03482):0.05332):0.00971):0.00744,((354957:0.05068,183390:0.04344):0.00734,275869:0.07717):0.0236):0.03292):0.00127):0.01695):0.01217,(461524:0.11627,((265106:0.03592,(178735:0.03407,267123:0.02167):0.02808):0.01912,199403:0.02374):0.05723):0.23533):0.03382):0.06256)'p__Firmicutes':0.00964,((115186:0.01033,687185:0.04697):0.18622,(214919:0.07235,(((((4397402:0.02449,187233:0.04148):0.00016,166911:0.01405):0.01203,4338733:0.18171):0.05395,(((452823:0.00238,259372:0.01508):0.00255,239571:0.02438):0.00333,135956:0.03044):0.04831):0.05536,303652:0.15985):0.07926):0.0439):0.05025):0.04842):0.02284):0.01143,(((((((1107945:0.00015,((2137001:0.03611,4468234:0.02727):0.00026,191483:0.02217):0.03914):0.08626,353782:0.02512):0.08438,(((4346374:0.02476,184567:0.04246):0.01612,4414420:0.02265):0.01346,(4372578:0.00847,3621189:0.03839):0.04814):0.0072):0.11232,(4449524:0.14634,173807:0.22514):0.01949):0.08539,(((((((3206355:0.05575,(260387:0.01306,269902:0.00981):0.02831):0.01394,179069:0.04071):0.01129,182995:0.11183):0.00805,174663:0.06637):0.00966,196138:0.06604):0.02537,(262399:0.01744,262166:0.00828):0.09631):0.0058,(((((194978:0.0156,187078:0.01205):0.00763,177205:0.03615):0.01066,259249:0.01811):0.01125,(187644:0.03502,163862:0.01646):0.02417):0.01184,(((((((197318:0.0139,186497:0.02106):0.03022,167078:0.05416):0.00273,(176886:0.02207,(174754:0.014,174791:0.0416):0.0392):0.01009):0.00569,264496:0.0376):0.00496,(259609:0.02492,380534:0.02199):0.01009):0.0004,(((275339:0.01699,264787:0.00088):0.01257,(210950:0.01363,(204144:0.006,195005:0.0172):0.02314):0.01172):0.02039,((261177:0.03474,(174056:0.03568,199698:0.08361):0.01406):0.00627,259012:0.05891):0.02085):0.00235):0.00568,((183106:0.04258,177427:0.02913):0.01459,(270984:0.02958,259859:0.03521):0.02522):0.00961):0.00583):0.00502):0.0886):0.05356,(((((199534:0.02102,4127460:0.03563):0.0077,(169398:0.01634,336214:0.01107):0.02565):0.00367,276172:0.01705):0.00497,270391:0.0711):0.02348,4331760:0.04459):0.11283):0.02804,(((2112006:0.09488,170335:0.05985):0.03176,3117556:0.13918)'f__[Odoribacteraceae]':0.0905,(847228:0.03654,(4329571:0.02548,4442459:0.01181):0.03366):0.14176):0.02217):0.39352):0.02085):0.08322):0.19527); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to politely catch and err if the
table
has < 4 samples? That's an edge case we haven't handled yet behind the scenesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure that this is the reason why we've observed failures, for all the unit tests we compute distances on a 2x2 sample matrix, and those tests pass.