Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Qemistree - Sirius 5 #155

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions q2_qemistree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,14 @@
# ----------------------------------------------------------------------------
from ._version import get_versions

from ._fingerprint import (compute_fragmentation_trees,
rerank_molecular_formulas,
predict_fingerprints)
from ._classyfire import get_classyfire_taxonomy
from ._fingerprint import (compute_fingerprint_classes)
from ._hierarchy import make_hierarchy
from ._prune_hierarchy import prune_hierarchy
from ._semantics import (MassSpectrometryFeatures, MGFDirFmt,
CSIFolder, CSIDirFmt, ZodiacFolder, ZodiacDirFmt,
SiriusFolder, SiriusDirFmt, OutputDirs)

__all__ = ['compute_fragmentation_trees', 'rerank_molecular_formulas',
'predict_fingerprints', 'make_hierarchy', 'get_classyfire_taxonomy',
__all__ = ['compute_fingerprint_classes', 'make_hierarchy',
'prune_hierarchy', 'plot', 'MassSpectrometryFeatures', 'MGFDirFmt',
'CSIFolder', 'CSIDirFmt', 'ZodiacFolder', 'ZodiacDirFmt',
'SiriusFolder', 'SiriusDirFmt', 'OutputDirs']

__version__ = get_versions()['version']
104 changes: 23 additions & 81 deletions q2_qemistree/_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
import subprocess
import os

from ._semantics import MGFDirFmt, SiriusDirFmt, ZodiacDirFmt, CSIDirFmt
from ._semantics import MGFDirFmt, SiriusDirFmt
from qiime2.plugin import Str, List


def run_command(cmd, output_fp, error_fp, verbose=True):
if verbose:
print("Running external command line application. This may print "
Expand Down Expand Up @@ -50,15 +51,19 @@ def artifactory(sirius_path: str, parameters: list, java_flags: str = None,
return artifact


def compute_fragmentation_trees(sirius_path: str, features: MGFDirFmt,
def compute_fingerprint_classes(sirius_path: str, features: MGFDirFmt,
ppm_max: int, profile: str,
tree_timeout: int = 1600,
maxmz: int = 600, n_jobs: int = 1,
num_candidates: int = 50,
database: List[Str] = ['all'],
ions_considered: List[Str] = ['[M+H]+'],
java_flags: str = None) -> SiriusDirFmt:
'''Compute fragmentation trees for candidate molecular formulas.
java_flags: str = None,
zodiac_threshold: float = 0.98,
fingerid_db: str = 'bio') -> SiriusDirFmt:
'''Compute fragmentation trees for candidate molecular formulas using SIRIUS; Reranks molecular formula
candidates generated using ZODIAC; Predict molecular fingerprints using CSI:FingerID; Predict compound
classes using CANOPUS (ClassyFire and NPClassifier).

Parameters
----------
Expand Down Expand Up @@ -88,6 +93,12 @@ def compute_fragmentation_trees(sirius_path: str, features: MGFDirFmt,
comma separated list of adducts (default: '[M+H]+')
java_flags : str, optional
Setup additional flags for the Java virtual machine.
zodiac_threshold: float, optional
threshold filter for molecular formula re-ranking. Higher value
recommended for less false positives (default: 0.98)
fingerid_db: str, optional
Search structure in given database. You can also provide a
comma-separated list of databases (default: 'bio')

Returns
-------
Expand All @@ -97,7 +108,7 @@ def compute_fragmentation_trees(sirius_path: str, features: MGFDirFmt,

# qiime2 will check that the only possible modes are positive, negative or
# auto

params = ['-i', os.path.join(str(features.path), 'features.mgf'),
'--maxmz', str(maxmz),
'--processors', str(n_jobs),
Expand All @@ -109,81 +120,12 @@ def compute_fragmentation_trees(sirius_path: str, features: MGFDirFmt,
'--ions-considered', str(', '.join(ions_considered)),
'--tree-timeout', str(tree_timeout),
'--ppm-max', str(ppm_max),
]

return artifactory(sirius_path, params, java_flags, SiriusDirFmt)


def rerank_molecular_formulas(sirius_path: str,
fragmentation_trees: SiriusDirFmt,
features: MGFDirFmt,
zodiac_threshold: float = 0.98, n_jobs: int = 1,
java_flags: str = None) -> ZodiacDirFmt:
"""Reranks molecular formula candidates generated by computing
fragmentation trees

Parameters
----------
sirius_path : str
Path to Sirius executable (without including the word sirius).
fragmentation_trees : SiriusDirFmt
Directory with computed fragmentation trees
features : MGFDirFmt
MGF file for Sirius
zodiac_threshold : float
threshold filter for molecular formula re-ranking. Higher value
recommended for less false positives (float)
n_jobs : int, optional
Number of cpu cores to use. If not specified Sirius uses all available
cores
java_flags : str, optional
Setup additional flags for the Java virtual machine.

Returns
-------
ZodiacDirFmt
Directory with reranked molecular formulas
"""

params = ['-i', str(fragmentation_trees.get_path()),
'--processors', str(n_jobs),
'zodiac',
'--thresholdFilter', str(zodiac_threshold)]

return artifactory(sirius_path, params, java_flags, ZodiacDirFmt)

'--thresholdFilter', str(zodiac_threshold),
'fingerprint',
'structure',
'--database', str(fingerid_db),
'canopus',
'write-summaries']

def predict_fingerprints(sirius_path: str, molecular_formulas: ZodiacDirFmt,
ppm_max: int, n_jobs: int = 1,
fingerid_db: str = 'bio',
java_flags: str = None) -> CSIDirFmt:
"""Predict molecular fingerprints

Parameters
----------
sirius_path : str
Path to Sirius executable (without including the word sirius).
molecular_formulas : ZodiacDirFmt
Directory with the re-ranked formulae.
ppm_max : int
Allowed parts per million tolerance for decomposing masses.
n_jobs : int, optional
Number of cpu cores to use. If not specified Sirius uses all available
cores.
fingerid_db : str, optional
Search structure in given database. You can also provide a
comma-separated list of databases (default: 'pubchem' )
java_flags : str, optional
Setup additional flags for the Java virtual machine.

Returns
-------
CSIDirFmt
Directory with predicted fingerprints.
"""

params = ['-i', molecular_formulas.get_path(),
'--processors', str(n_jobs),
'fingerid',
'--db', str(fingerid_db)]
return artifactory(sirius_path, params, java_flags, CSIDirFmt)
return artifactory(sirius_path, params, java_flags, SiriusDirFmt)
12 changes: 6 additions & 6 deletions q2_qemistree/_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from ._process_fingerprint import process_csi_results
from ._match import get_matched_tables
from ._semantics import CSIDirFmt
from ._semantics import SiriusDirFmt


def build_tree(relabeled_fingerprints: pd.DataFrame,
Expand Down Expand Up @@ -57,7 +57,7 @@ def merge_feature_data(fdata: pd.DataFrame) -> pd.DataFrame:
return merged_fdata


def make_hierarchy(csi_results: CSIDirFmt,
def make_hierarchy(csi_results: SiriusDirFmt,
feature_tables: biom.Table,
library_matches: pd.DataFrame = None,
metric: str = 'euclidean') -> (TreeNode, biom.Table,
Expand All @@ -70,9 +70,9 @@ def make_hierarchy(csi_results: CSIDirFmt,

Parameters
----------
csi_results : CSIDirFmt
one or more CSI:FingerID output folder
feature_table : biom.Table
csi_results : SiriusDirFmt
one or more CSI:FingerID results from the sirius-output folder
feature_tables : biom.Table
one or more feature tables with mass-spec feature intensity per sample
library_matches: pd.DataFrame
one or more tables with MS/MS library match for mass-spec features
Expand Down Expand Up @@ -120,7 +120,7 @@ def make_hierarchy(csi_results: CSIDirFmt,
else:
collated_fps, smiles = process_csi_results(csi_result, None, metric)
relabeled_fp, matched_ft, feature_data = get_matched_tables(
collated_fps, smiles, feature_table)
collated_fps, smiles, feature_table, csi_result)
fps.append(relabeled_fp)
fts.append(matched_ft)
fdata.append(feature_data)
Expand Down
57 changes: 51 additions & 6 deletions q2_qemistree/_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
import hashlib
import pandas as pd
import warnings
import os

from ._semantics import SiriusDirFmt

def get_matched_tables(collated_fingerprints: pd.DataFrame,
smiles: pd.DataFrame,
feature_table: biom.Table):
feature_table: biom.Table,
csi_result: SiriusDirFmt):
'''
This function filters the feature table to retain only features with
fingerprints. It also relabels features with MD5 hash of its
Expand Down Expand Up @@ -49,29 +52,59 @@ def get_matched_tables(collated_fingerprints: pd.DataFrame,
table that maps MD5 hash of a feature to the original feature ID in
the input feature table
'''
if isinstance(csi_result, SiriusDirFmt):
csi_result = str(csi_result.get_path())
fps = collated_fingerprints.copy()
allfps = list(fps.index)
canopus = pd.read_csv(os.path.join(csi_result, 'canopus_compound_summary.tsv'), dtype=str, sep='\t')

if fps.empty:
raise ValueError("Cannot have empty fingerprint table")
table = feature_table.to_dataframe(dense=True)
# table.index = table.index.map(str) # Caso seja necessário converter essa merda 💩💩
allfeatrs = set(table.index)
overlap = list(set(allfps).intersection(allfeatrs))
#overlap = [str(i) for i in overla] # Caso seja necessário converter essa merda 💩💩
if not set(allfps).issubset(allfeatrs):
extra_tips = set(allfps) - set(overlap)
warnings.warn('The following fingerprints were not '
'found in the feature table; removed from qemistree:\n' +
', '.join([str(i) for i in extra_tips]), UserWarning)
filtered_table = table.reindex(overlap)
filtered_fps = fps.reindex(overlap)
filtered_table = table[table.index.isin(overlap)]
filtered_fps = fps[fps.index.isin(overlap)]

canopus['feature_id'] = [id.split('_')[-1] for id in canopus['id']] # criar a coluna com o id apenas
canopus.set_index('feature_id', inplace=True) # transformar essa coluna em index do DF
filtered_canopus = canopus[canopus.index.isin(overlap)] # reindexar com base em overlap

list_md5 = []
list_fid = []
df_md5 = pd.DataFrame(index=overlap)
for fid in overlap:
md5 = str(hashlib.md5(fps.loc[fid].values.tobytes()).hexdigest())
list_fid.append(fid)
list_md5.append(md5)
filtered_fps['label'] = list_md5
filtered_table['label'] = list_md5
# df_md5['feature_id'] = list_fid
df_md5['label'] = list_md5

filtered_fps = pd.merge(filtered_fps, df_md5, left_index=True, right_index=True)
filtered_table = pd.merge(filtered_table, df_md5, left_index=True, right_index=True)
filtered_canopus = pd.merge(filtered_canopus, df_md5, left_index=True, right_index=True)

# list_md5 = []
# for fid in overlap:
# md5 = str(hashlib.md5(fps.loc[fid].values.tobytes()).hexdigest())
# list_md5.append(md5)
# filtered_fps['label'] = list_md5
# filtered_table['label'] = list_md5
# filtered_canopus['label'] = list_md5
feature_data = pd.DataFrame(columns=['label', '#featureID', 'csi_smiles',
'ms2_smiles', 'ms2_library_match',
'parent_mass', 'retention_time'])
'parent_mass', 'retention_time',
'NPC#pathway', 'NPC#superclass', 'NPC#class',
'ClassyFire#most specific class', 'ClassyFire#level 5',
'ClassyFire#subclass', 'ClassyFire#class', 'ClassyFire#superclass',
'ClassyFire#all classifications'])
feature_data['label'] = list_md5
feature_data['#featureID'] = overlap
feature_data['csi_smiles'] = list(smiles.loc[overlap, 'csi_smiles'])
Expand All @@ -81,6 +114,18 @@ def get_matched_tables(collated_fingerprints: pd.DataFrame,
feature_data['parent_mass'] = list(smiles.loc[overlap, 'parent_mass'])
feature_data['retention_time'] = list(smiles.loc[overlap,
'retention_time'])
feature_data['NPC#pathway'] = list(filtered_canopus.loc[overlap, 'NPC#pathway'])
feature_data['NPC#superclass'] = list(filtered_canopus.loc[overlap, 'NPC#superclass'])
feature_data['NPC#class'] = list(filtered_canopus.loc[overlap, 'NPC#class'])
feature_data['ClassyFire#most specific class'] = list(
filtered_canopus.loc[overlap, 'ClassyFire#most specific class'])
feature_data['ClassyFire#level 5'] = list(filtered_canopus.loc[overlap, 'ClassyFire#level 5'])
feature_data['ClassyFire#subclass'] = list(filtered_canopus.loc[overlap, 'ClassyFire#subclass'])
feature_data['ClassyFire#class'] = list(filtered_canopus.loc[overlap, 'ClassyFire#class'])
feature_data['ClassyFire#superclass'] = list(filtered_canopus.loc[overlap, 'ClassyFire#superclass'])
feature_data['ClassyFire#all classifications'] = list(
filtered_canopus.loc[overlap, 'ClassyFire#all classifications'])

feature_data.set_index('label', inplace=True)
relabel_fps = filtered_fps.groupby('label').first()
matched_table = filtered_table.groupby('label').sum()
Expand Down
Loading