Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First pass at modular DataPipeline #1

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ $DOWNLOAD_DIR/ # Total: ~ 2.2 TB (download: 438 GB)
mmcif_files/
# About 180,000 .cif files.
obsolete.dat
small_fbd/ # ~ 17 GB (download: 9.6 GB)
small_bfd/ # ~ 17 GB (download: 9.6 GB)
bfd-first_non_consensus_sequences.fasta
uniclust30/ # ~ 86 GB (download: 24.9 GB)
uniclust30_2018_08/
Expand Down Expand Up @@ -273,6 +273,10 @@ The contents of each output file are as follows:
serve for a visualisation of domain packing confidence within the
structure.

The pLDDT confidence measure is stored in the B-factor field of the output PDB
files (although unlike a B-factor, higher pLDDT is better, so care must be taken
when using for tasks such as molecular replacement).

This code has been tested to match mean top-1 accuracy on a CASP14 test set with
pLDDT ranking over 5 model predictions (some CASP targets were run with earlier
versions of AlphaFold and some had manual interventions; see our forthcoming
Expand Down Expand Up @@ -319,7 +323,7 @@ For genetics:
For templates:

* PDB: (downloaded 2020-05-14)
* PDB70: (downloaded 2020-05-13)
* PDB70: [2020-05-13](http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200513.tar.gz)

An alternative for templates is to use the latest PDB and PDB70, but pass the
flag `--max_template_date=2020-05-14`, which restricts templates only to
Expand Down
2 changes: 1 addition & 1 deletion alphafold/common/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def ideal_atom_mask(prot: Protein) -> np.ndarray:

`Protein.atom_mask` typically is defined according to the atoms that are
reported in the PDB. This function computes a mask according to heavy atoms
that should be present in the given seqence of amino acids.
that should be present in the given sequence of amino acids.

Args:
prot: `Protein` whose fields are `numpy.ndarray` objects.
Expand Down
134 changes: 134 additions & 0 deletions alphafold/data/pipeline_mod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Modular version of alphafold.data.pipeline"""

import os
from dataclasses import dataclass
from typing import Mapping, Optional, Sequence
from absl import logging
from alphafold.data import parsers
from alphafold.data import templates
from alphafold.data.tools.cli import *
from alphafold.data.pipeline import make_sequence_features
import numpy as np

# Internal import (7716).

FeatureDict = Mapping[str, np.ndarray]


@dataclass
class ModularDataPipeline:
"""Modular version of alphafold.data.pipeline.DataPipeline"""
jackhmmer_binary_path: str
hhblits_binary_path: str
hhsearch_binary_path: str
uniref90_database_path: str
mgnify_database_path: str
pdb70_database_path: str
use_small_bfd: bool

# for construction of TemplateHitFeaturizer, replacing
# template_featurizer: templates.TemplateHitFeaturizer
mmcif_dir: str
max_template_date: str
max_hits: int
kalign_binary_path: str
release_dates_path: str = None
obsolete_pdbs_path: str = None
strict_error_check: bool = False

mgnify_max_hits: int = 501
uniref_max_hits: int = 10000
bfd_database_path: str = None
uniclust30_database_path: str = None
small_bfd_database_path: str = None

def jackhmmer_uniref90(self, input_fasta_path: str):
return jackhmmer(
input_fasta_path=input_fasta_path,
jackhmmer_binary_path=self.jackhmmer_binary_path,
database_path=self.uniref90_database_path,
fname='uniref90_hits.sto',
output_dir=self.msa_output_dir
)

def jackhmmer_mgnify(self, input_fasta_path: str):
return jackhmmer(
input_fasta_path=input_fasta_path,
jackhmmer_binary_path=self.jackhmmer_binary_path,
database_path=self.mgnify_database_path,
fname='mgnify.sto',
output_dir=self.msa_output_dir
)

def hhsearch_pdb70(self, jackhmmer_uniref90_hits_path):
return hhsearch_pdb70(
jackhmmer_uniref90_hits_path=jackhmmer_uniref90_hits_path,
hhsearch_binary_path=self.hhsearch_binary_path,
pdb70_database_path=self.pdb70_database_path,
uniref_max_hits=self.uniref_max_hits,
output_dir=self.msa_output_dir
)

def jackhmmer_small_bfd(self, input_fasta_path):
return jackhmmer(
input_fasta_path=input_fasta_path,
jackhmmer_binary_path=self.jackhmmer_binary_path,
database_path=self.small_bfd_database_path,
fname='small_bfd_hits.sto',
output_dir=self.msa_output_dir
)

def hhblits(self, input_fasta_path):
return hhblits(
input_fasta_path=input_fasta_path,
hhblits_binary_path=self.hhblits_binary_path,
bfd_database_path=self.bfd_database_path,
uniclust30_database_path=self.uniclust30_database_path,
output_dir=self.msa_output_dir
)

def template_featurize(self, input_fasta_path, hhsearch_hits_path):
return template_featurize(
input_fasta_path=input_fasta_path,
hhsearch_hits_path=hhsearch_hits_path,
mmcif_dir=self.mmcif_dir,
max_template_date=self.max_template_date,
max_hits=self.max_hits,
kalign_binary_path=self.kalign_binary_path,
release_dates_path=self.release_dates_path,
obsolete_pdbs_path=self.obsolete_pdbs_path,
strict_error_check=self.strict_error_check
)

def make_msa_features(self, jackhmmer_uniref90_hits_path, jackhmmer_mgnify_hits_path,
bfd_hits_path):
return make_msa_features(jackhmmer_uniref90_hits_path, jackhmmer_mgnify_hits_path,
bfd_hits_path,
mgnify_max_hits=self.mgnify_max_hits,
use_small_bfd=self.use_small_bfd)

def make_sequence_features(self, input_fasta_path):
input_sequence, input_description, num_res = parse_fasta_path(input_fasta_path)
return make_sequence_features(sequence=input_sequence,
description=input_description,
num_res=num_res)

def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
"""Runs alignment tools on the input sequence and creates features."""
self.msa_output_dir = msa_output_dir

jackhmmer_uniref90_hits_path = self.jackhmmer_uniref90(input_fasta_path)
hhsearch_hits_path = self.hhsearch_pdb70(jackhmmer_uniref90_hits_path)
template_features = self.template_featurize(input_fasta_path, hhsearch_hits_path)

if self.use_small_bfd:
bfd_hits_path = self.jackhmmer_small_bfd(input_fasta_path)
else:
bfd_hits_path = self.hhblits(input_fasta_path)

jackhmmer_mgnify_hits_path = self.jackhmmer_mgnify(input_fasta_path)
sequence_features = self.make_sequence_features(input_fasta_path)
msa_features = self.make_msa_features(jackhmmer_uniref90_hits_path,
jackhmmer_mgnify_hits_path,
bfd_hits_path)
return {**sequence_features, **msa_features, **template_features}
2 changes: 1 addition & 1 deletion alphafold/data/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,7 +885,7 @@ def get_templates(
errors.append(result.error)

# There could be an error even if there are some results, e.g. thrown by
# other unparseable chains in the same mmCIF file.
# other unparsable chains in the same mmCIF file.
if result.warning:
warnings.append(result.warning)

Expand Down
Loading