TACC · ethho · Aug 2, 2021 · Aug 2, 2021 · Aug 12, 2021 · Aug 13, 2021
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ $DOWNLOAD_DIR/                             # Total: ~ 2.2 TB (download: 438 GB)
         mmcif_files/
             # About 180,000 .cif files.
         obsolete.dat
-    small_fbd/                             # ~ 17 GB (download: 9.6 GB)
+    small_bfd/                             # ~ 17 GB (download: 9.6 GB)
         bfd-first_non_consensus_sequences.fasta
     uniclust30/                            # ~ 86 GB (download: 24.9 GB)
         uniclust30_2018_08/
@@ -273,6 +273,10 @@ The contents of each output file are as follows:
         serve for a visualisation of domain packing confidence within the
         structure.
 
+The pLDDT confidence measure is stored in the B-factor field of the output PDB
+files (although unlike a B-factor, higher pLDDT is better, so care must be taken
+when using for tasks such as molecular replacement).
+
 This code has been tested to match mean top-1 accuracy on a CASP14 test set with
 pLDDT ranking over 5 model predictions (some CASP targets were run with earlier
 versions of AlphaFold and some had manual interventions; see our forthcoming
@@ -319,7 +323,7 @@ For genetics:
 For templates:
 
 *   PDB: (downloaded 2020-05-14)
-*   PDB70: (downloaded 2020-05-13)
+*   PDB70: [2020-05-13](http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200513.tar.gz)
 
 An alternative for templates is to use the latest PDB and PDB70, but pass the
 flag `--max_template_date=2020-05-14`, which restricts templates only to

diff --git a/alphafold/common/protein.py b/alphafold/common/protein.py
@@ -194,7 +194,7 @@ def ideal_atom_mask(prot: Protein) -> np.ndarray:
 
   `Protein.atom_mask` typically is defined according to the atoms that are
   reported in the PDB. This function computes a mask according to heavy atoms
-  that should be present in the given seqence of amino acids.
+  that should be present in the given sequence of amino acids.
 
   Args:
     prot: `Protein` whose fields are `numpy.ndarray` objects.

diff --git a/alphafold/data/pipeline_mod.py b/alphafold/data/pipeline_mod.py
@@ -0,0 +1,134 @@
+"""Modular version of alphafold.data.pipeline"""
+
+import os
+from dataclasses import dataclass
+from typing import Mapping, Optional, Sequence
+from absl import logging
+from alphafold.data import parsers
+from alphafold.data import templates
+from alphafold.data.tools.cli import *
+from alphafold.data.pipeline import make_sequence_features
+import numpy as np
+
+# Internal import (7716).
+
+FeatureDict = Mapping[str, np.ndarray]
+
+
+@dataclass
+class ModularDataPipeline:
+  """Modular version of alphafold.data.pipeline.DataPipeline"""
+  jackhmmer_binary_path: str
+  hhblits_binary_path: str
+  hhsearch_binary_path: str
+  uniref90_database_path: str
+  mgnify_database_path: str
+  pdb70_database_path: str
+  use_small_bfd: bool
+
+  # for construction of TemplateHitFeaturizer, replacing
+  # template_featurizer: templates.TemplateHitFeaturizer
+  mmcif_dir: str
+  max_template_date: str
+  max_hits: int
+  kalign_binary_path: str
+  release_dates_path: str = None
+  obsolete_pdbs_path: str = None
+  strict_error_check: bool = False
+
+  mgnify_max_hits: int = 501
+  uniref_max_hits: int = 10000
+  bfd_database_path: str = None
+  uniclust30_database_path: str = None
+  small_bfd_database_path: str = None
+
+  def jackhmmer_uniref90(self, input_fasta_path: str):
+    return jackhmmer(
+      input_fasta_path=input_fasta_path, 
+      jackhmmer_binary_path=self.jackhmmer_binary_path, 
+      database_path=self.uniref90_database_path, 
+      fname='uniref90_hits.sto',
+      output_dir=self.msa_output_dir
+    )
+
+  def jackhmmer_mgnify(self, input_fasta_path: str):
+    return jackhmmer(
+      input_fasta_path=input_fasta_path, 
+      jackhmmer_binary_path=self.jackhmmer_binary_path, 
+      database_path=self.mgnify_database_path, 
+      fname='mgnify.sto',
+      output_dir=self.msa_output_dir
+    )
+
+  def hhsearch_pdb70(self, jackhmmer_uniref90_hits_path):
+    return hhsearch_pdb70(
+      jackhmmer_uniref90_hits_path=jackhmmer_uniref90_hits_path, 
+      hhsearch_binary_path=self.hhsearch_binary_path,
+      pdb70_database_path=self.pdb70_database_path, 
+      uniref_max_hits=self.uniref_max_hits,
+      output_dir=self.msa_output_dir
+    )
+
+  def jackhmmer_small_bfd(self, input_fasta_path):
+    return jackhmmer(
+      input_fasta_path=input_fasta_path, 
+      jackhmmer_binary_path=self.jackhmmer_binary_path, 
+      database_path=self.small_bfd_database_path, 
+      fname='small_bfd_hits.sto',
+      output_dir=self.msa_output_dir
+    )
+
+  def hhblits(self, input_fasta_path):
+    return hhblits(
+      input_fasta_path=input_fasta_path, 
+      hhblits_binary_path=self.hhblits_binary_path,
+      bfd_database_path=self.bfd_database_path,
+      uniclust30_database_path=self.uniclust30_database_path,
+      output_dir=self.msa_output_dir
+    )
+
+  def template_featurize(self, input_fasta_path, hhsearch_hits_path):
+    return template_featurize(
+      input_fasta_path=input_fasta_path, 
+      hhsearch_hits_path=hhsearch_hits_path,
+      mmcif_dir=self.mmcif_dir,
+      max_template_date=self.max_template_date, 
+      max_hits=self.max_hits, 
+      kalign_binary_path=self.kalign_binary_path,
+      release_dates_path=self.release_dates_path, 
+      obsolete_pdbs_path=self.obsolete_pdbs_path,
+      strict_error_check=self.strict_error_check
+    )
+
+  def make_msa_features(self, jackhmmer_uniref90_hits_path, jackhmmer_mgnify_hits_path,
+                        bfd_hits_path):
+    return make_msa_features(jackhmmer_uniref90_hits_path, jackhmmer_mgnify_hits_path,
+                             bfd_hits_path,
+                             mgnify_max_hits=self.mgnify_max_hits,
+                             use_small_bfd=self.use_small_bfd)
+
+  def make_sequence_features(self, input_fasta_path):
+    input_sequence, input_description, num_res = parse_fasta_path(input_fasta_path)
+    return make_sequence_features(sequence=input_sequence, 
+                                  description=input_description, 
+                                  num_res=num_res)
+
+  def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
+    """Runs alignment tools on the input sequence and creates features."""
+    self.msa_output_dir = msa_output_dir
+
+    jackhmmer_uniref90_hits_path = self.jackhmmer_uniref90(input_fasta_path)
+    hhsearch_hits_path = self.hhsearch_pdb70(jackhmmer_uniref90_hits_path)
+    template_features = self.template_featurize(input_fasta_path, hhsearch_hits_path)
+
+    if self.use_small_bfd:
+      bfd_hits_path = self.jackhmmer_small_bfd(input_fasta_path)
+    else:
+      bfd_hits_path = self.hhblits(input_fasta_path)
+
+    jackhmmer_mgnify_hits_path = self.jackhmmer_mgnify(input_fasta_path)
+    sequence_features = self.make_sequence_features(input_fasta_path)
+    msa_features = self.make_msa_features(jackhmmer_uniref90_hits_path,
+                                          jackhmmer_mgnify_hits_path,
+                                          bfd_hits_path)
+    return {**sequence_features, **msa_features, **template_features}
diff --git a/alphafold/data/templates.py b/alphafold/data/templates.py
@@ -885,7 +885,7 @@ def get_templates(
         errors.append(result.error)
 
       # There could be an error even if there are some results, e.g. thrown by
-      # other unparseable chains in the same mmCIF file.
+      # other unparsable chains in the same mmCIF file.
       if result.warning:
         warnings.append(result.warning)