Add CPU/thread control to MSA tools and update dependencies.

rpachauri · copybara-github · commit 4b81ad9cfe60 · 2025-09-24T08:30:25.000-07:00
Addresses the following pull request: * #358 PiperOrigin-RevId: 810894445 Change-Id: Ib424b3f331c5168f4505edd154b860420e89237e
diff --git a/alphafold/data/pipeline.py b/alphafold/data/pipeline.py
@@ -112,6 +112,7 @@ class DataPipeline:
   """Runs the alignment tools and assembles the input features."""
 
   def __init__(self,
+               *,
                jackhmmer_binary_path: str,
                hhblits_binary_path: str,
                uniref90_database_path: str,
@@ -124,23 +125,28 @@ def __init__(self,
                use_small_bfd: bool,
                mgnify_max_hits: int = 501,
                uniref_max_hits: int = 10000,
-               use_precomputed_msas: bool = False):
+               use_precomputed_msas: bool = False,
+               msa_tools_n_cpu: int = 8):
     """Initializes the data pipeline."""
     self._use_small_bfd = use_small_bfd
     self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
         binary_path=jackhmmer_binary_path,
-        database_path=uniref90_database_path)
+        database_path=uniref90_database_path,
+        n_cpu=msa_tools_n_cpu)
     if use_small_bfd:
       self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
           binary_path=jackhmmer_binary_path,
-          database_path=small_bfd_database_path)
+          database_path=small_bfd_database_path,
+          n_cpu=msa_tools_n_cpu)
     else:
       self.hhblits_bfd_uniref_runner = hhblits.HHBlits(
           binary_path=hhblits_binary_path,
-          databases=[bfd_database_path, uniref30_database_path])
+          databases=[bfd_database_path, uniref30_database_path],
+          n_cpu=msa_tools_n_cpu)
     self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
         binary_path=jackhmmer_binary_path,
-        database_path=mgnify_database_path)
+        database_path=mgnify_database_path,
+        n_cpu=msa_tools_n_cpu)
     self.template_searcher = template_searcher
     self.template_featurizer = template_featurizer
     self.mgnify_max_hits = mgnify_max_hits
diff --git a/alphafold/data/pipeline_multimer.py b/alphafold/data/pipeline_multimer.py
@@ -134,7 +134,7 @@ def add_assembly_features(
   # Group the chains by sequence
   seq_to_entity_id = {}
   grouped_chains = collections.defaultdict(list)
-  for chain_id, chain_features in all_chain_features.items():
+  for _, chain_features in all_chain_features.items():
     seq = str(chain_features['sequence'])
     if seq not in seq_to_entity_id:
       seq_to_entity_id[seq] = len(seq_to_entity_id) + 1
@@ -172,10 +172,12 @@ class DataPipeline:
 
   def __init__(self,
                monomer_data_pipeline: pipeline.DataPipeline,
+               *,
                jackhmmer_binary_path: str,
                uniprot_database_path: str,
                max_uniprot_hits: int = 50000,
-               use_precomputed_msas: bool = False):
+               use_precomputed_msas: bool = False,
+               jackhmmer_n_cpu: int = 8):
     """Initializes the data pipeline.
 
     Args:
@@ -186,11 +188,13 @@ def __init__(self,
         will be searched with jackhmmer and used for MSA pairing.
       max_uniprot_hits: The maximum number of hits to return from uniprot.
       use_precomputed_msas: Whether to use pre-existing MSAs; see run_alphafold.
+      jackhmmer_n_cpu: Number of CPUs to use for Jackhmmer.
     """
     self._monomer_data_pipeline = monomer_data_pipeline
     self._uniprot_msa_runner = jackhmmer.Jackhmmer(
         binary_path=jackhmmer_binary_path,
-        database_path=uniprot_database_path)
+        database_path=uniprot_database_path,
+        n_cpu=jackhmmer_n_cpu)
     self._max_uniprot_hits = max_uniprot_hits
     self.use_precomputed_msas = use_precomputed_msas
 
diff --git a/alphafold/data/tools/hhsearch.py b/alphafold/data/tools/hhsearch.py
@@ -33,7 +33,8 @@ def __init__(self,
                *,
                binary_path: str,
                databases: Sequence[str],
-               maxseq: int = 1_000_000):
+               maxseq: int = 1_000_000,
+               cpu: int = 8):
     """Initializes the Python HHsearch wrapper.
 
     Args:
@@ -43,13 +44,15 @@ def __init__(self,
         _hhm.ffindex etc.)
       maxseq: The maximum number of rows in an input alignment. Note that this
         parameter is only supported in HHBlits version 3.1 and higher.
+      cpu: The number of CPUs to use.
 
     Raises:
       RuntimeError: If HHsearch binary not found within the path.
     """
     self.binary_path = binary_path
     self.databases = databases
     self.maxseq = maxseq
+    self.cpu = cpu
 
     for database_path in self.databases:
       if not glob.glob(database_path + '_*'):
@@ -79,7 +82,8 @@ def query(self, a3m: str) -> str:
       cmd = [self.binary_path,
              '-i', input_path,
              '-o', hhr_path,
-             '-maxseq', str(self.maxseq)
+             '-maxseq', str(self.maxseq),
+             '-cpu', str(self.cpu),
              ] + db_cmd
 
       logging.info('Launching subprocess "%s"', ' '.join(cmd))
diff --git a/alphafold/data/tools/hmmsearch.py b/alphafold/data/tools/hmmsearch.py
@@ -33,7 +33,8 @@ def __init__(self,
                binary_path: str,
                hmmbuild_binary_path: str,
                database_path: str,
-               flags: Optional[Sequence[str]] = None):
+               flags: Optional[Sequence[str]] = None,
+               cpu: int = 8):
     """Initializes the Python hmmsearch wrapper.
 
     Args:
@@ -42,13 +43,15 @@ def __init__(self,
         an hmm from an input a3m.
       database_path: The path to the hmmsearch database (FASTA format).
       flags: List of flags to be used by hmmsearch.
+      cpu: The number of CPUs to use for the hmmsearch query.
 
     Raises:
       RuntimeError: If hmmsearch binary not found within the path.
     """
     self.binary_path = binary_path
     self.hmmbuild_runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
     self.database_path = database_path
+    self.cpu = cpu
     if flags is None:
       # Default hmmsearch run settings.
       flags = ['--F1', '0.1',
@@ -89,7 +92,7 @@ def query_with_hmm(self, hmm: str) -> str:
       cmd = [
           self.binary_path,
           '--noali',  # Don't include the alignment in stdout.
-          '--cpu', '8'
+          '--cpu', str(self.cpu),
       ]
       # If adding flags, we have to do so before the output and input:
       if self.flags:
diff --git a/run_alphafold.py b/run_alphafold.py
@@ -143,6 +143,30 @@ class ModelsToRelax(enum.Enum):
                      'Relax on GPU can be much faster than CPU, so it is '
                      'recommended to enable if possible. GPUs must be available'
                      ' if this setting is enabled.')
+flags.DEFINE_integer(
+    'jackhmmer_n_cpu',
+    # Unfortunately, os.process_cpu_count() is only available in Python 3.13+.
+    min(len(os.sched_getaffinity(0)), 8),
+    'Number of CPUs to use for Jackhmmer. Defaults to min(cpu_count, 8). Going'
+    ' above 8 CPUs provides very little additional speedup.',
+    lower_bound=0,
+)
+flags.DEFINE_integer(
+    'hmmsearch_n_cpu',
+    # Unfortunately, os.process_cpu_count() is only available in Python 3.13+.
+    min(len(os.sched_getaffinity(0)), 8),
+    'Number of CPUs to use for HMMsearch. Defaults to min(cpu_count, 8). Going'
+    ' above 8 CPUs provides very little additional speedup.',
+    lower_bound=0,
+)
+flags.DEFINE_integer(
+    'hhsearch_n_cpu',
+    # Unfortunately, os.process_cpu_count() is only available in Python 3.13+.
+    min(len(os.sched_getaffinity(0)), 8),
+    'Number of CPUs to use for HHsearch. Defaults to min(cpu_count, 8). Going'
+    ' above 8 CPUs provides very little additional speedup.',
+    lower_bound=0,
+)
 
 FLAGS = flags.FLAGS
 
@@ -464,7 +488,8 @@ def main(argv):
     template_searcher = hmmsearch.Hmmsearch(
         binary_path=FLAGS.hmmsearch_binary_path,
         hmmbuild_binary_path=FLAGS.hmmbuild_binary_path,
-        database_path=FLAGS.pdb_seqres_database_path)
+        database_path=FLAGS.pdb_seqres_database_path,
+        cpu=FLAGS.hmmsearch_n_cpu)
     template_featurizer = templates.HmmsearchHitFeaturizer(
         mmcif_dir=FLAGS.template_mmcif_dir,
         max_template_date=FLAGS.max_template_date,
@@ -475,7 +500,8 @@ def main(argv):
   else:
     template_searcher = hhsearch.HHSearch(
         binary_path=FLAGS.hhsearch_binary_path,
-        databases=[FLAGS.pdb70_database_path])
+        databases=[FLAGS.pdb70_database_path],
+        cpu=FLAGS.hhsearch_n_cpu)
     template_featurizer = templates.HhsearchHitFeaturizer(
         mmcif_dir=FLAGS.template_mmcif_dir,
         max_template_date=FLAGS.max_template_date,
@@ -495,15 +521,17 @@ def main(argv):
       template_searcher=template_searcher,
       template_featurizer=template_featurizer,
       use_small_bfd=use_small_bfd,
-      use_precomputed_msas=FLAGS.use_precomputed_msas)
+      use_precomputed_msas=FLAGS.use_precomputed_msas,
+      msa_tools_n_cpu=FLAGS.jackhmmer_n_cpu)
 
   if run_multimer_system:
     num_predictions_per_model = FLAGS.num_multimer_predictions_per_model
     data_pipeline = pipeline_multimer.DataPipeline(
         monomer_data_pipeline=monomer_data_pipeline,
         jackhmmer_binary_path=FLAGS.jackhmmer_binary_path,
         uniprot_database_path=FLAGS.uniprot_database_path,
-        use_precomputed_msas=FLAGS.use_precomputed_msas)
+        use_precomputed_msas=FLAGS.use_precomputed_msas,
+        jackhmmer_n_cpu=FLAGS.jackhmmer_n_cpu)
   else:
     num_predictions_per_model = 1
     data_pipeline = monomer_data_pipeline