Dreycey · Dreycey · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 5, 2024
diff --git a/PhageScanner/main/database_adapters.py b/PhageScanner/main/database_adapters.py
@@ -13,7 +13,7 @@
 import re
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import List
+from typing import Generator, Any
 
 import requests
 from bs4 import BeautifulSoup
@@ -223,7 +223,7 @@ def get_phanns_query(query, extra=""):
         modified_query += extra
         return modified_query
 
-    def esearch(self, query, batch_size=10000) -> List[str]:
+    def esearch(self, query, batch_size=10000) -> Generator[Any, Any, Any]:
         """Return a list of URIs.
 
         Description:

diff --git a/PhageScanner/main/feature_extractors.py b/PhageScanner/main/feature_extractors.py
@@ -16,9 +16,11 @@
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Dict, List, Optional
+from concurrent.futures import ProcessPoolExecutor
 
 import numpy as np
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from keras.preprocessing.sequence import pad_sequences
 
 from PhageScanner.main.exceptions import (
     IncorrectValueError,
@@ -51,6 +53,7 @@ class FeatureExtractorNames(Enum):
     onehot = "SEQUENTIALONEHOT"
     pcp = "PCP"
     chemfeatures = "CHEMFEATURES"
+    integerencoding = "INTEGERENCODING"
 
     @classmethod
     def get_extractor(cls, name, parameters: Optional[Dict]):
@@ -68,6 +71,7 @@ def get_extractor(cls, name, parameters: Optional[Dict]):
             cls.onehot.value: SequentialOneHot,
             cls.pcp.value: PCPExtractor,
             cls.chemfeatures.value: ChemFeatureExtractor,
+            cls.integerencoding.value: IntegerEncoding
         }
 
         # instantiate the class
@@ -113,7 +117,7 @@ class ProteinFeatureExtraction(ABC):
         "Y": {"C": 9, "H": 9, "N": 1, "O": 2, "S": 0},  # Tyrosine
     }
 
-    canonical_amino_acids = set(sorted(list(amino_acid_atom_counts.keys())))
+    canonical_amino_acids = sorted(amino_acid_atom_counts.keys())
 
     @abstractmethod
     def extract_features(self, protein: str):
@@ -495,7 +499,7 @@ class SequentialOneHot(ProteinFeatureExtraction):
     def __init__(self, parameters: Optional[Dict] = None):
         """Instantiate tokenization extract method."""
         self.aa2index = {aa: ind for ind, aa in enumerate(self.canonical_amino_acids)}
-        self.matrix_length = 1000
+        self.matrix_length = 500
 
     def extract_features(self, protein: str):
         """Obtain an tokenization of the protein sequence."""
@@ -563,6 +567,30 @@ def extract_features(self, protein: str):
 
         return hash_vec
 
+class IntegerEncoding(ProteinFeatureExtraction):
+    """Extraction method for obtaining an integer encoding for a protein."""
+
+    def __init__(self, parameters: Optional[Dict] = None):
+        """Instantiate CTD extract method."""
+        codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
+                 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+
+        aa_dict = {}
+        for index, val in enumerate(codes):
+            aa_dict[val] = index+1
+
+        self.aa_dict = aa_dict
+
+    def extract_features(self, protein: str, max_length=500):
+        """ Integer encoding for a protein. """
+        integer_encoding = []
+        for amino_acid in protein:
+            integer_encoding.append(self.aa_dict.get(amino_acid, 0))
+
+        return pad_sequences([integer_encoding],
+                             maxlen=max_length,
+                             padding='post',
+                             truncating='post')[0]
 
 class CTDExtractor(ProteinFeatureExtraction):
     """Extraction method for obtaining Composition-transition-distribution (CTD)"""
@@ -694,6 +722,36 @@ def extract_features(self, protein):
             count += 1
 
         return [sequential_features]
+
+def extract_feature_vector(proteins, model_features, segment_size:int=0):
+    """ Extract the feature vector from a list/array of proteins. """
+    logging.info(f"Starting to extract protein features.")
+
+    # get feature extractors.
+    feature_list = []
+    for feature_name, parameters in model_features:
+        extractor = FeatureExtractorNames.get_extractor(feature_name, parameters)
+        feature_list.append(extractor)
+
+    # create feature aggregator (combines features)
+    if segment_size:
+        aggregator = SequentialProteinFeatureAggregator(
+            extractors=feature_list, segment_size=segment_size
+        )
+    else:
+        aggregator = ProteinFeatureAggregator(extractors=feature_list)
+
+    # extract features
+    features = []
+    if len(proteins) < 1000:
+        features = [aggregator.extract_features(protein) for protein in proteins]
+    else:
+        with ProcessPoolExecutor() as executor:
+            features = list(executor.map(aggregator.extract_features, proteins))
+
+    logging.info(f"Finished extracting features.")
+
+    return np.vstack(np.array(features))
 
 
 if __name__ == "__main__":