MannLabs
diff --git a/‎alphaquant/benchm/benchmarking.py‎
Lines changed: 1 addition & 1 deletion b/‎alphaquant/benchm/benchmarking.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎alphaquant/classify/classify_fragments.py‎
Lines changed: 18 additions & 18 deletions b/‎alphaquant/classify/classify_fragments.py‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎alphaquant/cluster/cluster_ions.py‎
Lines changed: 23 additions & 22 deletions b/‎alphaquant/cluster/cluster_ions.py‎
Lines changed: 23 additions & 22 deletions
@@ -352,7 +352,7 @@ def format_condpair_input(samplemap_df, condpair, minrep, input_file):
     LOGGER.info(condpair)
     samples_c1, samples_c2 = aqdiffutils.get_samples_used_from_samplemap_df(samplemap_df, condpair[0], condpair[1])
     input_df_local = aq_condpair.get_unnormed_df_condpair(input_file = input_file, samplemap_df = samplemap_df, condpair = condpair, file_has_alphaquant_format = True)
-    df_c1, df_c2 = aq_condpair.get_per_condition_dataframes(samples_c1, samples_c2, input_df_local, minrep_both=minrep)
+    df_c1, df_c2 = aq_condpair.get_per_condition_dataframes(samples_c1, samples_c2, input_df_local, minrep, "both", None, None)
     return df_c1, df_c2, samples_c1, samples_c2
 
 def get_filtered_protnodes(condpair, results_dir_unfiltered):
 
@@ -32,9 +32,9 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
     #prepare the input table with all the relevant features for machine learning
 
     protein_nodes = list(sorted(protein_nodes, key  = lambda x : x.name))
-    
+
     fragion_selector = FragionForTrainingSelector(protein_nodes, min_num_fragions = min_num_fragions)
-    
+
     LOGGER.info(f"{fragion_selector.num_fragions_suitable_for_training} of {fragion_selector.num_fragions_total} selected for training")
 
     if fragion_selector.num_fragions_suitable_for_training<100:
@@ -51,7 +51,7 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
 
     featurenames_str = ', '.join(ml_input_for_training.featurenames)
     LOGGER.info(f"starting RF prediction using features {featurenames_str}")
-    
+
     models = train_random_forest_ensemble(ml_input_for_training.X, ml_input_for_training.y, num_splits = 5, shorten_features_for_speed=shorten_features_for_speed)
 
     y_pred = predict_on_models(models,ml_input_for_training.X)
@@ -73,10 +73,10 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
         aq_plot_classify.plot_value_histogram(y_pred_total, results_dir_plots)
 
 
-    
+
     ionnames_total = ml_input_for_training.ionnames + ml_input_remaining.ionnames
     all_fragion_basenodes = fragion_selector.fragions_suitable_for_training + fragion_selector.fragions_not_suitable_for_training
-    
+
     #annotate the fragion nodes
     annotate_fragion_basenodes(all_fragion_basenodes, ionnames_total, y_pred_total) #two new variables added to each node:
     #update_fold_change_of_the_fragion_iontype_node(fragion_selector.fragment_iontype_nodes)
@@ -107,7 +107,7 @@ def __init__(self, protein_nodes,  min_num_fragions = 3 ):
         self.num_fragions_not_suitable_for_training = len(self.fragions_not_suitable_for_training)
         self.num_fragions_total = self.num_fragions_suitable_for_training + self.num_fragions_not_suitable_for_training
 
-    
+
     def _define_iontype_nodes(self):
         for protein_node in self._protein_nodes:
             iontype_nodes = anytree.search.findall(protein_node, filter_=lambda node: node.level == "ion_type")
@@ -134,7 +134,7 @@ def __init__(self, fragions, acquisition_info_df, define_y,replace_nans = False,
         self._numeric_threshold = numeric_threshold #fraction of non-nan values in a column, if less, the column is removed
 
         self._merged_df = None
-        
+
         self.X = None # the input for the ML model which has corresponding y values, so it is possible to train with this table
         self.y = None
         self.featurenames = None
@@ -165,18 +165,18 @@ def _collect_node_parameters(self):
 
     def _define_ionnames(self):
         self.ionnames = list(self._merged_df[aq_conf_vars.QUANT_ID])
-        
+
 
     def _remove_non_numeric_columns_from_merged_df(self):
         columns_to_drop = []
         self._merged_df = self._merged_df.drop(columns=[aq_conf_vars.QUANT_ID])
         self._merged_df = self._merged_df.apply(lambda col: pd.to_numeric(col, errors='coerce')) #'coerce' will turn non-numeric values into NaN
-        
+
         for column in self._merged_df.columns:
             proportion_non_nans = self._merged_df[column].notna().mean()
             if proportion_non_nans < self._numeric_threshold:
                 columns_to_drop.append(column)
-        
+
         self._merged_df = self._merged_df.drop(columns=columns_to_drop)
 
     def _define_featurenames(self):
@@ -190,12 +190,12 @@ def _define_X(self):
             self.X =  X_imputed
         else:
             self.X = X_df.to_numpy()
-    
+
 
     def _define_y(self):
         ion2fc = {x.name: self._get_fragnormed_fc(x) for x in self._fragions}
         self.y = np.array([ion2fc.get(ion) for ion in self.ionnames])
-    
+
     @staticmethod
     def _get_fragnormed_fc(base_node):
         base_fc = base_node.fc
@@ -239,7 +239,7 @@ def train_random_forest_ensemble(X, y, shorten_features_for_speed, num_splits=5)
                                                        max_features=max_features)  # Reduce the number of features
         model.fit(X_train, y_train)
         models.append(model)
-    
+
     return models
 
 
@@ -255,15 +255,15 @@ def annotate_fragion_basenodes(all_fragions_basenodes, ionnames_total, y_pred_to
     for fragion in all_fragions_basenodes:
         y_pred = ion2pred.get(fragion.name)
         fragion.ml_score_fragion = abs(y_pred)
-    
+
 
 
 def update_fold_change_of_the_fragion_iontype_node(all_fragions_iontype_nodes): #the iontype nodes are the parents of the basenodes
     for fragion_iontype in all_fragions_iontype_nodes:
         weigths = [2**-abs(fragion.ml_score_fragion) for fragion in fragion_iontype.children]
         fcs = [fragion.fc for fragion in fragion_iontype.children]
         fragion_iontype.fc = np.average(fcs, weights=weigths)
-        
+
 
 def update_fold_change_of_the_mod_seq_ch_node(all_fragions_iontype_nodes):
     for fragion in all_fragions_iontype_nodes:
@@ -279,7 +279,7 @@ def propagate_new_fcs_along_the_tree(protein_nodes):
             if nodelevel == "ion_type":
                 continue
             for level_node in level_nodes:
-                aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, use_fewpeps_per_protein=True)
+                aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, peptide_outlier_filtering=False)
 
 
 # def update_nodes_w_ml_score(protnodes):
@@ -298,7 +298,7 @@ def propagate_new_fcs_along_the_tree(protein_nodes):
 #             had_ml_score = hasattr(child_nodes[0], 'ml_score')
 #             if had_ml_score:
 #                 re_order_clusters_by_ml_score(child_nodes)
-#                 aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, use_fewpeps_per_protein=True)
+#                 aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=True)
 import copy
 
 def re_order_fragion_iontype_nodes_by_score(fragion_iontype_nodes):
@@ -342,6 +342,6 @@ def propagate_new_clusters_along_the_tree(protein_nodes):
             if nodelevel == "base":
                 continue
             for level_node in level_nodes:
-                aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, use_fewpeps_per_protein=True)
+                aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, peptide_outlier_filtering=True, fraction_highly_significant=0.08)
 
 
@@ -21,19 +21,20 @@
 LEVEL_NAMES = ['ion_type', 'mod_seq_charge', 'mod_seq', 'seq']
 MAPPING_DICT = {'SEQ':'seq', 'MOD':'mod_seq', 'CHARGE':'mod_seq_charge', 'MS1ISOTOPES':'ms1_isotopes','FRGION':'frgion', 'PRECURSOR' : 'precursor'}
 FCDIFF_CUTOFF_CLUSTERMERGE = 0
-LEVEL2PVALTHRESH = {'ion_type':0.01, 'mod_seq_charge':0.01, 'mod_seq':1e-20, 'seq':1e-20} #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
 
+LEVEL2PVALTHRESH = {'ion_type':0.01, 'mod_seq_charge':0.01, 'mod_seq':1e-20, 'seq':0.2} #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
 
 
 
 
-def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion,
-                                    fcdiff_cutoff_clustermerge):
+
+def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge):
     #typefilter = TypeFilter('successive')
 
     global FCDIFF_CUTOFF_CLUSTERMERGE
     FCDIFF_CUTOFF_CLUSTERMERGE = fcdiff_cutoff_clustermerge
 
+
     diffions = sorted(diffions, key = lambda x : x.name)
     name2diffion = {x.name : x for x in diffions}
     root_node = create_hierarchical_ion_grouping(gene_name, diffions)
@@ -87,13 +88,13 @@ def add_reduced_names_to_root(node):
         node.name_reduced = node.name.replace(node.parent.name, "")
     else:
         node.name_reduced = node.name
-    
+
 
 import pandas as pd
 def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion):#~60% of overall runtime
     #typefilter object specifies filtering and clustering of the nodes
     aqcluster_utils.assign_properties_to_base_ions(root_node, ionname2diffion, normed_c1, normed_c2)
-    
+
     for level_nodes in  aqcluster_utils.iterate_through_tree_levels_bottom_to_top(root_node):
         nodetypes_at_level = list(set([node.type for node in level_nodes]))
         if nodetypes_at_level == ["base"]:
@@ -105,7 +106,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
             for type_node in type_nodes: #this goes through each precursor individually and clusters the children
                 child_nodes = type_node.children
                 grouped_mainclust_leafs = aqcluster_utils.get_grouped_mainclust_leafs(child_nodes) #leafs are excluded if they are not in the main cluster
-                
+
                 if len(grouped_mainclust_leafs)==0: #this means the leafs were previously excluded
                     exclude_node(type_node)
                     continue
@@ -119,12 +120,12 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
                     childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes
                     childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE)
                     childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust)
-                
+
                 aq_cluster_pfstats.add_proteoform_statistics_to_nodes(childnode2clust, take_median_ion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist)
                 aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
                 aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
                 aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
-                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, use_fewpeps_per_protein=True)
+                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False)
 
     return root_node
 
@@ -153,11 +154,11 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
     diffions_idxs = [[x] for x in range(len(diffions))]
     diffions_fcs = aqcluster_utils.get_fcs_ions(diffions)
     #mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
-    condensed_similarity_matrix = scipy.spatial.distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_similarity(idx1[0], idx2[0], diffions, diffions_fcs, normed_c1, normed_c2, ion2diffDist,p2z, 
+    condensed_similarity_matrix = scipy.spatial.distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_similarity(idx1[0], idx2[0], diffions, diffions_fcs, normed_c1, normed_c2, ion2diffDist,p2z,
                                                                                                    deedpair2doublediffdist, fcfc_threshold)) #gives p-values of the pairwise comparisons of the ions
     condensed_similarity_matrix_mt_corrected = get_multiple_testing_corrected_condensed_similarity_matrix(condensed_similarity_matrix)
     condensed_distance_matrix_mt_corrected = 1/condensed_similarity_matrix_mt_corrected
-    
+
     after_clust = scipy.cluster.hierarchy.ward(condensed_distance_matrix_mt_corrected)
     clustered = scipy.cluster.hierarchy.fcluster(after_clust, 1/(pval_threshold_basis), criterion='distance')
     clustered = aqcluster_utils.exchange_cluster_idxs(clustered)
@@ -173,20 +174,20 @@ def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval thresho
         return pval_threshold_basis
     else:
         return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
-    
+
 def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array):
     """
     condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
-    
+
     Args:
     condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
-    
+
     Returns:
     np.array: Corrected condensed distance matrix.
     """
     # Apply Benjamini-Yekutieli correction
     _, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by')
-    
+
     # Return the corrected condensed matrix
     return corrected_pvalues
 
@@ -238,25 +239,25 @@ def update_childnode2clust(childnode2clust, old_clusters, new_clusters):
         new_clust = old2new[old_clust]
         childnode2clust_new.append((childnode, new_clust))
     return childnode2clust_new
-    
 
 
 
-def evaluate_similarity(idx1: int, idx2: int, 
-                        diffions: list[aq_diff_analysis.DifferentialIon], 
+
+def evaluate_similarity(idx1: int, idx2: int,
+                        diffions: list[aq_diff_analysis.DifferentialIon],
                         fcs: list[list[int]],
-                        normed_c1: aq_diff_background.BackGroundDistribution, 
+                        normed_c1: aq_diff_background.BackGroundDistribution,
                         normed_c2: aq_diff_background.BackGroundDistribution,
                         ion2diffDist: dict[str, aq_diff_background.SubtractedBackgrounds],
-                        p2z: dict[str, str], 
+                        p2z: dict[str, str],
                         deedpair2doublediffdist: dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds],aq_diff_background.SubtractedBackgrounds],
                         fcfc_threshold: float) -> float:
     """
     Evaluate the statistical similarity between two sets of ions based on their properties and fold changes.
-    
+
     This function calculates a p-value representing the statistical similarity between two sets of ions,
     testing the null hypothesis that the two sets are not significantly different.
-    
+
     Args:
         idx1 (int): Index of the first set of ions in the diffions list.
         idx2 (int): Index of the second set of ions in the diffions list.
@@ -268,7 +269,7 @@ def evaluate_similarity(idx1: int, idx2: int,
         p2z (dict[str, str]): Dictionary for converting p-values to z-scores.
         deedpair2doublediffdist (dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds], aq_diff_background.SubtractedBackgrounds]): Mapping of ion pairs to their double difference distributions.
         fcfc_threshold (float): Threshold for considering fold changes as similar.
-    
+
     Returns:
         float: A p-value where higher values suggest greater similarity between ion sets.
                Returns 0.99 for fold changes below fcfc_threshold.