21
21
LEVEL_NAMES = ['ion_type' , 'mod_seq_charge' , 'mod_seq' , 'seq' ]
22
22
MAPPING_DICT = {'SEQ' :'seq' , 'MOD' :'mod_seq' , 'CHARGE' :'mod_seq_charge' , 'MS1ISOTOPES' :'ms1_isotopes' ,'FRGION' :'frgion' , 'PRECURSOR' : 'precursor' }
23
23
FCDIFF_CUTOFF_CLUSTERMERGE = 0
24
- LEVEL2PVALTHRESH = {'ion_type' :0.01 , 'mod_seq_charge' :0.01 , 'mod_seq' :1e-20 , 'seq' :1e-20 } #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
25
24
25
+ LEVEL2PVALTHRESH = {'ion_type' :0.01 , 'mod_seq_charge' :0.01 , 'mod_seq' :1e-20 , 'seq' :0.2 } #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
26
26
27
27
28
28
29
29
30
- def get_scored_clusterselected_ions ( gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion ,
31
- fcdiff_cutoff_clustermerge ):
30
+
31
+ def get_scored_clusterselected_ions ( gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fcdiff_cutoff_clustermerge ):
32
32
#typefilter = TypeFilter('successive')
33
33
34
34
global FCDIFF_CUTOFF_CLUSTERMERGE
35
35
FCDIFF_CUTOFF_CLUSTERMERGE = fcdiff_cutoff_clustermerge
36
36
37
+
37
38
diffions = sorted (diffions , key = lambda x : x .name )
38
39
name2diffion = {x .name : x for x in diffions }
39
40
root_node = create_hierarchical_ion_grouping (gene_name , diffions )
@@ -87,13 +88,13 @@ def add_reduced_names_to_root(node):
87
88
node .name_reduced = node .name .replace (node .parent .name , "" )
88
89
else :
89
90
node .name_reduced = node .name
90
-
91
+
91
92
92
93
import pandas as pd
93
94
def cluster_along_specified_levels (root_node , ionname2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion ):#~60% of overall runtime
94
95
#typefilter object specifies filtering and clustering of the nodes
95
96
aqcluster_utils .assign_properties_to_base_ions (root_node , ionname2diffion , normed_c1 , normed_c2 )
96
-
97
+
97
98
for level_nodes in aqcluster_utils .iterate_through_tree_levels_bottom_to_top (root_node ):
98
99
nodetypes_at_level = list (set ([node .type for node in level_nodes ]))
99
100
if nodetypes_at_level == ["base" ]:
@@ -105,7 +106,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
105
106
for type_node in type_nodes : #this goes through each precursor individually and clusters the children
106
107
child_nodes = type_node .children
107
108
grouped_mainclust_leafs = aqcluster_utils .get_grouped_mainclust_leafs (child_nodes ) #leafs are excluded if they are not in the main cluster
108
-
109
+
109
110
if len (grouped_mainclust_leafs )== 0 : #this means the leafs were previously excluded
110
111
exclude_node (type_node )
111
112
continue
@@ -119,12 +120,12 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
119
120
childnode2clust = find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold ) #the clustering is performed on the child nodes
120
121
childnode2clust = merge_similar_clusters_if_applicable (childnode2clust , type_node , fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE )
121
122
childnode2clust = aq_cluster_sorting .decide_cluster_order (childnode2clust )
122
-
123
+
123
124
aq_cluster_pfstats .add_proteoform_statistics_to_nodes (childnode2clust , take_median_ion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist )
124
125
aqcluster_utils .assign_clusterstats_to_type_node (type_node , childnode2clust )
125
126
aqcluster_utils .annotate_mainclust_leaves (childnode2clust )
126
127
aqcluster_utils .assign_cluster_number (type_node , childnode2clust )
127
- aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , use_fewpeps_per_protein = True )
128
+ aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , peptide_outlier_filtering = False )
128
129
129
130
return root_node
130
131
@@ -153,11 +154,11 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
153
154
diffions_idxs = [[x ] for x in range (len (diffions ))]
154
155
diffions_fcs = aqcluster_utils .get_fcs_ions (diffions )
155
156
#mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
156
- condensed_similarity_matrix = scipy .spatial .distance .pdist (diffions_idxs , lambda idx1 , idx2 : evaluate_similarity (idx1 [0 ], idx2 [0 ], diffions , diffions_fcs , normed_c1 , normed_c2 , ion2diffDist ,p2z ,
157
+ condensed_similarity_matrix = scipy .spatial .distance .pdist (diffions_idxs , lambda idx1 , idx2 : evaluate_similarity (idx1 [0 ], idx2 [0 ], diffions , diffions_fcs , normed_c1 , normed_c2 , ion2diffDist ,p2z ,
157
158
deedpair2doublediffdist , fcfc_threshold )) #gives p-values of the pairwise comparisons of the ions
158
159
condensed_similarity_matrix_mt_corrected = get_multiple_testing_corrected_condensed_similarity_matrix (condensed_similarity_matrix )
159
160
condensed_distance_matrix_mt_corrected = 1 / condensed_similarity_matrix_mt_corrected
160
-
161
+
161
162
after_clust = scipy .cluster .hierarchy .ward (condensed_distance_matrix_mt_corrected )
162
163
clustered = scipy .cluster .hierarchy .fcluster (after_clust , 1 / (pval_threshold_basis ), criterion = 'distance' )
163
164
clustered = aqcluster_utils .exchange_cluster_idxs (clustered )
@@ -173,20 +174,20 @@ def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval thresho
173
174
return pval_threshold_basis
174
175
else :
175
176
return LEVEL2PVALTHRESH .get (type_node .level , 0.2 )
176
-
177
+
177
178
def get_multiple_testing_corrected_condensed_similarity_matrix (condensed_distance_matrix : np .array ):
178
179
"""
179
180
condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
180
-
181
+
181
182
Args:
182
183
condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
183
-
184
+
184
185
Returns:
185
186
np.array: Corrected condensed distance matrix.
186
187
"""
187
188
# Apply Benjamini-Yekutieli correction
188
189
_ , corrected_pvalues , _ , _ = multitest .multipletests (condensed_distance_matrix , method = 'fdr_by' )
189
-
190
+
190
191
# Return the corrected condensed matrix
191
192
return corrected_pvalues
192
193
@@ -238,25 +239,25 @@ def update_childnode2clust(childnode2clust, old_clusters, new_clusters):
238
239
new_clust = old2new [old_clust ]
239
240
childnode2clust_new .append ((childnode , new_clust ))
240
241
return childnode2clust_new
241
-
242
242
243
243
244
244
245
- def evaluate_similarity (idx1 : int , idx2 : int ,
246
- diffions : list [aq_diff_analysis .DifferentialIon ],
245
+
246
+ def evaluate_similarity (idx1 : int , idx2 : int ,
247
+ diffions : list [aq_diff_analysis .DifferentialIon ],
247
248
fcs : list [list [int ]],
248
- normed_c1 : aq_diff_background .BackGroundDistribution ,
249
+ normed_c1 : aq_diff_background .BackGroundDistribution ,
249
250
normed_c2 : aq_diff_background .BackGroundDistribution ,
250
251
ion2diffDist : dict [str , aq_diff_background .SubtractedBackgrounds ],
251
- p2z : dict [str , str ],
252
+ p2z : dict [str , str ],
252
253
deedpair2doublediffdist : dict [tuple [aq_diff_background .SubtractedBackgrounds , aq_diff_background .SubtractedBackgrounds ],aq_diff_background .SubtractedBackgrounds ],
253
254
fcfc_threshold : float ) -> float :
254
255
"""
255
256
Evaluate the statistical similarity between two sets of ions based on their properties and fold changes.
256
-
257
+
257
258
This function calculates a p-value representing the statistical similarity between two sets of ions,
258
259
testing the null hypothesis that the two sets are not significantly different.
259
-
260
+
260
261
Args:
261
262
idx1 (int): Index of the first set of ions in the diffions list.
262
263
idx2 (int): Index of the second set of ions in the diffions list.
@@ -268,7 +269,7 @@ def evaluate_similarity(idx1: int, idx2: int,
268
269
p2z (dict[str, str]): Dictionary for converting p-values to z-scores.
269
270
deedpair2doublediffdist (dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds], aq_diff_background.SubtractedBackgrounds]): Mapping of ion pairs to their double difference distributions.
270
271
fcfc_threshold (float): Threshold for considering fold changes as similar.
271
-
272
+
272
273
Returns:
273
274
float: A p-value where higher values suggest greater similarity between ion sets.
274
275
Returns 0.99 for fold changes below fcfc_threshold.
0 commit comments