Skip to content

Commit 4c6a6cc

Browse files
authored
Merge pull request #109 from MannLabs/outlier_ptm_treeviz
Outlier ptm treeviz
2 parents a6b1453 + 9998fd5 commit 4c6a6cc

24 files changed

+10693
-320
lines changed

alphaquant/benchm/benchmarking.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def format_condpair_input(samplemap_df, condpair, minrep, input_file):
352352
LOGGER.info(condpair)
353353
samples_c1, samples_c2 = aqdiffutils.get_samples_used_from_samplemap_df(samplemap_df, condpair[0], condpair[1])
354354
input_df_local = aq_condpair.get_unnormed_df_condpair(input_file = input_file, samplemap_df = samplemap_df, condpair = condpair, file_has_alphaquant_format = True)
355-
df_c1, df_c2 = aq_condpair.get_per_condition_dataframes(samples_c1, samples_c2, input_df_local, minrep_both=minrep)
355+
df_c1, df_c2 = aq_condpair.get_per_condition_dataframes(samples_c1, samples_c2, input_df_local, minrep, "both", None, None)
356356
return df_c1, df_c2, samples_c1, samples_c2
357357

358358
def get_filtered_protnodes(condpair, results_dir_unfiltered):

alphaquant/classify/classify_fragments.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
3232
#prepare the input table with all the relevant features for machine learning
3333

3434
protein_nodes = list(sorted(protein_nodes, key = lambda x : x.name))
35-
35+
3636
fragion_selector = FragionForTrainingSelector(protein_nodes, min_num_fragions = min_num_fragions)
37-
37+
3838
LOGGER.info(f"{fragion_selector.num_fragions_suitable_for_training} of {fragion_selector.num_fragions_total} selected for training")
3939

4040
if fragion_selector.num_fragions_suitable_for_training<100:
@@ -51,7 +51,7 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
5151

5252
featurenames_str = ', '.join(ml_input_for_training.featurenames)
5353
LOGGER.info(f"starting RF prediction using features {featurenames_str}")
54-
54+
5555
models = train_random_forest_ensemble(ml_input_for_training.X, ml_input_for_training.y, num_splits = 5, shorten_features_for_speed=shorten_features_for_speed)
5656

5757
y_pred = predict_on_models(models,ml_input_for_training.X)
@@ -73,10 +73,10 @@ def assign_predictability_scores_stacked(protein_nodes, results_dir, name, acqui
7373
aq_plot_classify.plot_value_histogram(y_pred_total, results_dir_plots)
7474

7575

76-
76+
7777
ionnames_total = ml_input_for_training.ionnames + ml_input_remaining.ionnames
7878
all_fragion_basenodes = fragion_selector.fragions_suitable_for_training + fragion_selector.fragions_not_suitable_for_training
79-
79+
8080
#annotate the fragion nodes
8181
annotate_fragion_basenodes(all_fragion_basenodes, ionnames_total, y_pred_total) #two new variables added to each node:
8282
#update_fold_change_of_the_fragion_iontype_node(fragion_selector.fragment_iontype_nodes)
@@ -107,7 +107,7 @@ def __init__(self, protein_nodes, min_num_fragions = 3 ):
107107
self.num_fragions_not_suitable_for_training = len(self.fragions_not_suitable_for_training)
108108
self.num_fragions_total = self.num_fragions_suitable_for_training + self.num_fragions_not_suitable_for_training
109109

110-
110+
111111
def _define_iontype_nodes(self):
112112
for protein_node in self._protein_nodes:
113113
iontype_nodes = anytree.search.findall(protein_node, filter_=lambda node: node.level == "ion_type")
@@ -134,7 +134,7 @@ def __init__(self, fragions, acquisition_info_df, define_y,replace_nans = False,
134134
self._numeric_threshold = numeric_threshold #fraction of non-nan values in a column, if less, the column is removed
135135

136136
self._merged_df = None
137-
137+
138138
self.X = None # the input for the ML model which has corresponding y values, so it is possible to train with this table
139139
self.y = None
140140
self.featurenames = None
@@ -165,18 +165,18 @@ def _collect_node_parameters(self):
165165

166166
def _define_ionnames(self):
167167
self.ionnames = list(self._merged_df[aq_conf_vars.QUANT_ID])
168-
168+
169169

170170
def _remove_non_numeric_columns_from_merged_df(self):
171171
columns_to_drop = []
172172
self._merged_df = self._merged_df.drop(columns=[aq_conf_vars.QUANT_ID])
173173
self._merged_df = self._merged_df.apply(lambda col: pd.to_numeric(col, errors='coerce')) #'coerce' will turn non-numeric values into NaN
174-
174+
175175
for column in self._merged_df.columns:
176176
proportion_non_nans = self._merged_df[column].notna().mean()
177177
if proportion_non_nans < self._numeric_threshold:
178178
columns_to_drop.append(column)
179-
179+
180180
self._merged_df = self._merged_df.drop(columns=columns_to_drop)
181181

182182
def _define_featurenames(self):
@@ -190,12 +190,12 @@ def _define_X(self):
190190
self.X = X_imputed
191191
else:
192192
self.X = X_df.to_numpy()
193-
193+
194194

195195
def _define_y(self):
196196
ion2fc = {x.name: self._get_fragnormed_fc(x) for x in self._fragions}
197197
self.y = np.array([ion2fc.get(ion) for ion in self.ionnames])
198-
198+
199199
@staticmethod
200200
def _get_fragnormed_fc(base_node):
201201
base_fc = base_node.fc
@@ -239,7 +239,7 @@ def train_random_forest_ensemble(X, y, shorten_features_for_speed, num_splits=5)
239239
max_features=max_features) # Reduce the number of features
240240
model.fit(X_train, y_train)
241241
models.append(model)
242-
242+
243243
return models
244244

245245

@@ -255,15 +255,15 @@ def annotate_fragion_basenodes(all_fragions_basenodes, ionnames_total, y_pred_to
255255
for fragion in all_fragions_basenodes:
256256
y_pred = ion2pred.get(fragion.name)
257257
fragion.ml_score_fragion = abs(y_pred)
258-
258+
259259

260260

261261
def update_fold_change_of_the_fragion_iontype_node(all_fragions_iontype_nodes): #the iontype nodes are the parents of the basenodes
262262
for fragion_iontype in all_fragions_iontype_nodes:
263263
weigths = [2**-abs(fragion.ml_score_fragion) for fragion in fragion_iontype.children]
264264
fcs = [fragion.fc for fragion in fragion_iontype.children]
265265
fragion_iontype.fc = np.average(fcs, weights=weigths)
266-
266+
267267

268268
def update_fold_change_of_the_mod_seq_ch_node(all_fragions_iontype_nodes):
269269
for fragion in all_fragions_iontype_nodes:
@@ -279,7 +279,7 @@ def propagate_new_fcs_along_the_tree(protein_nodes):
279279
if nodelevel == "ion_type":
280280
continue
281281
for level_node in level_nodes:
282-
aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, use_fewpeps_per_protein=True)
282+
aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, peptide_outlier_filtering=False)
283283

284284

285285
# def update_nodes_w_ml_score(protnodes):
@@ -298,7 +298,7 @@ def propagate_new_fcs_along_the_tree(protein_nodes):
298298
# had_ml_score = hasattr(child_nodes[0], 'ml_score')
299299
# if had_ml_score:
300300
# re_order_clusters_by_ml_score(child_nodes)
301-
# aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, use_fewpeps_per_protein=True)
301+
# aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=True)
302302
import copy
303303

304304
def re_order_fragion_iontype_nodes_by_score(fragion_iontype_nodes):
@@ -342,6 +342,6 @@ def propagate_new_clusters_along_the_tree(protein_nodes):
342342
if nodelevel == "base":
343343
continue
344344
for level_node in level_nodes:
345-
aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, use_fewpeps_per_protein=True)
345+
aq_cluster_utils.aggregate_node_properties(level_node, only_use_mainclust=True, peptide_outlier_filtering=True, fraction_highly_significant=0.08)
346346

347347

alphaquant/cluster/cluster_ions.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,20 @@
2121
LEVEL_NAMES = ['ion_type', 'mod_seq_charge', 'mod_seq', 'seq']
2222
MAPPING_DICT = {'SEQ':'seq', 'MOD':'mod_seq', 'CHARGE':'mod_seq_charge', 'MS1ISOTOPES':'ms1_isotopes','FRGION':'frgion', 'PRECURSOR' : 'precursor'}
2323
FCDIFF_CUTOFF_CLUSTERMERGE = 0
24-
LEVEL2PVALTHRESH = {'ion_type':0.01, 'mod_seq_charge':0.01, 'mod_seq':1e-20, 'seq':1e-20} #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
2524

25+
LEVEL2PVALTHRESH = {'ion_type':0.01, 'mod_seq_charge':0.01, 'mod_seq':1e-20, 'seq':0.2} #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
2626

2727

2828

2929

30-
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion,
31-
fcdiff_cutoff_clustermerge):
30+
31+
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge):
3232
#typefilter = TypeFilter('successive')
3333

3434
global FCDIFF_CUTOFF_CLUSTERMERGE
3535
FCDIFF_CUTOFF_CLUSTERMERGE = fcdiff_cutoff_clustermerge
3636

37+
3738
diffions = sorted(diffions, key = lambda x : x.name)
3839
name2diffion = {x.name : x for x in diffions}
3940
root_node = create_hierarchical_ion_grouping(gene_name, diffions)
@@ -87,13 +88,13 @@ def add_reduced_names_to_root(node):
8788
node.name_reduced = node.name.replace(node.parent.name, "")
8889
else:
8990
node.name_reduced = node.name
90-
91+
9192

9293
import pandas as pd
9394
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion):#~60% of overall runtime
9495
#typefilter object specifies filtering and clustering of the nodes
9596
aqcluster_utils.assign_properties_to_base_ions(root_node, ionname2diffion, normed_c1, normed_c2)
96-
97+
9798
for level_nodes in aqcluster_utils.iterate_through_tree_levels_bottom_to_top(root_node):
9899
nodetypes_at_level = list(set([node.type for node in level_nodes]))
99100
if nodetypes_at_level == ["base"]:
@@ -105,7 +106,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
105106
for type_node in type_nodes: #this goes through each precursor individually and clusters the children
106107
child_nodes = type_node.children
107108
grouped_mainclust_leafs = aqcluster_utils.get_grouped_mainclust_leafs(child_nodes) #leafs are excluded if they are not in the main cluster
108-
109+
109110
if len(grouped_mainclust_leafs)==0: #this means the leafs were previously excluded
110111
exclude_node(type_node)
111112
continue
@@ -119,12 +120,12 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
119120
childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes
120121
childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE)
121122
childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust)
122-
123+
123124
aq_cluster_pfstats.add_proteoform_statistics_to_nodes(childnode2clust, take_median_ion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist)
124125
aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
125126
aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
126127
aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
127-
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, use_fewpeps_per_protein=True)
128+
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False)
128129

129130
return root_node
130131

@@ -153,11 +154,11 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
153154
diffions_idxs = [[x] for x in range(len(diffions))]
154155
diffions_fcs = aqcluster_utils.get_fcs_ions(diffions)
155156
#mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
156-
condensed_similarity_matrix = scipy.spatial.distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_similarity(idx1[0], idx2[0], diffions, diffions_fcs, normed_c1, normed_c2, ion2diffDist,p2z,
157+
condensed_similarity_matrix = scipy.spatial.distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_similarity(idx1[0], idx2[0], diffions, diffions_fcs, normed_c1, normed_c2, ion2diffDist,p2z,
157158
deedpair2doublediffdist, fcfc_threshold)) #gives p-values of the pairwise comparisons of the ions
158159
condensed_similarity_matrix_mt_corrected = get_multiple_testing_corrected_condensed_similarity_matrix(condensed_similarity_matrix)
159160
condensed_distance_matrix_mt_corrected = 1/condensed_similarity_matrix_mt_corrected
160-
161+
161162
after_clust = scipy.cluster.hierarchy.ward(condensed_distance_matrix_mt_corrected)
162163
clustered = scipy.cluster.hierarchy.fcluster(after_clust, 1/(pval_threshold_basis), criterion='distance')
163164
clustered = aqcluster_utils.exchange_cluster_idxs(clustered)
@@ -173,20 +174,20 @@ def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval thresho
173174
return pval_threshold_basis
174175
else:
175176
return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
176-
177+
177178
def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array):
178179
"""
179180
condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
180-
181+
181182
Args:
182183
condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
183-
184+
184185
Returns:
185186
np.array: Corrected condensed distance matrix.
186187
"""
187188
# Apply Benjamini-Yekutieli correction
188189
_, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by')
189-
190+
190191
# Return the corrected condensed matrix
191192
return corrected_pvalues
192193

@@ -238,25 +239,25 @@ def update_childnode2clust(childnode2clust, old_clusters, new_clusters):
238239
new_clust = old2new[old_clust]
239240
childnode2clust_new.append((childnode, new_clust))
240241
return childnode2clust_new
241-
242242

243243

244244

245-
def evaluate_similarity(idx1: int, idx2: int,
246-
diffions: list[aq_diff_analysis.DifferentialIon],
245+
246+
def evaluate_similarity(idx1: int, idx2: int,
247+
diffions: list[aq_diff_analysis.DifferentialIon],
247248
fcs: list[list[int]],
248-
normed_c1: aq_diff_background.BackGroundDistribution,
249+
normed_c1: aq_diff_background.BackGroundDistribution,
249250
normed_c2: aq_diff_background.BackGroundDistribution,
250251
ion2diffDist: dict[str, aq_diff_background.SubtractedBackgrounds],
251-
p2z: dict[str, str],
252+
p2z: dict[str, str],
252253
deedpair2doublediffdist: dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds],aq_diff_background.SubtractedBackgrounds],
253254
fcfc_threshold: float) -> float:
254255
"""
255256
Evaluate the statistical similarity between two sets of ions based on their properties and fold changes.
256-
257+
257258
This function calculates a p-value representing the statistical similarity between two sets of ions,
258259
testing the null hypothesis that the two sets are not significantly different.
259-
260+
260261
Args:
261262
idx1 (int): Index of the first set of ions in the diffions list.
262263
idx2 (int): Index of the second set of ions in the diffions list.
@@ -268,7 +269,7 @@ def evaluate_similarity(idx1: int, idx2: int,
268269
p2z (dict[str, str]): Dictionary for converting p-values to z-scores.
269270
deedpair2doublediffdist (dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds], aq_diff_background.SubtractedBackgrounds]): Mapping of ion pairs to their double difference distributions.
270271
fcfc_threshold (float): Threshold for considering fold changes as similar.
271-
272+
272273
Returns:
273274
float: A p-value where higher values suggest greater similarity between ion sets.
274275
Returns 0.99 for fold changes below fcfc_threshold.

0 commit comments

Comments
 (0)