Skip to content

Commit ac85fdb

Browse files
committed
improve PartitionFinder mAIC algorithm
1 parent ca9bece commit ac85fdb

File tree

2 files changed

+49
-44
lines changed

2 files changed

+49
-44
lines changed

main/phylotesting.cpp

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4220,12 +4220,16 @@ double PartitionFinder::getmAICforMergeScheme(vector<set<int> > gene_sets, StrVe
42204220

42214221
ModelPairSet PartitionFinder::getBetterPairsmAIC() {
42224222
cout << "Compute mAIC score of partition models for compatible partition merging schemes..." << endl;
4223+
double cpu_time = getCPUTime();
4224+
double real_time = getRealTime();
4225+
42234226
double cur_score_maic = 0;
4227+
ModelPairSet sorted_pairs_bu = sorted_pairs;
42244228
ModelPairSet cur_better_pairs;
42254229
set<int> part_ids;
4226-
4227-
double cpu_time = getCPUTime();
4228-
double real_time = getRealTime();
4230+
int better_df = dfsum;
4231+
vector<set<int> > better_gene_sets = gene_sets;
4232+
StrVector better_model_names = model_names;
42294233

42304234
// evaluate all pairs of partitions for merging
42314235
int index = 0;
@@ -4242,33 +4246,43 @@ ModelPairSet PartitionFinder::getBetterPairsmAIC() {
42424246

42434247
//compute mAIC
42444248
ModelPair cur_pair = it->second;
4245-
int merged_df = dfsum - dfvec[cur_pair.part1] - dfvec[cur_pair.part2] + it->second.df;
42464249

42474250
// prepare partition IDs and model_names for subsets
4248-
vector<set<int> > cur_gene_sets = gene_sets;
4249-
StrVector model_names;
4250-
model_names.resize(in_tree->size());
4251-
for (int i = 0; i < gene_sets.size(); i++) {
4252-
model_names[i] = in_tree->at(i)->aln->model_name;
4253-
}
4251+
auto it_bu = next(sorted_pairs_bu.begin(), index); // as partition index may change in this loop, use back up to get back correct information
4252+
int cur_df = better_df - dfvec[it_bu->second.part1] - dfvec[it_bu->second.part2] + it->second.df;
4253+
vector<set<int> > cur_gene_sets = better_gene_sets;
4254+
StrVector cur_model_names = better_model_names;
4255+
42544256
cur_gene_sets[cur_pair.part1] = cur_pair.merged_set;
4255-
model_names[cur_pair.part1] = cur_pair.model_name;
4257+
cur_model_names[cur_pair.part1] = cur_pair.model_name;
42564258
cur_gene_sets.erase(cur_gene_sets.begin() + cur_pair.part2);
4257-
model_names.erase(model_names.begin() + cur_pair.part2);
4259+
cur_model_names.erase(cur_model_names.begin() + cur_pair.part2);
42584260

4259-
cur_score_maic = getmAICforMergeScheme(cur_gene_sets, model_names, merged_df, true);
4261+
cur_score_maic = getmAICforMergeScheme(cur_gene_sets, cur_model_names, cur_df, true);
42604262

42614263
if (cur_score_maic < inf_score_maic) {
4262-
cout << it->second.set_name << " will be merged with mAIC score: " << cur_score_maic
4263-
<< " (Marginal LnL: " << lh_marginal << " df: " << merged_df << ")" << endl;
4264-
//<< ", AIC: "<< it->second.score << " ( LnL: " << it->second.logl << ")" << endl;
4264+
cout << "Merging " << it->second.set_name << " with mAIC score: " << cur_score_maic
4265+
<< " (Marginal LnL: " << lh_marginal << " df: " << cur_df << ")" << endl;
42654266
part_ids.insert(it->second.merged_set.begin(), it->second.merged_set.end());
4266-
cur_better_pairs.insertPair(it->second);
4267-
} /*else {
4267+
cur_better_pairs.insertPair(it_bu->second);
4268+
4269+
inf_score_maic = cur_score_maic;
4270+
better_df = cur_df;
4271+
better_gene_sets = cur_gene_sets;
4272+
better_model_names = cur_model_names;
4273+
4274+
// decrease part ID for all pairs beyond opt_pair.part2
4275+
auto next_pair = it;
4276+
for (next_pair++; next_pair != sorted_pairs.end(); next_pair++) {
4277+
if (next_pair->second.part1 > cur_pair.part2)
4278+
next_pair->second.part1--;
4279+
if (next_pair->second.part2 > cur_pair.part2)
4280+
next_pair->second.part2--;
4281+
}
4282+
}/* else {
42684283
cout << "[mAIC] "<< it->second.set_name << " will [NOT] be merged with mAIC score: " << cur_score_maic
4269-
<< " (Marginal LnL: " << lh_marginal << " df: " << merged_df << "), AIC: "
4270-
<< it->second.score << " ( LnL: " << it->second.logl << ")" << endl;
4271-
} */
4284+
<< " (Marginal LnL: " << lh_marginal << " df: " << cur_df << ")" << endl;
4285+
}*/
42724286
}
42734287

42744288

@@ -4906,8 +4920,6 @@ void PartitionFinder::test_PartitionModel() {
49064920
cout << "Full partition model " << criterionName(params->model_test_criterion)
49074921
<< " score: " << inf_score << " (LnL: " << lhsum << " df:" << dfsum << ")" << endl;
49084922

4909-
StrVector model_names;
4910-
49114923
if (params->marginal_lh_aic) {
49124924
inf_score_maic = getmAICforMergeScheme(gene_sets, model_names, dfsum, false);
49134925
cout << "Full partition model mAIC score: " << inf_score_maic << " (Marginal LnL: " << lh_marginal << " df:" << dfsum << ")" << endl;
@@ -4999,7 +5011,7 @@ void PartitionFinder::test_PartitionModel() {
49995011
bool proceed_stepwise_merge = perform_merge;
50005012

50015013
// variables for mAIC merging
5002-
bool switch_to_caic = false;
5014+
bool switched_to_caic = false;
50035015
double lhsum_bu;
50045016
int dfsum_bu;
50055017
vector<set<int> > gene_sets_bu;
@@ -5041,7 +5053,7 @@ void PartitionFinder::test_PartitionModel() {
50415053
bool is_maic_pairs_empty = better_pairs.empty();
50425054

50435055
if (is_maic_pairs_empty){
5044-
if (switch_to_caic) {
5056+
if (switched_to_caic) {
50455057
cout << "No better pairs based on mAIC after one round of cAIC merging and finish merging with previous optimal mAIC" << endl;
50465058
//load back back up
50475059
lhsum = lhsum_bu;
@@ -5060,7 +5072,7 @@ void PartitionFinder::test_PartitionModel() {
50605072
};
50615073
cout << "No better pairs based on mAIC, try merging better pairs based on cAIC" << endl;
50625074
cout << compatible_pairs.size() << " compatible better partition pairs found based on cAIC" << endl;
5063-
switch_to_caic = true;
5075+
switched_to_caic = true;
50645076
//back up
50655077
lhsum_bu = lhsum;
50665078
dfsum_bu = dfsum;
@@ -5072,9 +5084,9 @@ void PartitionFinder::test_PartitionModel() {
50725084
greedy_model_trees_bu = greedy_model_trees;
50735085
}
50745086
} else {
5075-
if (switch_to_caic) {
5087+
if (switched_to_caic) {
50765088
cout << "Better pairs based on mAIC are found after one round of cAIC merging" << endl;
5077-
switch_to_caic = false;
5089+
switched_to_caic = false;
50785090
}
50795091
compatible_pairs = better_pairs;
50805092
}
@@ -5088,7 +5100,7 @@ void PartitionFinder::test_PartitionModel() {
50885100
dfsum = dfsum - dfvec[opt_pair.part1] - dfvec[opt_pair.part2] + opt_pair.df;
50895101
inf_score = computeInformationScore(lhsum, dfsum, ssize, params->model_test_criterion);
50905102

5091-
if (!params->marginal_lh_aic || switch_to_caic) {
5103+
if (!params->marginal_lh_aic || switched_to_caic) {
50925104
ASSERT(inf_score <= opt_pair.score + 0.1);
50935105
cout << "Merging " << opt_pair.set_name << " with " << criterionName(params->model_test_criterion)
50945106
<< " score: " << inf_score << " (LnL: " << lhsum << " df: " << dfsum << ")" << endl;
@@ -5133,25 +5145,19 @@ void PartitionFinder::test_PartitionModel() {
51335145
}
51345146
// save and output mAIC after merging all pairs
51355147
if (params->marginal_lh_aic) {
5136-
if (!switch_to_caic) {
5137-
double cur_score_maic = getmAICforMergeScheme(gene_sets, model_names, dfsum, true);
5138-
if (cur_score_maic < inf_score_maic) {
5139-
cout << "Current partition model mAIC score: " << cur_score_maic
5140-
<< " (Marginal LnL: " << lh_marginal << " df:" << dfsum << ")" << endl;
5141-
} else {
5142-
cout << "[Rare case] Current partition model mAIC score: " << cur_score_maic
5143-
<< " (Marginal LnL: " << lh_marginal << " df:" << dfsum << ") is worse than "
5144-
<< inf_score_maic << endl;
5145-
}
5148+
double cur_score_maic = getmAICforMergeScheme(gene_sets, model_names, dfsum, true);
5149+
if (!switched_to_caic) {
5150+
cout << "Current partition model mAIC score: " << cur_score_maic
5151+
<< " (Marginal LnL: " << lh_marginal << " df:" << dfsum << "). Current cAIC score: "
5152+
<< inf_score << " (LnL: " << lhsum << " df: " << dfsum << ")" << endl;
51465153
inf_score_maic = cur_score_maic;
51475154
} else {
5148-
double cur_score_maic = getmAICforMergeScheme(gene_sets, model_names, dfsum, true);
5149-
if (cur_score_maic < inf_score_maic) {
5155+
if (cur_score_maic <= inf_score_maic) {
51505156
cout << "Current partition model mAIC score: " << cur_score_maic
51515157
<< " (Marginal LnL: " << lh_marginal << " df:" << dfsum << ") is better than "
51525158
<< inf_score_maic << ", back to merging with mAIC since next round" << endl;
51535159
inf_score_maic = cur_score_maic;
5154-
switch_to_caic = false;
5160+
switched_to_caic = false;
51555161
} else {
51565162
cout << "Current partition model mAIC score: " << cur_score_maic
51575163
<< " (Marginal LnL: " << lh_marginal << " df:" << dfsum << ") is worse than "
@@ -5161,8 +5167,6 @@ void PartitionFinder::test_PartitionModel() {
51615167
cout << endl;
51625168
}
51635169

5164-
5165-
51665170
// proceed to the next iteration if gene_sets.size() >= 2
51675171
proceed_stepwise_merge = (gene_sets.size() >= 2);
51685172
}

main/phylotesting.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,7 @@ class PartitionFinder {
588588
int64_t num_model;
589589
vector<SubsetPair> closest_pairs;
590590
vector<set<int> > gene_sets; // vector all merged subsets, each containing the ID of original partitions
591+
StrVector model_names;
591592
PhyloSuperTree* in_tree;
592593
size_t ssize;
593594
Params *params;

0 commit comments

Comments
 (0)