Skip to content

MixtureFinder update #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f2c802b
allow mAIC calculation for 0 or 1 "overlapping" sequences.
HuaiyanRen Mar 4, 2025
96d6b53
allow mAIC calculation for 2 "overlapping" sequences.
HuaiyanRen Mar 7, 2025
7357f83
method: add one gappy sequence for "2-overlapping" cases.
HuaiyanRen Mar 19, 2025
aa60951
fix bugs for old method of "2-overlapping" cases. Although we don't u…
HuaiyanRen Mar 19, 2025
219fd3f
Fix protein ambiguous preblem for mAIC. Only compute mAIC for DNA and…
HuaiyanRen Mar 21, 2025
687002e
Allow mAIC calculation for binary and codon data. Make mAIC calculati…
HuaiyanRen Mar 26, 2025
13b6892
Allow to fix the parameters for RHAS when using mixture finder. Also …
thomaskf Apr 16, 2025
81f4e7a
Merge pull request #19 from thomaskf/mixFinder_update
thomaskf Apr 16, 2025
4710559
Fixed the issue happened when user specifies the RHAS model when runn…
thomaskf Apr 16, 2025
dd0c411
Merge pull request #20 from thomaskf/mixFinder_update
thomaskf Apr 16, 2025
f262dbd
Merge branch 'master' into mixFinder_update
thomaskf May 9, 2025
53ec25a
Merge branch 'master' into huaiyan
HuaiyanRen May 9, 2025
988531d
Allow the option -m MIX+MF, MIX+MFP, MF+MIX, MFP+MIX to run MixtureFi…
thomaskf May 9, 2025
61342be
Allow mixtureFinder on an alignment with a single partition
thomaskf May 9, 2025
016a6b8
Fixed the double commas inside the best_model.nex file
thomaskf May 9, 2025
0442a7a
Set the best model reported by mixturefinder to the first partition
thomaskf May 12, 2025
0ec0308
Save the model checkpoint to the file when mixtureFinder finishes
thomaskf May 12, 2025
d4aebb4
Update mixtureFinder for alignment with single partition
thomaskf May 12, 2025
17eecd6
Merge remote-tracking branch 'origin/mixFinder_update' into mixFinder…
HuaiyanRen May 12, 2025
e0e397c
report the mixture model in single partition in mixture model format.
HuaiyanRen May 12, 2025
98779b8
Fixed the issue of using these models LG4X,LG4M in -madd
thomaskf May 12, 2025
d29ff88
report the mixture model in only in single-partition alignment in mix…
HuaiyanRen May 12, 2025
64423b1
Copy the resulting final tree to the first partition for mixfinder wi…
thomaskf May 12, 2025
1e073ec
update the checkpoint file to keep the model parameters of the best m…
thomaskf May 14, 2025
0a9f5a5
Merge branch 'huaiyan' into mixFinder_update
HuaiyanRen May 15, 2025
c6b25ab
report the mixture model in only in single-partition alignment STILL …
HuaiyanRen May 15, 2025
66c5278
introduce an option -mfopt to further optimise nucl/aa frequencies in…
thomaskf May 15, 2025
6fc4e70
Add an option --writefreqchkpt to force writing nucl/aa freq to check…
thomaskf May 15, 2025
86c8cab
Fixed the issue in modelfinder
thomaskf May 20, 2025
781d602
Revert "Add an option --writefreqchkpt to force writing nucl/aa freq …
thomaskf May 22, 2025
2d18c90
Always output the frequencies to the checkpoint file.
thomaskf May 22, 2025
63e2ab4
Fix the issue for using mixtureFinder on alignment with single partition
thomaskf May 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions main/phyloanalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,8 +928,7 @@ void reportTree(ofstream &out, Params &params, PhyloTree &tree, double tree_lh,
int ntrees; //mix_df;
double mix_lh;

string maic_warning;
mix_lh = tree.getModelFactory()->computeMixLh(maic_warning);
mix_lh = tree.getModelFactory()->computeMarginalLh();
if (mix_lh < 0) {
PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
ntrees = stree->size();
Expand All @@ -940,13 +939,13 @@ void reportTree(ofstream &out, Params &params, PhyloTree &tree, double tree_lh,
computeInformationScores(mix_lh, df, ssize, mAIC, mAICc, mBIC);

out << endl;
out << "Mixture-based log-likelihood of the tree: " << mix_lh << endl;
out << "Marginal log-likelihood of the tree: " << mix_lh << endl;
out << "Marginal Akaike information criterion (mAIC) score: " << mAIC << endl;
//out << "Marginal corrected Akaike information criterion (mAICc) score: " << mAICc << endl;
//out << "Marginal Bayesian information criterion (mBIC) score: " << mBIC << endl;
} else {
out << endl;
out << maic_warning << endl;
out << "mAIC calculation is skipped because not all partition sequence types are same" << endl;
}
}

Expand Down Expand Up @@ -1239,7 +1238,7 @@ void printOutfilesInfo(Params &params, IQTree &tree) {
cout << " Concatenated alignment: " << params.out_prefix
<< ".conaln" << endl;
}
if ((params.model_name.find("TEST") != string::npos || params.model_name.substr(0,2) == "MF") && tree.isSuperTree()) {
if ((params.model_name.find("TEST") != string::npos || params.model_name.substr(0,2) == "MF" || params.model_name.substr(0,6) == "MIX+MF") && tree.isSuperTree()) {
cout << " Best partitioning scheme: " << params.out_prefix << ".best_scheme.nex" << endl;
bool raxml_format_printed = true;

Expand Down Expand Up @@ -1351,7 +1350,7 @@ void reportSubstitutionProcess(ostream &out, Params &params, IQTree &tree)
out << "Edge-unlinked partition model with ";
else
out << "Topology-unlinked partition model with ";

// if (params.model_joint)
if (!params.model_joint.empty())
out << "joint substitution model ";
Expand All @@ -1365,8 +1364,10 @@ void reportSubstitutionProcess(ostream &out, Params &params, IQTree &tree)

PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
PhyloSuperTree::iterator it;
it = stree->begin();

int part;

// force showing full params if running AliSim
bool show_full_params = tree.params->alisim_active;

Expand Down Expand Up @@ -1670,7 +1671,10 @@ void reportPhyloAnalysis(Params &params, IQTree &tree, ModelCheckpoint &model_in
<< endl;
*/
if (params.compute_ml_tree) {
if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP")) {
if (params.model_name == "MIX+MF" || params.model_name == "MF+MIX") {
out << "TREE USED FOR MixtureFinder" << endl
<< "---------------------------" << endl << endl;
} else if (params.model_name.find("ONLY") != string::npos || (params.model_name.substr(0,2) == "MF" && params.model_name.substr(0,3) != "MFP")) {
out << "TREE USED FOR ModelFinder" << endl
<< "-------------------------" << endl << endl;
} else if (params.min_iterations == 0) {
Expand Down
161 changes: 116 additions & 45 deletions main/phylotesting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,11 @@ void runModelFinder(Params &params, IQTree &iqtree, ModelCheckpoint &model_info,
return;
}

if (params.model_name == "MIX+MF" || params.model_name == "MIX+MFP" || params.model_name == "MF+MIX" || params.model_name == "MFP+MIX") {
// mixture finder
return;
}

if (nest_network.size() == 0 && iqtree.aln->seq_type == SEQ_DNA) {
// build the nest relationship between the models
// we will use the optimized parameters of the last model which is nested by this model
Expand Down Expand Up @@ -1486,6 +1491,9 @@ void runModelFinder(Params &params, IQTree &iqtree, ModelCheckpoint &model_info,
iqtree.aln->model_name = best_model.getName();
best_subst_name = best_model.subst_name;
best_rate_name = best_model.rate_name;
string best_orig_rate_name = best_model.orig_rate_name;
if (under_mix_finder)
CKP_SAVE(best_orig_rate_name); // for mixture finder
// Checkpoint *checkpoint = &model_info;
string best_model_AIC, best_model_AICc, best_model_BIC;
CKP_RESTORE(best_model_AIC);
Expand All @@ -1502,9 +1510,6 @@ void runModelFinder(Params &params, IQTree &iqtree, ModelCheckpoint &model_info,
#endif
}

// remove key "OptModel" from the checkpoint file, which is only used for initialising models from the nested models.
iqtree.getCheckpoint()->eraseKeyPrefix("OptModel");

delete models_block;

// force to dump all checkpointing information
Expand Down Expand Up @@ -1984,8 +1989,10 @@ string CandidateModel::evaluate(Params &params,

// obtain the likelihood value from the (k-1)-class mixture model
string criteria_str = criterionName(params.model_test_criterion);
string best_model = in_model_info["best_model_" + criteria_str];
string best_model_logl_df = in_model_info[best_model];
string best_model;
ASSERT(in_model_info.getString("best_model_" + criteria_str, best_model));
string best_model_logl_df;
ASSERT(in_model_info.getString(best_model, best_model_logl_df));
stringstream ss (best_model_logl_df);
double pre_logl;
ss >> pre_logl;
Expand Down Expand Up @@ -2066,7 +2073,6 @@ string CandidateModel::evaluate(Params &params,
logl += new_logl;
string tree_string = iqtree->getTreeString();

//cout << "[optimized] " << iqtree->getModelFactory()->model->getNameParams(false) << endl;

if (syncChkPoint != nullptr)
iqtree->getModelFactory()->syncChkPoint = nullptr;
Expand Down Expand Up @@ -3246,6 +3252,10 @@ CandidateModel CandidateModelSet::test(Params &params, PhyloTree* in_tree, Model
CKP_SAVE(best_score_AIC);
CKP_SAVE(best_score_AICc);
CKP_SAVE(best_score_BIC);

// remove key "OptModel" from the checkpoint file, which is only used for initialising models from the nested models.
model_info.eraseKeyPrefix("OptModel");

checkpoint->dump();

delete [] model_rank;
Expand Down Expand Up @@ -6472,15 +6482,11 @@ CandidateModel findMixtureComponent(Params &params, IQTree &iqtree, ModelCheckpo
<< criterionName(params.model_test_criterion) << endl;

// remove key "OptModel" from the checkpoint file, which is only used for initialising models from the nested models.
iqtree.getCheckpoint()->eraseKeyPrefix("OptModel");
model_info.eraseKeyPrefix(model_info.getStructName() + "OptModel");

delete models_block;

// force to dump all checkpointing information
model_info.dump(true);

// transfer models parameters
transferModelFinderParameters(&iqtree, orig_checkpoint);
model_info.dump();
iqtree.setCheckpoint(orig_checkpoint);

params.model_set = orig_model_set;
Expand Down Expand Up @@ -6551,16 +6557,20 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode
params.model_name = "";
bool under_mix_finder = true;
runModelFinder(params, *iqtree, model_info, best_subst_name, best_rate_name, nest_network, under_mix_finder);
string best_orig_rate_name;
ASSERT(model_info.getString("best_orig_rate_name", best_orig_rate_name));

// (cancel) Step 2: do tree search for this single-class model
// runTreeReconstruction(params, iqtree);
// curr_df = iqtree->getModelFactory()->getNParameters(BRLEN_OPTIMIZE);
// curr_loglike = iqtree->getCurScore();
// curr_score = computeInformationScore(curr_loglike, curr_df, ssize, params.model_test_criterion);
string best_model_logl_df = model_info[best_subst_name+best_rate_name];
string best_model_logl_df;
ASSERT(model_info.getString(best_subst_name+best_rate_name, best_model_logl_df));
stringstream ss (best_model_logl_df);
ss >> curr_loglike >> curr_df;
string best_score = model_info["best_score_" + criteria_str];
string best_score;
ASSERT(model_info.getString("best_score_" + criteria_str, best_score));
curr_score = convert_double(best_score.c_str());

cout << endl << "Model: " << best_subst_name << best_rate_name << "; df: " << curr_df << "; loglike: " << curr_loglike << "; " << criteria_str << " score: " << curr_score << endl;
Expand All @@ -6579,6 +6589,8 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode
do_init_tree = false;
model_str = best_subst_name;
do {
if (params.optimize_from_given_params == false)
best_rate_name = best_orig_rate_name;
best_model = findMixtureComponent(params, *iqtree, model_info, MA_ADD_CLASS, do_init_tree, model_str, best_subst_name, best_rate_name, nest_network);
cout << endl << "Model: " << best_subst_name << best_rate_name << "; df: " << best_model.df << "; loglike: " << best_model.logl << "; " << criteria_str << " score: " << best_model.getScore() << ";";
if (params.opt_qmix_criteria == 1) {
Expand Down Expand Up @@ -6610,8 +6622,15 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode
model_info.put("best_model_AIC", best_model_pre_AIC);
model_info.put("best_model_AICc", best_model_pre_AICc);
model_info.put("best_model_BIC", best_model_pre_BIC);


// overwrite the checkpoint by the best models
ModelCheckpoint best_model_info;
model_info.getSubCheckpoint(&best_model_info, model_info.getStructName() + "BestOfTheKClass");
model_info.putSubCheckpoint(&best_model_info, "");

best_subst_name = model_str;
if (params.optimize_from_given_params == false)
best_rate_name = best_orig_rate_name;
int n_class = getClassNum(model_str);
if (params.opt_rhas_again) {
if (n_class == 1) {
Expand All @@ -6630,29 +6649,64 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode
}

model_str = best_subst_name+best_rate_name;

// force to dump all checkpointing information
model_info.dump(true);
}

// Optimisation of Q-Mixture model, including estimation of best number of classes in the mixture
void runMixtureFinder(Params &params, IQTree* &iqtree, ModelCheckpoint &model_info) {

IQTree* new_iqtree;
string model_str;
Alignment* aln;

if (params.model_name.substr(0,6) != "MIX+MF")
bool mix_finder_mode = (params.model_name == "MIX+MF" || params.model_name == "MIX+MFP" || params.model_name == "MF+MIX" || params.model_name == "MFP+MIX");

if (!mix_finder_mode)
return;

bool test_only = (params.model_name == "MIX+MF");
params.model_name = "";
string orig_model_name = params.model_name;
bool test_only = (params.model_name == "MIX+MF" || params.model_name == "MF+MIX");

if (MPIHelper::getInstance().getNumProcesses() > 1)
outError("Error! The option -m '" + params.model_name + "' does not support MPI parallelization");

if (iqtree->isSuperTree())
outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions");
if (iqtree->isSuperTree()) {
SuperAlignment* saln = (SuperAlignment*)iqtree->aln;
if (saln->partitions.size() == 1) {
aln = saln->partitions[0];
model_info.startStruct(aln->name);
} else
outError("Error! The option -m '" + params.model_name + "' cannot work on data set with more than one partition");
} else {
aln = iqtree->aln;
}

if (iqtree->aln->seq_type != SEQ_DNA)
if (aln->seq_type != SEQ_DNA)
outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set");

// create a new IQTree object for this mixture model
// allocate heterotachy tree if neccessary
int pos = posRateHeterotachy(aln->model_name);
if (params.num_mixlen > 1) {
new_iqtree = new PhyloTreeMixlen(aln, params.num_mixlen);
} else if (pos != string::npos) {
new_iqtree = new PhyloTreeMixlen(aln, 0);
} else {
new_iqtree = new IQTree(aln);
}
new_iqtree->setCheckpoint(iqtree->getCheckpoint());
if (!iqtree->constraintTree.empty())
new_iqtree->constraintTree.readConstraint(iqtree->constraintTree);
new_iqtree->removed_seqs = iqtree->removed_seqs;
new_iqtree->twin_seqs = iqtree->twin_seqs;
if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
/* Initialized all data structure for PLL*/
new_iqtree->initializePLL(params);
}
new_iqtree->setParams(&params);

cout << "--------------------------------------------------------------------" << endl;
cout << "| Running MixtureFinder |" << endl;
cout << "--------------------------------------------------------------------" << endl;
Expand All @@ -6664,9 +6718,30 @@ void runMixtureFinder(Params &params, IQTree* &iqtree, ModelCheckpoint &model_in
params.gbo_replicates = 0;
params.consensus_type = CT_NONE;
params.stop_condition = SC_UNSUCCESS_ITERATION;
params.model_name = "";

runMixtureFinderMain(params, iqtree, model_info, model_str);
runMixtureFinderMain(params, new_iqtree, model_info, model_str);

// transfer models parameters
Checkpoint *iqtree_chkpt = iqtree->getCheckpoint();
if (iqtree->isSuperTree()) {
string partmodel_name;
if (params.partition_type == BRLEN_SCALE || params.partition_type == BRLEN_FIX)
partmodel_name = "PartitionModelPlen";
else
partmodel_name = "PartitionModel";
iqtree_chkpt->startStruct(partmodel_name);
}
string structname = model_info.getStructName();
model_info.transferSubCheckpoint(iqtree_chkpt, structname + "Model", true);
model_info.transferSubCheckpoint(iqtree_chkpt, structname + "Rate", true);
model_info.transferSubCheckpoint(iqtree_chkpt, structname + "PhyloTree", true);
model_info.transferSubCheckpoint(iqtree_chkpt, structname + "best_model_", true);
model_info.transferSubCheckpoint(iqtree_chkpt, structname + "best_score_", true);
if (iqtree->isSuperTree()) {
iqtree_chkpt->endStruct();
}

// restore the original values
params.gbo_replicates = orig_gbo_replicates;
params.consensus_type = orig_consensus_type;
Expand All @@ -6676,32 +6751,28 @@ void runMixtureFinder(Params &params, IQTree* &iqtree, ModelCheckpoint &model_in
cout << " Best-fit Q-Mixture model: " << model_str << endl;
cout << "-------------------------------------------------------" << endl;

params.model_name = model_str;
iqtree->aln->model_name = model_str;

// create a new IQTree object for this mixture model
// allocate heterotachy tree if neccessary
int pos = posRateHeterotachy(iqtree->aln->model_name);
if (params.num_mixlen > 1) {
new_iqtree = new PhyloTreeMixlen(iqtree->aln, params.num_mixlen);
} else if (pos != string::npos) {
new_iqtree = new PhyloTreeMixlen(iqtree->aln, 0);
if (!iqtree->isSuperTree()) {
// alignment with no partition
iqtree->readTreeString(new_iqtree->getTreeString());
} else {
new_iqtree = new IQTree(iqtree->aln);
}
new_iqtree->setCheckpoint(iqtree->getCheckpoint());
if (!iqtree->constraintTree.empty())
new_iqtree->constraintTree.readConstraint(iqtree->constraintTree);
new_iqtree->removed_seqs = iqtree->removed_seqs;
new_iqtree->twin_seqs = iqtree->twin_seqs;
if (params.start_tree == STT_PLL_PARSIMONY || params.start_tree == STT_RANDOM_TREE || params.pll) {
/* Initialized all data structure for PLL*/
new_iqtree->initializePLL(params);
// partitioned alignment
((PhyloSuperTree*)iqtree)->at(0)->aln->model_name = model_str;
// ((PhyloSuperTree*)iqtree)->at(0)->readTreeString(new_iqtree->getTreeString());
if (params.partition_type == BRLEN_SCALE || params.partition_type == BRLEN_FIX)
((PhyloSuperTree*)iqtree)->readTreeString(new_iqtree->getTreeString());
else
((PhyloSuperTreeUnlinked*)iqtree)->readTreeString(new_iqtree->getTreeString()+new_iqtree->getTreeString());
model_info.endStruct();

((SuperAlignment*)iqtree->aln)->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
((SuperAlignment*)iqtree->aln)->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
}
new_iqtree->setParams(&params);
new_iqtree->copyPhyloTree(iqtree, false);
delete(iqtree);
iqtree = new_iqtree;

params.model_name = orig_model_name;
iqtree->saveCheckpoint();

delete(new_iqtree);

if (test_only) {
params.min_iterations = 0;
Expand Down
13 changes: 4 additions & 9 deletions model/modelfactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@ ModelFactory::ModelFactory(Params &params, string &model_name, PhyloTree *tree,
for (mix_pos = 0; mix_pos < curr_model_str.length(); mix_pos++) {
size_t next_mix_pos = curr_model_str.find_first_of("+*", mix_pos);
string sub_model_str = curr_model_str.substr(mix_pos, next_mix_pos-mix_pos);
// cout << "mix_pos = "<< mix_pos << "; sub_model_str = " << sub_model_str << endl;
nxsmodel = models_block->findMixModel(sub_model_str);
if (nxsmodel) sub_model_str = nxsmodel->description;
new_model_str += sub_model_str;
Expand Down Expand Up @@ -1127,10 +1126,8 @@ bool ModelFactory::initFromNestedModel(map<string, vector<string> > nest_network
*/

for (i = 0; i < nested_models.size(); i++) {
map<string, string>::iterator itr = checkpoint->find(nested_models[i] + rate_name);
ASSERT(itr != checkpoint->end());

string best_model_logl_df = itr->second;
string best_model_logl_df;
ASSERT(checkpoint->getString(nested_models[i] + rate_name, best_model_logl_df));
stringstream ss(best_model_logl_df);
ss >> cur_logl;

Expand Down Expand Up @@ -1179,10 +1176,8 @@ bool ModelFactory::initFromNestedModel(map<string, vector<string> > nest_network

for (i = 0; i < nested_models.size(); i++) {
nested_mix_model = replaceLastQ(model_name, nested_models[i]);
map<string, string>::iterator itr = checkpoint->find(nested_mix_model + rate_name);
ASSERT(itr != checkpoint->end());

string best_model_logl_df = itr->second;
string best_model_logl_df;
ASSERT(checkpoint->getString(nested_mix_model + rate_name, best_model_logl_df));
stringstream ss(best_model_logl_df);
ss >> cur_logl;

Expand Down
3 changes: 1 addition & 2 deletions model/modelfactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,8 @@ class ModelFactory : public unordered_map<int, double*>, public Optimization, pu

/**
compute the mixture-based log-likelihood for mAIC, mAICc, mBIC calculation.
@param warning the warning message when mixture-based log-likelihood calculation is skipped.
*/
virtual double computeMixLh(string &warning) {return 0.0;}
virtual double computeMarginalLh() {return 0.0;}

protected:

Expand Down
Loading
Loading