From 6df055521c867e8dab423654c43d0ebc31008a94 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Sun, 6 Apr 2025 14:35:17 +0900 Subject: [PATCH 01/17] extend MixtureFinder to codon, binary, multistate, and amino acid data --- main/phyloanalysis.cpp | 6 +- main/phylotesting.cpp | 74 +++++++++++++--- model/CMakeLists.txt | 1 + model/modelfactory.cpp | 3 + model/modelmixture.cpp | 7 +- model/modelmultistate.cpp | 174 ++++++++++++++++++++++++++++++++++++++ model/modelmultistate.h | 83 ++++++++++++++++++ 7 files changed, 332 insertions(+), 16 deletions(-) create mode 100644 model/modelmultistate.cpp create mode 100644 model/modelmultistate.h diff --git a/main/phyloanalysis.cpp b/main/phyloanalysis.cpp index 7fafdd978..6559512d4 100644 --- a/main/phyloanalysis.cpp +++ b/main/phyloanalysis.cpp @@ -48,6 +48,7 @@ //#include "modeltest_wrapper.h" #include "model/modelprotein.h" #include "model/modelbin.h" +#include "model/modelmultistate.h" #include "model/modelcodon.h" #include "utils/stoprule.h" @@ -249,6 +250,7 @@ void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) { << alignment.getNSite() << " "; switch (alignment.seq_type) { case SEQ_BINARY: out << "binary"; break; + case SEQ_MULTISTATE: out << "multistate"; break; case SEQ_DNA: out << "nucleotide"; break; case SEQ_PROTEIN: out << "amino-acid"; break; case SEQ_CODON: out << "codon"; break; @@ -2064,7 +2066,7 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) { // make sure this method will not make IQTREE crashed if (!(params.aln_file || params.partition_file) || !params.out_prefix || !tree.aln || !tree.getModel() - || !(tree.aln->seq_type == SEQ_DNA || tree.aln->seq_type == SEQ_CODON || tree.aln->seq_type == SEQ_PROTEIN || tree.aln->seq_type == SEQ_BINARY || tree.aln->seq_type == SEQ_MORPH) + || !(tree.aln->seq_type == SEQ_DNA || tree.aln->seq_type == SEQ_CODON || tree.aln->seq_type == SEQ_PROTEIN || tree.aln->seq_type == SEQ_BINARY || tree.aln->seq_type == SEQ_MULTISTATE || tree.aln->seq_type == SEQ_MORPH) || tree.isTreeMix()) return; @@ -2076,7 +2078,7 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) // skip unsupported models if (tree.getModel()->isMixture() || tree.getRate()->isHeterotachy() || tree.getModel()->isLieMarkov() || tree.aln->seq_type == SEQ_CODON) { - out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; + out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, Multistate, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; out << more_info << endl << endl; return; } diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index f5720950c..78d736c52 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -29,6 +29,7 @@ //#include "modeltest_wrapper.h" #include "model/modelprotein.h" #include "model/modelbin.h" +#include "model/modelmultistate.h" #include "model/modelcodon.h" #include "model/modelmorphology.h" #include "model/modelmixture.h" @@ -61,6 +62,13 @@ const char* bin_model_names[] = {"GTR2", "JC2"}; +/******* Multistate model set ******/ +const char* multistate_model_names[] = {"GTRX", "MK"}; + +/* Multistate frequency set */ +const char* multistate_freq_names[] = {"FQ", "FO"}; + + /******* Morphological model set ******/ // 2018-08-20: don't test ORDERED model due to lots of numerical issues //const char* morph_model_names[] = {"MK", "ORDERED"}; @@ -206,6 +214,7 @@ const char* aa_usual_model = "LG"; const char* aa_usual_nonrev_model = "NQ.pfam"; const char* codon_usual_model = "GY+F3X4"; const char* bin_usual_model = "GTR2"; +const char* multistate_usual_model = "GTRX"; const char* morph_usual_model = "MK"; const char* pomo_usual_model = "GTR+P"; @@ -243,6 +252,7 @@ string getUsualModelSubst(SeqType seq_type) { return aa_usual_model; case SEQ_CODON: return codon_usual_model; case SEQ_BINARY: return bin_usual_model; + case SEQ_MULTISTATE: return multistate_usual_model; case SEQ_MORPH: return morph_usual_model; case SEQ_POMO: return pomo_usual_model; default: ASSERT(0 && "Unprocessed seq_type"); return ""; @@ -488,6 +498,9 @@ int detectSeqType(const char *model_name, SeqType &seq_type) { seq_type = SEQ_MORPH; break; } + if (model_str == "GTRX") { + seq_type = SEQ_MULTISTATE; + } copyCString(dna_model_names, sizeof(dna_model_names)/sizeof(char*), model_list, true); for (i = 0; i < model_list.size(); i++) if (model_str == model_list[i]) { @@ -566,6 +579,7 @@ string convertSeqTypeToSeqTypeName(SeqType seq_type) { switch (seq_type) { case SEQ_BINARY: return "BIN"; break; + case SEQ_MULTISTATE: return "MULTISTATE"; break; case SEQ_MORPH: return "MORPH"; break; case SEQ_DNA: return "DNA"; break; case SEQ_PROTEIN: return "AA"; break; @@ -943,6 +957,16 @@ void getModelSubst(SeqType seq_type, bool standard_code, string model_name, } else { convert_string_vec(model_set.c_str(), model_names); } + } else if (seq_type == SEQ_MULTISTATE) { + if (model_set.empty()) { + copyCString(multistate_model_names, sizeof(multistate_model_names) / sizeof(char*), model_names); + } else if (model_set[0] == '+') { + // append model_set into existing models + convert_string_vec(model_set.c_str()+1, model_names); + appendCString(multistate_model_names, sizeof(multistate_model_names) / sizeof(char*), model_names); + } else { + convert_string_vec(model_set.c_str(), model_names); + } } else if (seq_type == SEQ_MORPH) { if (model_set.empty()) { copyCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names); @@ -1148,6 +1172,9 @@ void getStateFreqs(SeqType seq_type, char *state_freq_set, StrVector &freq_names } if (freq_names.empty()) { switch (seq_type) { + case SEQ_MULTISTATE: + copyCString(multistate_freq_names, sizeof(multistate_freq_names)/sizeof(char*), freq_names); + break; case SEQ_DNA: copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names); break; @@ -1214,13 +1241,13 @@ void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites if (with_new && rate_set != "1") { if (with_asc) test_options = test_options_asc_new; - else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) + else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MULTISTATE || seq_type == SEQ_MORPH) test_options = test_options_morph_new; else test_options = test_options_noASC_I_new; } else if (with_asc) test_options = test_options_asc; - else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) { + else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MULTISTATE || seq_type == SEQ_MORPH) { if (rate_set == "1") test_options = test_options_morph_fast; else @@ -6347,7 +6374,15 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint } skip_all_when_drop = true; } else if (action == 3) { - char init_state_freq_set[] = "FO"; + char freq_set_multistate_protein[] = "FQ,FO"; + char freq_set_codon[] = "FQ,F,F1X4,F3X4"; + char freq_set_default[] = "FO"; + char* init_state_freq_set = + (iqtree.aln->seq_type == SEQ_MULTISTATE || iqtree.aln->seq_type == SEQ_PROTEIN) + ? freq_set_multistate_protein + : (iqtree.aln->seq_type == SEQ_CODON) + ? freq_set_codon + : freq_set_default; if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } @@ -6513,7 +6548,15 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck double LR, df_diff, pvalue; string criteria_str; - char init_state_freq_set[] = "FO"; + char freq_set_multistate_protein[] = "FQ,FO"; + char freq_set_codon[] = "FQ,F,F1X4,F3X4"; + char freq_set_default[] = "FO"; + char* init_state_freq_set = + (iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_PROTEIN) + ? freq_set_multistate_protein + : (iqtree->aln->seq_type == SEQ_CODON) + ? freq_set_codon + : freq_set_default; if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } @@ -6522,16 +6565,16 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck ssize = iqtree->getAlnNSite(); criteria_str = criterionName(params.model_test_criterion); - // Step 0: (reorder candidate DNA models when -mset is used) build the nest-relationship network + // Step 0: (reorder candidate models when -mset is used) build the nest-relationship network map > nest_network; - if (iqtree->aln->seq_type == SEQ_DNA) { + //if (iqtree->aln->seq_type == SEQ_DNA) { StrVector model_names, freq_names; getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name, params.model_set, params.model_subset, model_names); getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names); nest_network = generateNestNetwork(model_names, freq_names); - } + //} // Step 1: run ModelFinder params.model_name = ""; @@ -6638,8 +6681,8 @@ void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_i if (iqtree->isSuperTree()) outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions"); - if (iqtree->aln->seq_type != SEQ_DNA) - outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set"); + if (!(iqtree->aln->seq_type == SEQ_DNA || iqtree->aln->seq_type == SEQ_CODON|| iqtree->aln->seq_type == SEQ_BINARY || iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_MORPH || iqtree->aln->seq_type == SEQ_PROTEIN)) + outError("Error! The option -m '" + params.model_name + "' can only work on DNA, codon, binary, multistate, and amino acid data set"); cout << "--------------------------------------------------------------------" << endl; cout << "| Optimizing Q-mixture model |" << endl; @@ -6652,6 +6695,11 @@ void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_i params.gbo_replicates = 0; params.consensus_type = CT_NONE; params.stop_condition = SC_UNSUCCESS_ITERATION; + + if (iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_MORPH) { + cout << endl << "MixtureFinder will also test models with unequal rates and/or frequencies. Make sure your data are multistate." << endl; + iqtree->aln->seq_type = SEQ_MULTISTATE; + } optimiseQMixModel_method_update(params, iqtree, model_info, model_str); @@ -6755,15 +6803,15 @@ void reorderModelNames(StrVector& model_names, const char* model_set[], size_t s } bool isRateTypeNested(string rate_type1, string rate_type2) { - if (rate_type1.length() != 6) { + /*if (rate_type1.length() != 6) { outError("Incorrect DNA model rate type code: " + rate_type1); } if (rate_type2.length() != 6) { outError("Incorrect DNA model rate type code: " + rate_type2); - } + }*/ - for (int i = 0; i < 5; i++) { - for (int j = i; j < 6; j++ ){ + for (int i = 0; i < rate_type1.length()-1; i++) { + for (int j = i; j < rate_type1.length(); j++ ){ if (rate_type1[i] == rate_type1[j] && rate_type2[i] != rate_type2[j]){ return false; } diff --git a/model/CMakeLists.txt b/model/CMakeLists.txt index d1284e50b..e99a9fa05 100644 --- a/model/CMakeLists.txt +++ b/model/CMakeLists.txt @@ -1,6 +1,7 @@ add_library(model modelmarkov.cpp modelmarkov.h modelbin.cpp modelbin.h +modelmultistate.cpp modelmultistate.h modeldna.cpp modeldna.h modeldnaerror.cpp modeldnaerror.h modelfactory.cpp modelfactory.h diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 36ce37af9..35968a281 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -26,6 +26,7 @@ #include "modeldna.h" #include "modelprotein.h" #include "modelbin.h" +#include "modelmultistate.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelpomo.h" @@ -167,6 +168,7 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, if (tree->aln->seq_type == SEQ_DNA) model_str = "HKY"; else if (tree->aln->seq_type == SEQ_PROTEIN) model_str = "LG"; else if (tree->aln->seq_type == SEQ_BINARY) model_str = "GTR2"; + else if (tree->aln->seq_type == SEQ_MULTISTATE) model_str = "GTRX"; else if (tree->aln->seq_type == SEQ_CODON) model_str = "GY"; else if (tree->aln->seq_type == SEQ_MORPH) model_str = "MK"; else if (tree->aln->seq_type == SEQ_POMO) model_str = "HKY+P"; @@ -436,6 +438,7 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, if (model_str.substr(0, 3) != "MIX" && freq_type == FREQ_UNKNOWN) { switch (tree->aln->seq_type) { case SEQ_BINARY: freq_type = FREQ_ESTIMATE; break; // default for binary: optimized frequencies + case SEQ_MULTISTATE: freq_type = FREQ_ESTIMATE; break; // default for multistate data: optimized frequencies case SEQ_PROTEIN: break; // let ModelProtein decide by itself case SEQ_MORPH: freq_type = FREQ_EQUAL; break; case SEQ_CODON: freq_type = FREQ_UNKNOWN; break; diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp index 81d2bf12a..1d8a05650 100644 --- a/model/modelmixture.cpp +++ b/model/modelmixture.cpp @@ -10,6 +10,7 @@ #include "modeldnaerror.h" #include "modelprotein.h" #include "modelbin.h" +#include "modelmultistate.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelset.h" @@ -1138,6 +1139,7 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || + (model_str == "MK" && tree->aln->seq_type == SEQ_MULTISTATE) || (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) { @@ -1153,6 +1155,7 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; // else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || // (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || +// ((model_str == "GTR" || model_str == "GTRX") && model_str == SEQ_MULTISTATE) || // (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { // model = new ModelGTR(tree, count_rates); // if (freq_params != "") @@ -1165,7 +1168,9 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); } else if (tree->aln->seq_type == SEQ_BINARY) { model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_DNA) { + } else if (tree->aln->seq_type == SEQ_MULTISTATE) { + model = new ModelMultistate(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_DNA) { if (seqerr.empty()) model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); else diff --git a/model/modelmultistate.cpp b/model/modelmultistate.cpp new file mode 100644 index 000000000..4896417b9 --- /dev/null +++ b/model/modelmultistate.cpp @@ -0,0 +1,174 @@ +/* + * modelmultistate.cpp + * + * Created on: Apr 6, 2025 + * Author: Hiroaki Sato + * Original author: minh + */ + +#include "modelmultistate.h" + +ModelMultistate::ModelMultistate(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree) +: ModelMarkov(tree) +{ + init(model_name, model_params, freq, freq_params); +} + +void ModelMultistate::init(const char *model_name, string model_params, StateFreqType freq, string freq_params) +{ + name = model_name; + full_name = model_name; + if (name == "MK") { + // all were initialized + num_params = 0; + } else if (name == "ORDERED") { + int i, j, k = 0; + // only allow for substitution from state i to state i+1 and back. + for (i = 0; i < num_states-1; i++) { + rates[k++] = 1.0; + for (j = i+2; j < num_states; j++, k++) + rates[k] = 0.0; + } + num_params = 0; + } else if (name == "GTR" || name == "GTRX") { + outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting!"); + outWarning("Please only use GTRX with very large data and always test for model fit!"); + name = "GTRX"; + } else { + // if name does not match, read the user-defined model + readParameters(model_name); + num_params = 0; + freq = FREQ_USER_DEFINED; + } + + // parse user-specified state frequencies (if any) + if (freq_params != "") + { + freq_type = FREQ_USER_DEFINED; + readStateFreq(freq_params); + } + + ModelMarkov::init(freq); +} + +void ModelMultistate::readRates(istream &in) noexcept(false) { + int nrates = getNumRateEntries(); + int row = 1, col = 0; + // since states for protein is stored in lower-triangle, special treatment is needed + for (int i = 0; i < nrates; i++, col++) { + if (col == row) { + row++; col = 0; + } + // switch col and row + int id = col*(2*num_states-col-1)/2 + (row-col-1); + if (id >= nrates) { + cout << row << " " << col << endl; + } + assert(id < nrates && id >= 0); // make sure that the conversion is correct + + string tmp_value; + in >> tmp_value; + if (tmp_value.length() == 0) + throw name+string(": Rate entries could not be read"); + rates[id] = convert_double_with_distribution(tmp_value.c_str(), true); + + if (rates[id] < 0.0) + throw "Negative rates found"; + } +} + +int ModelMultistate::getNDim() { + int ndim = num_params; + if (freq_type == FREQ_ESTIMATE) + ndim += num_states-1; + return ndim; +} + +ModelMultistate::~ModelMultistate() { +} + +void ModelMultistate::startCheckpoint() { + checkpoint->startStruct("ModelMorph"); +} + +void ModelMultistate::saveCheckpoint() { + startCheckpoint(); + if (num_params > 0) + CKP_ARRAY_SAVE(getNumRateEntries(), rates); + endCheckpoint(); + ModelMarkov::saveCheckpoint(); +} + +void ModelMultistate::restoreCheckpoint() { + ModelMarkov::restoreCheckpoint(); + startCheckpoint(); + if (num_params > 0) + CKP_ARRAY_RESTORE(getNumRateEntries(), rates); + endCheckpoint(); + decomposeRateMatrix(); + if (phylo_tree) + phylo_tree->clearAllPartialLH(); +} + +/** + * @return model name + */ +string ModelMultistate::getName() { + size_t pos_plus = name.find('+'); + if (pos_plus != string::npos) { + return name; + } else { + return ModelMarkov::getName(); + } +} + +string ModelMultistate::getNameParams(bool show_fixed_params) { + if (num_params == 0) return name; + ostringstream retname; + size_t pos_plus = name.find('+'); + if (pos_plus != string::npos) { + retname << name.substr(0, pos_plus) << '{'; + } else { + retname << name << '{'; + } + int nrates = getNumRateEntries(); + for (int i = 0; i < nrates; i++) { + if (i>0) retname << ','; + retname << rates[i]; + } + retname << '}'; + if (pos_plus != string::npos) { + retname << name.substr(pos_plus); + } else { + getNameParamsFreq(retname); + } + return retname.str(); +} + +void ModelMultistate::writeParameters(ostream &out) { + int i; + if (freq_type == FREQ_ESTIMATE) { + for (i = 0; i < num_states; i++) + out << "\t" << state_freq[i]; + } + if (num_params == 0) return; + int nrateout = getNumRateEntries() - 1; + for (i = 0; i < nrateout; i++) + out << "\t" << rates[i]; +} + +void ModelMultistate::writeInfo(ostream &out) { + if (num_params > 0) { + out << "Rate parameters:"; + int nrate = getNumRateEntries(); + for (int i = 0; i < nrate; i++) + out << " " << rates[i]; + out << endl; + } + if (freq_type != FREQ_EQUAL) { + out << "State frequencies:"; + for (int i = 0; i < num_states; i++) + out << " " << state_freq[i]; + out << endl; + } +} diff --git a/model/modelmultistate.h b/model/modelmultistate.h new file mode 100644 index 000000000..ce506ed84 --- /dev/null +++ b/model/modelmultistate.h @@ -0,0 +1,83 @@ +/* + * modelmultistate.h + * + * Created on: Apr 6, 2025 + * Author: Hiroaki Sato + * Original author: minh + */ + +#ifndef MODELMULTISTATE_H_ +#define MODELMULTISTATE_H_ + +#include "modelmarkov.h" + +class ModelMultistate: public ModelMarkov { +public: + /** + constructor + @param model_name model name, e.g., JC, HKY. + @param freq state frequency type + @param tree associated phylogenetic tree + */ + ModelMultistate(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree); + + + /** + initialization, called automatically by the constructor, no need to call it + @param model_name model name, e.g., JC, HKY. + @param freq state frequency type + */ + virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params); + + /** + return the number of dimensions + */ + virtual int getNDim(); + + /** + start structure for checkpointing + */ + virtual void startCheckpoint(); + + /** + save object into the checkpoint + */ + virtual void saveCheckpoint(); + + /** + restore object from the checkpoint + */ + virtual void restoreCheckpoint(); + + /** + * @return model name + */ + virtual string getName(); + + /** + * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f} + */ + virtual string getNameParams(bool show_fixed_params = false); + + /** + write information + @param out output stream + */ + virtual void writeInfo(ostream &out); + + /** + write parameters, used with modeltest + @param out output stream + */ + virtual void writeParameters(ostream &out); + + /** + read the rates from an input stream. it will throw error messages if failed + @param in input stream + */ + virtual void readRates(istream &in) noexcept(false); + + virtual ~ModelMultistate(); +}; + +#endif /* MODELMUTISTATE_H_ */ From d3e3e59c900e37a174934c85e57495dd5f679d1d Mon Sep 17 00:00:00 2001 From: HS6986 Date: Tue, 8 Apr 2025 23:45:16 +0900 Subject: [PATCH 02/17] fix frequency types --- main/phylotesting.cpp | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 78d736c52..a6d597b4c 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6374,15 +6374,18 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint } skip_all_when_drop = true; } else if (action == 3) { - char freq_set_multistate_protein[] = "FQ,FO"; - char freq_set_codon[] = "FQ,F,F1X4,F3X4"; - char freq_set_default[] = "FO"; - char* init_state_freq_set = - (iqtree.aln->seq_type == SEQ_MULTISTATE || iqtree.aln->seq_type == SEQ_PROTEIN) - ? freq_set_multistate_protein - : (iqtree.aln->seq_type == SEQ_CODON) - ? freq_set_codon - : freq_set_default; + char freq_set_codon[] = ",F1X4,F3X4"; + char freq_set_multistate[] = "FQ,FO"; + char freq_set_protein[] = ",FO"; + char freq_set_default[] = "FO"; + char* init_state_freq_set = + (iqtree.aln->seq_type == SEQ_CODON) + ? freq_set_codon + : (iqtree.aln->seq_type == SEQ_MULTISTATE) + ? freq_set_multistate + : (iqtree.aln->seq_type == SEQ_PROTEIN) + ? freq_set_protein + : freq_set_default; if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } @@ -6548,15 +6551,18 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck double LR, df_diff, pvalue; string criteria_str; - char freq_set_multistate_protein[] = "FQ,FO"; - char freq_set_codon[] = "FQ,F,F1X4,F3X4"; + char freq_set_codon[] = ",F1X4,F3X4"; + char freq_set_multistate[] = "FQ,FO"; + char freq_set_protein[] = ",FO"; char freq_set_default[] = "FO"; char* init_state_freq_set = - (iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_PROTEIN) - ? freq_set_multistate_protein - : (iqtree->aln->seq_type == SEQ_CODON) - ? freq_set_codon - : freq_set_default; + (iqtree->aln->seq_type == SEQ_CODON) + ? freq_set_codon + : (iqtree->aln->seq_type == SEQ_MULTISTATE) + ? freq_set_multistate + : (iqtree->aln->seq_type == SEQ_PROTEIN) + ? freq_set_protein + : freq_set_default; if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } From 3a4de2be77531869f71c2d796464224638299474 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Tue, 15 Apr 2025 13:31:17 +0900 Subject: [PATCH 03/17] allow FU in mixture models --- main/phylotesting.cpp | 2 +- model/modelmixture.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index a6d597b4c..4746f1570 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6703,7 +6703,7 @@ void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_i params.stop_condition = SC_UNSUCCESS_ITERATION; if (iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_MORPH) { - cout << endl << "MixtureFinder will also test models with unequal rates and/or frequencies. Make sure your data are multistate." << endl; + cout << endl << "MixtureFinder will also test models with unequal rates and/or frequencies" << endl; iqtree->aln->seq_type = SEQ_MULTISTATE; } diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp index 1d8a05650..4dec0a102 100644 --- a/model/modelmixture.cpp +++ b/model/modelmixture.cpp @@ -1408,6 +1408,8 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string } else { outError("The user defined frequency model is incorrect"); } + } else if (fstr == "+FU" || fstr == "+Fu") { + model_freq = FREQ_USER_DEFINED; } else if (fstr == "+FO" || fstr == "+Fo") { model_freq = FREQ_ESTIMATE; } else if (fstr == "+FQ" || fstr == "+Fq") { From 39c8a328b8281f86c56847ac892f57ae24c899bc Mon Sep 17 00:00:00 2001 From: HS6986 Date: Tue, 29 Apr 2025 00:38:09 +0900 Subject: [PATCH 04/17] delete the class ModelMultistate --- main/phyloanalysis.cpp | 6 +- main/phylotesting.cpp | 54 +++--------- model/CMakeLists.txt | 1 - model/modelfactory.cpp | 3 - model/modelmixture.cpp | 7 +- model/modelmultistate.cpp | 174 -------------------------------------- model/modelmultistate.h | 83 ------------------ 7 files changed, 15 insertions(+), 313 deletions(-) delete mode 100644 model/modelmultistate.cpp delete mode 100644 model/modelmultistate.h diff --git a/main/phyloanalysis.cpp b/main/phyloanalysis.cpp index 6559512d4..7fafdd978 100644 --- a/main/phyloanalysis.cpp +++ b/main/phyloanalysis.cpp @@ -48,7 +48,6 @@ //#include "modeltest_wrapper.h" #include "model/modelprotein.h" #include "model/modelbin.h" -#include "model/modelmultistate.h" #include "model/modelcodon.h" #include "utils/stoprule.h" @@ -250,7 +249,6 @@ void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) { << alignment.getNSite() << " "; switch (alignment.seq_type) { case SEQ_BINARY: out << "binary"; break; - case SEQ_MULTISTATE: out << "multistate"; break; case SEQ_DNA: out << "nucleotide"; break; case SEQ_PROTEIN: out << "amino-acid"; break; case SEQ_CODON: out << "codon"; break; @@ -2066,7 +2064,7 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) { // make sure this method will not make IQTREE crashed if (!(params.aln_file || params.partition_file) || !params.out_prefix || !tree.aln || !tree.getModel() - || !(tree.aln->seq_type == SEQ_DNA || tree.aln->seq_type == SEQ_CODON || tree.aln->seq_type == SEQ_PROTEIN || tree.aln->seq_type == SEQ_BINARY || tree.aln->seq_type == SEQ_MULTISTATE || tree.aln->seq_type == SEQ_MORPH) + || !(tree.aln->seq_type == SEQ_DNA || tree.aln->seq_type == SEQ_CODON || tree.aln->seq_type == SEQ_PROTEIN || tree.aln->seq_type == SEQ_BINARY || tree.aln->seq_type == SEQ_MORPH) || tree.isTreeMix()) return; @@ -2078,7 +2076,7 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) // skip unsupported models if (tree.getModel()->isMixture() || tree.getRate()->isHeterotachy() || tree.getModel()->isLieMarkov() || tree.aln->seq_type == SEQ_CODON) { - out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, Multistate, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; + out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; out << more_info << endl << endl; return; } diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 4746f1570..4152e7a37 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -29,7 +29,6 @@ //#include "modeltest_wrapper.h" #include "model/modelprotein.h" #include "model/modelbin.h" -#include "model/modelmultistate.h" #include "model/modelcodon.h" #include "model/modelmorphology.h" #include "model/modelmixture.h" @@ -62,13 +61,6 @@ const char* bin_model_names[] = {"GTR2", "JC2"}; -/******* Multistate model set ******/ -const char* multistate_model_names[] = {"GTRX", "MK"}; - -/* Multistate frequency set */ -const char* multistate_freq_names[] = {"FQ", "FO"}; - - /******* Morphological model set ******/ // 2018-08-20: don't test ORDERED model due to lots of numerical issues //const char* morph_model_names[] = {"MK", "ORDERED"}; @@ -214,7 +206,6 @@ const char* aa_usual_model = "LG"; const char* aa_usual_nonrev_model = "NQ.pfam"; const char* codon_usual_model = "GY+F3X4"; const char* bin_usual_model = "GTR2"; -const char* multistate_usual_model = "GTRX"; const char* morph_usual_model = "MK"; const char* pomo_usual_model = "GTR+P"; @@ -252,7 +243,6 @@ string getUsualModelSubst(SeqType seq_type) { return aa_usual_model; case SEQ_CODON: return codon_usual_model; case SEQ_BINARY: return bin_usual_model; - case SEQ_MULTISTATE: return multistate_usual_model; case SEQ_MORPH: return morph_usual_model; case SEQ_POMO: return pomo_usual_model; default: ASSERT(0 && "Unprocessed seq_type"); return ""; @@ -498,9 +488,6 @@ int detectSeqType(const char *model_name, SeqType &seq_type) { seq_type = SEQ_MORPH; break; } - if (model_str == "GTRX") { - seq_type = SEQ_MULTISTATE; - } copyCString(dna_model_names, sizeof(dna_model_names)/sizeof(char*), model_list, true); for (i = 0; i < model_list.size(); i++) if (model_str == model_list[i]) { @@ -579,7 +566,6 @@ string convertSeqTypeToSeqTypeName(SeqType seq_type) { switch (seq_type) { case SEQ_BINARY: return "BIN"; break; - case SEQ_MULTISTATE: return "MULTISTATE"; break; case SEQ_MORPH: return "MORPH"; break; case SEQ_DNA: return "DNA"; break; case SEQ_PROTEIN: return "AA"; break; @@ -957,16 +943,6 @@ void getModelSubst(SeqType seq_type, bool standard_code, string model_name, } else { convert_string_vec(model_set.c_str(), model_names); } - } else if (seq_type == SEQ_MULTISTATE) { - if (model_set.empty()) { - copyCString(multistate_model_names, sizeof(multistate_model_names) / sizeof(char*), model_names); - } else if (model_set[0] == '+') { - // append model_set into existing models - convert_string_vec(model_set.c_str()+1, model_names); - appendCString(multistate_model_names, sizeof(multistate_model_names) / sizeof(char*), model_names); - } else { - convert_string_vec(model_set.c_str(), model_names); - } } else if (seq_type == SEQ_MORPH) { if (model_set.empty()) { copyCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names); @@ -1172,9 +1148,6 @@ void getStateFreqs(SeqType seq_type, char *state_freq_set, StrVector &freq_names } if (freq_names.empty()) { switch (seq_type) { - case SEQ_MULTISTATE: - copyCString(multistate_freq_names, sizeof(multistate_freq_names)/sizeof(char*), freq_names); - break; case SEQ_DNA: copyCString(dna_freq_names, sizeof(dna_freq_names)/sizeof(char*), freq_names); break; @@ -1241,13 +1214,13 @@ void getRateHet(SeqType seq_type, string model_name, double frac_invariant_sites if (with_new && rate_set != "1") { if (with_asc) test_options = test_options_asc_new; - else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MULTISTATE || seq_type == SEQ_MORPH) + else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) test_options = test_options_morph_new; else test_options = test_options_noASC_I_new; } else if (with_asc) test_options = test_options_asc; - else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MULTISTATE || seq_type == SEQ_MORPH) { + else if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY || seq_type == SEQ_MORPH) { if (rate_set == "1") test_options = test_options_morph_fast; else @@ -6375,13 +6348,13 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint skip_all_when_drop = true; } else if (action == 3) { char freq_set_codon[] = ",F1X4,F3X4"; - char freq_set_multistate[] = "FQ,FO"; + char freq_set_multistate[] = "FQ"; char freq_set_protein[] = ",FO"; char freq_set_default[] = "FO"; char* init_state_freq_set = (iqtree.aln->seq_type == SEQ_CODON) ? freq_set_codon - : (iqtree.aln->seq_type == SEQ_MULTISTATE) + : (iqtree.aln->seq_type == SEQ_MORPH) ? freq_set_multistate : (iqtree.aln->seq_type == SEQ_PROTEIN) ? freq_set_protein @@ -6552,13 +6525,13 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck string criteria_str; char freq_set_codon[] = ",F1X4,F3X4"; - char freq_set_multistate[] = "FQ,FO"; + char freq_set_multistate[] = "FQ"; char freq_set_protein[] = ",FO"; char freq_set_default[] = "FO"; char* init_state_freq_set = (iqtree->aln->seq_type == SEQ_CODON) ? freq_set_codon - : (iqtree->aln->seq_type == SEQ_MULTISTATE) + : (iqtree->aln->seq_type == SEQ_MORPH) ? freq_set_multistate : (iqtree->aln->seq_type == SEQ_PROTEIN) ? freq_set_protein @@ -6578,7 +6551,12 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name, params.model_set, params.model_subset, model_names); getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names); - + + if (std::all_of(model_names.begin(), model_names.end(), [](const std::string& s) { return s == "MK"; }) && + std::all_of(freq_names.begin(), freq_names.end(), [](const std::string& s) { return s == "+FQ"; })) { + outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless. Please provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); + } + nest_network = generateNestNetwork(model_names, freq_names); //} @@ -6687,9 +6665,6 @@ void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_i if (iqtree->isSuperTree()) outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions"); - if (!(iqtree->aln->seq_type == SEQ_DNA || iqtree->aln->seq_type == SEQ_CODON|| iqtree->aln->seq_type == SEQ_BINARY || iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_MORPH || iqtree->aln->seq_type == SEQ_PROTEIN)) - outError("Error! The option -m '" + params.model_name + "' can only work on DNA, codon, binary, multistate, and amino acid data set"); - cout << "--------------------------------------------------------------------" << endl; cout << "| Optimizing Q-mixture model |" << endl; cout << "--------------------------------------------------------------------" << endl; @@ -6701,11 +6676,6 @@ void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_i params.gbo_replicates = 0; params.consensus_type = CT_NONE; params.stop_condition = SC_UNSUCCESS_ITERATION; - - if (iqtree->aln->seq_type == SEQ_MULTISTATE || iqtree->aln->seq_type == SEQ_MORPH) { - cout << endl << "MixtureFinder will also test models with unequal rates and/or frequencies" << endl; - iqtree->aln->seq_type = SEQ_MULTISTATE; - } optimiseQMixModel_method_update(params, iqtree, model_info, model_str); diff --git a/model/CMakeLists.txt b/model/CMakeLists.txt index e99a9fa05..d1284e50b 100644 --- a/model/CMakeLists.txt +++ b/model/CMakeLists.txt @@ -1,7 +1,6 @@ add_library(model modelmarkov.cpp modelmarkov.h modelbin.cpp modelbin.h -modelmultistate.cpp modelmultistate.h modeldna.cpp modeldna.h modeldnaerror.cpp modeldnaerror.h modelfactory.cpp modelfactory.h diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 35968a281..36ce37af9 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -26,7 +26,6 @@ #include "modeldna.h" #include "modelprotein.h" #include "modelbin.h" -#include "modelmultistate.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelpomo.h" @@ -168,7 +167,6 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, if (tree->aln->seq_type == SEQ_DNA) model_str = "HKY"; else if (tree->aln->seq_type == SEQ_PROTEIN) model_str = "LG"; else if (tree->aln->seq_type == SEQ_BINARY) model_str = "GTR2"; - else if (tree->aln->seq_type == SEQ_MULTISTATE) model_str = "GTRX"; else if (tree->aln->seq_type == SEQ_CODON) model_str = "GY"; else if (tree->aln->seq_type == SEQ_MORPH) model_str = "MK"; else if (tree->aln->seq_type == SEQ_POMO) model_str = "HKY+P"; @@ -438,7 +436,6 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, if (model_str.substr(0, 3) != "MIX" && freq_type == FREQ_UNKNOWN) { switch (tree->aln->seq_type) { case SEQ_BINARY: freq_type = FREQ_ESTIMATE; break; // default for binary: optimized frequencies - case SEQ_MULTISTATE: freq_type = FREQ_ESTIMATE; break; // default for multistate data: optimized frequencies case SEQ_PROTEIN: break; // let ModelProtein decide by itself case SEQ_MORPH: freq_type = FREQ_EQUAL; break; case SEQ_CODON: freq_type = FREQ_UNKNOWN; break; diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp index 4dec0a102..fe75f3885 100644 --- a/model/modelmixture.cpp +++ b/model/modelmixture.cpp @@ -10,7 +10,6 @@ #include "modeldnaerror.h" #include "modelprotein.h" #include "modelbin.h" -#include "modelmultistate.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelset.h" @@ -1139,7 +1138,6 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || - (model_str == "MK" && tree->aln->seq_type == SEQ_MULTISTATE) || (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) { @@ -1155,7 +1153,6 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; // else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || // (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || -// ((model_str == "GTR" || model_str == "GTRX") && model_str == SEQ_MULTISTATE) || // (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { // model = new ModelGTR(tree, count_rates); // if (freq_params != "") @@ -1168,9 +1165,7 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); } else if (tree->aln->seq_type == SEQ_BINARY) { model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_MULTISTATE) { - model = new ModelMultistate(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_DNA) { + } else if (tree->aln->seq_type == SEQ_DNA) { if (seqerr.empty()) model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); else diff --git a/model/modelmultistate.cpp b/model/modelmultistate.cpp deleted file mode 100644 index 4896417b9..000000000 --- a/model/modelmultistate.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * modelmultistate.cpp - * - * Created on: Apr 6, 2025 - * Author: Hiroaki Sato - * Original author: minh - */ - -#include "modelmultistate.h" - -ModelMultistate::ModelMultistate(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree) -: ModelMarkov(tree) -{ - init(model_name, model_params, freq, freq_params); -} - -void ModelMultistate::init(const char *model_name, string model_params, StateFreqType freq, string freq_params) -{ - name = model_name; - full_name = model_name; - if (name == "MK") { - // all were initialized - num_params = 0; - } else if (name == "ORDERED") { - int i, j, k = 0; - // only allow for substitution from state i to state i+1 and back. - for (i = 0; i < num_states-1; i++) { - rates[k++] = 1.0; - for (j = i+2; j < num_states; j++, k++) - rates[k] = 0.0; - } - num_params = 0; - } else if (name == "GTR" || name == "GTRX") { - outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting!"); - outWarning("Please only use GTRX with very large data and always test for model fit!"); - name = "GTRX"; - } else { - // if name does not match, read the user-defined model - readParameters(model_name); - num_params = 0; - freq = FREQ_USER_DEFINED; - } - - // parse user-specified state frequencies (if any) - if (freq_params != "") - { - freq_type = FREQ_USER_DEFINED; - readStateFreq(freq_params); - } - - ModelMarkov::init(freq); -} - -void ModelMultistate::readRates(istream &in) noexcept(false) { - int nrates = getNumRateEntries(); - int row = 1, col = 0; - // since states for protein is stored in lower-triangle, special treatment is needed - for (int i = 0; i < nrates; i++, col++) { - if (col == row) { - row++; col = 0; - } - // switch col and row - int id = col*(2*num_states-col-1)/2 + (row-col-1); - if (id >= nrates) { - cout << row << " " << col << endl; - } - assert(id < nrates && id >= 0); // make sure that the conversion is correct - - string tmp_value; - in >> tmp_value; - if (tmp_value.length() == 0) - throw name+string(": Rate entries could not be read"); - rates[id] = convert_double_with_distribution(tmp_value.c_str(), true); - - if (rates[id] < 0.0) - throw "Negative rates found"; - } -} - -int ModelMultistate::getNDim() { - int ndim = num_params; - if (freq_type == FREQ_ESTIMATE) - ndim += num_states-1; - return ndim; -} - -ModelMultistate::~ModelMultistate() { -} - -void ModelMultistate::startCheckpoint() { - checkpoint->startStruct("ModelMorph"); -} - -void ModelMultistate::saveCheckpoint() { - startCheckpoint(); - if (num_params > 0) - CKP_ARRAY_SAVE(getNumRateEntries(), rates); - endCheckpoint(); - ModelMarkov::saveCheckpoint(); -} - -void ModelMultistate::restoreCheckpoint() { - ModelMarkov::restoreCheckpoint(); - startCheckpoint(); - if (num_params > 0) - CKP_ARRAY_RESTORE(getNumRateEntries(), rates); - endCheckpoint(); - decomposeRateMatrix(); - if (phylo_tree) - phylo_tree->clearAllPartialLH(); -} - -/** - * @return model name - */ -string ModelMultistate::getName() { - size_t pos_plus = name.find('+'); - if (pos_plus != string::npos) { - return name; - } else { - return ModelMarkov::getName(); - } -} - -string ModelMultistate::getNameParams(bool show_fixed_params) { - if (num_params == 0) return name; - ostringstream retname; - size_t pos_plus = name.find('+'); - if (pos_plus != string::npos) { - retname << name.substr(0, pos_plus) << '{'; - } else { - retname << name << '{'; - } - int nrates = getNumRateEntries(); - for (int i = 0; i < nrates; i++) { - if (i>0) retname << ','; - retname << rates[i]; - } - retname << '}'; - if (pos_plus != string::npos) { - retname << name.substr(pos_plus); - } else { - getNameParamsFreq(retname); - } - return retname.str(); -} - -void ModelMultistate::writeParameters(ostream &out) { - int i; - if (freq_type == FREQ_ESTIMATE) { - for (i = 0; i < num_states; i++) - out << "\t" << state_freq[i]; - } - if (num_params == 0) return; - int nrateout = getNumRateEntries() - 1; - for (i = 0; i < nrateout; i++) - out << "\t" << rates[i]; -} - -void ModelMultistate::writeInfo(ostream &out) { - if (num_params > 0) { - out << "Rate parameters:"; - int nrate = getNumRateEntries(); - for (int i = 0; i < nrate; i++) - out << " " << rates[i]; - out << endl; - } - if (freq_type != FREQ_EQUAL) { - out << "State frequencies:"; - for (int i = 0; i < num_states; i++) - out << " " << state_freq[i]; - out << endl; - } -} diff --git a/model/modelmultistate.h b/model/modelmultistate.h deleted file mode 100644 index ce506ed84..000000000 --- a/model/modelmultistate.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * modelmultistate.h - * - * Created on: Apr 6, 2025 - * Author: Hiroaki Sato - * Original author: minh - */ - -#ifndef MODELMULTISTATE_H_ -#define MODELMULTISTATE_H_ - -#include "modelmarkov.h" - -class ModelMultistate: public ModelMarkov { -public: - /** - constructor - @param model_name model name, e.g., JC, HKY. - @param freq state frequency type - @param tree associated phylogenetic tree - */ - ModelMultistate(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree); - - - /** - initialization, called automatically by the constructor, no need to call it - @param model_name model name, e.g., JC, HKY. - @param freq state frequency type - */ - virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params); - - /** - return the number of dimensions - */ - virtual int getNDim(); - - /** - start structure for checkpointing - */ - virtual void startCheckpoint(); - - /** - save object into the checkpoint - */ - virtual void saveCheckpoint(); - - /** - restore object from the checkpoint - */ - virtual void restoreCheckpoint(); - - /** - * @return model name - */ - virtual string getName(); - - /** - * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f} - */ - virtual string getNameParams(bool show_fixed_params = false); - - /** - write information - @param out output stream - */ - virtual void writeInfo(ostream &out); - - /** - write parameters, used with modeltest - @param out output stream - */ - virtual void writeParameters(ostream &out); - - /** - read the rates from an input stream. it will throw error messages if failed - @param in input stream - */ - virtual void readRates(istream &in) noexcept(false); - - virtual ~ModelMultistate(); -}; - -#endif /* MODELMUTISTATE_H_ */ From 477b8b0cd2d84420b1f5e383d8b93413168479ce Mon Sep 17 00:00:00 2001 From: HS6986 Date: Tue, 29 Apr 2025 16:36:37 +0900 Subject: [PATCH 05/17] Changed the wording of the GTRX warnings; disabled the GTRX warnings if the number of states <= 6 && the number of the patterns in the alignment/partition >= 100 --- model/modelmorphology.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/model/modelmorphology.cpp b/model/modelmorphology.cpp index 0e3fe4a71..16ddf5484 100644 --- a/model/modelmorphology.cpp +++ b/model/modelmorphology.cpp @@ -30,9 +30,10 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre } num_params = 0; } else if (name == "GTR" || name == "GTRX") { - outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting!"); - outWarning("Please only use GTRX with very large data and always test for model fit!"); - name = "GTRX"; + if (!(num_states <= 6 && phylo_tree->aln->getNPattern() >= 100)) { + outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!"); + outWarning("Please only use GTRX with large data and always test for model fit!"); + } } else { // if name does not match, read the user-defined model readParameters(model_name); From 5e95d0e2c503021815d708a0150d39026d8713ec Mon Sep 17 00:00:00 2001 From: HS6986 Date: Tue, 29 Apr 2025 16:51:53 +0900 Subject: [PATCH 06/17] Fix indentation issues --- model/modelmorphology.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model/modelmorphology.cpp b/model/modelmorphology.cpp index 16ddf5484..1250cd1d1 100644 --- a/model/modelmorphology.cpp +++ b/model/modelmorphology.cpp @@ -30,10 +30,10 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre } num_params = 0; } else if (name == "GTR" || name == "GTRX") { - if (!(num_states <= 6 && phylo_tree->aln->getNPattern() >= 100)) { - outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!"); - outWarning("Please only use GTRX with large data and always test for model fit!"); - } + if (!(num_states <= 6 && phylo_tree->aln->getNPattern() >= 100)) { + outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!"); + outWarning("Please only use GTRX with large data and always test for model fit!"); + } } else { // if name does not match, read the user-defined model readParameters(model_name); From 06fffe7bf53650c874284ec3bff61ad6bbb054b7 Mon Sep 17 00:00:00 2001 From: HuaiyanRen <100552380+HuaiyanRen@users.noreply.github.com> Date: Fri, 9 May 2025 15:42:59 +1000 Subject: [PATCH 07/17] Only generateNestNetwork for DNA models. --- main/phylotesting.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index e85508ca6..4d8b46f4b 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6544,7 +6544,7 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck ssize = iqtree->getAlnNSite(); criteria_str = criterionName(params.model_test_criterion); - // Step 0: (reorder candidate models when -mset is used) build the nest-relationship network + // Step 0: (reorder candidate DNA models when -mset is used) build the nest-relationship network map > nest_network; //if (iqtree->aln->seq_type == SEQ_DNA) { StrVector model_names, freq_names; @@ -6556,8 +6556,10 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck std::all_of(freq_names.begin(), freq_names.end(), [](const std::string& s) { return s == "+FQ"; })) { outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless. Please provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); } - - nest_network = generateNestNetwork(model_names, freq_names); + + if (iqtree->aln->seq_type == SEQ_DNA) { + nest_network = generateNestNetwork(model_names, freq_names); + } //} // Step 1: run ModelFinder From 16fc4172ebb44118a61770560db85dd66551ce5e Mon Sep 17 00:00:00 2001 From: HuaiyanRen <100552380+HuaiyanRen@users.noreply.github.com> Date: Fri, 9 May 2025 16:24:05 +1000 Subject: [PATCH 08/17] create a function to initialise MixtureFinder frequencies --- main/phylotesting.cpp | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 4d8b46f4b..333c64554 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6207,6 +6207,16 @@ void addModel(string model_str, string& new_model_str, string new_subst) { } } +// initialise model frequency set in MixtureFinder for different sequence types +char* initFreqSet(SeqType seq_type) { + switch (seq_type) { + case SEQ_CODON: return ",F1X4,F3X4"; + case SEQ_MORPH: return "FQ"; + case SEQ_PROTEIN: return ",FO"; + default: return "FO"; + } +} + // This function is similar to runModelFinder, but it is designed for optimisation of Q-Mixture model // action: 1 - estimate the RHAS model // 2 - estimate the number of classes in a mixture model @@ -6347,18 +6357,7 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint } skip_all_when_drop = true; } else if (action == 3) { - char freq_set_codon[] = ",F1X4,F3X4"; - char freq_set_multistate[] = "FQ"; - char freq_set_protein[] = ",FO"; - char freq_set_default[] = "FO"; - char* init_state_freq_set = - (iqtree.aln->seq_type == SEQ_CODON) - ? freq_set_codon - : (iqtree.aln->seq_type == SEQ_MORPH) - ? freq_set_multistate - : (iqtree.aln->seq_type == SEQ_PROTEIN) - ? freq_set_protein - : freq_set_default; + char* init_state_freq_set = initFreqSet(iqtree.aln->seq_type); if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } @@ -6367,6 +6366,7 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint params.model_set, params.model_subset, model_names); if (model_names.empty()) + free(init_state_freq_set); return best_model; getStateFreqs(iqtree.aln->seq_type, params.state_freq_set, freq_names); @@ -6400,6 +6400,7 @@ CandidateModel runModelSelection(Params ¶ms, IQTree &iqtree, ModelCheckpoint candidate_models.push_back(CandidateModel(new_model_str, iqtree.getModelFactory()->site_rate->name, iqtree.aln, 0)); } + free(init_state_freq_set); skip_all_when_drop = false; } else { params.ratehet_set = best_rate_name; @@ -6524,18 +6525,7 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck double LR, df_diff, pvalue; string criteria_str; - char freq_set_codon[] = ",F1X4,F3X4"; - char freq_set_multistate[] = "FQ"; - char freq_set_protein[] = ",FO"; - char freq_set_default[] = "FO"; - char* init_state_freq_set = - (iqtree->aln->seq_type == SEQ_CODON) - ? freq_set_codon - : (iqtree->aln->seq_type == SEQ_MORPH) - ? freq_set_multistate - : (iqtree->aln->seq_type == SEQ_PROTEIN) - ? freq_set_protein - : freq_set_default; + char* init_state_freq_set = initFreqSet(iqtree->aln->seq_type); if (!params.state_freq_set) { params.state_freq_set = init_state_freq_set; } @@ -6561,7 +6551,7 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck nest_network = generateNestNetwork(model_names, freq_names); } //} - + free(init_state_freq_set); // Step 1: run ModelFinder params.model_name = ""; bool under_mix_finder = true; From 0b417c258297fc27aac999fdaf4969d4f239974b Mon Sep 17 00:00:00 2001 From: HS6986 Date: Sun, 11 May 2025 19:51:00 +0900 Subject: [PATCH 09/17] Restore a comment I accidentally deleted --- main/phylotesting.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index fc37b0817..dd40d4bca 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6207,6 +6207,7 @@ void addModel(string model_str, string& new_model_str, string new_subst) { } } +// initialise model frequency set in MixtureFinder for different sequence types char* initFreqSet(SeqType seq_type) { switch (seq_type) { case SEQ_CODON: return ",F1X4,F3X4"; From ce2b9a9f9b6ced9458bfa87ad6318c2c6972baec Mon Sep 17 00:00:00 2001 From: HS6986 Date: Mon, 12 May 2025 00:05:09 +0900 Subject: [PATCH 10/17] Refine the code according to the advice by StefanFlaumberg and bqminh; Temporarily comment out `free(init_state_freq_set);`, which HuaiyanRen added, as they cause an error --- main/phylotesting.cpp | 40 ++++++++++++++++++++++----------------- model/modelmorphology.cpp | 21 +++++++++++--------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index dd40d4bca..668325d4b 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6375,7 +6375,7 @@ CandidateModel findMixtureComponent(Params ¶ms, IQTree &iqtree, ModelCheckpo params.model_set, params.model_subset, model_names); if (model_names.empty()) - free(init_state_freq_set); + //free(init_state_freq_set); return best_model; getStateFreqs(iqtree.aln->seq_type, params.state_freq_set, freq_names); @@ -6409,7 +6409,7 @@ CandidateModel findMixtureComponent(Params ¶ms, IQTree &iqtree, ModelCheckpo candidate_models.push_back(CandidateModel(new_model_str, iqtree.getModelFactory()->site_rate->name, iqtree.aln, 0)); } - free(init_state_freq_set); + //free(init_state_freq_set); skip_all_when_drop = false; } else { params.ratehet_set = best_rate_name; @@ -6550,22 +6550,28 @@ void runMixtureFinderMain(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &mode // Step 0: (reorder candidate DNA models when -mset is used) build the nest-relationship network map > nest_network; - //if (iqtree->aln->seq_type == SEQ_DNA) { - StrVector model_names, freq_names; - getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name, - params.model_set, params.model_subset, model_names); - getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names); - - if (std::all_of(model_names.begin(), model_names.end(), [](const std::string& s) { return s == "MK"; }) && - std::all_of(freq_names.begin(), freq_names.end(), [](const std::string& s) { return s == "+FQ"; })) { - outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless. Please provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); - } + StrVector model_names, freq_names; + getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name, + params.model_set, params.model_subset, model_names); + getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names); + + // check if all models names are "MK" and all frequencies are "+FQ" + auto isOnlyMKAndFQ = [&]() -> bool { + bool onlyMK = std::all_of(model_names.begin(), model_names.end(), + [](const std::string& s) { return s == "MK"; }); + bool onlyFQ = std::all_of(freq_names.begin(), freq_names.end(), + [](const std::string& s) { return s == "+FQ"; }); + return onlyMK && onlyFQ; + }; + if (isOnlyMKAndFQ()) { + outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless. Please provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); + } - if (iqtree->aln->seq_type == SEQ_DNA) { - nest_network = generateNestNetwork(model_names, freq_names); - } - //} - free(init_state_freq_set); + if (iqtree->aln->seq_type == SEQ_DNA) { + nest_network = generateNestNetwork(model_names, freq_names); + } + //free(init_state_freq_set); + // Step 1: run ModelFinder params.model_name = ""; bool under_mix_finder = true; diff --git a/model/modelmorphology.cpp b/model/modelmorphology.cpp index 1250cd1d1..d0bc1de63 100644 --- a/model/modelmorphology.cpp +++ b/model/modelmorphology.cpp @@ -30,9 +30,10 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre } num_params = 0; } else if (name == "GTR" || name == "GTRX") { - if (!(num_states <= 6 && phylo_tree->aln->getNPattern() >= 100)) { + if (num_states > 6 || phylo_tree->aln->getNPattern() < 100) { outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!"); outWarning("Please only use GTRX with large data and always test for model fit!"); + name = "GTRX"; } } else { // if name does not match, read the user-defined model @@ -123,20 +124,22 @@ string ModelMorphology::getName() { } string ModelMorphology::getNameParams(bool show_fixed_params) { - if (num_params == 0) return name; ostringstream retname; size_t pos_plus = name.find('+'); if (pos_plus != string::npos) { - retname << name.substr(0, pos_plus) << '{'; + retname << name.substr(0, pos_plus); } else { - retname << name << '{'; + retname << name; } - int nrates = getNumRateEntries(); - for (int i = 0; i < nrates; i++) { - if (i>0) retname << ','; - retname << rates[i]; + if (num_params > 0 || show_fixed_params) { + retname << '{'; + int nrates = getNumRateEntries(); + for (int i = 0; i < nrates; i++) { + if (i > 0) retname << ','; + retname << rates[i]; + } + retname << '}'; } - retname << '}'; if (pos_plus != string::npos) { retname << name.substr(pos_plus); } else { From 727a9349e000d1de937adfdd45a0c590a8e15326 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Mon, 12 May 2025 10:02:41 +0900 Subject: [PATCH 11/17] Delete free(init_state_freq_set) --- main/phylotesting.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 668325d4b..94f5a569b 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6375,7 +6375,6 @@ CandidateModel findMixtureComponent(Params ¶ms, IQTree &iqtree, ModelCheckpo params.model_set, params.model_subset, model_names); if (model_names.empty()) - //free(init_state_freq_set); return best_model; getStateFreqs(iqtree.aln->seq_type, params.state_freq_set, freq_names); @@ -6409,7 +6408,6 @@ CandidateModel findMixtureComponent(Params ¶ms, IQTree &iqtree, ModelCheckpo candidate_models.push_back(CandidateModel(new_model_str, iqtree.getModelFactory()->site_rate->name, iqtree.aln, 0)); } - //free(init_state_freq_set); skip_all_when_drop = false; } else { params.ratehet_set = best_rate_name; @@ -6570,7 +6568,6 @@ void runMixtureFinderMain(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &mode if (iqtree->aln->seq_type == SEQ_DNA) { nest_network = generateNestNetwork(model_names, freq_names); } - //free(init_state_freq_set); // Step 1: run ModelFinder params.model_name = ""; From 623874addcfbf5e79a44429735a336ccc65d95e9 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Mon, 12 May 2025 10:13:43 +0900 Subject: [PATCH 12/17] Relocate the misplaced name = "GTRX"; --- model/modelmorphology.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/modelmorphology.cpp b/model/modelmorphology.cpp index d0bc1de63..d90dc3440 100644 --- a/model/modelmorphology.cpp +++ b/model/modelmorphology.cpp @@ -33,8 +33,8 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre if (num_states > 6 || phylo_tree->aln->getNPattern() < 100) { outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!"); outWarning("Please only use GTRX with large data and always test for model fit!"); - name = "GTRX"; } + name = "GTRX"; } else { // if name does not match, read the user-defined model readParameters(model_name); From 3f01fd9828a1dfde00b37543da64ce0b00c18511 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Mon, 12 May 2025 10:34:18 +0900 Subject: [PATCH 13/17] Restrict isRateTypeNested() only to DNA data --- main/phylotesting.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 94f5a569b..cea23dad9 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6786,15 +6786,15 @@ void reorderModelNames(StrVector& model_names, const char* model_set[], size_t s } bool isRateTypeNested(string rate_type1, string rate_type2) { - /*if (rate_type1.length() != 6) { + if (rate_type1.length() != 6) { outError("Incorrect DNA model rate type code: " + rate_type1); } if (rate_type2.length() != 6) { outError("Incorrect DNA model rate type code: " + rate_type2); - }*/ + } - for (int i = 0; i < rate_type1.length()-1; i++) { - for (int j = i; j < rate_type1.length(); j++ ){ + for (int i = 0; i < 5; i++) { + for (int j = i; j < 6; j++ ){ if (rate_type1[i] == rate_type1[j] && rate_type2[i] != rate_type2[j]){ return false; } From 4198d8953463b9e735e180dadd29ff569a274ed7 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Wed, 21 May 2025 22:22:11 +0900 Subject: [PATCH 14/17] Refine the code a little according to the changes made in feature/HS6986/MorphMixtureFinder --- main/phylotesting.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index cea23dad9..c8f5d714d 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6553,16 +6553,15 @@ void runMixtureFinderMain(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &mode params.model_set, params.model_subset, model_names); getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names); - // check if all models names are "MK" and all frequencies are "+FQ" auto isOnlyMKAndFQ = [&]() -> bool { bool onlyMK = std::all_of(model_names.begin(), model_names.end(), - [](const std::string& s) { return s == "MK"; }); + [](const std::string& s) { return s == "MK"; }); bool onlyFQ = std::all_of(freq_names.begin(), freq_names.end(), - [](const std::string& s) { return s == "+FQ"; }); + [](const std::string& s) { return s == "+FQ"; }); return onlyMK && onlyFQ; }; if (isOnlyMKAndFQ()) { - outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless. Please provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); + outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless.\nPlease provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data."); } if (iqtree->aln->seq_type == SEQ_DNA) { From 375ab159a8ce8dc576fed832971e273bd95d3fb4 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Wed, 21 May 2025 23:44:47 +0900 Subject: [PATCH 15/17] Set warnings according to the advice by StefanFlaumberg; Set an error that occurs when users try to apply MixtureFinder to amino acid data; Create the --force-aa-mix-finder option to force IQ-TREE to run MixtureFinder for amino acid data --- main/phylotesting.cpp | 9 +++++++++ utils/tools.cpp | 6 ++++++ utils/tools.h | 3 +++ 3 files changed, 18 insertions(+) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index c8f5d714d..e8ea82f2d 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6671,6 +6671,15 @@ void runMixtureFinder(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_in if (iqtree->isSuperTree()) outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions"); + if (iqtree->aln->seq_type != SEQ_DNA) + outWarning("MixtureFinder has not been tested for non-DNA data types. Be cautious about interpreting the results"); + + if (iqtree->aln->getMaxNumStates() > 6) + outWarning("Running MixtureFinder for the given data type can take much time. Please consider restricting the set of the models to test as much as possible"); + + if (iqtree->aln->seq_type == SEQ_PROTEIN && !params.force_aa_mix_finder) + outError("Error! We already have probably effective mixture frequency vectors C10–C60 for amino acid data, and they should explain data well.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line."); + cout << "--------------------------------------------------------------------" << endl; cout << "| Running MixtureFinder |" << endl; cout << "--------------------------------------------------------------------" << endl; diff --git a/utils/tools.cpp b/utils/tools.cpp index cd4d16a7c..6b46f7c10 100644 --- a/utils/tools.cpp +++ b/utils/tools.cpp @@ -1487,6 +1487,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.ignore_checkpoint = false; params.checkpoint_dump_interval = 60; params.force_unfinished = false; + params.force_aa_mix_finder = false; params.print_all_checkpoints = false; params.suppress_output_flags = 0; params.ufboot2corr = false; @@ -5475,6 +5476,11 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.force_unfinished = true; continue; } + + if (strcmp(argv[cnt], "-force-aa-mix-finder") == 0 || strcmp(argv[cnt], "--force-aa-mix-finder") == 0) { + params.force_aa_mix_finder = true; + continue; + } if (strcmp(argv[cnt], "-cptime") == 0 || strcmp(argv[cnt], "--cptime") == 0) { cnt++; diff --git a/utils/tools.h b/utils/tools.h index 749bf6cf4..4994c8b77 100644 --- a/utils/tools.h +++ b/utils/tools.h @@ -2453,6 +2453,9 @@ class Params { /** true if ignoring the "finished" flag in checkpoint file */ bool force_unfinished; + /** true if forcing IQ-TREE to run MixtureFinder for amino acid data */ + bool force_aa_mix_finder; + /** TRUE to print checkpoints to 1.ckp.gz, 2.ckp.gz,... */ bool print_all_checkpoints; From 3d01a12bdaac9b7901cc7169653de0d7c7938f71 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Thu, 22 May 2025 03:22:21 +0900 Subject: [PATCH 16/17] Change the wording of an error --- main/phylotesting.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index e8ea82f2d..dcffb5161 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6678,7 +6678,7 @@ void runMixtureFinder(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_in outWarning("Running MixtureFinder for the given data type can take much time. Please consider restricting the set of the models to test as much as possible"); if (iqtree->aln->seq_type == SEQ_PROTEIN && !params.force_aa_mix_finder) - outError("Error! We already have probably effective mixture frequency vectors C10–C60 for amino acid data, and they should explain data well.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line."); + outError("Error! We already have the probably effective mixture frequency vectors C10–C60 for amino acid data, and they should explain data well.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line."); cout << "--------------------------------------------------------------------" << endl; cout << "| Running MixtureFinder |" << endl; From 468832af13695d1c3c7cd8885a5e88607c06b260 Mon Sep 17 00:00:00 2001 From: HS6986 Date: Thu, 22 May 2025 05:04:53 +0900 Subject: [PATCH 17/17] Change the wording of an error --- main/phylotesting.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index dcffb5161..4d6243aac 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -6678,7 +6678,7 @@ void runMixtureFinder(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_in outWarning("Running MixtureFinder for the given data type can take much time. Please consider restricting the set of the models to test as much as possible"); if (iqtree->aln->seq_type == SEQ_PROTEIN && !params.force_aa_mix_finder) - outError("Error! We already have the probably effective mixture frequency vectors C10–C60 for amino acid data, and they should explain data well.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line."); + outError("Error! We already have the mixture frequency vectors C10–C60 which are effective for modeling amino acid data.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line."); cout << "--------------------------------------------------------------------" << endl; cout << "| Running MixtureFinder |" << endl;