Skip to content

Extend MixtureFinder to codon, binary, multistate, (and amino acid) data #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
6df0555
extend MixtureFinder to codon, binary, multistate, and amino acid data
Apr 6, 2025
d3e3e59
fix frequency types
Apr 8, 2025
3a4de2b
allow FU in mixture models
Apr 15, 2025
39c8a32
delete the class ModelMultistate
HS6986 Apr 28, 2025
4af564f
Merge branch 'master' into feature/HS6986/extend-MixtureFinder
HS6986 Apr 28, 2025
477b8b0
Changed the wording of the GTRX warnings; disabled the GTRX warnings …
HS6986 Apr 29, 2025
5e95d0e
Fix indentation issues
HS6986 Apr 29, 2025
06fffe7
Only generateNestNetwork for DNA models.
HuaiyanRen May 9, 2025
16fc417
create a function to initialise MixtureFinder frequencies
HuaiyanRen May 9, 2025
6bac548
Fix conflicts
HS6986 May 11, 2025
0b417c2
Restore a comment I accidentally deleted
HS6986 May 11, 2025
ce2b9a9
Refine the code according to the advice by StefanFlaumberg and bqminh…
HS6986 May 11, 2025
727a934
Delete free(init_state_freq_set)
HS6986 May 12, 2025
623874a
Relocate the misplaced name = "GTRX";
HS6986 May 12, 2025
3f01fd9
Restrict isRateTypeNested() only to DNA data
HS6986 May 12, 2025
4198d89
Refine the code a little according to the changes made in feature/HS6…
HS6986 May 21, 2025
baa25de
Merge branch 'master' into feature/HS6986/extend-MixtureFinder
HS6986 May 21, 2025
375ab15
Set warnings according to the advice by StefanFlaumberg; Set an error…
HS6986 May 21, 2025
3d01a12
Change the wording of an error
HS6986 May 21, 2025
468832a
Change the wording of an error
HS6986 May 21, 2025
b42a4cc
Merge remote-tracking branch 'upstream/master' into feature/HS6986/ex…
HS6986 May 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions main/phylotesting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6207,6 +6207,16 @@ void addModel(string model_str, string& new_model_str, string new_subst) {
}
}

// initialise model frequency set in MixtureFinder for different sequence types
char* initFreqSet(SeqType seq_type) {
switch (seq_type) {
case SEQ_CODON: return ",F1X4,F3X4";
case SEQ_MORPH: return "FQ";
case SEQ_PROTEIN: return ",FO";
default: return "FO";
}
}

/**
@brief Find the best component of a Q-mixture model while fixing other components
@note This function is similar to runModelFinder, but it is adapted for optimisation of Q-Mixture model
Expand Down Expand Up @@ -6355,8 +6365,8 @@ CandidateModel findMixtureComponent(Params &params, IQTree &iqtree, ModelCheckpo
}
}
skip_all_when_drop = true;
} else if (mixture_action == MA_FIND_CLASS) {
char init_state_freq_set[] = "FO";
} else if (mixture_action == MA_FIND_CLASS) {
char* init_state_freq_set = initFreqSet(iqtree.aln->seq_type);
if (!params.state_freq_set) {
params.state_freq_set = init_state_freq_set;
}
Expand Down Expand Up @@ -6527,7 +6537,7 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode
double LR, df_diff, pvalue;
string criteria_str;

char init_state_freq_set[] = "FO";
char* init_state_freq_set = initFreqSet(iqtree->aln->seq_type);
if (!params.state_freq_set) {
params.state_freq_set = init_state_freq_set;
}
Expand All @@ -6538,15 +6548,26 @@ void runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mode

// Step 0: (reorder candidate DNA models when -mset is used) build the nest-relationship network
map<string, vector<string> > nest_network;
if (iqtree->aln->seq_type == SEQ_DNA) {
StrVector model_names, freq_names;
getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name,
params.model_set, params.model_subset, model_names);
getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names);
StrVector model_names, freq_names;
getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name,
params.model_set, params.model_subset, model_names);
getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names);

auto isOnlyMKAndFQ = [&]() -> bool {
bool onlyMK = std::all_of(model_names.begin(), model_names.end(),
[](const std::string& s) { return s == "MK"; });
bool onlyFQ = std::all_of(freq_names.begin(), freq_names.end(),
[](const std::string& s) { return s == "+FQ"; });
return onlyMK && onlyFQ;
};
if (isOnlyMKAndFQ()) {
outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless.\nPlease provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data.");
}

if (iqtree->aln->seq_type == SEQ_DNA) {
nest_network = generateNestNetwork(model_names, freq_names);
}

// Step 1: run ModelFinder
params.model_name = "";
bool under_mix_finder = true;
Expand Down Expand Up @@ -6651,8 +6672,14 @@ void runMixtureFinder(Params &params, IQTree* &iqtree, ModelCheckpoint &model_in
outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions");

if (iqtree->aln->seq_type != SEQ_DNA)
outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set");

outWarning("MixtureFinder has not been tested for non-DNA data types. Be cautious about interpreting the results");

if (iqtree->aln->getMaxNumStates() > 6)
outWarning("Running MixtureFinder for the given data type can take much time. Please consider restricting the set of the models to test as much as possible");

if (iqtree->aln->seq_type == SEQ_PROTEIN && !params.force_aa_mix_finder)
outError("Error! We already have the mixture frequency vectors C10–C60 which are effective for modeling amino acid data.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line.");

cout << "--------------------------------------------------------------------" << endl;
cout << "| Running MixtureFinder |" << endl;
cout << "--------------------------------------------------------------------" << endl;
Expand Down
2 changes: 2 additions & 0 deletions model/modelmixture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1403,6 +1403,8 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
} else {
outError("The user defined frequency model is incorrect");
}
} else if (fstr == "+FU" || fstr == "+Fu") {
model_freq = FREQ_USER_DEFINED;
} else if (fstr == "+FO" || fstr == "+Fo") {
model_freq = FREQ_ESTIMATE;
} else if (fstr == "+FQ" || fstr == "+Fq") {
Expand Down
26 changes: 15 additions & 11 deletions model/modelmorphology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre
}
num_params = 0;
} else if (name == "GTR" || name == "GTRX") {
outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting!");
outWarning("Please only use GTRX with very large data and always test for model fit!");
name = "GTRX";
if (num_states > 6 || phylo_tree->aln->getNPattern() < 100) {
outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!");
outWarning("Please only use GTRX with large data and always test for model fit!");
}
name = "GTRX";
} else {
// if name does not match, read the user-defined model
readParameters(model_name);
Expand Down Expand Up @@ -122,20 +124,22 @@ string ModelMorphology::getName() {
}

string ModelMorphology::getNameParams(bool show_fixed_params) {
if (num_params == 0) return name;
ostringstream retname;
size_t pos_plus = name.find('+');
if (pos_plus != string::npos) {
retname << name.substr(0, pos_plus) << '{';
retname << name.substr(0, pos_plus);
} else {
retname << name << '{';
retname << name;
}
int nrates = getNumRateEntries();
for (int i = 0; i < nrates; i++) {
if (i>0) retname << ',';
retname << rates[i];
if (num_params > 0 || show_fixed_params) {
retname << '{';
int nrates = getNumRateEntries();
for (int i = 0; i < nrates; i++) {
if (i > 0) retname << ',';
retname << rates[i];
}
retname << '}';
}
retname << '}';
if (pos_plus != string::npos) {
retname << name.substr(pos_plus);
} else {
Expand Down
6 changes: 6 additions & 0 deletions utils/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1487,6 +1487,7 @@ void parseArg(int argc, char *argv[], Params &params) {
params.ignore_checkpoint = false;
params.checkpoint_dump_interval = 60;
params.force_unfinished = false;
params.force_aa_mix_finder = false;
params.print_all_checkpoints = false;
params.suppress_output_flags = 0;
params.ufboot2corr = false;
Expand Down Expand Up @@ -5475,6 +5476,11 @@ void parseArg(int argc, char *argv[], Params &params) {
params.force_unfinished = true;
continue;
}

if (strcmp(argv[cnt], "-force-aa-mix-finder") == 0 || strcmp(argv[cnt], "--force-aa-mix-finder") == 0) {
params.force_aa_mix_finder = true;
continue;
}

if (strcmp(argv[cnt], "-cptime") == 0 || strcmp(argv[cnt], "--cptime") == 0) {
cnt++;
Expand Down
3 changes: 3 additions & 0 deletions utils/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -2453,6 +2453,9 @@ class Params {
/** true if ignoring the "finished" flag in checkpoint file */
bool force_unfinished;

/** true if forcing IQ-TREE to run MixtureFinder for amino acid data */
bool force_aa_mix_finder;

/** TRUE to print checkpoints to 1.ckp.gz, 2.ckp.gz,... */
bool print_all_checkpoints;

Expand Down
Loading