3232 which_path ,
3333 get_odb_version ,
3434 rename_gff_contigs ,
35+ validate_busco_lineage ,
36+ validate_augustus_species ,
3537)
36- from .config import augustus_species , busco_taxonomy
37-
38-
39- def validate_augustus_species (species_name ):
40- """
41- Validate that the provided Augustus species is available in the config.
42-
43- Parameters:
44- - species_name (str): The Augustus species name to validate
45-
46- Returns:
47- - bool: True if valid, False otherwise
48- """
49- return species_name in augustus_species
50-
51-
52- def validate_busco_lineage (lineage_name ):
53- """
54- Validate that the provided BUSCO lineage is available in the config.
55-
56- Parameters:
57- - lineage_name (str): The BUSCO lineage name to validate
58-
59- Returns:
60- - bool: True if valid, False otherwise
61- """
62- return lineage_name in busco_taxonomy
38+ from .config import busco_taxonomy , augustus_species
6339
6440
6541def train (args ):
@@ -458,7 +434,9 @@ def count_multi_CDS_genes(indict):
458434 return len (indict ), counter
459435
460436
461- def selectTrainingModels (genome , train_dict , tmpdir = "/tmp" , flank_length = 1000 ):
437+ def selectTrainingModels (
438+ genome , train_dict , tmpdir = "/tmp" , flank_length = 1000 , mult_cds_threshold = 0.65
439+ ):
462440 """
463441 Filter and sort gene models from a GFF3 file based on completeness, non-overlapping nature, and number of exons.
464442
@@ -472,14 +450,18 @@ def selectTrainingModels(genome, train_dict, tmpdir="/tmp", flank_length=1000):
472450 - train_dict (dict): A dictionary containing gene models from a GFF3 file.
473451 - tmpdir (str, optional): Temporary directory for intermediate files (default is "/tmp").
474452 - flank_length (int, optional): Length of flanking regions to include (default is 1000).
453+ - mult_cds_threshold (float, optional): Threshold for ratio of multi-CDS genes (default is 0.65).
475454
476455 Returns:
477456 - dict: A dictionary of filtered and sorted gene models ready for training.
478457 """
479458
480- def _sortDict (d ):
459+ def _sortDictCDS (d ):
481460 return len (d [1 ]["CDS" ][0 ])
482461
462+ def _sortDict (d ):
463+ return (d [1 ]["contig" ], d [1 ]["location" ][0 ])
464+
483465 # setup interlap object
484466 gene_inter = defaultdict (InterLap )
485467
@@ -492,9 +474,14 @@ def _sortDict(d):
492474 countGenes , countGenesCDS = count_multi_CDS_genes (train_dict )
493475 logger .debug (f"{ countGenes } training set genes; { countGenesCDS } have multi-CDS" )
494476
477+ # calculate ratio of multi-CDS genes
478+ multiCDSratio = countGenesCDS / countGenes
495479 multiCDScheck = False
496- if countGenesCDS >= 20000 :
480+ if multiCDSratio >= mult_cds_threshold :
497481 multiCDScheck = True
482+ logger .debug (
483+ f"multi-CDS ratio is high ({ multiCDSratio :.2f} ), filtering out single-CDS genes for training"
484+ )
498485
499486 with open (proteins , "w" ) as protout :
500487 for k , v in natsorted (list (train_dict .items ())):
@@ -592,8 +579,11 @@ def _sortDict(d):
592579 else :
593580 GenesPass [k ] = v
594581
595- # now sort dictionary number of exons
596- sGenes = sorted (iter (GenesPass .items ()), key = _sortDict , reverse = True )
582+ # now sort dictionary number of exons if multiCDScheck else just location
583+ if multiCDScheck :
584+ sGenes = sorted (iter (GenesPass .items ()), key = _sortDictCDS , reverse = True )
585+ else :
586+ sGenes = sorted (iter (GenesPass .items ()), key = _sortDict )
597587 sortedGenes = OrderedDict (sGenes )
598588 logger .info (
599589 "{:,} of {:,} models pass training parameters" .format (
0 commit comments