Add --busco-lineage option to annotate, improve training model selection, fix CITATION.cff format

Jon Palmer · Jon Palmer · commit 5c3b7eafa3ba · 2025-07-06T14:12:30.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ tests/unit/__pycache__*
 tests/__pycache__*
 tests/integration/__pycache__*
 output.json
+.coverage
+annotations.txt
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,4 +1,4 @@
-cff-version: version = "25.7.1"
+cff-version: 1.2.0
 title: 'funannotate2: eukaryotic genome annotation'
 message: >-
   If you use this software, please cite it using the
@@ -17,5 +17,5 @@ keywords:
   - functional annotation
   - consensus gene models
 license: BSD-2-Clause
-version: version = "25.7.1"
+version: "25.7.1"
 date-released: '2025-07-01'
diff --git a/funannotate2/__main__.py b/funannotate2/__main__.py
@@ -514,6 +514,12 @@ def annotate_subparser(subparsers):
         help="Path to custom file with gene-specific annotations (tab-delimited: gene_id\tannotation_type\tannotation_value)",
         metavar="",
     )
+    optional_args.add_argument(
+        "--busco-lineage",
+        dest="busco_lineage",
+        help="BUSCO lineage to use, over-rides default auto selection",
+        metavar="",
+    )
     other_args = group.add_argument_group("Other arguments")
     other_args.add_argument(
         "-h",
diff --git a/funannotate2/annotate.py b/funannotate2/annotate.py
@@ -46,7 +46,9 @@
     lookup_taxonomy,
     naming_slug,
     get_odb_version,
+    validate_busco_lineage,
 )
+from .config import busco_taxonomy
 
 
 def _sortDict(d):
@@ -237,8 +239,19 @@ def annotate(args):
             # get taxonomy information
             taxonomy = lookup_taxonomy(args.species)
 
-        # choose best busco species
-        busco_species = choose_best_busco_species(taxonomy)
+        # validate and set busco lineage
+        if args.busco_lineage:
+            if not validate_busco_lineage(args.busco_lineage):
+                logger.critical(f"Invalid BUSCO lineage: {args.busco_lineage}")
+                logger.critical(
+                    f"Valid options are: {', '.join(sorted(busco_taxonomy.keys()))}"
+                )
+                raise SystemExit(1)
+            busco_species = args.busco_lineage
+            logger.info(f"Using user-specified BUSCO lineage: {busco_species}")
+        else:
+            # choose best busco species
+            busco_species = choose_best_busco_species(taxonomy)
         busco_model_path = os.path.join(
             env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}"
         )
diff --git a/funannotate2/data/custom_annotations.template.txt b/funannotate2/data/custom_annotations.template.txt
@@ -0,0 +1,24 @@
+# Custom annotations for specific genes/transcripts
+# This file can be used with funannotate2 annotate --curated-names
+# Format: gene_id<tab>annotation_type<tab>annotation_value
+#
+# Annotation types:
+#   name - Gene name (replaces existing name)
+#   product - Product description (replaces existing product)
+#   note - Additional information (added to existing notes)
+#   go_term - Gene Ontology term (added to existing GO terms)
+#   ec_number - Enzyme Commission number (added to existing EC numbers)
+#   db_xref - Database cross-reference (added to existing db_xrefs)
+#
+# Note: For name and product, custom values replace existing ones
+# For other annotation types, custom values are added to existing ones
+#
+# Examples:
+gene123	name	ACT1
+gene123	product	Actin
+gene123	note	Manually curated annotation
+gene456	name	CDC42
+gene456	product	Cell division control protein 42
+gene789	go_term	GO:0005524
+gene789	ec_number	3.6.4.13
+# Add your custom annotations below:
diff --git a/funannotate2/predict.py b/funannotate2/predict.py
@@ -430,7 +430,7 @@ def predict(args):
     else:
         abinitio_scores = {}
         logger.info(
-            "Measuring assembly completeness with buscolite for all ab initio predictions"
+            f"Measuring assembly completeness with buscolite [lineage={os.path.basename(busco_model_path)}] for all ab initio predictions"
         )
         for ap in abinitio_preds:
             ProtPreds = os.path.join(misc_dir, os.path.basename(ap) + ".prots.fa")
@@ -568,7 +568,9 @@ def predict(args):
         "Annotation statistics:\n{}".format(json.dumps(consensus_stats, indent=2))
     )
     # we are finished here with coding sequences, lets check completeness
-    logger.info("Measuring assembly completeness with buscolite")
+    logger.info(
+        f"Measuring assembly completeness with buscolite [lineage={os.path.basename(busco_model_path)}]"
+    )
     d, m, stats, cfg = runbusco(
         finalProteins,
         busco_model_path,
diff --git a/funannotate2/train.py b/funannotate2/train.py
@@ -32,34 +32,10 @@
     which_path,
     get_odb_version,
     rename_gff_contigs,
+    validate_busco_lineage,
+    validate_augustus_species,
 )
-from .config import augustus_species, busco_taxonomy
-
-
-def validate_augustus_species(species_name):
-    """
-    Validate that the provided Augustus species is available in the config.
-
-    Parameters:
-    - species_name (str): The Augustus species name to validate
-
-    Returns:
-    - bool: True if valid, False otherwise
-    """
-    return species_name in augustus_species
-
-
-def validate_busco_lineage(lineage_name):
-    """
-    Validate that the provided BUSCO lineage is available in the config.
-
-    Parameters:
-    - lineage_name (str): The BUSCO lineage name to validate
-
-    Returns:
-    - bool: True if valid, False otherwise
-    """
-    return lineage_name in busco_taxonomy
+from .config import busco_taxonomy, augustus_species
 
 
 def train(args):
@@ -458,7 +434,9 @@ def count_multi_CDS_genes(indict):
     return len(indict), counter
 
 
-def selectTrainingModels(genome, train_dict, tmpdir="/tmp", flank_length=1000):
+def selectTrainingModels(
+    genome, train_dict, tmpdir="/tmp", flank_length=1000, mult_cds_threshold=0.65
+):
     """
     Filter and sort gene models from a GFF3 file based on completeness, non-overlapping nature, and number of exons.
 
@@ -472,14 +450,18 @@ def selectTrainingModels(genome, train_dict, tmpdir="/tmp", flank_length=1000):
     - train_dict (dict): A dictionary containing gene models from a GFF3 file.
     - tmpdir (str, optional): Temporary directory for intermediate files (default is "/tmp").
     - flank_length (int, optional): Length of flanking regions to include (default is 1000).
+    - mult_cds_threshold (float, optional): Threshold for ratio of multi-CDS genes (default is 0.65).
 
     Returns:
     - dict: A dictionary of filtered and sorted gene models ready for training.
     """
 
-    def _sortDict(d):
+    def _sortDictCDS(d):
         return len(d[1]["CDS"][0])
 
+    def _sortDict(d):
+        return (d[1]["contig"], d[1]["location"][0])
+
     # setup interlap object
     gene_inter = defaultdict(InterLap)
 
@@ -492,9 +474,14 @@ def _sortDict(d):
     countGenes, countGenesCDS = count_multi_CDS_genes(train_dict)
     logger.debug(f"{countGenes} training set genes; {countGenesCDS} have multi-CDS")
 
+    # calculate ratio of multi-CDS genes
+    multiCDSratio = countGenesCDS / countGenes
     multiCDScheck = False
-    if countGenesCDS >= 20000:
+    if multiCDSratio >= mult_cds_threshold:
         multiCDScheck = True
+        logger.debug(
+            f"multi-CDS ratio is high ({multiCDSratio:.2f}), filtering out single-CDS genes for training"
+        )
 
     with open(proteins, "w") as protout:
         for k, v in natsorted(list(train_dict.items())):
@@ -592,8 +579,11 @@ def _sortDict(d):
                 else:
                     GenesPass[k] = v
 
-    # now sort dictionary number of exons
-    sGenes = sorted(iter(GenesPass.items()), key=_sortDict, reverse=True)
+    # now sort dictionary number of exons if multiCDScheck else just location
+    if multiCDScheck:
+        sGenes = sorted(iter(GenesPass.items()), key=_sortDictCDS, reverse=True)
+    else:
+        sGenes = sorted(iter(GenesPass.items()), key=_sortDict)
     sortedGenes = OrderedDict(sGenes)
     logger.info(
         "{:,} of {:,} models pass training parameters".format(
diff --git a/funannotate2/utilities.py b/funannotate2/utilities.py
@@ -292,6 +292,32 @@ def _download_ftp(url, file_name, timeout=60):
         return False
 
 
+def validate_augustus_species(species_name):
+    """
+    Validate that the provided Augustus species is available in the config.
+
+    Parameters:
+    - species_name (str): The Augustus species name to validate
+
+    Returns:
+    - bool: True if valid, False otherwise
+    """
+    return species_name in augustus_species
+
+
+def validate_busco_lineage(lineage_name):
+    """
+    Validate that the provided BUSCO lineage is available in the config.
+
+    Parameters:
+    - lineage_name (str): The BUSCO lineage name to validate
+
+    Returns:
+    - bool: True if valid, False otherwise
+    """
+    return lineage_name in busco_taxonomy
+
+
 def lookup_taxonomy(name):
     """
     Fetch taxonomy information for a given organism species name.