Skip to content

Commit 5c3b7ea

Browse files
author
Jon Palmer
committed
Add --busco-lineage option to annotate, improve training model selection, fix CITATION.cff format
1 parent f7b6379 commit 5c3b7ea

File tree

8 files changed

+101
-38
lines changed

8 files changed

+101
-38
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ tests/unit/__pycache__*
77
tests/__pycache__*
88
tests/integration/__pycache__*
99
output.json
10+
.coverage
11+
annotations.txt

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cff-version: version = "25.7.1"
1+
cff-version: 1.2.0
22
title: 'funannotate2: eukaryotic genome annotation'
33
message: >-
44
If you use this software, please cite it using the
@@ -17,5 +17,5 @@ keywords:
1717
- functional annotation
1818
- consensus gene models
1919
license: BSD-2-Clause
20-
version: version = "25.7.1"
20+
version: "25.7.1"
2121
date-released: '2025-07-01'

funannotate2/__main__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,12 @@ def annotate_subparser(subparsers):
514514
help="Path to custom file with gene-specific annotations (tab-delimited: gene_id\tannotation_type\tannotation_value)",
515515
metavar="",
516516
)
517+
optional_args.add_argument(
518+
"--busco-lineage",
519+
dest="busco_lineage",
520+
help="BUSCO lineage to use, over-rides default auto selection",
521+
metavar="",
522+
)
517523
other_args = group.add_argument_group("Other arguments")
518524
other_args.add_argument(
519525
"-h",

funannotate2/annotate.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@
4646
lookup_taxonomy,
4747
naming_slug,
4848
get_odb_version,
49+
validate_busco_lineage,
4950
)
51+
from .config import busco_taxonomy
5052

5153

5254
def _sortDict(d):
@@ -237,8 +239,19 @@ def annotate(args):
237239
# get taxonomy information
238240
taxonomy = lookup_taxonomy(args.species)
239241

240-
# choose best busco species
241-
busco_species = choose_best_busco_species(taxonomy)
242+
# validate and set busco lineage
243+
if args.busco_lineage:
244+
if not validate_busco_lineage(args.busco_lineage):
245+
logger.critical(f"Invalid BUSCO lineage: {args.busco_lineage}")
246+
logger.critical(
247+
f"Valid options are: {', '.join(sorted(busco_taxonomy.keys()))}"
248+
)
249+
raise SystemExit(1)
250+
busco_species = args.busco_lineage
251+
logger.info(f"Using user-specified BUSCO lineage: {busco_species}")
252+
else:
253+
# choose best busco species
254+
busco_species = choose_best_busco_species(taxonomy)
242255
busco_model_path = os.path.join(
243256
env["FUNANNOTATE2_DB"], f"{busco_species}_{odb_version}"
244257
)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Custom annotations for specific genes/transcripts
2+
# This file can be used with funannotate2 annotate --curated-names
3+
# Format: gene_id<tab>annotation_type<tab>annotation_value
4+
#
5+
# Annotation types:
6+
# name - Gene name (replaces existing name)
7+
# product - Product description (replaces existing product)
8+
# note - Additional information (added to existing notes)
9+
# go_term - Gene Ontology term (added to existing GO terms)
10+
# ec_number - Enzyme Commission number (added to existing EC numbers)
11+
# db_xref - Database cross-reference (added to existing db_xrefs)
12+
#
13+
# Note: For name and product, custom values replace existing ones
14+
# For other annotation types, custom values are added to existing ones
15+
#
16+
# Examples:
17+
gene123 name ACT1
18+
gene123 product Actin
19+
gene123 note Manually curated annotation
20+
gene456 name CDC42
21+
gene456 product Cell division control protein 42
22+
gene789 go_term GO:0005524
23+
gene789 ec_number 3.6.4.13
24+
# Add your custom annotations below:

funannotate2/predict.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def predict(args):
430430
else:
431431
abinitio_scores = {}
432432
logger.info(
433-
"Measuring assembly completeness with buscolite for all ab initio predictions"
433+
f"Measuring assembly completeness with buscolite [lineage={os.path.basename(busco_model_path)}] for all ab initio predictions"
434434
)
435435
for ap in abinitio_preds:
436436
ProtPreds = os.path.join(misc_dir, os.path.basename(ap) + ".prots.fa")
@@ -568,7 +568,9 @@ def predict(args):
568568
"Annotation statistics:\n{}".format(json.dumps(consensus_stats, indent=2))
569569
)
570570
# we are finished here with coding sequences, lets check completeness
571-
logger.info("Measuring assembly completeness with buscolite")
571+
logger.info(
572+
f"Measuring assembly completeness with buscolite [lineage={os.path.basename(busco_model_path)}]"
573+
)
572574
d, m, stats, cfg = runbusco(
573575
finalProteins,
574576
busco_model_path,

funannotate2/train.py

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,34 +32,10 @@
3232
which_path,
3333
get_odb_version,
3434
rename_gff_contigs,
35+
validate_busco_lineage,
36+
validate_augustus_species,
3537
)
36-
from .config import augustus_species, busco_taxonomy
37-
38-
39-
def validate_augustus_species(species_name):
40-
"""
41-
Validate that the provided Augustus species is available in the config.
42-
43-
Parameters:
44-
- species_name (str): The Augustus species name to validate
45-
46-
Returns:
47-
- bool: True if valid, False otherwise
48-
"""
49-
return species_name in augustus_species
50-
51-
52-
def validate_busco_lineage(lineage_name):
53-
"""
54-
Validate that the provided BUSCO lineage is available in the config.
55-
56-
Parameters:
57-
- lineage_name (str): The BUSCO lineage name to validate
58-
59-
Returns:
60-
- bool: True if valid, False otherwise
61-
"""
62-
return lineage_name in busco_taxonomy
38+
from .config import busco_taxonomy, augustus_species
6339

6440

6541
def train(args):
@@ -458,7 +434,9 @@ def count_multi_CDS_genes(indict):
458434
return len(indict), counter
459435

460436

461-
def selectTrainingModels(genome, train_dict, tmpdir="/tmp", flank_length=1000):
437+
def selectTrainingModels(
438+
genome, train_dict, tmpdir="/tmp", flank_length=1000, mult_cds_threshold=0.65
439+
):
462440
"""
463441
Filter and sort gene models from a GFF3 file based on completeness, non-overlapping nature, and number of exons.
464442
@@ -472,14 +450,18 @@ def selectTrainingModels(genome, train_dict, tmpdir="/tmp", flank_length=1000):
472450
- train_dict (dict): A dictionary containing gene models from a GFF3 file.
473451
- tmpdir (str, optional): Temporary directory for intermediate files (default is "/tmp").
474452
- flank_length (int, optional): Length of flanking regions to include (default is 1000).
453+
- mult_cds_threshold (float, optional): Threshold for ratio of multi-CDS genes (default is 0.65).
475454
476455
Returns:
477456
- dict: A dictionary of filtered and sorted gene models ready for training.
478457
"""
479458

480-
def _sortDict(d):
459+
def _sortDictCDS(d):
481460
return len(d[1]["CDS"][0])
482461

462+
def _sortDict(d):
463+
return (d[1]["contig"], d[1]["location"][0])
464+
483465
# setup interlap object
484466
gene_inter = defaultdict(InterLap)
485467

@@ -492,9 +474,14 @@ def _sortDict(d):
492474
countGenes, countGenesCDS = count_multi_CDS_genes(train_dict)
493475
logger.debug(f"{countGenes} training set genes; {countGenesCDS} have multi-CDS")
494476

477+
# calculate ratio of multi-CDS genes
478+
multiCDSratio = countGenesCDS / countGenes
495479
multiCDScheck = False
496-
if countGenesCDS >= 20000:
480+
if multiCDSratio >= mult_cds_threshold:
497481
multiCDScheck = True
482+
logger.debug(
483+
f"multi-CDS ratio is high ({multiCDSratio:.2f}), filtering out single-CDS genes for training"
484+
)
498485

499486
with open(proteins, "w") as protout:
500487
for k, v in natsorted(list(train_dict.items())):
@@ -592,8 +579,11 @@ def _sortDict(d):
592579
else:
593580
GenesPass[k] = v
594581

595-
# now sort dictionary number of exons
596-
sGenes = sorted(iter(GenesPass.items()), key=_sortDict, reverse=True)
582+
# now sort dictionary number of exons if multiCDScheck else just location
583+
if multiCDScheck:
584+
sGenes = sorted(iter(GenesPass.items()), key=_sortDictCDS, reverse=True)
585+
else:
586+
sGenes = sorted(iter(GenesPass.items()), key=_sortDict)
597587
sortedGenes = OrderedDict(sGenes)
598588
logger.info(
599589
"{:,} of {:,} models pass training parameters".format(

funannotate2/utilities.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,32 @@ def _download_ftp(url, file_name, timeout=60):
292292
return False
293293

294294

295+
def validate_augustus_species(species_name):
296+
"""
297+
Validate that the provided Augustus species is available in the config.
298+
299+
Parameters:
300+
- species_name (str): The Augustus species name to validate
301+
302+
Returns:
303+
- bool: True if valid, False otherwise
304+
"""
305+
return species_name in augustus_species
306+
307+
308+
def validate_busco_lineage(lineage_name):
309+
"""
310+
Validate that the provided BUSCO lineage is available in the config.
311+
312+
Parameters:
313+
- lineage_name (str): The BUSCO lineage name to validate
314+
315+
Returns:
316+
- bool: True if valid, False otherwise
317+
"""
318+
return lineage_name in busco_taxonomy
319+
320+
295321
def lookup_taxonomy(name):
296322
"""
297323
Fetch taxonomy information for a given organism species name.

0 commit comments

Comments
 (0)