diff --git a/config/r2dt.config b/config/r2dt.config index 2ba11e393..8109e4239 100644 --- a/config/r2dt.config +++ b/config/r2dt.config @@ -4,6 +4,7 @@ params { sequence_chunks = 4000 data_chunk_size = 1024 * 1000 * 1000 sequence_chunk_size = 1000 + sequence_count = 2000000 tablename = 'traveler_sequences_to_analyze' publish = "$baseDir/r2dt" container = 'rnacentral/r2dt:latest' diff --git a/files/r2dt/attempted.ctl b/files/r2dt/attempted.ctl index a3c211abf..957342089 100644 --- a/files/r2dt/attempted.ctl +++ b/files/r2dt/attempted.ctl @@ -1,11 +1,13 @@ LOAD CSV FROM ALL FILENAMES MATCHING ~ HAVING FIELDS ( - urs + urs, + r2dt_version ) INTO {{PGDATABASE}}?load_traveler_attempted TARGET COLUMNS ( - urs + urs, + r2dt_version ) WITH @@ -19,7 +21,8 @@ DROP TABLE IF EXISTS load_traveler_attempted; $$, $$ CREATE TABLE load_traveler_attempted ( - urs text primary key + urs text primary key, + r2dt_version text ); $$ @@ -27,15 +30,18 @@ AFTER LOAD DO $$ INSERT INTO pipeline_tracking_traveler ( urs, - last_run + last_run, + r2dt_version ) ( SELECT load.urs, - NOW() + NOW(), + load.r2dt_version FROM load_traveler_attempted load ) ON CONFLICT (urs) DO UPDATE -SET - last_run = EXCLUDED.last_run +SET + last_run = EXCLUDED.last_run, + r2dt_version = EXCLUDED.r2dt_version ; $$ ; diff --git a/files/r2dt/find-sequences.sql b/files/r2dt/find-sequences.sql index 5cf4df31f..c7b0043dd 100644 --- a/files/r2dt/find-sequences.sql +++ b/files/r2dt/find-sequences.sql @@ -5,7 +5,8 @@ SELECT 'sequence', COALESCE(rna.seq_short, rna.seq_long) ) FROM rna -WHERE +WHERE not exists(select 1 from pipeline_tracking_traveler track where track.urs = rna.upi) AND rna.len < :max_len + LIMIT :sequence_count ) TO STDOUT; diff --git a/files/r2dt/load-models.ctl b/files/r2dt/load-models.ctl index 3c1b4ef53..5c6f2e596 100644 --- a/files/r2dt/load-models.ctl +++ b/files/r2dt/load-models.ctl @@ -1,11 +1,11 @@ LOAD CSV -FROM ALL FILENAMES MATCHING ~ +FROM ALL FILENAMES MATCHING ~ HAVING FIELDS ( model_name, taxid, + cellular_location, rna_type, - so_term, - cell_location, + so_term_id, model_source, model_length, model_basepair_count @@ -13,9 +13,9 @@ HAVING FIELDS ( TARGET COLUMNS ( model_name, taxid, + cellular_location, rna_type, - so_term, - cell_location, + so_term_id, model_source, model_length, model_basepair_count @@ -33,9 +33,9 @@ $$ create table load_secondary_layout_models ( model_name text NOT NULL, taxid int NOT NULL, + cellular_location text, rna_type text NOT NULL, - so_term text NOT NULL, - cell_location text, + so_term_id text NOT NULL, model_source text not null, model_length int, model_basepair_count int @@ -47,9 +47,9 @@ $$ INSERT INTO rnc_secondary_structure_layout_models ( model_name, taxid, + cellular_location, rna_type, so_term_id, - cellular_location, model_source, model_length, model_basepair_count @@ -57,14 +57,23 @@ INSERT INTO rnc_secondary_structure_layout_models ( SELECT model_name, taxid, + cellular_location, rna_type, - so_term, - cell_location, + so_term_id, model_source, model_length, model_basepair_count FROM load_secondary_layout_models load -) ON CONFLICT (model_name) DO NOTHING +) ON CONFLICT (model_name) DO UPDATE +SET + taxid = EXCLUDED.taxid, + cellular_location = EXCLUDED.cellular_location, + rna_type = EXCLUDED.rna_type, + so_term_id = EXCLUDED.so_term_id, + model_source = EXCLUDED.model_source, + model_length = EXCLUDED.model_length, + model_basepair_count = EXCLUDED.model_basepair_count + ; $$, $$ diff --git a/r2dt-scan.nf b/r2dt-scan.nf new file mode 100644 index 000000000..227b3ce84 --- /dev/null +++ b/r2dt-scan.nf @@ -0,0 +1,130 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +include { r2dt } from './workflows/r2dt' + + +process parse_gtrnadb_model { + + input: + path(model_path) + output: + path("model_data.csv") + + script: + """ + rnac r2dt model-info gtrnadb $model_path model_data.csv + """ +} + +process parse_ribovision_models { + + input: + val(ribovision_metadata_url) + + + + output: + path("model_data.csv") + + script: + """ + wget $ribovision_metadata_url + + rnac r2dt model-info ribovision metadata.tsv model_data.csv + """ + +} + +process parse_rnasep_models { + + input: + val(rnasep_metadata_url) + + output: + path("model_data.csv") + + script: + """ + wget $rnasep_metadata_url + sed -i 's/\\tNRC-1\\t/\\t/g' metadata.tsv + rnac r2dt model-info rnase-p metadata.tsv model_data.csv + """ + +} + +process parse_rfam_models { + + input: + path(all_models) + output: + path("model_data.csv") + + script: + """ + rnac r2dt model-info rfam $all_models $PGDATABASE model_data.csv + """ +} + + +process parse_crw_models { + + input: + tuple path(all_models), val(metadata) + output: + path("model_data.csv") + + script: + """ + wget $metadata -O metadata.tsv + sed -i 's/taxid rna_type/taxid\trna_type/g' metadata.tsv + rnac r2dt model-info crw $all_models metadata.tsv model_data.csv + """ +} + +process load_models { + + input: + path(all_data) + path(ctl) + + output: + val('models loaded') + + script: + """ + split-and-load $ctl $all_data ${params.import_data.chunk_size} + """ +} + + + + + +workflow { + rfam_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/rfam/all.cm") + crw_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/crw/all.cm") + crw_metadata = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/crw-metadata.tsv") + gtrnadb_models = Channel.fromPath("$baseDir/singularity/bind/r2dt/data/cms/gtrnadb/*.cm") + ribovision_lsu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-lsu/metadata.tsv") + ribovision_ssu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-ssu/metadata.tsv") + + rnasep_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/rnasep/metadata.tsv") + + load_ctl = Channel.of("$baseDir/files/r2dt/load-models.ctl") + + rfam_models | parse_rfam_models | set { rfam_data } + crw_models.combine(crw_metadata) | parse_crw_models | set { crw_data } + gtrnadb_models | parse_gtrnadb_model | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { gtrnadb_data } + ribovision_lsu_metadata_url.mix(ribovision_ssu_metadata_url) | parse_ribovision_models | set {ribovision_data } + rnasep_metadata_url | parse_rnasep_models | set {rnasep_data} + + rfam_data.mix(crw_data, gtrnadb_data, ribovision_data, rnasep_data) | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { all_data } + + + load_models(all_data, load_ctl) | set { model_load } + + model_load | r2dt | set { done } + +} diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index cce9a7178..57acf1e73 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -138,12 +138,13 @@ def model_info(): @model_info.command("crw") @click.argument("filename", type=click.File("r")) +@click.argument("metadata_url", type=str) @click.argument("output", default="-", type=click.File("w")) -def crw_model_info(filename, output): +def crw_model_info(filename, metadata_url, output): """ Parse the CRW metadata file and produce """ - r2dt.write_crw(filename, output) + r2dt.write_crw(filename, metadata_url, output) @model_info.command("ribovision") @@ -158,7 +159,7 @@ def ribovision_model_info(filename, output): @model_info.command("gtrnadb") -@click.argument("filename", type=click.File("r")) +@click.argument("filename", type=click.File()) @click.argument("output", default="-", type=click.File("w")) def gtrnadb_model_info(filename, output): """ @@ -181,20 +182,23 @@ def rnase_p_model_info(filename, output): @model_info.command("rfam") @click.argument("filename", type=click.File("r")) +@click.argument("db_url", type=str) @click.argument("output", default="-", type=click.File("w")) -def rnase_p_model_info(filename, output): +def rnase_p_model_info(filename, db_url, output): """ Parse the metadata.tsv file from R2DT for Ribovision models to produce something we can put in our database. """ - r2dt.write_rfam(filename, output) + r2dt.write_rfam(filename, db_url, output) @cli.command("create-attempted") @click.argument("filename", type=click.File("r")) +@click.argument("version", type=click.File("r")) @click.argument("output", default="-", type=click.File("w")) -def r2dt_create_attempted(filename, output): - attempted.r2dt(filename, output) +def r2dt_create_attempted(filename, version, output): + version_string = version.read().strip() + attempted.r2dt(filename, version_string, output) @cli.command("publish") diff --git a/rnacentral_pipeline/rnacentral/attempted.py b/rnacentral_pipeline/rnacentral/attempted.py index 3c33d0368..f4ab67105 100644 --- a/rnacentral_pipeline/rnacentral/attempted.py +++ b/rnacentral_pipeline/rnacentral/attempted.py @@ -62,10 +62,14 @@ def parse_rfam_version(handle: ty.IO) -> str: raise ValueError(f"Could not find version in file {handle}") -def write(data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True): +def write( + data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True, version=None +): writer = csv.writer(output) seen = False for row in data: + if version: + row.append(version) writer.writerow(row) seen = True if not seen: @@ -88,6 +92,6 @@ def qa(handle: ty.IO, name: str, version_file: ty.IO, output: ty.IO): write(data, output) -def r2dt(handle: ty.IO, output: ty.IO): +def r2dt(handle: ty.IO, version: str, output: ty.IO): data = fasta_parser(handle) - write(data, output) + write(data, output, version=version) diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 951d294c2..8832a8ad5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -21,13 +21,14 @@ import joblib -from rnacentral_pipeline.rnacentral.r2dt import parser -from rnacentral_pipeline.rnacentral.r2dt import should_show -from rnacentral_pipeline.rnacentral.r2dt.models import crw -from rnacentral_pipeline.rnacentral.r2dt.models import gtrnadb -from rnacentral_pipeline.rnacentral.r2dt.models import ribovision -from rnacentral_pipeline.rnacentral.r2dt.models import rnase_p -from rnacentral_pipeline.rnacentral.r2dt.models import rfam +from rnacentral_pipeline.rnacentral.r2dt import parser, should_show +from rnacentral_pipeline.rnacentral.r2dt.models import ( + crw, + gtrnadb, + rfam, + ribovision, + rnase_p, +) def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False): @@ -36,7 +37,10 @@ def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False): def write( - model_mapping: ty.TextIO, directory: str, output: ty.TextIO, allow_missing=False + model_mapping: ty.TextIO, + directory: str, + output: ty.TextIO, + allow_missing=False, ): """ Parse all the secondary structure data from the given directory and write @@ -101,8 +105,8 @@ def prepare_s3( raw.write("\n") -def write_model(generator, handle, output): - data = generator(handle) +def write_model(generator, handle, output, extra=None): + data = generator(handle, extra=extra) data = (d.writeable() for d in data) csv.writer(output).writerows(data) @@ -115,16 +119,16 @@ def write_ribovision(handle, output): return write_model(ribovision.parse, handle, output) -def write_crw(handle, output): - return write_model(crw.parse, handle, output) +def write_crw(handle, db_url, output): + return write_model(crw.parse, handle, output, extra=db_url) def write_rnase_p(handle, output): return write_model(rnase_p.parse, handle, output) -def write_rfam(handle, output): - return write_model(rfam.parse, handle, output) +def write_rfam(handle, db_url, output): + return write_model(rfam.parse, handle, output, extra=db_url) def write_should_show(model: Path, handle: ty.IO, db_url: str, output: ty.IO): diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index d9d1313d1..aeefdac04 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -20,11 +20,10 @@ import typing as ty from pathlib import Path -from Bio import SeqIO - import attr from attr.validators import instance_of as is_a from attr.validators import optional +from Bio import SeqIO from rnacentral_pipeline.databases.data import RibovoreResult @@ -55,6 +54,105 @@ "SO:0000267", } +SO_RNA_NAME_LOOKUP = { + "SO:0000254": "alanyl_tRNA", + "SO:0000259": "glutaminyl_tRNA", + "SO:0000268": "prolyl_tRNA", + "SO:0000260": "glutamyl_tRNA", + "SO:0000266": "methionyl_tRNA", + "SO:0000256": "asparaginyl_tRNA", + "SO:0000270": "threonyl_tRNA", + "SO:0000261": "glycyl_tRNA", + "SO:0000273": "valyl_tRNA", + "SO:0000272": "tyrosyl_tRNA", + "SO:0000258": "cysteinyl_tRNA", + "SO:0000263": "isoleucyl_tRNA", + "SO:0000269": "seryl_tRNA", + "SO:0000264": "leucyl_tRNA", + "SO:0000271": "tryptophanyl_tRNA", + "SO:0005857": "selenocysteinyl_tRNA", + "SO:0000766": "pyrrolysyl_tRNA", + "SO:0000265": "lysyl_tRNA", + "SO:0000257": "aspartyl_tRNA", + "SO:0001036": "arginyl_tRNA", + "SO:0000262": "histidyl_tRNA", + "SO:0001172": "tRNA_region", + "SO:0002129": "mt_tRNA", + "SO:0000267": "phenylalanyl_tRNA", + "SO:0001001": "cytosolic_23S_rRNA", + "SO:0000386": "RNase_P_RNA", + "SO:0000650": "cytosolic_SSU_rRNA", + "SO:0000652": "cytosolic_5S_rRNA", + "SO:0000587": "group_I_intron", + "SO:0000603": "group_II_intron", + "SO:0000651": "cytosolic_LSU_rRNA", + "SO:0002128": "mt_rRNA", + "SO:0000407": "cytosolic_18S_rRNA", + "SO:0002345": "mt_LSU_rRNA", + "SO:0002344": "mt_SSU_rrNA", + "SO:0001001": "cytosolic_23S_rRNA", + "SO:0000391": "U1_snRNA", + "SO:0000392": "U2_snRNA", # below here added... + "SO:0000380": "hammerhead_ribozyme", + "SO:0001179": "U3_snoRNA", + "SO:0000393": "U4_snRNA", + "SO:0000593": "C_D_box_snoRNA", + "SO:0000590": "SRP_RNA", + "SO:0000405": "Y_RNA", + "SO:0000584": "tmRNA", + "SO:0000390": "telomerase_RNA", + "SO:0000396": "U6_snRNA", + "SO:0001244": "pre_miRNA", + "SO:0000385": "RNase_MRP_RNA", + "SO:1001274": "SECIS_element", + "SO:0000205": "three_prime_UTR", + "SO:0000655": "ncRNA", + "SO:0000644": "antisense_RNA", + "SO:0000204": "five_prime_UTR", + "SO:0000594": "H_ACA_box_snoRNA", + "SO:0000035": "riboswitch", + "SO:0000243": "internal_ribosome_entry_site", + "SO:0000274": "snRNA", + "SO:0000275": "snoRNA", + "SO:0000374": "ribozyme", + "SO:0005836": "regulatory_region", + "SO:0001459": "CRISPR", + "SO:0001427": "cis_regulatory_frameshift_element", + "SO:1001268": "recoding_stimulatory_region", + "SO:0000370": "small_regulatory_ncRNA", + "SO:0000837": "UTR_region", + "SO:0001263": "ncRNA_gene", + # From Rfam table... + "SO:0000122": "RNA_sequence_secondary_structure", + "SO:0000140": "attenuator", + "SO:0000376": "RNA_6S", + "SO:0000377": "CsrB_RsmB_RNA", + "SO:0000378": "DsrA_RNA", + "SO:0000379": "GcvB_RNA", + "SO:0000383": "MicF_RNA", + "SO:0000384": "OxyS_RNA", + "SO:0000387": "RprA_RNA", + "SO:0000388": "RRE_RNA", + "SO:0000389": "spot_42_RNA", + "SO:0000394": "U4atac_snRNA", + "SO:0000395": "U5_snRNA", + "SO:0000397": "U6atac_snRNA", + "SO:0000398": "U11_snRNA", + "SO:0000399": "U12_snRNA", + "SO:0000404": "vault_RNA", + "SO:0000588": "autocatalytically_spliced_intron", + "SO:0000726": "repeat_unit", + "SO:0000836": "mRNA_region", + "SO:0000990": "class_I_RNA", + "SO:0001055": "transcriptional_cis_regulatory_region", + "SO:0001877": "lncRNA", +} + +""" + + +""" + @enum.unique class Source(enum.Enum): @@ -97,11 +195,14 @@ class ModelInfo(object): source: Source = attr.ib(validator=is_a(Source)) length: ty.Optional[int] = attr.ib(validator=optional(is_a(int))) basepairs: ty.Optional[int] = attr.ib(validator=optional(is_a(int))) + cell_location: ty.Optional[str] = attr.ib(validator=optional(is_a(str))) def writeable(self): return [ self.model_name, self.taxid, + self.cell_location, + SO_RNA_NAME_LOOKUP[self.so_rna_type], self.so_rna_type, self.source.name, self.length, diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index 9fcc73fc8..b1bcb9607 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -13,33 +13,35 @@ limitations under the License. """ -import re import csv +import re +import typing as ty -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo -from rnacentral_pipeline.rnacentral.r2dt.data import Source +from rnacentral_pipeline.databases.helpers.phylogeny import FailedTaxonId, taxid +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source SO_TERM_MAPPING = { - "16S": "SO:0000650", - "23S": "SO:0000651", - "5S": "SO:0000652", - "I": "SO:0000587", - "IA1": "SO:0000587", - "IA2": "SO:0000587", - "IB": "SO:0000587", - "IB1": "SO:0000587", - "IB2": "SO:0000587", - "IB4": "SO:0000587", - "IC1": "SO:0000587", - "IC2": "SO:0000587", - "IC3": "SO:0000587", - "ID": "SO:0000587", - "IE": "SO:0000587", - "IIA": "SO:0000603", - "IIB": "SO:0000603", + "rRNA_16S": "SO:0000650", + "rRNA_5S": "SO:0000652", + "group_I_intron": "SO:0000587", + "group_II_intron": "SO:0000603", + "large_subunit_rRNA": "SO:0000651", + "small_subunit_rRNA": "SO:0000650", + "mt_LSU_rRNA": "SO:0002345", + "mt_SSU_rRNA": "SO:0002344", + "rRNA_18S": "SO:0000407", + "rRNA_21S": "SO:0002345", + "rRNA_23S": "SO:0001001", } +def load_metadata(handle: str): + metadata = {} + for row in csv.DictReader(open(handle), delimiter="\t"): + metadata[row["model_name"]] = {**row} + return metadata + + def as_so_term(raw): if raw in SO_TERM_MAPPING: return SO_TERM_MAPPING[raw] @@ -60,24 +62,50 @@ def as_taxid(raw): return int(raw) -def models(raw): - for model_id in raw["structure"].split(" "): - data = dict(raw) - model_id = re.sub(r"\.ps$", "", model_id) - data["model_id"] = model_id - yield data - - -def parse(handle): - for row in csv.DictReader(handle, delimiter="\t"): - for info in models(row): - intronic = info["rna_type"] == "I" - yield ModelInfo( - model_id=info["model_id"], - is_intronic=intronic, - so_term=as_so_term(info["rna_class"]), - taxid=as_taxid(info["tax_id"]), - accessions=row["accession(s)"].split(","), - source=Source.crw, - cell_location=info["cell_location"], - ) +def parse_model(handle, metadata) -> ModelInfo: + length: ty.Optional[str] = None + model_name: ty.Optional[str] = None + for line in handle: + line = line.strip() + if line == "CM": + break + key, value = re.split("\s+", line, maxsplit=1) + + if key == "NAME": + model_name = value + if key == "CLEN": + length = value + + if not model_name: + raise ValueError("Invalid name") + + if not length: + raise ValueError("Invalid length for: %s" % model_name) + + taxonomy_id = int(metadata[model_name]["taxid"]) + so_type_name = metadata[model_name]["rna_type"] + if so_type_name == "mt_rRNA": + if ".16." in model_name: + so_type_name = "mt_SSU_rRNA" + else: + so_type_name = "mt_LSU_rRNA" + + return ModelInfo( + model_name=model_name, + so_rna_type=as_so_term(so_type_name), + taxid=taxonomy_id, + source=Source.crw, + length=int(length), + basepairs=None, + cell_location=None, + ) + + +def parse(handle, extra=None): + metadata = load_metadata(extra) + for line in handle: + if line.startswith("INFERNAL"): + try: + yield parse_model(handle, metadata) + except KeyError as e: + continue diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py index d1243ed6d..9f63b76b5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py @@ -13,13 +13,12 @@ limitations under the License. """ -import re import csv -import typing as ty import operator as op +import re +import typing as ty -from rnacentral_pipeline.rnacentral.r2dt.data import Source -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source DOMAINS = { "arch": ("A", 2157), @@ -85,27 +84,27 @@ def parse_model(handle) -> ModelInfo: short_domain, taxid = DOMAINS[domain] so_term = TYPES[trna_type] - model_id = "%s-%s" % (short_domain, trna_type) + model_name = "%s-%s" % (short_domain, trna_type) return ModelInfo( - model_id=model_id, - is_intronic=False, - so_term=so_term, + model_name=model_name, + so_rna_type=so_term, taxid=taxid, - accessions=[], source=Source.gtrnadb, length=int(length), cell_location=loc, + basepairs=None, ) -def parse(handle): +def parse(handle, extra=None): for line in handle: if line.startswith("INFERNAL"): yield parse_model(handle) def write(handle, output): + data = parse(handle) data = map(op.methodcaller("writeable"), data) csv.writer(output).writerows(data) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py index 30faacdb7..c56c02bfc 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py @@ -13,37 +13,76 @@ limitations under the License. """ -import typing as ty import csv +import re +import typing as ty + +import psycopg2 +import psycopg2.extras from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source RFAM_QUERY = """ -select +select rfam_model_id, 'rfam', 131567, so_rna_type, - rna_type -from rfam_models + rna_type +from rfam_models where so_rna_type is not null """ -def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int]]: - return {} +def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int, str, str]]: + conn = psycopg2.connect(db_url) + cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) + cur.execute(RFAM_QUERY) + res = {} + for result in cur: + res[result["rfam_model_id"]] = (result[1], result[2], result[3], result[4]) + cur.close() + conn.close() + return res + + +def parse_model(handle, known_info) -> ModelInfo: + length: ty.Optional[str] = None + model_name: ty.Optional[str] = None + for line in handle: + line = line.strip() + if line == "CM": + break + key, value = re.split("\s+", line, maxsplit=1) + + if key == "ACC": + model_name = value + if key == "CLEN": + length = value + + if not model_name: + raise ValueError("Invalid name") + + if not length: + raise ValueError("Invalid length for: %s" % model_name) + + return ModelInfo( + model_name=model_name, + so_rna_type=known_info[model_name][2], + taxid=known_info[model_name][1], + source=Source.rfam, + length=int(length), + basepairs=None, + cell_location=None, + ) -def parse(cm_stat: ty.IO, db_url: str) -> ty.Iterable[ModelInfo]: - known_info = load_info(db_url) - for row in csv.reader(cm_stat): - info = known_info[row[0]] - yield ModelInfo( - model_name=row[0], - so_rna_type=info[0], - taxid=info[1], - source=Source.rfam, - length=int(row[1]), - basepairs=int(row[2]), - ) +def parse(cm_stat: ty.IO, extra: str = None) -> ty.Iterable[ModelInfo]: + known_info = load_info(extra) + for line in cm_stat: + if line.startswith("INFERNAL"): + try: + yield parse_model(cm_stat, known_info) + except KeyError: + continue diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py b/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py index a9fb7d36c..051a932b0 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py @@ -15,8 +15,8 @@ import csv -from rnacentral_pipeline.rnacentral.r2dt.data import Source -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo +from rnacentral_pipeline.databases.helpers.phylogeny import FailedTaxonId, taxid +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source def lookup_taxid(species): @@ -56,7 +56,10 @@ def lookup_taxid(species): return 7227 if species == "Trypanosoma brucei": return 5691 - raise ValueError("Unknown species name: " + species) + try: + return taxid(species) + except FailedTaxonId: + raise ValueError("Unknown species name: " + species) def as_location(raw): @@ -85,23 +88,22 @@ def so_term(row): raise ValueError("Could not figure out SO term for: %s" % row) -def parse(handle): +def parse(handle, extra=None): for row in csv.DictReader(handle, delimiter="\t"): so_term_id = so_term(row) - if not row["taxid"]: + if not row.get("taxid", None): taxid = lookup_taxid(row["species"]) else: taxid = int(row["taxid"]) - location = as_location(row["cellular_location"]) + # location = as_location(row["cellular_location"]) yield ModelInfo( - model_id=row["model_name"], - is_intronic=False, - so_term=so_term_id, + model_name=row["model_name"], + so_rna_type=so_term_id, taxid=taxid, - accessions=[], source=Source.ribovision, - cell_location=location, + cell_location=None, length=None, + basepairs=None, ) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py b/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py index f63f53fe6..0a1547002 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py @@ -13,23 +13,22 @@ limitations under the License. """ -import typing as ty import csv +import typing as ty from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source -def parse(handle) -> ty.Iterable[ModelInfo]: +def parse(handle, extra=None) -> ty.Iterable[ModelInfo]: for row in csv.DictReader(handle, delimiter="\t"): so_term_id = "SO:0000386" taxid = int(row["taxid"]) yield ModelInfo( model_name=row["model_name"], - is_intronic=False, - so_term=so_term_id, + so_rna_type=so_term_id, taxid=taxid, - accessions=[], source=Source.rnase_p, cell_location=None, length=None, + basepairs=None, ) diff --git a/rnacentral_pipeline/rnacentral/r2dt/should_show.py b/rnacentral_pipeline/rnacentral/r2dt/should_show.py index d27e18ed1..b83eb7cd5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/should_show.py +++ b/rnacentral_pipeline/rnacentral/r2dt/should_show.py @@ -20,14 +20,13 @@ from pathlib import Path import joblib -from more_itertools import chunked import pandas as pd -from pypika import Table, Query import psycopg2 import psycopg2.extras +from more_itertools import chunked +from pypika import Query, Table from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score, train_test_split LOGGER = logging.getLogger(__name__) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index 3be666626..3654aa5e2 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -32,6 +32,7 @@ process extract_sequences { -v ON_ERROR_STOP=1 \ -v 'tablename=${params.r2dt.tablename}' \ -v max_len=10000 \ + -v 'sequence_count=${params.r2dt.sequence_count}' \ -f "$query" "$PGDATABASE" > raw.json mkdir parts/ split --number=l/4000 --additional-suffix='.json' raw.json parts/ @@ -59,17 +60,18 @@ process layout_sequences { memory params.r2dt.layout.memory container params.r2dt.container containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms" - errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' } + errorStrategy { task.exitStatus = 130 ? 'ignore' : 'finish' } input: path(sequences) output: - tuple path("$sequences"), path('output') + tuple path("$sequences"), path('output'), path('version') """ esl-sfetch --index $sequences r2dt.py draw $sequences output/ + r2dt.py version | perl -ne 'm/(\\d\\.\\d)/ && print "\$1\\n"' > version """ } @@ -80,7 +82,7 @@ process publish_layout { queue 'datamover' input: - tuple path(sequences), path(output), path(mapping) + tuple path(sequences), path(output), path(_version), path(mapping) output: val 'done', emit: flag @@ -94,7 +96,7 @@ process publish_layout { process parse_layout { input: - tuple path(sequences), path(to_parse), path(mapping) + tuple path(sequences), path(to_parse), path(version), path(mapping) errorStrategy "ignore" output: @@ -103,7 +105,7 @@ process parse_layout { """ rnac r2dt process-svgs --allow-missing $mapping $to_parse data.csv - rnac r2dt create-attempted $sequences attempted.csv + rnac r2dt create-attempted $sequences $version attempted.csv """ }