From d0ce5e921764564e0b0e18056f15f86faf8594d1 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 25 Oct 2022 13:12:24 +0100 Subject: [PATCH 01/33] Changes to extract and propagate r2dt version info for hits Also, black has some changes to make --- files/r2dt/load.ctl | 16 +++++++---- rnacentral_pipeline/cli/r2dt.py | 8 ++++-- .../rnacentral/r2dt/__init__.py | 27 +++++++++++-------- rnacentral_pipeline/rnacentral/r2dt/data.py | 9 +++++-- rnacentral_pipeline/rnacentral/r2dt/parser.py | 4 +-- 5 files changed, 42 insertions(+), 22 deletions(-) diff --git a/files/r2dt/load.ctl b/files/r2dt/load.ctl index 1cd3e55ad..1091c7fd6 100644 --- a/files/r2dt/load.ctl +++ b/files/r2dt/load.ctl @@ -11,7 +11,8 @@ HAVING FIELDS ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show + inferred_should_show, + r2dt_version ) INTO {{PGDATABASE}}?load_secondary TARGET COLUMNS ( urs, @@ -24,7 +25,8 @@ TARGET COLUMNS ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show + inferred_should_show, + r2dt_version ) WITH @@ -49,7 +51,8 @@ create table load_secondary ( sequence_start int, sequence_stop int, sequence_coverage float, - inferred_should_show bool + inferred_should_show bool, + r2dt_version text ); $$ @@ -66,7 +69,8 @@ INSERT INTO rnc_secondary_structure_layout ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show + inferred_should_show, + r2dt_version ) ( SELECT urs, @@ -79,7 +83,8 @@ SELECT sequence_start, sequence_stop, sequence_coverage, - inferred_should_show + inferred_should_show, + r2dt_version FROM load_secondary ) ON CONFLICT (urs) DO UPDATE SET @@ -93,6 +98,7 @@ SET sequence_stop = EXCLUDED.sequence_stop, sequence_coverage = EXCLUDED.sequence_coverage, inferred_should_show = EXCLUDED.inferred_should_show + r2dt_version = EXCLUDED.r2dt_version ; $$, $$ diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index cce9a7178..7ff595fe2 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -33,13 +33,17 @@ def cli(): @click.option("--allow-missing", is_flag=True, default=False) @click.argument("model_info", type=click.File("r")) @click.argument("directory", type=click.Path()) +@click.argument("version", type=click.File("r")) @click.argument("output", type=click.File("w")) -def process_svgs(model_info, directory, output, allow_missing=False): +def process_svgs(model_info, directory, version, output, allow_missing=False): """ Process all SVG secondary structures in the given directory and produce a single data file that can be imported into the database. """ - r2dt.write(model_info, directory, output, allow_missing=allow_missing) + version_string = version.read().strip() + r2dt.write( + model_info, directory, version_string, output, allow_missing=allow_missing + ) @cli.group("should-show") diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 951d294c2..d489e0d9e 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -21,29 +21,34 @@ import joblib -from rnacentral_pipeline.rnacentral.r2dt import parser -from rnacentral_pipeline.rnacentral.r2dt import should_show -from rnacentral_pipeline.rnacentral.r2dt.models import crw -from rnacentral_pipeline.rnacentral.r2dt.models import gtrnadb -from rnacentral_pipeline.rnacentral.r2dt.models import ribovision -from rnacentral_pipeline.rnacentral.r2dt.models import rnase_p -from rnacentral_pipeline.rnacentral.r2dt.models import rfam +from rnacentral_pipeline.rnacentral.r2dt import parser, should_show +from rnacentral_pipeline.rnacentral.r2dt.models import ( + crw, + gtrnadb, + rfam, + ribovision, + rnase_p, +) -def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False): +def parse(model_mapping: ty.TextIO, directory: str, version: str, allow_missing=False): path = Path(directory) - return parser.parse(model_mapping, path, allow_missing=allow_missing) + return parser.parse(model_mapping, path, version, allow_missing=allow_missing) def write( - model_mapping: ty.TextIO, directory: str, output: ty.TextIO, allow_missing=False + model_mapping: ty.TextIO, + directory: str, + version: str, + output: ty.TextIO, + allow_missing=False, ): """ Parse all the secondary structure data from the given directory and write it to the given file. """ - parsed = parse(model_mapping, directory, allow_missing=allow_missing) + parsed = parse(model_mapping, directory, version, allow_missing=allow_missing) writeable = (e.writeable() for e in parsed) csv.writer(output).writerows(writeable) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index d9d1313d1..744a1c72d 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -20,11 +20,10 @@ import typing as ty from pathlib import Path -from Bio import SeqIO - import attr from attr.validators import instance_of as is_a from attr.validators import optional +from Bio import SeqIO from rnacentral_pipeline.databases.data import RibovoreResult @@ -141,6 +140,7 @@ class R2DTResultInfo(object): db_info = attr.ib(validator=is_a(ModelDatabaseInfo)) source = attr.ib(validator=is_a(Source)) path = attr.ib(validator=is_a(Path)) + version = attr.ib(validator=is_a(str)) @property def model_name(self): @@ -269,6 +269,10 @@ def from_info(cls, info: R2DTResultInfo, hit_info=None): def urs(self): return self.info.urs + @property + def r2dt_version(self): + return self.info.version + @property def model_id(self): return self.info.model_db_id @@ -341,6 +345,7 @@ def writeable(self): sequence_stop, sequence_coverage, True, + self.r2dt_version, ] diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py index bec61a023..fbc7a3843 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/parser.py +++ b/rnacentral_pipeline/rnacentral/r2dt/parser.py @@ -60,7 +60,7 @@ def load_hit_info(base: Path, allow_missing: bool): def parse( - info_path: ty.TextIO, base: Path, allow_missing=False + info_path: ty.TextIO, base: Path, version: str, allow_missing=False ) -> ty.Iterator[data.R2DTResult]: if not base.exists(): @@ -82,7 +82,7 @@ def parse( raise ValueError("No info for model %s", model_name) minfo = model_info[model_name] - info = data.R2DTResultInfo(urs, minfo, source, result_base) + info = data.R2DTResultInfo(urs, minfo, source, result_base, version) if info in seen: LOGGER.warn("Dupcliate line in metadata for, %s", info) continue From 2fd83ff3a57d578ed179e81ab60952da0c2ad291 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 25 Oct 2022 13:13:21 +0100 Subject: [PATCH 02/33] Version information for attempted --- files/r2dt/attempted.ctl | 12 ++++++++---- rnacentral_pipeline/cli/r2dt.py | 6 ++++-- rnacentral_pipeline/rnacentral/attempted.py | 10 +++++++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/files/r2dt/attempted.ctl b/files/r2dt/attempted.ctl index a3c211abf..f17730102 100644 --- a/files/r2dt/attempted.ctl +++ b/files/r2dt/attempted.ctl @@ -19,7 +19,8 @@ DROP TABLE IF EXISTS load_traveler_attempted; $$, $$ CREATE TABLE load_traveler_attempted ( - urs text primary key + urs text primary key, + r2dt_version text, ); $$ @@ -27,15 +28,18 @@ AFTER LOAD DO $$ INSERT INTO pipeline_tracking_traveler ( urs, - last_run + last_run, + r2dt_version ) ( SELECT load.urs, - NOW() + NOW(), + load.r2dt_version FROM load_traveler_attempted load ) ON CONFLICT (urs) DO UPDATE -SET +SET last_run = EXCLUDED.last_run + r2dt_version = EXCLUDED.r2dt_version ; $$ ; diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index 7ff595fe2..cdf83c1e2 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -196,9 +196,11 @@ def rnase_p_model_info(filename, output): @cli.command("create-attempted") @click.argument("filename", type=click.File("r")) +@click.argument("version", type=click.File("r")) @click.argument("output", default="-", type=click.File("w")) -def r2dt_create_attempted(filename, output): - attempted.r2dt(filename, output) +def r2dt_create_attempted(filename, version, output): + version_string = version.read().strip() + attempted.r2dt(filename, version_string, output) @cli.command("publish") diff --git a/rnacentral_pipeline/rnacentral/attempted.py b/rnacentral_pipeline/rnacentral/attempted.py index 3c33d0368..f4ab67105 100644 --- a/rnacentral_pipeline/rnacentral/attempted.py +++ b/rnacentral_pipeline/rnacentral/attempted.py @@ -62,10 +62,14 @@ def parse_rfam_version(handle: ty.IO) -> str: raise ValueError(f"Could not find version in file {handle}") -def write(data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True): +def write( + data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True, version=None +): writer = csv.writer(output) seen = False for row in data: + if version: + row.append(version) writer.writerow(row) seen = True if not seen: @@ -88,6 +92,6 @@ def qa(handle: ty.IO, name: str, version_file: ty.IO, output: ty.IO): write(data, output) -def r2dt(handle: ty.IO, output: ty.IO): +def r2dt(handle: ty.IO, version: str, output: ty.IO): data = fasta_parser(handle) - write(data, output) + write(data, output, version=version) From 82e0683644311864a5ea2ab79f37c3179924cdd2 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 25 Oct 2022 13:14:14 +0100 Subject: [PATCH 03/33] Tweak workflow to extract and propagate version info --- workflows/r2dt.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index 3be666626..6fe9fa4a8 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -59,17 +59,18 @@ process layout_sequences { memory params.r2dt.layout.memory container params.r2dt.container containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms" - errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' } + errorStrategy { task.exitStatus = 130 ? 'ignore' : 'finish' } input: path(sequences) output: - tuple path("$sequences"), path('output') + tuple path("$sequences"), path('output'), path('version') """ esl-sfetch --index $sequences r2dt.py draw $sequences output/ + r2dt.py version | perl -ne 'm/(\d\.\d)/ && print "$1\n"' > version """ } @@ -94,7 +95,7 @@ process publish_layout { process parse_layout { input: - tuple path(sequences), path(to_parse), path(mapping) + tuple path(sequences), path(to_parse), path(version), path(mapping) errorStrategy "ignore" output: From 3d7c1403ff82a616aabad89bc54472825c151eee Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 25 Oct 2022 14:24:45 +0100 Subject: [PATCH 04/33] Remove version from r2dt hit info. It will only be stored in the attempted table now --- files/r2dt/load.ctl | 16 +++++----------- rnacentral_pipeline/cli/r2dt.py | 8 ++------ rnacentral_pipeline/rnacentral/r2dt/__init__.py | 7 +++---- rnacentral_pipeline/rnacentral/r2dt/data.py | 6 ------ rnacentral_pipeline/rnacentral/r2dt/parser.py | 4 ++-- 5 files changed, 12 insertions(+), 29 deletions(-) diff --git a/files/r2dt/load.ctl b/files/r2dt/load.ctl index 1091c7fd6..1cd3e55ad 100644 --- a/files/r2dt/load.ctl +++ b/files/r2dt/load.ctl @@ -11,8 +11,7 @@ HAVING FIELDS ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show, - r2dt_version + inferred_should_show ) INTO {{PGDATABASE}}?load_secondary TARGET COLUMNS ( urs, @@ -25,8 +24,7 @@ TARGET COLUMNS ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show, - r2dt_version + inferred_should_show ) WITH @@ -51,8 +49,7 @@ create table load_secondary ( sequence_start int, sequence_stop int, sequence_coverage float, - inferred_should_show bool, - r2dt_version text + inferred_should_show bool ); $$ @@ -69,8 +66,7 @@ INSERT INTO rnc_secondary_structure_layout ( sequence_start, sequence_stop, sequence_coverage, - inferred_should_show, - r2dt_version + inferred_should_show ) ( SELECT urs, @@ -83,8 +79,7 @@ SELECT sequence_start, sequence_stop, sequence_coverage, - inferred_should_show, - r2dt_version + inferred_should_show FROM load_secondary ) ON CONFLICT (urs) DO UPDATE SET @@ -98,7 +93,6 @@ SET sequence_stop = EXCLUDED.sequence_stop, sequence_coverage = EXCLUDED.sequence_coverage, inferred_should_show = EXCLUDED.inferred_should_show - r2dt_version = EXCLUDED.r2dt_version ; $$, $$ diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index cdf83c1e2..2fc993e24 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -33,17 +33,13 @@ def cli(): @click.option("--allow-missing", is_flag=True, default=False) @click.argument("model_info", type=click.File("r")) @click.argument("directory", type=click.Path()) -@click.argument("version", type=click.File("r")) @click.argument("output", type=click.File("w")) -def process_svgs(model_info, directory, version, output, allow_missing=False): +def process_svgs(model_info, directory, output, allow_missing=False): """ Process all SVG secondary structures in the given directory and produce a single data file that can be imported into the database. """ - version_string = version.read().strip() - r2dt.write( - model_info, directory, version_string, output, allow_missing=allow_missing - ) + r2dt.write(model_info, directory, output, allow_missing=allow_missing) @cli.group("should-show") diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index d489e0d9e..ff3870759 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -31,15 +31,14 @@ ) -def parse(model_mapping: ty.TextIO, directory: str, version: str, allow_missing=False): +def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False): path = Path(directory) - return parser.parse(model_mapping, path, version, allow_missing=allow_missing) + return parser.parse(model_mapping, path, allow_missing=allow_missing) def write( model_mapping: ty.TextIO, directory: str, - version: str, output: ty.TextIO, allow_missing=False, ): @@ -48,7 +47,7 @@ def write( it to the given file. """ - parsed = parse(model_mapping, directory, version, allow_missing=allow_missing) + parsed = parse(model_mapping, directory, allow_missing=allow_missing) writeable = (e.writeable() for e in parsed) csv.writer(output).writerows(writeable) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index 744a1c72d..d71788968 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -140,7 +140,6 @@ class R2DTResultInfo(object): db_info = attr.ib(validator=is_a(ModelDatabaseInfo)) source = attr.ib(validator=is_a(Source)) path = attr.ib(validator=is_a(Path)) - version = attr.ib(validator=is_a(str)) @property def model_name(self): @@ -269,10 +268,6 @@ def from_info(cls, info: R2DTResultInfo, hit_info=None): def urs(self): return self.info.urs - @property - def r2dt_version(self): - return self.info.version - @property def model_id(self): return self.info.model_db_id @@ -345,7 +340,6 @@ def writeable(self): sequence_stop, sequence_coverage, True, - self.r2dt_version, ] diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py index fbc7a3843..bec61a023 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/parser.py +++ b/rnacentral_pipeline/rnacentral/r2dt/parser.py @@ -60,7 +60,7 @@ def load_hit_info(base: Path, allow_missing: bool): def parse( - info_path: ty.TextIO, base: Path, version: str, allow_missing=False + info_path: ty.TextIO, base: Path, allow_missing=False ) -> ty.Iterator[data.R2DTResult]: if not base.exists(): @@ -82,7 +82,7 @@ def parse( raise ValueError("No info for model %s", model_name) minfo = model_info[model_name] - info = data.R2DTResultInfo(urs, minfo, source, result_base, version) + info = data.R2DTResultInfo(urs, minfo, source, result_base) if info in seen: LOGGER.warn("Dupcliate line in metadata for, %s", info) continue From d044b85e311ea9854d7a9eb2b70f25b91a6fec7c Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 09:08:04 +0100 Subject: [PATCH 05/33] Add cell location as an optional parameter --- rnacentral_pipeline/rnacentral/r2dt/data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index d71788968..f739e77bf 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -96,12 +96,14 @@ class ModelInfo(object): source: Source = attr.ib(validator=is_a(Source)) length: ty.Optional[int] = attr.ib(validator=optional(is_a(int))) basepairs: ty.Optional[int] = attr.ib(validator=optional(is_a(int))) + cell_location: ty.Optional[str] = attr.ib(validator=optional(is_a(str))) def writeable(self): return [ self.model_name, self.taxid, self.so_rna_type, + self.cell_location, self.source.name, self.length, self.basepairs, From 7dd45e8bed9ff8253fe1d6b827bb0629680dbc07 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 09:08:36 +0100 Subject: [PATCH 06/33] Rfam cm parser to rnac database entry --- .../rnacentral/r2dt/models/rfam.py | 65 ++++++++++++++----- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py index 30faacdb7..9a32a086a 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py @@ -13,37 +13,70 @@ limitations under the License. """ -import typing as ty import csv +import typing as ty + +import psycopg2 +import psycopg2.extras from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source RFAM_QUERY = """ -select +select rfam_model_id, 'rfam', 131567, so_rna_type, - rna_type -from rfam_models + rna_type +from rfam_models where so_rna_type is not null """ -def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int]]: - return {} +def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int, str, str]]: + conn = psycopg2.connect(db_url) + cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) + cur.execute(RFAM_QUERY) + res = {} + for result in cur: + res[result["rfam_model_id"]] = (result[1], result[2], result[3], result[4]) + cur.close() + conn.close() + return res + + +def parse_model(handle, known_info) -> ModelInfo: + length: ty.Optional[str] = None + model_name: ty.Optional[str] = None + for line in handle: + line = line.strip() + if line == "CM": + break + key, value = re.split("\s+", line, maxsplit=1) + + if key == "ACC": + model_name = value + if key == "CLEN": + length = value + + if not model_name: + raise ValueError("Invalid name") + + if not length: + raise ValueError("Invalid length for: %s" % model_name) + + return ModelInfo( + model_name=model_name, + so_rna_type=known_info[model_name][2], + taxid=known_info[model_name][1], + source=Source.rfam, + length=int(length), + ) def parse(cm_stat: ty.IO, db_url: str) -> ty.Iterable[ModelInfo]: known_info = load_info(db_url) - for row in csv.reader(cm_stat): - info = known_info[row[0]] - yield ModelInfo( - model_name=row[0], - so_rna_type=info[0], - taxid=info[1], - source=Source.rfam, - length=int(row[1]), - basepairs=int(row[2]), - ) + for line in cm_stat: + if line.startswith("INFERNAL"): + yield parse_model(cm_stat, known_info) From 684031b7582be95e4895e2eaed6f24feea257230 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 09:09:17 +0100 Subject: [PATCH 07/33] Fix gtrnadb parser for up to date model spec --- .../rnacentral/r2dt/models/gtrnadb.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py index d1243ed6d..ca0ec2bfd 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py @@ -13,13 +13,12 @@ limitations under the License. """ -import re import csv -import typing as ty import operator as op +import re +import typing as ty -from rnacentral_pipeline.rnacentral.r2dt.data import Source -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source DOMAINS = { "arch": ("A", 2157), @@ -85,14 +84,12 @@ def parse_model(handle) -> ModelInfo: short_domain, taxid = DOMAINS[domain] so_term = TYPES[trna_type] - model_id = "%s-%s" % (short_domain, trna_type) + model_name = "%s-%s" % (short_domain, trna_type) return ModelInfo( - model_id=model_id, - is_intronic=False, - so_term=so_term, + model_name=model_name, + so_rna_type=so_term, taxid=taxid, - accessions=[], source=Source.gtrnadb, length=int(length), cell_location=loc, From 12a0ef3643dc5ff3f525e4bcfe9b5e36017f8c4b Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 09:29:44 +0100 Subject: [PATCH 08/33] CM scanner for CRW --- .../rnacentral/r2dt/models/crw.py | 74 +++++++++++-------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index 9fcc73fc8..76fb1b68d 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -13,30 +13,16 @@ limitations under the License. """ -import re import csv +import re -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo -from rnacentral_pipeline.rnacentral.r2dt.data import Source +from rnacentral_pipeline.databases.helpers.phylogeny import taxid +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source SO_TERM_MAPPING = { - "16S": "SO:0000650", - "23S": "SO:0000651", - "5S": "SO:0000652", - "I": "SO:0000587", - "IA1": "SO:0000587", - "IA2": "SO:0000587", - "IB": "SO:0000587", - "IB1": "SO:0000587", - "IB2": "SO:0000587", - "IB4": "SO:0000587", - "IC1": "SO:0000587", - "IC2": "SO:0000587", - "IC3": "SO:0000587", - "ID": "SO:0000587", - "IE": "SO:0000587", - "IIA": "SO:0000603", - "IIB": "SO:0000603", + "16": "SO:0000650", + "5": "SO:0000652", + "I1": "SO:0000587", } @@ -60,6 +46,39 @@ def as_taxid(raw): return int(raw) +def parse_model(handle) -> ModelInfo: + length: ty.Optional[str] = None + model_name: ty.Optional[str] = None + for line in handle: + line = line.strip() + if line == "CM": + break + key, value = re.split("\s+", line, maxsplit=1) + + if key == "NAME": + model_name = value + if key == "CLEN": + length = value + + if not model_name: + raise ValueError("Invalid name") + + if not length: + raise ValueError("Invalid length for: %s" % model_name) + + rna_type_key = model_name.split(".")[1] + + taxonomy_id = taxid(model_name.split(".")[3:4].join(" ")) + + return ModelInfo( + model_name=model_name, + so_rna_type=SO_TERM_MAPPING[rna_type_key], + taxid=taxonomy_id, + source=Source.rfam, + length=int(length), + ) + + def models(raw): for model_id in raw["structure"].split(" "): data = dict(raw) @@ -69,15 +88,6 @@ def models(raw): def parse(handle): - for row in csv.DictReader(handle, delimiter="\t"): - for info in models(row): - intronic = info["rna_type"] == "I" - yield ModelInfo( - model_id=info["model_id"], - is_intronic=intronic, - so_term=as_so_term(info["rna_class"]), - taxid=as_taxid(info["tax_id"]), - accessions=row["accession(s)"].split(","), - source=Source.crw, - cell_location=info["cell_location"], - ) + for line in handle: + if line.startswith("INFERNAL"): + yield parse_model(handle) From 205efcd6e07a966cf413018455649f7e4782aaa6 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 12:33:56 +0100 Subject: [PATCH 09/33] Working parsers with correct forwarding of db_url --- .../rnacentral/r2dt/__init__.py | 12 ++--- .../rnacentral/r2dt/models/crw.py | 48 +++++++++++++++---- .../rnacentral/r2dt/models/gtrnadb.py | 4 +- .../rnacentral/r2dt/models/rfam.py | 12 +++-- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index ff3870759..8832a8ad5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -105,8 +105,8 @@ def prepare_s3( raw.write("\n") -def write_model(generator, handle, output): - data = generator(handle) +def write_model(generator, handle, output, extra=None): + data = generator(handle, extra=extra) data = (d.writeable() for d in data) csv.writer(output).writerows(data) @@ -119,16 +119,16 @@ def write_ribovision(handle, output): return write_model(ribovision.parse, handle, output) -def write_crw(handle, output): - return write_model(crw.parse, handle, output) +def write_crw(handle, db_url, output): + return write_model(crw.parse, handle, output, extra=db_url) def write_rnase_p(handle, output): return write_model(rnase_p.parse, handle, output) -def write_rfam(handle, output): - return write_model(rfam.parse, handle, output) +def write_rfam(handle, db_url, output): + return write_model(rfam.parse, handle, output, extra=db_url) def write_should_show(model: Path, handle: ty.IO, db_url: str, output: ty.IO): diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index 76fb1b68d..df9749e6d 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -15,16 +15,42 @@ import csv import re +import typing as ty -from rnacentral_pipeline.databases.helpers.phylogeny import taxid +import psycopg2 +import psycopg2.extras +from Bio import SeqIO + +from rnacentral_pipeline.databases.helpers.phylogeny import FailedTaxonId, taxid from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source SO_TERM_MAPPING = { "16": "SO:0000650", "5": "SO:0000652", "I1": "SO:0000587", + "I2": "SO:0000587", } +CRW_QUERY = """ +select xref.ac accession, rna.md5 md5, xref.taxid taxid, rnc_accessions.rna_type rna_type from rnc_accessions +join xref on xref.ac = rnc_accessions.accession +join rna on rna.upi = xref.upi +where xref.dbid = 45 +and xref.deleted = 'N' +""" + + +def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int, str]]: + conn = psycopg2.connect(db_url) + cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) + cur.execute(CRW_QUERY) + res = {} + for result in cur: + res[result["accession"].split(":")[1]] = (result[1], result[2], result[3]) + cur.close() + conn.close() + return res + def as_so_term(raw): if raw in SO_TERM_MAPPING: @@ -46,7 +72,7 @@ def as_taxid(raw): return int(raw) -def parse_model(handle) -> ModelInfo: +def parse_model(handle, metadata) -> ModelInfo: length: ty.Optional[str] = None model_name: ty.Optional[str] = None for line in handle: @@ -66,16 +92,16 @@ def parse_model(handle) -> ModelInfo: if not length: raise ValueError("Invalid length for: %s" % model_name) - rna_type_key = model_name.split(".")[1] - - taxonomy_id = taxid(model_name.split(".")[3:4].join(" ")) + taxonomy_id = metadata[model_name][1] return ModelInfo( model_name=model_name, - so_rna_type=SO_TERM_MAPPING[rna_type_key], + so_rna_type=metadata[model_name][2], taxid=taxonomy_id, - source=Source.rfam, + source=Source.crw, length=int(length), + basepairs=None, + cell_location=None, ) @@ -87,7 +113,11 @@ def models(raw): yield data -def parse(handle): +def parse(handle, extra=None): + metadata = load_info(extra) for line in handle: if line.startswith("INFERNAL"): - yield parse_model(handle) + try: + yield parse_model(handle, metadata) + except KeyError: + continue diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py index ca0ec2bfd..9f63b76b5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/gtrnadb.py @@ -93,16 +93,18 @@ def parse_model(handle) -> ModelInfo: source=Source.gtrnadb, length=int(length), cell_location=loc, + basepairs=None, ) -def parse(handle): +def parse(handle, extra=None): for line in handle: if line.startswith("INFERNAL"): yield parse_model(handle) def write(handle, output): + data = parse(handle) data = map(op.methodcaller("writeable"), data) csv.writer(output).writerows(data) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py index 9a32a086a..c56c02bfc 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/rfam.py @@ -14,6 +14,7 @@ """ import csv +import re import typing as ty import psycopg2 @@ -72,11 +73,16 @@ def parse_model(handle, known_info) -> ModelInfo: taxid=known_info[model_name][1], source=Source.rfam, length=int(length), + basepairs=None, + cell_location=None, ) -def parse(cm_stat: ty.IO, db_url: str) -> ty.Iterable[ModelInfo]: - known_info = load_info(db_url) +def parse(cm_stat: ty.IO, extra: str = None) -> ty.Iterable[ModelInfo]: + known_info = load_info(extra) for line in cm_stat: if line.startswith("INFERNAL"): - yield parse_model(cm_stat, known_info) + try: + yield parse_model(cm_stat, known_info) + except KeyError: + continue From 54c894ee516458dc97993ce9371ff2ab708b6e0d Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:39:03 +0100 Subject: [PATCH 10/33] Use crw metadata file rather than database query and tidy up unused stuff --- .../rnacentral/r2dt/models/crw.py | 57 +++++++------------ 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index df9749e6d..8a7aa8e57 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -17,39 +17,28 @@ import re import typing as ty -import psycopg2 -import psycopg2.extras -from Bio import SeqIO - from rnacentral_pipeline.databases.helpers.phylogeny import FailedTaxonId, taxid from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source SO_TERM_MAPPING = { - "16": "SO:0000650", - "5": "SO:0000652", - "I1": "SO:0000587", - "I2": "SO:0000587", + "rRNA_16S": "SO:0000650", + "rRNA_5S": "SO:0000652", + "group_I_intron": "SO:0000587", + "group_II_intron": "SO:0000603", + "large_subunit_rRNA": "SO:0000651", + "small_subunit_rRNA": "SO:0000650", + "mt_rRNA": "SO:0002128", + "rRNA_18S": "SO:0000407", + "rRNA_21S": "SO:0002345", + "rRNA_23S": "SO:0001001", } -CRW_QUERY = """ -select xref.ac accession, rna.md5 md5, xref.taxid taxid, rnc_accessions.rna_type rna_type from rnc_accessions -join xref on xref.ac = rnc_accessions.accession -join rna on rna.upi = xref.upi -where xref.dbid = 45 -and xref.deleted = 'N' -""" - -def load_info(db_url: str) -> ty.Dict[str, ty.Tuple[str, int, str]]: - conn = psycopg2.connect(db_url) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - cur.execute(CRW_QUERY) - res = {} - for result in cur: - res[result["accession"].split(":")[1]] = (result[1], result[2], result[3]) - cur.close() - conn.close() - return res +def load_metadata(handle: str): + metadata = {} + for row in csv.DictReader(open(handle), delimiter="\t"): + metadata[row["model_name"]] = {**row} + return metadata def as_so_term(raw): @@ -92,11 +81,11 @@ def parse_model(handle, metadata) -> ModelInfo: if not length: raise ValueError("Invalid length for: %s" % model_name) - taxonomy_id = metadata[model_name][1] + taxonomy_id = int(metadata[model_name]["taxid"]) return ModelInfo( model_name=model_name, - so_rna_type=metadata[model_name][2], + so_rna_type=as_so_term(metadata[model_name]["rna_type"]), taxid=taxonomy_id, source=Source.crw, length=int(length), @@ -105,19 +94,11 @@ def parse_model(handle, metadata) -> ModelInfo: ) -def models(raw): - for model_id in raw["structure"].split(" "): - data = dict(raw) - model_id = re.sub(r"\.ps$", "", model_id) - data["model_id"] = model_id - yield data - - def parse(handle, extra=None): - metadata = load_info(extra) + metadata = load_metadata(extra) for line in handle: if line.startswith("INFERNAL"): try: yield parse_model(handle, metadata) - except KeyError: + except KeyError as e: continue From 09a8289eba9ad6f7b01f91a7f5697d95a8198278 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:39:48 +0100 Subject: [PATCH 11/33] Fix RNAse-p parser --- rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py b/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py index f63f53fe6..0a1547002 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/rnase_p.py @@ -13,23 +13,22 @@ limitations under the License. """ -import typing as ty import csv +import typing as ty from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source -def parse(handle) -> ty.Iterable[ModelInfo]: +def parse(handle, extra=None) -> ty.Iterable[ModelInfo]: for row in csv.DictReader(handle, delimiter="\t"): so_term_id = "SO:0000386" taxid = int(row["taxid"]) yield ModelInfo( model_name=row["model_name"], - is_intronic=False, - so_term=so_term_id, + so_rna_type=so_term_id, taxid=taxid, - accessions=[], source=Source.rnase_p, cell_location=None, length=None, + basepairs=None, ) From e51c9e100a44f3acc9e98015f47e4bf486ec8fba Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:40:30 +0100 Subject: [PATCH 12/33] Fix ribovision parser and add lookup for more taxa --- .../rnacentral/r2dt/models/ribovision.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py b/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py index a9fb7d36c..051a932b0 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/ribovision.py @@ -15,8 +15,8 @@ import csv -from rnacentral_pipeline.rnacentral.r2dt.data import Source -from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo +from rnacentral_pipeline.databases.helpers.phylogeny import FailedTaxonId, taxid +from rnacentral_pipeline.rnacentral.r2dt.data import ModelInfo, Source def lookup_taxid(species): @@ -56,7 +56,10 @@ def lookup_taxid(species): return 7227 if species == "Trypanosoma brucei": return 5691 - raise ValueError("Unknown species name: " + species) + try: + return taxid(species) + except FailedTaxonId: + raise ValueError("Unknown species name: " + species) def as_location(raw): @@ -85,23 +88,22 @@ def so_term(row): raise ValueError("Could not figure out SO term for: %s" % row) -def parse(handle): +def parse(handle, extra=None): for row in csv.DictReader(handle, delimiter="\t"): so_term_id = so_term(row) - if not row["taxid"]: + if not row.get("taxid", None): taxid = lookup_taxid(row["species"]) else: taxid = int(row["taxid"]) - location = as_location(row["cellular_location"]) + # location = as_location(row["cellular_location"]) yield ModelInfo( - model_id=row["model_name"], - is_intronic=False, - so_term=so_term_id, + model_name=row["model_name"], + so_rna_type=so_term_id, taxid=taxid, - accessions=[], source=Source.ribovision, - cell_location=location, + cell_location=None, length=None, + basepairs=None, ) From a29ed6a49ca9f8e7251ef686140c35936c237523 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:41:53 +0100 Subject: [PATCH 13/33] Add metadata url to crw parser --- rnacentral_pipeline/cli/r2dt.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index 2fc993e24..57acf1e73 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -138,12 +138,13 @@ def model_info(): @model_info.command("crw") @click.argument("filename", type=click.File("r")) +@click.argument("metadata_url", type=str) @click.argument("output", default="-", type=click.File("w")) -def crw_model_info(filename, output): +def crw_model_info(filename, metadata_url, output): """ Parse the CRW metadata file and produce """ - r2dt.write_crw(filename, output) + r2dt.write_crw(filename, metadata_url, output) @model_info.command("ribovision") @@ -158,7 +159,7 @@ def ribovision_model_info(filename, output): @model_info.command("gtrnadb") -@click.argument("filename", type=click.File("r")) +@click.argument("filename", type=click.File()) @click.argument("output", default="-", type=click.File("w")) def gtrnadb_model_info(filename, output): """ @@ -181,13 +182,14 @@ def rnase_p_model_info(filename, output): @model_info.command("rfam") @click.argument("filename", type=click.File("r")) +@click.argument("db_url", type=str) @click.argument("output", default="-", type=click.File("w")) -def rnase_p_model_info(filename, output): +def rnase_p_model_info(filename, db_url, output): """ Parse the metadata.tsv file from R2DT for Ribovision models to produce something we can put in our database. """ - r2dt.write_rfam(filename, output) + r2dt.write_rfam(filename, db_url, output) @cli.command("create-attempted") From 7883e8a454b00f6a3dc53e0c08374e0e1ec00463 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:42:12 +0100 Subject: [PATCH 14/33] Fix bad escapes in r2dt version extraction --- workflows/r2dt.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index 6fe9fa4a8..b837fa93b 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -70,7 +70,7 @@ process layout_sequences { """ esl-sfetch --index $sequences r2dt.py draw $sequences output/ - r2dt.py version | perl -ne 'm/(\d\.\d)/ && print "$1\n"' > version + r2dt.py version | perl -ne 'm/(\\d\\.\\d)/ && print "\$1\\n"' > version """ } From 98537a53e59c87c9b3d159a941efc45533016cf1 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 15:42:58 +0100 Subject: [PATCH 15/33] Workflow to scan the database with r2dt Parses the model info and updates the models table first, then runs r2dt on everything --- r2dt-scan.nf | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 r2dt-scan.nf diff --git a/r2dt-scan.nf b/r2dt-scan.nf new file mode 100644 index 000000000..d437df0c9 --- /dev/null +++ b/r2dt-scan.nf @@ -0,0 +1,131 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +include { r2dt } from './workflows/r2dt' + + +process parse_gtrnadb_model { + + input: + path(model_path) + output: + path("model_data.csv") + + script: + """ + rnac r2dt model-info gtrnadb $model_path model_data.csv + """ +} + +process parse_ribovision_models { + + input: + val(ribovision_metadata_url) + + + + output: + path("model_data.csv") + + script: + """ + wget $ribovision_metadata_url + + rnac r2dt model-info ribovision metadata.tsv model_data.csv + """ + +} + +process parse_rnasep_models { + + input: + val(rnasep_metadata_url) + + output: + path("model_data.csv") + + script: + """ + wget $rnasep_metadata_url + sed -i '' 's/\\tNRC-1\\t/\\t/g' metadata.tsv + rnac r2dt model-info rnase-p metadata.tsv model_data.csv + """ + +} + +process parse_rfam_models { + + input: + path(all_models) + output: + path("model_data.csv") + + script: + """ + rnac r2dt model-info rfam $all_models $PGDATABASE model_data.csv + """ +} + + +process parse_crw_models { + + input: + tuple path(all_models), val(metadata) + output: + path("model_data.csv") + + script: + """ + #wget $metadata -O metadata.tsv + #sed -i '' 's/taxid rna_type/taxid\trna_type/g' metadata.tsv + #rnac r2dt model-info crw $all_models metadata.tsv model_data.csv + rnac r2dt model-info crw $all_models $PGDATABASE model_data.csv + """ +} + +process load_models { + + input: + path(all_data) + path(ctl) + + output: + val('models loaded') + + script: + """ + split-and-load $ctl $all_data ${params.import_data.chunk_size} + """ +} + + + + + +workflow { + rfam_models = Channel.of("/Users/agreen/code/rnacentral-import-pipeline/singularity/bind/r2dt/cms/rfam/all.cm") + crw_models = Channel.of("$baseDir/singularity/bind/r2dt/cms/crw/all.cm") + crw_metadata = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/crw-metadata.tsv") + gtrnadb_models = Channel.fromPath("/Users/agreen/code/rnacentral-import-pipeline/singularity/bind/r2dt/cms/gtrnadb/*.cm") + ribovision_lsu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-lsu/metadata.tsv") + ribovision_ssu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-ssu/metadata.tsv") + + rnasep_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/rnasep/metadata.tsv") + + load_ctl = Channel.of("$baseDir/files/r2dt/load-models.ctl") + + rfam_models | parse_rfam_models | set { rfam_data } + crw_models.combine(crw_metadata) | parse_crw_models | set { crw_data } + gtrnadb_models | parse_gtrnadb_model | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { gtrnadb_data } + ribovision_lsu_metadata_url.mix(ribovision_ssu_metadata_url) | parse_ribovision_models | set {ribovision_data } + rnasep_metadata_url | parse_rnasep_models | set {rnasep_data} + + rfam_data.mix(crw_data, gtrnadb_data, ribovision_data, rnasep_data) | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { all_data } + + + load_models(all_data, load_ctl) | set { model_load } + + model_load | r2dt | set { done } + +} From 5e362e1848efa9739915d8b165d5c7195d7af85b Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 16:27:07 +0100 Subject: [PATCH 16/33] Update models on conflict --- files/r2dt/load-models.ctl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/files/r2dt/load-models.ctl b/files/r2dt/load-models.ctl index 3c1b4ef53..292d6f309 100644 --- a/files/r2dt/load-models.ctl +++ b/files/r2dt/load-models.ctl @@ -64,7 +64,16 @@ SELECT model_length, model_basepair_count FROM load_secondary_layout_models load -) ON CONFLICT (model_name) DO NOTHING +) ON CONFLICT (model_name) DO UPDATE +SET + taxid = EXCLUDED.taxid, + rna_type = EXCLUDED.rna_type, + so_term = EXCLUDED.so_term, + cell_location = EXCLUDED.cell_location, + model_source = EXCLUDED.model_source, + model_length = EXCLUDED.model_length, + model_basepair_count = EXCLUDED.model_basepair_count + ; $$, $$ From ffa4026b1fe423371f4c0a2fcd5be4ee05835e5d Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 16:57:42 +0100 Subject: [PATCH 17/33] Switch pipeline back to use metadata --- r2dt-scan.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r2dt-scan.nf b/r2dt-scan.nf index d437df0c9..7531614ea 100644 --- a/r2dt-scan.nf +++ b/r2dt-scan.nf @@ -77,10 +77,9 @@ process parse_crw_models { script: """ - #wget $metadata -O metadata.tsv - #sed -i '' 's/taxid rna_type/taxid\trna_type/g' metadata.tsv - #rnac r2dt model-info crw $all_models metadata.tsv model_data.csv - rnac r2dt model-info crw $all_models $PGDATABASE model_data.csv + wget $metadata -O metadata.tsv + sed -i '' 's/taxid rna_type/taxid\trna_type/g' metadata.tsv + rnac r2dt model-info crw $all_models metadata.tsv model_data.csv """ } From 13ff828eed57bff410331dee5f78dc62e2738eca Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 21:04:03 +0100 Subject: [PATCH 18/33] Fix sed command --- r2dt-scan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r2dt-scan.nf b/r2dt-scan.nf index 7531614ea..4cd5186a2 100644 --- a/r2dt-scan.nf +++ b/r2dt-scan.nf @@ -48,7 +48,7 @@ process parse_rnasep_models { script: """ wget $rnasep_metadata_url - sed -i '' 's/\\tNRC-1\\t/\\t/g' metadata.tsv + sed -i 's/\\tNRC-1\\t/\\t/g' metadata.tsv rnac r2dt model-info rnase-p metadata.tsv model_data.csv """ @@ -78,7 +78,7 @@ process parse_crw_models { script: """ wget $metadata -O metadata.tsv - sed -i '' 's/taxid rna_type/taxid\trna_type/g' metadata.tsv + sed -i 's/taxid rna_type/taxid\trna_type/g' metadata.tsv rnac r2dt model-info crw $all_models metadata.tsv model_data.csv """ } From e3485ba60c396472ee324203cd54416a9836c835 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Wed, 26 Oct 2022 21:23:02 +0100 Subject: [PATCH 19/33] Fix cms paths --- r2dt-scan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r2dt-scan.nf b/r2dt-scan.nf index 4cd5186a2..5c5cda82b 100644 --- a/r2dt-scan.nf +++ b/r2dt-scan.nf @@ -103,10 +103,10 @@ process load_models { workflow { - rfam_models = Channel.of("/Users/agreen/code/rnacentral-import-pipeline/singularity/bind/r2dt/cms/rfam/all.cm") + rfam_models = Channel.of("$baseDir/singularity/bind/r2dt/cms/rfam/all.cm") crw_models = Channel.of("$baseDir/singularity/bind/r2dt/cms/crw/all.cm") crw_metadata = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/crw-metadata.tsv") - gtrnadb_models = Channel.fromPath("/Users/agreen/code/rnacentral-import-pipeline/singularity/bind/r2dt/cms/gtrnadb/*.cm") + gtrnadb_models = Channel.fromPath("$baseDir/singularity/bind/r2dt/cms/gtrnadb/*.cm") ribovision_lsu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-lsu/metadata.tsv") ribovision_ssu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-ssu/metadata.tsv") From 9443e2ea6113d74712e3a6f3c501f9e653d50ca5 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Thu, 27 Oct 2022 09:04:08 +0100 Subject: [PATCH 20/33] Fix cms path --- r2dt-scan.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r2dt-scan.nf b/r2dt-scan.nf index 5c5cda82b..227b3ce84 100644 --- a/r2dt-scan.nf +++ b/r2dt-scan.nf @@ -103,10 +103,10 @@ process load_models { workflow { - rfam_models = Channel.of("$baseDir/singularity/bind/r2dt/cms/rfam/all.cm") - crw_models = Channel.of("$baseDir/singularity/bind/r2dt/cms/crw/all.cm") + rfam_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/rfam/all.cm") + crw_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/crw/all.cm") crw_metadata = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/crw-metadata.tsv") - gtrnadb_models = Channel.fromPath("$baseDir/singularity/bind/r2dt/cms/gtrnadb/*.cm") + gtrnadb_models = Channel.fromPath("$baseDir/singularity/bind/r2dt/data/cms/gtrnadb/*.cm") ribovision_lsu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-lsu/metadata.tsv") ribovision_ssu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-ssu/metadata.tsv") From cf91e65a6e78005f54bbbdb76e556d19265af740 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Thu, 27 Oct 2022 11:03:03 +0100 Subject: [PATCH 21/33] Add SO term to name lookup and reorganise writing --- rnacentral_pipeline/rnacentral/r2dt/data.py | 41 ++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index f739e77bf..f0acfed77 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -54,6 +54,44 @@ "SO:0000267", } +SO_RNA_NAME_LOOKUP = { + "SO:0000254": "alanyl_tRNA", + "SO:0000259": "glutaminyl_tRNA", + "SO:0000268": "prolyl_tRNA", + "SO:0000260": "glutamyl_tRNA", + "SO:0000266": "methionyl_tRNA", + "SO:0000256": "asparaginyl_tRNA", + "SO:0000270": "threonyl_tRNA", + "SO:0000261": "glycyl_tRNA", + "SO:0000273": "valyl_tRNA", + "SO:0000272": "tyrosyl_tRNA", + "SO:0000258": "cysteinyl_tRNA", + "SO:0000263": "isoleucyl_tRNA", + "SO:0000269": "seryl_tRNA", + "SO:0000264": "leucyl_tRNA", + "SO:0000271": "tryptophanyl_tRNA", + "SO:0005857": "selenocysteinyl_tRNA", + "SO:0000766": "pyrrolysyl_tRNA", + "SO:0000265": "lysyl_tRNA", + "SO:0000257": "aspartyl_tRNA", + "SO:0001036": "arginyl_tRNA", + "SO:0000262": "histidyl_tRNA", + "SO:0001172": "tRNA_region", + "SO:0002129": "mt_tRNA", + "SO:0000267": "phenylalanyl_tRNA", + "SO:0001001": "cytosolic_23S_rRNA", + "SO:0000386": "RNase_P_RNA", + "SO:0000650": "cytosolic_SSU_rRNA", + "SO:0000652": "cytosolic_5S_rRNA", + "SO:0000587": "group_I_intron", + "SO:0000603": "group_II_intron", + "SO:0000651": "cytosolic_LSU_rRNA", + "SO:0002128": "mt_rRNA", + "SO:0000407": "cytosolic_18S_rRNA", + "SO:0002345": "mt_LSU_rRNA", + "SO:0001001": "cytosolic_23S_rRNA", +} + @enum.unique class Source(enum.Enum): @@ -102,8 +140,9 @@ def writeable(self): return [ self.model_name, self.taxid, - self.so_rna_type, self.cell_location, + SO_RNA_NAME_LOOKUP[self.so_rna_type], + self.so_rna_type, self.source.name, self.length, self.basepairs, From e8aa886668267ed9b3eeaa19bf08ef8159d83e4b Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Thu, 27 Oct 2022 11:03:26 +0100 Subject: [PATCH 22/33] Fix filenames and field names in ctl --- files/r2dt/load-models.ctl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/files/r2dt/load-models.ctl b/files/r2dt/load-models.ctl index 292d6f309..ab2adf8a0 100644 --- a/files/r2dt/load-models.ctl +++ b/files/r2dt/load-models.ctl @@ -1,11 +1,11 @@ LOAD CSV -FROM ALL FILENAMES MATCHING ~ +FROM ALL FILENAMES MATCHING ~ HAVING FIELDS ( model_name, taxid, - rna_type, - so_term, cell_location, + rna_type, + so_term_id, model_source, model_length, model_basepair_count @@ -13,9 +13,9 @@ HAVING FIELDS ( TARGET COLUMNS ( model_name, taxid, - rna_type, - so_term, cell_location, + rna_type, + so_term_id, model_source, model_length, model_basepair_count @@ -33,9 +33,9 @@ $$ create table load_secondary_layout_models ( model_name text NOT NULL, taxid int NOT NULL, - rna_type text NOT NULL, - so_term text NOT NULL, cell_location text, + rna_type text NOT NULL, + so_term_id text NOT NULL, model_source text not null, model_length int, model_basepair_count int @@ -47,9 +47,9 @@ $$ INSERT INTO rnc_secondary_structure_layout_models ( model_name, taxid, + cellular_location, rna_type, so_term_id, - cellular_location, model_source, model_length, model_basepair_count @@ -57,9 +57,9 @@ INSERT INTO rnc_secondary_structure_layout_models ( SELECT model_name, taxid, - rna_type, - so_term, cell_location, + rna_type, + so_term_id, model_source, model_length, model_basepair_count @@ -67,9 +67,9 @@ FROM load_secondary_layout_models load ) ON CONFLICT (model_name) DO UPDATE SET taxid = EXCLUDED.taxid, - rna_type = EXCLUDED.rna_type, - so_term = EXCLUDED.so_term, cell_location = EXCLUDED.cell_location, + rna_type = EXCLUDED.rna_type, + so_term_id = EXCLUDED.so_term_id, model_source = EXCLUDED.model_source, model_length = EXCLUDED.model_length, model_basepair_count = EXCLUDED.model_basepair_count From 10ee204df9b002a4932ce8353557ef7695b26974 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Thu, 27 Oct 2022 13:57:10 +0100 Subject: [PATCH 23/33] Add extra parsing to handle LSU and SSU for mt_rRNA Needed until https://github.com/RNAcentral/R2DT/issues/87 is resolved --- rnacentral_pipeline/rnacentral/r2dt/models/crw.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index 8a7aa8e57..bbf1f1acf 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -27,7 +27,8 @@ "group_II_intron": "SO:0000603", "large_subunit_rRNA": "SO:0000651", "small_subunit_rRNA": "SO:0000650", - "mt_rRNA": "SO:0002128", + "mt_LSU_rRNA": "SO:0002345", + "mt_SSU_rRNA": "SO:0002344", "rRNA_18S": "SO:0000407", "rRNA_21S": "SO:0002345", "rRNA_23S": "SO:0001001", @@ -82,10 +83,16 @@ def parse_model(handle, metadata) -> ModelInfo: raise ValueError("Invalid length for: %s" % model_name) taxonomy_id = int(metadata[model_name]["taxid"]) + so_type_name = metadata[model_name]["rna_type"] + if so_type_name == "mt_rRNA": + if model_name.contains(".16."): + so_type_name = "mt_SSU_rRNA" + else: + so_type_name = "mt_LSU_rRNA" return ModelInfo( model_name=model_name, - so_rna_type=as_so_term(metadata[model_name]["rna_type"]), + so_rna_type=as_so_term(so_type_name), taxid=taxonomy_id, source=Source.crw, length=int(length), From 30862c9e4cdd7f4dbb0074ea3870a4f9697610ec Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 28 Oct 2022 10:51:15 +0100 Subject: [PATCH 24/33] Update SO-rnaType name lookup with Rfam types --- rnacentral_pipeline/rnacentral/r2dt/data.py | 30 +++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index f0acfed77..f577a8124 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -90,8 +90,38 @@ "SO:0000407": "cytosolic_18S_rRNA", "SO:0002345": "mt_LSU_rRNA", "SO:0001001": "cytosolic_23S_rRNA", + "SO:0000391": "U1_snRNA", + # From Rfam table... + "SO:0000122": "RNA_sequence_secondary_structure", + "SO:0000140": "attenuator", + "SO:0000376": "RNA_6S", + "SO:0000377": "CsrB_RsmB_RNA", + "SO:0000378": "DsrA_RNA", + "SO:0000379": "GcvB_RNA", + "SO:0000383": "MicF_RNA", + "SO:0000384": "OxyS_RNA", + "SO:0000387": "RprA_RNA", + "SO:0000388": "RRE_RNA", + "SO:0000389": "spot_42_RNA", + "SO:0000394": "U4atac_snRNA", + "SO:0000395": "U5_snRNA", + "SO:0000397": "U6atac_snRNA", + "SO:0000398": "U11_snRNA", + "SO:0000399": "U12_snRNA", + "SO:0000404": "vault_RNA", + "SO:0000588": "autocatalytically_spliced_intron", + "SO:0000726": "repeat_unit", + "SO:0000836": "mRNA_region", + "SO:0000990": "class_I_RNA", + "SO:0001055": "transcriptional_cis_regulatory_region", + "SO:0001877": "lncRNA", } +""" + + +""" + @enum.unique class Source(enum.Enum): From 9faf6c01ef63b8763e688cce653643c7df846903 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 28 Oct 2022 11:24:23 +0100 Subject: [PATCH 25/33] Adds another missing type to the mapping --- rnacentral_pipeline/rnacentral/r2dt/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index f577a8124..e01bb31ee 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -91,6 +91,7 @@ "SO:0002345": "mt_LSU_rRNA", "SO:0001001": "cytosolic_23S_rRNA", "SO:0000391": "U1_snRNA", + "SO:0000392": "U2_snRNA", # From Rfam table... "SO:0000122": "RNA_sequence_secondary_structure", "SO:0000140": "attenuator", From 70ec16794d6cadd4ab5d8e2280b153ed4189383c Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 28 Oct 2022 12:57:09 +0100 Subject: [PATCH 26/33] Check for .16. in model name properly --- rnacentral_pipeline/rnacentral/r2dt/models/crw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py index bbf1f1acf..b1bcb9607 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/models/crw.py +++ b/rnacentral_pipeline/rnacentral/r2dt/models/crw.py @@ -85,7 +85,7 @@ def parse_model(handle, metadata) -> ModelInfo: taxonomy_id = int(metadata[model_name]["taxid"]) so_type_name = metadata[model_name]["rna_type"] if so_type_name == "mt_rRNA": - if model_name.contains(".16."): + if ".16." in model_name: so_type_name = "mt_SSU_rRNA" else: so_type_name = "mt_LSU_rRNA" From f06f883873728e0deddac3371aed3804ac8b1798 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 11 Nov 2022 11:02:50 +0000 Subject: [PATCH 27/33] Add sequence limit for r2dt processing Modified in params.r2dt.sequence_count Currently set to 2 million in config --- config/r2dt.config | 1 + files/r2dt/find-sequences.sql | 3 ++- workflows/r2dt.nf | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config/r2dt.config b/config/r2dt.config index 2ba11e393..8109e4239 100644 --- a/config/r2dt.config +++ b/config/r2dt.config @@ -4,6 +4,7 @@ params { sequence_chunks = 4000 data_chunk_size = 1024 * 1000 * 1000 sequence_chunk_size = 1000 + sequence_count = 2000000 tablename = 'traveler_sequences_to_analyze' publish = "$baseDir/r2dt" container = 'rnacentral/r2dt:latest' diff --git a/files/r2dt/find-sequences.sql b/files/r2dt/find-sequences.sql index 5cf4df31f..c7b0043dd 100644 --- a/files/r2dt/find-sequences.sql +++ b/files/r2dt/find-sequences.sql @@ -5,7 +5,8 @@ SELECT 'sequence', COALESCE(rna.seq_short, rna.seq_long) ) FROM rna -WHERE +WHERE not exists(select 1 from pipeline_tracking_traveler track where track.urs = rna.upi) AND rna.len < :max_len + LIMIT :sequence_count ) TO STDOUT; diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index b837fa93b..cfc134c97 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -32,6 +32,7 @@ process extract_sequences { -v ON_ERROR_STOP=1 \ -v 'tablename=${params.r2dt.tablename}' \ -v max_len=10000 \ + -v 'sequence_count=${params.r2dt.sequence_count}' \ -f "$query" "$PGDATABASE" > raw.json mkdir parts/ split --number=l/4000 --additional-suffix='.json' raw.json parts/ From 6309e3317cb2512949979d67fc0e3f4d7f12d376 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 11 Nov 2022 11:06:03 +0000 Subject: [PATCH 28/33] Add some missing SO -> name lookups From patches made during r2dt rescan --- rnacentral_pipeline/rnacentral/r2dt/data.py | 32 ++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index e01bb31ee..aeefdac04 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -89,9 +89,39 @@ "SO:0002128": "mt_rRNA", "SO:0000407": "cytosolic_18S_rRNA", "SO:0002345": "mt_LSU_rRNA", + "SO:0002344": "mt_SSU_rrNA", "SO:0001001": "cytosolic_23S_rRNA", "SO:0000391": "U1_snRNA", - "SO:0000392": "U2_snRNA", + "SO:0000392": "U2_snRNA", # below here added... + "SO:0000380": "hammerhead_ribozyme", + "SO:0001179": "U3_snoRNA", + "SO:0000393": "U4_snRNA", + "SO:0000593": "C_D_box_snoRNA", + "SO:0000590": "SRP_RNA", + "SO:0000405": "Y_RNA", + "SO:0000584": "tmRNA", + "SO:0000390": "telomerase_RNA", + "SO:0000396": "U6_snRNA", + "SO:0001244": "pre_miRNA", + "SO:0000385": "RNase_MRP_RNA", + "SO:1001274": "SECIS_element", + "SO:0000205": "three_prime_UTR", + "SO:0000655": "ncRNA", + "SO:0000644": "antisense_RNA", + "SO:0000204": "five_prime_UTR", + "SO:0000594": "H_ACA_box_snoRNA", + "SO:0000035": "riboswitch", + "SO:0000243": "internal_ribosome_entry_site", + "SO:0000274": "snRNA", + "SO:0000275": "snoRNA", + "SO:0000374": "ribozyme", + "SO:0005836": "regulatory_region", + "SO:0001459": "CRISPR", + "SO:0001427": "cis_regulatory_frameshift_element", + "SO:1001268": "recoding_stimulatory_region", + "SO:0000370": "small_regulatory_ncRNA", + "SO:0000837": "UTR_region", + "SO:0001263": "ncRNA_gene", # From Rfam table... "SO:0000122": "RNA_sequence_secondary_structure", "SO:0000140": "attenuator", From 7cfffeac22055e25c2798ee6895f2dcf1fc03738 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 11 Nov 2022 11:07:43 +0000 Subject: [PATCH 29/33] Fix model loading ctl to be consistent with table cell_location -> cellular_location --- files/r2dt/load-models.ctl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/files/r2dt/load-models.ctl b/files/r2dt/load-models.ctl index ab2adf8a0..5c6f2e596 100644 --- a/files/r2dt/load-models.ctl +++ b/files/r2dt/load-models.ctl @@ -3,7 +3,7 @@ FROM ALL FILENAMES MATCHING ~ HAVING FIELDS ( model_name, taxid, - cell_location, + cellular_location, rna_type, so_term_id, model_source, @@ -13,7 +13,7 @@ HAVING FIELDS ( TARGET COLUMNS ( model_name, taxid, - cell_location, + cellular_location, rna_type, so_term_id, model_source, @@ -33,7 +33,7 @@ $$ create table load_secondary_layout_models ( model_name text NOT NULL, taxid int NOT NULL, - cell_location text, + cellular_location text, rna_type text NOT NULL, so_term_id text NOT NULL, model_source text not null, @@ -57,7 +57,7 @@ INSERT INTO rnc_secondary_structure_layout_models ( SELECT model_name, taxid, - cell_location, + cellular_location, rna_type, so_term_id, model_source, @@ -67,7 +67,7 @@ FROM load_secondary_layout_models load ) ON CONFLICT (model_name) DO UPDATE SET taxid = EXCLUDED.taxid, - cell_location = EXCLUDED.cell_location, + cellular_location = EXCLUDED.cellular_location, rna_type = EXCLUDED.rna_type, so_term_id = EXCLUDED.so_term_id, model_source = EXCLUDED.model_source, From 5df46d351729306a4ba0914f6ee7a3b5d7587c71 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 11 Nov 2022 15:59:47 +0000 Subject: [PATCH 30/33] Pass version correctly to create-attempted --- workflows/r2dt.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index cfc134c97..bf9f0b0fe 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -105,7 +105,7 @@ process parse_layout { """ rnac r2dt process-svgs --allow-missing $mapping $to_parse data.csv - rnac r2dt create-attempted $sequences attempted.csv + rnac r2dt create-attempted $sequences $version attempted.csv """ } From 34e31c5912c92fce2b0d2c8c5a41fd081c4af801 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Fri, 11 Nov 2022 16:38:31 +0000 Subject: [PATCH 31/33] Ignore version in input to publish layout --- workflows/r2dt.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index bf9f0b0fe..3654aa5e2 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -82,7 +82,7 @@ process publish_layout { queue 'datamover' input: - tuple path(sequences), path(output), path(mapping) + tuple path(sequences), path(output), path(_version), path(mapping) output: val 'done', emit: flag From 804528764a4dd37a68d1c806a36191c296a17b90 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 14 Nov 2022 16:46:43 +0000 Subject: [PATCH 32/33] Black reformatting --- rnacentral_pipeline/rnacentral/r2dt/should_show.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/should_show.py b/rnacentral_pipeline/rnacentral/r2dt/should_show.py index d27e18ed1..b83eb7cd5 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/should_show.py +++ b/rnacentral_pipeline/rnacentral/r2dt/should_show.py @@ -20,14 +20,13 @@ from pathlib import Path import joblib -from more_itertools import chunked import pandas as pd -from pypika import Table, Query import psycopg2 import psycopg2.extras +from more_itertools import chunked +from pypika import Query, Table from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score, train_test_split LOGGER = logging.getLogger(__name__) From b7cd92b97981698881a31d09a4b6d0c2dfc89887 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 15 Nov 2022 15:17:45 +0000 Subject: [PATCH 33/33] Add version invormation to attempted ctl --- files/r2dt/attempted.ctl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/files/r2dt/attempted.ctl b/files/r2dt/attempted.ctl index f17730102..957342089 100644 --- a/files/r2dt/attempted.ctl +++ b/files/r2dt/attempted.ctl @@ -1,11 +1,13 @@ LOAD CSV FROM ALL FILENAMES MATCHING ~ HAVING FIELDS ( - urs + urs, + r2dt_version ) INTO {{PGDATABASE}}?load_traveler_attempted TARGET COLUMNS ( - urs + urs, + r2dt_version ) WITH @@ -20,7 +22,7 @@ $$, $$ CREATE TABLE load_traveler_attempted ( urs text primary key, - r2dt_version text, + r2dt_version text ); $$ @@ -38,7 +40,7 @@ SELECT FROM load_traveler_attempted load ) ON CONFLICT (urs) DO UPDATE SET - last_run = EXCLUDED.last_run + last_run = EXCLUDED.last_run, r2dt_version = EXCLUDED.r2dt_version ; $$