Skip to content

R2dt rescan #153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 33 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d0ce5e9
Changes to extract and propagate r2dt version info for hits
afg1 Oct 25, 2022
2fd83ff
Version information for attempted
afg1 Oct 25, 2022
82e0683
Tweak workflow to extract and propagate version info
afg1 Oct 25, 2022
3d7c140
Remove version from r2dt hit info. It will only be stored in the atte…
afg1 Oct 25, 2022
d044b85
Add cell location as an optional parameter
afg1 Oct 26, 2022
7dd45e8
Rfam cm parser to rnac database entry
afg1 Oct 26, 2022
684031b
Fix gtrnadb parser for up to date model spec
afg1 Oct 26, 2022
12a0ef3
CM scanner for CRW
afg1 Oct 26, 2022
205efcd
Working parsers with correct forwarding of db_url
afg1 Oct 26, 2022
54c894e
Use crw metadata file rather than database query and tidy up unused s…
afg1 Oct 26, 2022
09a8289
Fix RNAse-p parser
afg1 Oct 26, 2022
e51c9e1
Fix ribovision parser and add lookup for more taxa
afg1 Oct 26, 2022
a29ed6a
Add metadata url to crw parser
afg1 Oct 26, 2022
7883e8a
Fix bad escapes in r2dt version extraction
afg1 Oct 26, 2022
98537a5
Workflow to scan the database with r2dt
afg1 Oct 26, 2022
5e362e1
Update models on conflict
afg1 Oct 26, 2022
ffa4026
Switch pipeline back to use metadata
afg1 Oct 26, 2022
13ff828
Fix sed command
afg1 Oct 26, 2022
e3485ba
Fix cms paths
afg1 Oct 26, 2022
9443e2e
Fix cms path
afg1 Oct 27, 2022
cf91e65
Add SO term to name lookup and reorganise writing
afg1 Oct 27, 2022
e8aa886
Fix filenames and field names in ctl
afg1 Oct 27, 2022
10ee204
Add extra parsing to handle LSU and SSU for mt_rRNA
afg1 Oct 27, 2022
30862c9
Update SO-rnaType name lookup with Rfam types
afg1 Oct 28, 2022
9faf6c0
Adds another missing type to the mapping
afg1 Oct 28, 2022
70ec167
Check for .16. in model name properly
afg1 Oct 28, 2022
f06f883
Add sequence limit for r2dt processing
afg1 Nov 11, 2022
6309e33
Add some missing SO -> name lookups
afg1 Nov 11, 2022
7cfffea
Fix model loading ctl to be consistent with table
afg1 Nov 11, 2022
5df46d3
Pass version correctly to create-attempted
afg1 Nov 11, 2022
34e31c5
Ignore version in input to publish layout
afg1 Nov 11, 2022
8045287
Black reformatting
afg1 Nov 14, 2022
b7cd92b
Add version invormation to attempted ctl
afg1 Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/r2dt.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ params {
sequence_chunks = 4000
data_chunk_size = 1024 * 1000 * 1000
sequence_chunk_size = 1000
sequence_count = 2000000
tablename = 'traveler_sequences_to_analyze'
publish = "$baseDir/r2dt"
container = 'rnacentral/r2dt:latest'
Expand Down
20 changes: 13 additions & 7 deletions files/r2dt/attempted.ctl
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
LOAD CSV
FROM ALL FILENAMES MATCHING ~<r2dt-attempted.*csv$>
HAVING FIELDS (
urs
urs,
r2dt_version
)
INTO {{PGDATABASE}}?load_traveler_attempted
TARGET COLUMNS (
urs
urs,
r2dt_version
)

WITH
Expand All @@ -19,23 +21,27 @@ DROP TABLE IF EXISTS load_traveler_attempted;
$$,
$$
CREATE TABLE load_traveler_attempted (
urs text primary key
urs text primary key,
r2dt_version text
);
$$

AFTER LOAD DO
$$
INSERT INTO pipeline_tracking_traveler (
urs,
last_run
last_run,
r2dt_version
) (
SELECT
load.urs,
NOW()
NOW(),
load.r2dt_version
FROM load_traveler_attempted load
) ON CONFLICT (urs) DO UPDATE
SET
last_run = EXCLUDED.last_run
SET
last_run = EXCLUDED.last_run,
r2dt_version = EXCLUDED.r2dt_version
;
$$
;
3 changes: 2 additions & 1 deletion files/r2dt/find-sequences.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ SELECT
'sequence', COALESCE(rna.seq_short, rna.seq_long)
)
FROM rna
WHERE
WHERE
not exists(select 1 from pipeline_tracking_traveler track where track.urs = rna.upi)
AND rna.len < :max_len
LIMIT :sequence_count
) TO STDOUT;
31 changes: 20 additions & 11 deletions files/r2dt/load-models.ctl
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
LOAD CSV
FROM ALL FILENAMES MATCHING ~<models.*.csv$>
FROM ALL FILENAMES MATCHING ~<data.*.csv$>
HAVING FIELDS (
model_name,
taxid,
cellular_location,
rna_type,
so_term,
cell_location,
so_term_id,
model_source,
model_length,
model_basepair_count
) INTO {{PGDATABASE}}?load_secondary_layout_models
TARGET COLUMNS (
model_name,
taxid,
cellular_location,
rna_type,
so_term,
cell_location,
so_term_id,
model_source,
model_length,
model_basepair_count
Expand All @@ -33,9 +33,9 @@ $$
create table load_secondary_layout_models (
model_name text NOT NULL,
taxid int NOT NULL,
cellular_location text,
rna_type text NOT NULL,
so_term text NOT NULL,
cell_location text,
so_term_id text NOT NULL,
model_source text not null,
model_length int,
model_basepair_count int
Expand All @@ -47,24 +47,33 @@ $$
INSERT INTO rnc_secondary_structure_layout_models (
model_name,
taxid,
cellular_location,
rna_type,
so_term_id,
cellular_location,
model_source,
model_length,
model_basepair_count
) (
SELECT
model_name,
taxid,
cellular_location,
rna_type,
so_term,
cell_location,
so_term_id,
model_source,
model_length,
model_basepair_count
FROM load_secondary_layout_models load
) ON CONFLICT (model_name) DO NOTHING
) ON CONFLICT (model_name) DO UPDATE
SET
taxid = EXCLUDED.taxid,
cellular_location = EXCLUDED.cellular_location,
rna_type = EXCLUDED.rna_type,
so_term_id = EXCLUDED.so_term_id,
model_source = EXCLUDED.model_source,
model_length = EXCLUDED.model_length,
model_basepair_count = EXCLUDED.model_basepair_count

;
$$,
$$
Expand Down
130 changes: 130 additions & 0 deletions r2dt-scan.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env nextflow

nextflow.enable.dsl=2

include { r2dt } from './workflows/r2dt'


process parse_gtrnadb_model {

input:
path(model_path)
output:
path("model_data.csv")

script:
"""
rnac r2dt model-info gtrnadb $model_path model_data.csv
"""
}

process parse_ribovision_models {

input:
val(ribovision_metadata_url)



output:
path("model_data.csv")

script:
"""
wget $ribovision_metadata_url

rnac r2dt model-info ribovision metadata.tsv model_data.csv
"""

}

process parse_rnasep_models {

input:
val(rnasep_metadata_url)

output:
path("model_data.csv")

script:
"""
wget $rnasep_metadata_url
sed -i 's/\\tNRC-1\\t/\\t/g' metadata.tsv
rnac r2dt model-info rnase-p metadata.tsv model_data.csv
"""

}

process parse_rfam_models {

input:
path(all_models)
output:
path("model_data.csv")

script:
"""
rnac r2dt model-info rfam $all_models $PGDATABASE model_data.csv
"""
}


process parse_crw_models {

input:
tuple path(all_models), val(metadata)
output:
path("model_data.csv")

script:
"""
wget $metadata -O metadata.tsv
sed -i 's/taxid rna_type/taxid\trna_type/g' metadata.tsv
rnac r2dt model-info crw $all_models metadata.tsv model_data.csv
"""
}

process load_models {

input:
path(all_data)
path(ctl)

output:
val('models loaded')

script:
"""
split-and-load $ctl $all_data ${params.import_data.chunk_size}
"""
}





workflow {
rfam_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/rfam/all.cm")
crw_models = Channel.of("$baseDir/singularity/bind/r2dt/data/cms/crw/all.cm")
crw_metadata = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/crw-metadata.tsv")
gtrnadb_models = Channel.fromPath("$baseDir/singularity/bind/r2dt/data/cms/gtrnadb/*.cm")
ribovision_lsu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-lsu/metadata.tsv")
ribovision_ssu_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/ribovision-ssu/metadata.tsv")

rnasep_metadata_url = Channel.of("https://raw.githubusercontent.com/RNAcentral/R2DT/v1.3/data/rnasep/metadata.tsv")

load_ctl = Channel.of("$baseDir/files/r2dt/load-models.ctl")

rfam_models | parse_rfam_models | set { rfam_data }
crw_models.combine(crw_metadata) | parse_crw_models | set { crw_data }
gtrnadb_models | parse_gtrnadb_model | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { gtrnadb_data }
ribovision_lsu_metadata_url.mix(ribovision_ssu_metadata_url) | parse_ribovision_models | set {ribovision_data }
rnasep_metadata_url | parse_rnasep_models | set {rnasep_data}

rfam_data.mix(crw_data, gtrnadb_data, ribovision_data, rnasep_data) | collectFile() {csvfile -> [csvfile.name, csvfile.text]} | set { all_data }


load_models(all_data, load_ctl) | set { model_load }

model_load | r2dt | set { done }

}
18 changes: 11 additions & 7 deletions rnacentral_pipeline/cli/r2dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,13 @@ def model_info():

@model_info.command("crw")
@click.argument("filename", type=click.File("r"))
@click.argument("metadata_url", type=str)
@click.argument("output", default="-", type=click.File("w"))
def crw_model_info(filename, output):
def crw_model_info(filename, metadata_url, output):
"""
Parse the CRW metadata file and produce
"""
r2dt.write_crw(filename, output)
r2dt.write_crw(filename, metadata_url, output)


@model_info.command("ribovision")
Expand All @@ -158,7 +159,7 @@ def ribovision_model_info(filename, output):


@model_info.command("gtrnadb")
@click.argument("filename", type=click.File("r"))
@click.argument("filename", type=click.File())
@click.argument("output", default="-", type=click.File("w"))
def gtrnadb_model_info(filename, output):
"""
Expand All @@ -181,20 +182,23 @@ def rnase_p_model_info(filename, output):

@model_info.command("rfam")
@click.argument("filename", type=click.File("r"))
@click.argument("db_url", type=str)
@click.argument("output", default="-", type=click.File("w"))
def rnase_p_model_info(filename, output):
def rnase_p_model_info(filename, db_url, output):
"""
Parse the metadata.tsv file from R2DT for Ribovision models to
produce something we can put in our database.
"""
r2dt.write_rfam(filename, output)
r2dt.write_rfam(filename, db_url, output)


@cli.command("create-attempted")
@click.argument("filename", type=click.File("r"))
@click.argument("version", type=click.File("r"))
@click.argument("output", default="-", type=click.File("w"))
def r2dt_create_attempted(filename, output):
attempted.r2dt(filename, output)
def r2dt_create_attempted(filename, version, output):
version_string = version.read().strip()
attempted.r2dt(filename, version_string, output)


@cli.command("publish")
Expand Down
10 changes: 7 additions & 3 deletions rnacentral_pipeline/rnacentral/attempted.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,14 @@ def parse_rfam_version(handle: ty.IO) -> str:
raise ValueError(f"Could not find version in file {handle}")


def write(data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True):
def write(
data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True, version=None
):
writer = csv.writer(output)
seen = False
for row in data:
if version:
row.append(version)
writer.writerow(row)
seen = True
if not seen:
Expand All @@ -88,6 +92,6 @@ def qa(handle: ty.IO, name: str, version_file: ty.IO, output: ty.IO):
write(data, output)


def r2dt(handle: ty.IO, output: ty.IO):
def r2dt(handle: ty.IO, version: str, output: ty.IO):
data = fasta_parser(handle)
write(data, output)
write(data, output, version=version)
Loading