diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py deleted file mode 100755 index ef80ec80..00000000 --- a/bin/sra_runinfo_to_ftp.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python - - -import argparse -import csv -import logging -import sys -from itertools import chain -from pathlib import Path - - -logger = logging.getLogger() - - -def parse_args(args=None): - Description = "Create samplesheet with FTP download links and md5ums from sample information obtained via 'sra_ids_to_runinfo.py' script." - Epilog = "Example usage: python sra_runinfo_to_ftp.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument( - "files_in", - metavar="FILES_IN", - help="Comma-separated list of metadata file created from 'sra_ids_to_runinfo.py' script.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Output file containing paths to download FastQ files along with their associated md5sums.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(args) - - -def valid_fastq_extension(fastq): - return fastq.endswith("fastq.gz") - - -def parse_sra_runinfo(file_in): - runinfo = {} - columns = [ - "run_accession", - "experiment_accession", - "library_layout", - "fastq_ftp", - "fastq_md5", - ] - extensions = [ - "fastq_1", - "fastq_2", - "md5_1", - "md5_2", - "single_end", - ] - with open(file_in, "r", newline="") as fin: - reader = csv.DictReader(fin, delimiter="\t", skipinitialspace=True) - header = list(reader.fieldnames) - if missing := frozenset(columns).difference(frozenset(header)): - logger.critical(f"The following expected columns are missing from {file_in}: " f"{', '.join(missing)}.") - sys.exit(1) - for row in reader: - db_id = row["experiment_accession"] - if row["fastq_ftp"]: - fq_files = row["fastq_ftp"].split(";")[-2:] - fq_md5 = row["fastq_md5"].split(";")[-2:] - if len(fq_files) == 1: - assert fq_files[0].endswith(".fastq.gz"), f"Unexpected FastQ file format {file_in.name}." - if row["library_layout"] != "SINGLE": - logger.warning(f"The library layout '{row['library_layout']}' should be " f"'SINGLE'.") - sample = { - "fastq_1": fq_files[0], - "fastq_2": None, - "md5_1": fq_md5[0], - "md5_2": None, - "single_end": "true", - } - elif len(fq_files) == 2: - assert fq_files[0].endswith("_1.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." - assert fq_files[1].endswith("_2.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." - if row["library_layout"] != "PAIRED": - logger.warning(f"The library layout '{row['library_layout']}' should be " f"'PAIRED'.") - sample = { - "fastq_1": fq_files[0], - "fastq_2": fq_files[1], - "md5_1": fq_md5[0], - "md5_2": fq_md5[1], - "single_end": "false", - } - else: - raise RuntimeError(f"Unexpected number of FastQ files: {fq_files}.") - else: - # In some instances, FTP links don't exist for FastQ files. - # These have to be downloaded with the run accession using sra-tools. - sample = dict.fromkeys(extensions, None) - if row["library_layout"] == "SINGLE": - sample["single_end"] = "true" - elif row["library_layout"] == "PAIRED": - sample["single_end"] = "false" - - sample.update(row) - if db_id not in runinfo: - runinfo[db_id] = [sample] - else: - if sample in runinfo[db_id]: - logger.error( - f"Input run info file contains duplicate rows!\n" f"{', '.join([row[col] for col in header])}" - ) - else: - runinfo[db_id].append(sample) - - return runinfo, header + extensions - - -def sra_runinfo_to_ftp(files_in, file_out): - samplesheet = {} - header = [] - for file_in in files_in: - runinfo, sample_header = parse_sra_runinfo(file_in) - header.append(sample_header) - for db_id, rows in runinfo.items(): - if db_id not in samplesheet: - samplesheet[db_id] = rows - else: - logger.warning(f"Duplicate sample identifier found!\nID: '{db_id}'") - - # Create a combined header from all input files. - combined_header = header[0] + list(set().union(chain.from_iterable(header)).difference(header[0])) - combined_header.insert(0, "id") - - # Write samplesheet with paths to FastQ files and md5 sums. - if samplesheet: - with file_out.open("w", newline="") as fout: - writer = csv.DictWriter(fout, fieldnames=combined_header, delimiter="\t") - writer.writeheader() - for db_id in sorted(samplesheet): - for idx, row in enumerate(samplesheet[db_id], start=1): - row["id"] = f"{db_id}" - if "run_accession" in row: - row["id"] = f"{db_id}_{row['run_accession']}" - writer.writerow(row) - - -def main(args=None): - args = parse_args(args) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - files = [Path(x.strip()) for x in args.files_in.split(",")] - for path in files: - if not path.is_file(): - logger.critical(f"The given input file {path} was not found!") - sys.exit(1) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - sra_runinfo_to_ftp(files, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/sra_runinfo_to_ftp/main.nf b/modules/local/sra_runinfo_to_ftp/main.nf index 9c83cf53..181af499 100644 --- a/modules/local/sra_runinfo_to_ftp/main.nf +++ b/modules/local/sra_runinfo_to_ftp/main.nf @@ -1,27 +1,131 @@ process SRA_RUNINFO_TO_FTP { - conda "conda-forge::python=3.9.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'biocontainers/python:3.9--1' }" - input: - path runinfo + path runinfo_file output: - path "*.tsv" , emit: tsv - path "versions.yml", emit: versions - - script: - """ - sra_runinfo_to_ftp.py \\ - ${runinfo.join(',')} \\ - ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ + path "*.runinfo_ftp.tsv", emit: tsv + + exec: + def (runinfo, header) = parseSraRuninfo(runinfo_file) + header.add(0, "id") + + def samplesheet = [:] + runinfo.each { db_id, rows -> + if( db_id !in samplesheet ) + samplesheet[db_id] = rows + else + log.warn("Duplicate sample identifier found -- ID: '${db_id}'") + } + + def prefix = runinfo_file.name.tokenize(".")[0] + def file_out = task.workDir.resolve("${prefix}.runinfo_ftp.tsv") + file_out << header.join("\t") << "\n" + + samplesheet + .sort { id, _rows -> id } + .each { id, rows -> + rows.each { row -> + row.id = row.run_accession + ? "${id}_${row.run_accession}" + : id + def values = header.collect { k -> row[k] } + file_out << values.join("\t") << "\n" + } + } +} + + +def parseSraRuninfo(file_in) { + def runinfo = [:] + def columns = [ + "run_accession", + "experiment_accession", + "library_layout", + "fastq_ftp", + "fastq_md5", + ] + def records = file_in.splitCsv(header: true, sep: "\t") + def header = file_in.readLines().first().tokenize("\t") + def missing = columns.findAll { c -> c !in header } + if( missing ) + throw new Exception("The following expected columns are missing from ${file_in}: ${missing.join(', ')}.") + + records.each { row -> + def db_id = row.experiment_accession + def sample = getSample(row, file_in.name) + + sample.putAll(row) + if( db_id !in runinfo ) { + runinfo[db_id] = [sample] + } + else { + if( sample in runinfo[db_id] ) + log.error("Input run info file contains duplicate rows: ${row}") + else + runinfo[db_id].append(sample) + } + } + + return [ runinfo, (header + getExtensions()).unique() ] +} + + +def getSample(row, filename) { + if( row.fastq_ftp ) { + def fq_files = row.fastq_ftp.tokenize(";") + def fq_md5 = row.fastq_md5.tokenize(";") + if( fq_files.size() == 1 ) { + assert fq_files[0].endsWith(".fastq.gz") : "Unexpected FastQ file format ${filename}." + if( row.library_layout != "SINGLE" ) + log.warn("The library layout '${row.library_layout}' should be 'SINGLE'.") + return [ + "fastq_1": fq_files[0], + "fastq_2": null, + "md5_1": fq_md5[0], + "md5_2": null, + "single_end": "true", + ] + } + + if( fq_files.size() == 2 ) { + assert fq_files[0].endsWith("_1.fastq.gz") : "Unexpected FastQ file format ${filename}." + assert fq_files[1].endsWith("_2.fastq.gz") : "Unexpected FastQ file format ${filename}." + if( row.library_layout != "PAIRED" ) + log.warn("The library layout '${row.library_layout}' should be 'PAIRED'.") + return [ + "fastq_1": fq_files[0], + "fastq_2": fq_files[1], + "md5_1": fq_md5[0], + "md5_2": fq_md5[1], + "single_end": "false", + ] + } + + throw new Exception("Unexpected number of FastQ files: ${fq_files}") + } + + // In some instances, FTP links don't exist for FastQ files. + // These have to be downloaded with the run accession using sra-tools. + def sample = getExtensions().inject([:]) { acc, k -> + acc[k] = null + acc + } + if( row.library_layout == "SINGLE" ) + sample.single_end = "true" + else if( row.library_layout == "PAIRED" ) + sample.single_end = "false" + return sample +} + + +def getExtensions() { + return [ + "fastq_1", + "fastq_2", + "md5_1", + "md5_2", + "single_end", + ] }