diff --git a/pom.xml b/pom.xml index 97157e9..9c9c91d 100644 --- a/pom.xml +++ b/pom.xml @@ -17,14 +17,14 @@ rice: an ADAM-based RNA-seq quantification pipeline - 0.17.0 + 0.23.0 1.7.6 - 2.2.0 - 1.7 - 1.7.0 - 1.2.1 + 2.7.3 + 1.8 + 1.8.2 + 1.6.3 - 0.2.2 + 0.2.13 @@ -236,7 +236,7 @@ args4j args4j - 2.0.23 + 2.0.31 org.bdgenomics.rice @@ -314,7 +314,7 @@ org.scalatest scalatest_2.10 - 2.2.2 + 2.2.6 test @@ -322,11 +322,6 @@ commons-configuration 1.10 - - args4j - args4j - 2.0.23 - org.apache.spark spark-core_2.10 diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala index b417054..a27f341 100644 --- a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala +++ b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala @@ -22,8 +22,8 @@ import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.formats.avro.NucleotideContigFragment -import org.bdgenomics.adam.models.{ Exon, Transcript } import org.bdgenomics.rice.Timers._ +import org.bdgenomics.rice.models.{ Exon, Transcript } import org.bdgenomics.adam.util.{ TwoBitFile, ReferenceFile } import scala.util.hashing.MurmurHash3 diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala index a60117d..4e29f72 100644 --- a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala +++ b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala @@ -21,8 +21,9 @@ import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD import org.bdgenomics.formats.avro.AlignmentRecord -import org.bdgenomics.adam.models.Transcript +import org.bdgenomics.rice.models.Transcript import org.bdgenomics.rice.Timers._ object Quantify extends Serializable with Logging { @@ -39,7 +40,7 @@ object Quantify extends Serializable with Logging { * Patro, Rob, Stephen M. Mount, and Carl Kingsford. "Sailfish enables alignment-free * isoform quantification from RNA-seq reads using lightweight algorithms." Nature biotechnology 32.5 (2014): 462-464. */ - def apply(reads: RDD[AlignmentRecord], + def apply(reads: AlignmentRecordRDD, kmerToEquivalenceClass: RDD[(String, Long)], equivalenceClassToTranscript: RDD[(Long, Iterable[String])], transcripts: RDD[Transcript], @@ -56,7 +57,7 @@ object Quantify extends Serializable with Logging { // cut reads into kmers and then calibrate if desired val readKmers = CountKmers.time { - reads.adamCountKmers(kmerLength) + reads.countKmers(kmerLength) } val calibratedKmers = if (calibrateKmerBias) { TareKmers.time { @@ -175,7 +176,7 @@ object Quantify extends Serializable with Logging { private[algorithms] def initializeEM(equivalenceClassCounts: RDD[(Long, Long)], equivalenceClassToTranscript: RDD[(Long, Iterable[String])]): RDD[(Long, Iterable[(String, Double)])] = { equivalenceClassCounts.join(equivalenceClassToTranscript).map((x: (Long, (Long, Iterable[String]))) => { - val normCoverage: Double = x._2._1.toDouble / x._2._2.size() + val normCoverage: Double = x._2._1.toDouble / x._2._2.size val iter2: Iterable[(String, Double)] = x._2._2.map((y: String) => { (y, normCoverage) }) diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala b/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala new file mode 100644 index 0000000..bd5e32b --- /dev/null +++ b/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala @@ -0,0 +1,233 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.rice.models + +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ Alphabet, ReferenceRegion } +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.formats.avro.{ Strand, Feature } + +/** + * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons. + * Each Gene contains a collection of Transcripts, and each Transcript contains a collection of + * Exons, and together they describe how the genome is transcribed and translated into a family + * of related proteins (or other RNA products that aren't translated at all). + * + * This review, + * Gerstein et al. "What is a gene, post-ENCODE? History and updated definition" Genome Research (2007) + * http://genome.cshlp.org/content/17/6/669.full + * + * is a reasonably good overview both of what the term 'gene' has meant in the past as well as + * where it might be headed in the future. + * + * Here, we aren't trying to answer any of these questions about "what is a gene," but rather to + * provide the routines necessary to _re-assemble_ hierarchical models of genes that have been + * flattened into features (GFF, GTF, or BED) + * + * @param id A name, presumably unique within a gene dataset, of a Gene + * @param names Common names for the gene, possibly shared with other genes (for historical or + * ad hoc reasons) + * @param strand The strand of the Gene (this is from data, not derived from the Transcripts' strand(s), and + * we leave open the possibility that a single Gene will have Transcripts in _both_ directions, + * e.g. anti-sense transcripts) + * @param transcripts The Transcripts that are part of this gene model + */ +case class Gene(id: String, names: Seq[String], strand: Boolean, transcripts: Iterable[Transcript]) { + + /** + * Finds the union of all the locations of the transcripts for this gene, + * across all the reference sequences indicates by the transcripts themselves. + * @return A Seq of ReferenceRegions + */ + lazy val regions = + ReferenceUtils.unionReferenceSet(transcripts.map(_.region)).toSeq +} + +/** + * A transcript model (here represented as a value of the Transcript class) is a simple, + * hierarchical model containing a collection of exon models as well as an associated + * gene identifier, transcript identifier, and a set of common names (synonyms). + * + * @param id the (unique) identifier of the Transcript + * @param names Common names for the transcript + * @param geneId The (unique) identifier of the gene to which the transcript belongs + * @param exons The set of exons in the transcript model; each of these contain a + * reference region whose coordinates are in genomic space. + * @param cds the set of CDS regions (the subset of the exons that are coding) for this + * transcript + * @param utrs + */ +case class Transcript(id: String, + names: Seq[String], + geneId: String, + strand: Boolean, + exons: Iterable[Exon], + cds: Iterable[CDS], + utrs: Iterable[UTR]) { + + lazy val region = exons.map(_.region).reduceLeft[ReferenceRegion]((acc, ex) => acc.hull(ex)) + + /** + * Returns the concatenated sequence of the exons of the Transcript. + * + * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with + * respect to which the Transcript's coordinates or locators + * are given + * @return the String representation of this Transcript's spliced mRNA sequence + */ + def extractTranscribedRNASequence(referenceSequence: String): String = { + val minStart = exons.map(_.region.start).toSeq.sorted.head.toInt + // takes the max... + + val maxEnd = -exons.map(-_.region.end).toSeq.sorted.head.toInt + if (strand) + referenceSequence.substring(minStart, maxEnd) + else + Alphabet.dna.reverseComplement(referenceSequence.substring(minStart, maxEnd)) + } + + /** + * Returns the _coding sequence_ of the Transcript -- essentially, the subset of the + * exon(s) after the translation-start codon and before the translation-stop codon, + * as annotated in the Exon and CDS objects of the transcript. + * + * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with + * respect to which the Transcript's coordinates or locators + * are given + * @return the String representation of this Transcript's spliced mRNA sequence + */ + def extractCodingSequence(referenceSequence: String): String = { + val builder = new StringBuilder() + val cdses = + if (strand) + cds.toSeq.sortBy(_.region.start) + else + cds.toSeq.sortBy(_.region.start).reverse + + cdses.foreach(cds => builder.append(cds.extractSequence(referenceSequence))) + builder.toString() + } + + /** + * Returns the _exonic_ sequence of the Transcript -- basically, the sequences of + * each exon concatenated together, with the intervening intronic sequences omitted. + * + * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with + * respect to which the Transcript's coordinates or locators + * are given + * @return the String representation of this Transcript's spliced mRNA sequence + */ + def extractSplicedmRNASequence(referenceSequence: String): String = { + val builder = new StringBuilder() + val exs = + if (strand) + exons.toSeq.sortBy(_.region.start) + else + exons.toSeq.sortBy(_.region.start).reverse + + exs.foreach(exon => builder.append(exon.extractSequence(referenceSequence))) + builder.toString() + } + +} + +/** + * A trait for values (usually regions or collections of regions) that can be + * subsetted or extracted out of a larger region string -- for example, exons + * or transcripts which have a sequence defined in terms of their coordinates + * against a reference chromosome. Passing the sequence of the reference + * chromosome to a transcript's 'extractSequence' method will return the + * sequence of the transcript. + */ +trait Extractable { + + /** + * Returns the subset of the given referenceSequence that corresponds to this + * Extractable (not necessarily a contiguous substring, and possibly reverse- + * complemented or transformed in other ways). + * + * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with + * respect to which the Extractable's coordinates or locators + * are given + * @return the String representation of this Extractable + */ + def extractSequence(referenceSequence: String): String +} + +abstract class BlockExtractable(strand: Boolean, region: ReferenceRegion) + extends Extractable { + + override def extractSequence(referenceSequence: String): String = + if (strand) + referenceSequence.substring(region.start.toInt, region.end.toInt) + else + Alphabet.dna.reverseComplement(referenceSequence.substring(region.start.toInt, region.end.toInt)) +} + +/** + * An exon model (here represented as a value of the Exon class) is a representation of a + * single exon from a transcript in genomic coordinates. + * + * NOTE: we're not handling shared exons here + * + * @param transcriptId the (unique) identifier of the transcript to which the exon belongs + * @param region The region (in genomic coordinates) to which the exon maps + */ +case class Exon(exonId: String, transcriptId: String, strand: Boolean, region: ReferenceRegion) + extends BlockExtractable(strand, region) { +} + +/** + * Coding Sequence annotations, should be a subset of an Exon for a particular Transcript + * @param transcriptId + * @param strand + * @param region + */ +case class CDS(transcriptId: String, strand: Boolean, region: ReferenceRegion) + extends BlockExtractable(strand, region) { +} + +/** + * UnTranslated Regions + * + * @param transcriptId + * @param strand + * @param region + */ +case class UTR(transcriptId: String, strand: Boolean, region: ReferenceRegion) + extends BlockExtractable(strand, region) { +} + +object ReferenceUtils { + + def unionReferenceSet(refs: Iterable[ReferenceRegion]): Iterable[ReferenceRegion] = { + + def folder(acc: Seq[ReferenceRegion], tref: ReferenceRegion): Seq[ReferenceRegion] = + acc match { + case Seq() => Seq(tref) + case (first: ReferenceRegion) +: rest => + if (first.overlaps(tref)) + first.hull(tref) +: rest + else + tref +: first +: rest + } + + refs.toSeq.sorted.foldLeft(Seq[ReferenceRegion]())(folder) + } +} diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala index 92c8147..f186755 100644 --- a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala +++ b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala @@ -18,8 +18,14 @@ package org.bdgenomics.rice.algorithms import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.models.{ Exon, ReferenceRegion, Transcript } -import org.bdgenomics.rice.utils.riceFunSuite +import org.bdgenomics.adam.models.{ + RecordGroupDictionary, + ReferenceRegion, + SequenceDictionary +} +import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.rice.models.{ Exon, Transcript } +import org.bdgenomics.rice.utils.RiceFunSuite import org.bdgenomics.rice.utils.{ ReadGenerator, TranscriptGenerator } import scala.collection.Map import scala.collection.immutable.HashMap @@ -34,9 +40,12 @@ class TestingTwoBitFile(byteAccess: ByteAccess) extends ReferenceFile with Seria override def extract(region: ReferenceRegion): String = { testSeq.substring(region.start.toInt, region.end.toInt) } + override def sequences: SequenceDictionary = { + SequenceDictionary.empty + } } -class QuantifySuite extends riceFunSuite { +class QuantifySuite extends RiceFunSuite { def fpEquals(a: Double, b: Double, eps: Double = 1e-6): Boolean = { val passed = abs(a - b) <= eps @@ -394,7 +403,8 @@ class QuantifySuite extends riceFunSuite { val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.3, 0.2, 0.1, 0.1), 10000, 75, Some(4321L)) // run quantification - val relativeAbundances = Quantify(sc.parallelize(reads), + val relativeAbundances = Quantify( + AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty), sc.parallelize(kmerMap.toSeq), sc.parallelize(classMap.toSeq), sc.parallelize(names.zip(tLen).map(p => Transcript(p._1, @@ -512,7 +522,8 @@ class QuantifySuite extends riceFunSuite { Some(5000L)) // run quantification - val relativeAbundances = Quantify(sc.parallelize(reads), + val relativeAbundances = Quantify( + AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty), sc.parallelize(kmerMap.toSeq), sc.parallelize(classMap.toSeq), sc.parallelize(names.zip(tLen).map(p => Transcript(p._1, @@ -557,7 +568,8 @@ class QuantifySuite extends riceFunSuite { val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.3, 0.2, 0.1, 0.1), 10000, 75, Some(4321L)) // run quantification - val relativeAbundances = Quantify(sc.parallelize(reads), + val relativeAbundances = Quantify( + AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty), sc.parallelize(kmerMap.toSeq), sc.parallelize(classMap.toSeq), sc.parallelize(names.zip(tLen).map(p => Transcript(p._1, @@ -600,7 +612,8 @@ class QuantifySuite extends riceFunSuite { val reads = ReadGenerator(transcripts, tLen.map(x => x / totLen), 10000, 75, Some(4321L)) // run quantification - val relativeAbundances = Quantify(sc.parallelize(reads), + val relativeAbundances = Quantify( + AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty), sc.parallelize(kmerMap.toSeq), sc.parallelize(classMap.toSeq), sc.parallelize(names.zip(tLen).map(p => Transcript(p._1, @@ -643,7 +656,8 @@ class QuantifySuite extends riceFunSuite { val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.05, 0.2, 0.05, 0.4), 10000, 75, Some(4321L)) // run quantification - val relativeAbundances = Quantify(sc.parallelize(reads), + val relativeAbundances = Quantify( + AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty), sc.parallelize(kmerMap.toSeq), sc.parallelize(classMap.toSeq), sc.parallelize(names.zip(tLen).map(p => Transcript(p._1, diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala index aefac52..65464da 100644 --- a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala +++ b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala @@ -18,12 +18,12 @@ package org.bdgenomics.rice.algorithms import org.apache.spark.rdd.RDD -import org.bdgenomics.rice.utils.riceFunSuite +import org.bdgenomics.rice.utils.RiceFunSuite import org.bdgenomics.rice.utils.TranscriptGenerator import scala.math.{ abs, exp, log, max, min } import scala.util.Random -class TareSuite extends riceFunSuite { +class TareSuite extends RiceFunSuite { def fpEquals(a: Double, b: Double, eps: Double = 1e-6): Boolean = { val passed = abs(a - b) <= eps diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala similarity index 93% rename from rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala rename to rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala index 67103a4..93855a7 100644 --- a/rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala +++ b/rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala @@ -19,12 +19,12 @@ package org.bdgenomics.rice.utils import org.bdgenomics.utils.misc.SparkFunSuite -trait riceFunSuite extends SparkFunSuite { +trait RiceFunSuite extends SparkFunSuite { override val appName: String = "rice" override val properties: Map[String, String] = Map(("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), ("spark.kryo.registrator", "org.bdgenomics.adam.serialization.ADAMKryoRegistrator"), - ("spark.kryoserializer.buffer.mb", "4"), + ("spark.kryoserializer.buffer", "4"), ("spark.kryo.referenceTracking", "true")) }