diff --git a/pom.xml b/pom.xml
index 97157e9..9c9c91d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -17,14 +17,14 @@
rice: an ADAM-based RNA-seq quantification pipeline
- 0.17.0
+ 0.23.0
1.7.6
- 2.2.0
- 1.7
- 1.7.0
- 1.2.1
+ 2.7.3
+ 1.8
+ 1.8.2
+ 1.6.3
- 0.2.2
+ 0.2.13
@@ -236,7 +236,7 @@
args4j
args4j
- 2.0.23
+ 2.0.31
org.bdgenomics.rice
@@ -314,7 +314,7 @@
org.scalatest
scalatest_2.10
- 2.2.2
+ 2.2.6
test
@@ -322,11 +322,6 @@
commons-configuration
1.10
-
- args4j
- args4j
- 2.0.23
-
org.apache.spark
spark-core_2.10
diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala
index b417054..a27f341 100644
--- a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala
+++ b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala
@@ -22,8 +22,8 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.formats.avro.NucleotideContigFragment
-import org.bdgenomics.adam.models.{ Exon, Transcript }
import org.bdgenomics.rice.Timers._
+import org.bdgenomics.rice.models.{ Exon, Transcript }
import org.bdgenomics.adam.util.{ TwoBitFile, ReferenceFile }
import scala.util.hashing.MurmurHash3
diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala
index a60117d..4e29f72 100644
--- a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala
+++ b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala
@@ -21,8 +21,9 @@ import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.formats.avro.AlignmentRecord
-import org.bdgenomics.adam.models.Transcript
+import org.bdgenomics.rice.models.Transcript
import org.bdgenomics.rice.Timers._
object Quantify extends Serializable with Logging {
@@ -39,7 +40,7 @@ object Quantify extends Serializable with Logging {
* Patro, Rob, Stephen M. Mount, and Carl Kingsford. "Sailfish enables alignment-free
* isoform quantification from RNA-seq reads using lightweight algorithms." Nature biotechnology 32.5 (2014): 462-464.
*/
- def apply(reads: RDD[AlignmentRecord],
+ def apply(reads: AlignmentRecordRDD,
kmerToEquivalenceClass: RDD[(String, Long)],
equivalenceClassToTranscript: RDD[(Long, Iterable[String])],
transcripts: RDD[Transcript],
@@ -56,7 +57,7 @@ object Quantify extends Serializable with Logging {
// cut reads into kmers and then calibrate if desired
val readKmers = CountKmers.time {
- reads.adamCountKmers(kmerLength)
+ reads.countKmers(kmerLength)
}
val calibratedKmers = if (calibrateKmerBias) {
TareKmers.time {
@@ -175,7 +176,7 @@ object Quantify extends Serializable with Logging {
private[algorithms] def initializeEM(equivalenceClassCounts: RDD[(Long, Long)],
equivalenceClassToTranscript: RDD[(Long, Iterable[String])]): RDD[(Long, Iterable[(String, Double)])] = {
equivalenceClassCounts.join(equivalenceClassToTranscript).map((x: (Long, (Long, Iterable[String]))) => {
- val normCoverage: Double = x._2._1.toDouble / x._2._2.size()
+ val normCoverage: Double = x._2._1.toDouble / x._2._2.size
val iter2: Iterable[(String, Double)] = x._2._2.map((y: String) => {
(y, normCoverage)
})
diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala b/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala
new file mode 100644
index 0000000..bd5e32b
--- /dev/null
+++ b/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala
@@ -0,0 +1,233 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.rice.models
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.models.{ Alphabet, ReferenceRegion }
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.formats.avro.{ Strand, Feature }
+
+/**
+ * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons.
+ * Each Gene contains a collection of Transcripts, and each Transcript contains a collection of
+ * Exons, and together they describe how the genome is transcribed and translated into a family
+ * of related proteins (or other RNA products that aren't translated at all).
+ *
+ * This review,
+ * Gerstein et al. "What is a gene, post-ENCODE? History and updated definition" Genome Research (2007)
+ * http://genome.cshlp.org/content/17/6/669.full
+ *
+ * is a reasonably good overview both of what the term 'gene' has meant in the past as well as
+ * where it might be headed in the future.
+ *
+ * Here, we aren't trying to answer any of these questions about "what is a gene," but rather to
+ * provide the routines necessary to _re-assemble_ hierarchical models of genes that have been
+ * flattened into features (GFF, GTF, or BED)
+ *
+ * @param id A name, presumably unique within a gene dataset, of a Gene
+ * @param names Common names for the gene, possibly shared with other genes (for historical or
+ * ad hoc reasons)
+ * @param strand The strand of the Gene (this is from data, not derived from the Transcripts' strand(s), and
+ * we leave open the possibility that a single Gene will have Transcripts in _both_ directions,
+ * e.g. anti-sense transcripts)
+ * @param transcripts The Transcripts that are part of this gene model
+ */
+case class Gene(id: String, names: Seq[String], strand: Boolean, transcripts: Iterable[Transcript]) {
+
+ /**
+ * Finds the union of all the locations of the transcripts for this gene,
+ * across all the reference sequences indicates by the transcripts themselves.
+ * @return A Seq of ReferenceRegions
+ */
+ lazy val regions =
+ ReferenceUtils.unionReferenceSet(transcripts.map(_.region)).toSeq
+}
+
+/**
+ * A transcript model (here represented as a value of the Transcript class) is a simple,
+ * hierarchical model containing a collection of exon models as well as an associated
+ * gene identifier, transcript identifier, and a set of common names (synonyms).
+ *
+ * @param id the (unique) identifier of the Transcript
+ * @param names Common names for the transcript
+ * @param geneId The (unique) identifier of the gene to which the transcript belongs
+ * @param exons The set of exons in the transcript model; each of these contain a
+ * reference region whose coordinates are in genomic space.
+ * @param cds the set of CDS regions (the subset of the exons that are coding) for this
+ * transcript
+ * @param utrs
+ */
+case class Transcript(id: String,
+ names: Seq[String],
+ geneId: String,
+ strand: Boolean,
+ exons: Iterable[Exon],
+ cds: Iterable[CDS],
+ utrs: Iterable[UTR]) {
+
+ lazy val region = exons.map(_.region).reduceLeft[ReferenceRegion]((acc, ex) => acc.hull(ex))
+
+ /**
+ * Returns the concatenated sequence of the exons of the Transcript.
+ *
+ * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+ * respect to which the Transcript's coordinates or locators
+ * are given
+ * @return the String representation of this Transcript's spliced mRNA sequence
+ */
+ def extractTranscribedRNASequence(referenceSequence: String): String = {
+ val minStart = exons.map(_.region.start).toSeq.sorted.head.toInt
+ // takes the max...
+
+ val maxEnd = -exons.map(-_.region.end).toSeq.sorted.head.toInt
+ if (strand)
+ referenceSequence.substring(minStart, maxEnd)
+ else
+ Alphabet.dna.reverseComplement(referenceSequence.substring(minStart, maxEnd))
+ }
+
+ /**
+ * Returns the _coding sequence_ of the Transcript -- essentially, the subset of the
+ * exon(s) after the translation-start codon and before the translation-stop codon,
+ * as annotated in the Exon and CDS objects of the transcript.
+ *
+ * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+ * respect to which the Transcript's coordinates or locators
+ * are given
+ * @return the String representation of this Transcript's spliced mRNA sequence
+ */
+ def extractCodingSequence(referenceSequence: String): String = {
+ val builder = new StringBuilder()
+ val cdses =
+ if (strand)
+ cds.toSeq.sortBy(_.region.start)
+ else
+ cds.toSeq.sortBy(_.region.start).reverse
+
+ cdses.foreach(cds => builder.append(cds.extractSequence(referenceSequence)))
+ builder.toString()
+ }
+
+ /**
+ * Returns the _exonic_ sequence of the Transcript -- basically, the sequences of
+ * each exon concatenated together, with the intervening intronic sequences omitted.
+ *
+ * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+ * respect to which the Transcript's coordinates or locators
+ * are given
+ * @return the String representation of this Transcript's spliced mRNA sequence
+ */
+ def extractSplicedmRNASequence(referenceSequence: String): String = {
+ val builder = new StringBuilder()
+ val exs =
+ if (strand)
+ exons.toSeq.sortBy(_.region.start)
+ else
+ exons.toSeq.sortBy(_.region.start).reverse
+
+ exs.foreach(exon => builder.append(exon.extractSequence(referenceSequence)))
+ builder.toString()
+ }
+
+}
+
+/**
+ * A trait for values (usually regions or collections of regions) that can be
+ * subsetted or extracted out of a larger region string -- for example, exons
+ * or transcripts which have a sequence defined in terms of their coordinates
+ * against a reference chromosome. Passing the sequence of the reference
+ * chromosome to a transcript's 'extractSequence' method will return the
+ * sequence of the transcript.
+ */
+trait Extractable {
+
+ /**
+ * Returns the subset of the given referenceSequence that corresponds to this
+ * Extractable (not necessarily a contiguous substring, and possibly reverse-
+ * complemented or transformed in other ways).
+ *
+ * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+ * respect to which the Extractable's coordinates or locators
+ * are given
+ * @return the String representation of this Extractable
+ */
+ def extractSequence(referenceSequence: String): String
+}
+
+abstract class BlockExtractable(strand: Boolean, region: ReferenceRegion)
+ extends Extractable {
+
+ override def extractSequence(referenceSequence: String): String =
+ if (strand)
+ referenceSequence.substring(region.start.toInt, region.end.toInt)
+ else
+ Alphabet.dna.reverseComplement(referenceSequence.substring(region.start.toInt, region.end.toInt))
+}
+
+/**
+ * An exon model (here represented as a value of the Exon class) is a representation of a
+ * single exon from a transcript in genomic coordinates.
+ *
+ * NOTE: we're not handling shared exons here
+ *
+ * @param transcriptId the (unique) identifier of the transcript to which the exon belongs
+ * @param region The region (in genomic coordinates) to which the exon maps
+ */
+case class Exon(exonId: String, transcriptId: String, strand: Boolean, region: ReferenceRegion)
+ extends BlockExtractable(strand, region) {
+}
+
+/**
+ * Coding Sequence annotations, should be a subset of an Exon for a particular Transcript
+ * @param transcriptId
+ * @param strand
+ * @param region
+ */
+case class CDS(transcriptId: String, strand: Boolean, region: ReferenceRegion)
+ extends BlockExtractable(strand, region) {
+}
+
+/**
+ * UnTranslated Regions
+ *
+ * @param transcriptId
+ * @param strand
+ * @param region
+ */
+case class UTR(transcriptId: String, strand: Boolean, region: ReferenceRegion)
+ extends BlockExtractable(strand, region) {
+}
+
+object ReferenceUtils {
+
+ def unionReferenceSet(refs: Iterable[ReferenceRegion]): Iterable[ReferenceRegion] = {
+
+ def folder(acc: Seq[ReferenceRegion], tref: ReferenceRegion): Seq[ReferenceRegion] =
+ acc match {
+ case Seq() => Seq(tref)
+ case (first: ReferenceRegion) +: rest =>
+ if (first.overlaps(tref))
+ first.hull(tref) +: rest
+ else
+ tref +: first +: rest
+ }
+
+ refs.toSeq.sorted.foldLeft(Seq[ReferenceRegion]())(folder)
+ }
+}
diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala
index 92c8147..f186755 100644
--- a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala
+++ b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/QuantifySuite.scala
@@ -18,8 +18,14 @@
package org.bdgenomics.rice.algorithms
import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ Exon, ReferenceRegion, Transcript }
-import org.bdgenomics.rice.utils.riceFunSuite
+import org.bdgenomics.adam.models.{
+ RecordGroupDictionary,
+ ReferenceRegion,
+ SequenceDictionary
+}
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
+import org.bdgenomics.rice.models.{ Exon, Transcript }
+import org.bdgenomics.rice.utils.RiceFunSuite
import org.bdgenomics.rice.utils.{ ReadGenerator, TranscriptGenerator }
import scala.collection.Map
import scala.collection.immutable.HashMap
@@ -34,9 +40,12 @@ class TestingTwoBitFile(byteAccess: ByteAccess) extends ReferenceFile with Seria
override def extract(region: ReferenceRegion): String = {
testSeq.substring(region.start.toInt, region.end.toInt)
}
+ override def sequences: SequenceDictionary = {
+ SequenceDictionary.empty
+ }
}
-class QuantifySuite extends riceFunSuite {
+class QuantifySuite extends RiceFunSuite {
def fpEquals(a: Double, b: Double, eps: Double = 1e-6): Boolean = {
val passed = abs(a - b) <= eps
@@ -394,7 +403,8 @@ class QuantifySuite extends riceFunSuite {
val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.3, 0.2, 0.1, 0.1), 10000, 75, Some(4321L))
// run quantification
- val relativeAbundances = Quantify(sc.parallelize(reads),
+ val relativeAbundances = Quantify(
+ AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty),
sc.parallelize(kmerMap.toSeq),
sc.parallelize(classMap.toSeq),
sc.parallelize(names.zip(tLen).map(p => Transcript(p._1,
@@ -512,7 +522,8 @@ class QuantifySuite extends riceFunSuite {
Some(5000L))
// run quantification
- val relativeAbundances = Quantify(sc.parallelize(reads),
+ val relativeAbundances = Quantify(
+ AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty),
sc.parallelize(kmerMap.toSeq),
sc.parallelize(classMap.toSeq),
sc.parallelize(names.zip(tLen).map(p => Transcript(p._1,
@@ -557,7 +568,8 @@ class QuantifySuite extends riceFunSuite {
val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.3, 0.2, 0.1, 0.1), 10000, 75, Some(4321L))
// run quantification
- val relativeAbundances = Quantify(sc.parallelize(reads),
+ val relativeAbundances = Quantify(
+ AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty),
sc.parallelize(kmerMap.toSeq),
sc.parallelize(classMap.toSeq),
sc.parallelize(names.zip(tLen).map(p => Transcript(p._1,
@@ -600,7 +612,8 @@ class QuantifySuite extends riceFunSuite {
val reads = ReadGenerator(transcripts, tLen.map(x => x / totLen), 10000, 75, Some(4321L))
// run quantification
- val relativeAbundances = Quantify(sc.parallelize(reads),
+ val relativeAbundances = Quantify(
+ AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty),
sc.parallelize(kmerMap.toSeq),
sc.parallelize(classMap.toSeq),
sc.parallelize(names.zip(tLen).map(p => Transcript(p._1,
@@ -643,7 +656,8 @@ class QuantifySuite extends riceFunSuite {
val reads = ReadGenerator(transcripts, Seq(0.2, 0.1, 0.05, 0.2, 0.05, 0.4), 10000, 75, Some(4321L))
// run quantification
- val relativeAbundances = Quantify(sc.parallelize(reads),
+ val relativeAbundances = Quantify(
+ AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty),
sc.parallelize(kmerMap.toSeq),
sc.parallelize(classMap.toSeq),
sc.parallelize(names.zip(tLen).map(p => Transcript(p._1,
diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala
index aefac52..65464da 100644
--- a/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala
+++ b/rice-core/src/test/scala/org/bdgenomics/rice/algorithms/TareSuite.scala
@@ -18,12 +18,12 @@
package org.bdgenomics.rice.algorithms
import org.apache.spark.rdd.RDD
-import org.bdgenomics.rice.utils.riceFunSuite
+import org.bdgenomics.rice.utils.RiceFunSuite
import org.bdgenomics.rice.utils.TranscriptGenerator
import scala.math.{ abs, exp, log, max, min }
import scala.util.Random
-class TareSuite extends riceFunSuite {
+class TareSuite extends RiceFunSuite {
def fpEquals(a: Double, b: Double, eps: Double = 1e-6): Boolean = {
val passed = abs(a - b) <= eps
diff --git a/rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala b/rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala
similarity index 93%
rename from rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala
rename to rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala
index 67103a4..93855a7 100644
--- a/rice-core/src/test/scala/org/bdgenomics/rice/utils/RNAdamFunSuite.scala
+++ b/rice-core/src/test/scala/org/bdgenomics/rice/utils/RiceFunSuite.scala
@@ -19,12 +19,12 @@ package org.bdgenomics.rice.utils
import org.bdgenomics.utils.misc.SparkFunSuite
-trait riceFunSuite extends SparkFunSuite {
+trait RiceFunSuite extends SparkFunSuite {
override val appName: String = "rice"
override val properties: Map[String, String] = Map(("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
("spark.kryo.registrator", "org.bdgenomics.adam.serialization.ADAMKryoRegistrator"),
- ("spark.kryoserializer.buffer.mb", "4"),
+ ("spark.kryoserializer.buffer", "4"),
("spark.kryo.referenceTracking", "true"))
}