bigdatagenomics · heuermh · Jan 16, 2018 · Jan 19, 2018
diff --git a/pom.xml b/pom.xml
@@ -17,14 +17,14 @@
   <name>rice: an ADAM-based RNA-seq quantification pipeline</name>
 
   <properties>
-    <adam.version>0.17.0</adam.version>
+    <adam.version>0.23.0</adam.version>
     <avro.version>1.7.6</avro.version>
-    <hadoop.version>2.2.0</hadoop.version>
-    <java.version>1.7</java.version>
-    <parquet.version>1.7.0</parquet.version>
-    <spark.version>1.2.1</spark.version>
+    <hadoop.version>2.7.3</hadoop.version>
+    <java.version>1.8</java.version>
+    <parquet.version>1.8.2</parquet.version>
+    <spark.version>1.6.3</spark.version>
     <!-- Edit the following line to configure the Hadoop (HDFS) version. -->
-    <utils.version>0.2.2</utils.version>
+    <utils.version>0.2.13</utils.version>
   </properties>
 
   <modules>
@@ -236,7 +236,7 @@
       <dependency>
         <groupId>args4j</groupId>
         <artifactId>args4j</artifactId>
-        <version>2.0.23</version>
+        <version>2.0.31</version>
       </dependency>
       <dependency>
         <groupId>org.bdgenomics.rice</groupId>
@@ -314,19 +314,14 @@
       <dependency>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest_2.10</artifactId>
-        <version>2.2.2</version>
+        <version>2.2.6</version>
         <scope>test</scope>
       </dependency>
       <dependency>
         <groupId>commons-configuration</groupId>
         <artifactId>commons-configuration</artifactId>
         <version>1.10</version>
       </dependency>
-      <dependency>
-        <groupId>args4j</groupId>
-        <artifactId>args4j</artifactId>
-        <version>2.0.23</version>
-      </dependency>
       <dependency>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-core_2.10</artifactId>

diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Index.scala
@@ -22,8 +22,8 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
 import org.bdgenomics.adam.rdd.ADAMContext._
 import org.bdgenomics.formats.avro.NucleotideContigFragment
-import org.bdgenomics.adam.models.{ Exon, Transcript }
 import org.bdgenomics.rice.Timers._
+import org.bdgenomics.rice.models.{ Exon, Transcript }
 import org.bdgenomics.adam.util.{ TwoBitFile, ReferenceFile }
 import scala.util.hashing.MurmurHash3
 

diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala b/rice-core/src/main/scala/org/bdgenomics/rice/algorithms/Quantify.scala
@@ -21,8 +21,9 @@ import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
 import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
 import org.bdgenomics.formats.avro.AlignmentRecord
-import org.bdgenomics.adam.models.Transcript
+import org.bdgenomics.rice.models.Transcript
 import org.bdgenomics.rice.Timers._
 
 object Quantify extends Serializable with Logging {
@@ -39,7 +40,7 @@ object Quantify extends Serializable with Logging {
    * Patro, Rob, Stephen M. Mount, and Carl Kingsford. "Sailfish enables alignment-free
    * isoform quantification from RNA-seq reads using lightweight algorithms." Nature biotechnology 32.5 (2014): 462-464.
    */
-  def apply(reads: RDD[AlignmentRecord],
+  def apply(reads: AlignmentRecordRDD,
             kmerToEquivalenceClass: RDD[(String, Long)],
             equivalenceClassToTranscript: RDD[(Long, Iterable[String])],
             transcripts: RDD[Transcript],
@@ -56,7 +57,7 @@ object Quantify extends Serializable with Logging {
 
     // cut reads into kmers and then calibrate if desired
     val readKmers = CountKmers.time {
-      reads.adamCountKmers(kmerLength)
+      reads.countKmers(kmerLength)
     }
     val calibratedKmers = if (calibrateKmerBias) {
       TareKmers.time {
@@ -175,7 +176,7 @@ object Quantify extends Serializable with Logging {
   private[algorithms] def initializeEM(equivalenceClassCounts: RDD[(Long, Long)],
                                        equivalenceClassToTranscript: RDD[(Long, Iterable[String])]): RDD[(Long, Iterable[(String, Double)])] = {
     equivalenceClassCounts.join(equivalenceClassToTranscript).map((x: (Long, (Long, Iterable[String]))) => {
-      val normCoverage: Double = x._2._1.toDouble / x._2._2.size()
+      val normCoverage: Double = x._2._1.toDouble / x._2._2.size
       val iter2: Iterable[(String, Double)] = x._2._2.map((y: String) => {
         (y, normCoverage)
       })

diff --git a/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala b/rice-core/src/main/scala/org/bdgenomics/rice/models/Gene.scala
@@ -0,0 +1,233 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.rice.models
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.models.{ Alphabet, ReferenceRegion }
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.formats.avro.{ Strand, Feature }
+
+/**
+ * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons.
+ * Each Gene contains a collection of Transcripts, and each Transcript contains a collection of
+ * Exons, and together they describe how the genome is transcribed and translated into a family
+ * of related proteins (or other RNA products that aren't translated at all).
+ *
+ * This review,
+ * Gerstein et al. "What is a gene, post-ENCODE? History and updated definition" Genome Research (2007)
+ * http://genome.cshlp.org/content/17/6/669.full
+ *
+ * is a reasonably good overview both of what the term 'gene' has meant in the past as well as
+ * where it might be headed in the future.
+ *
+ * Here, we aren't trying to answer any of these questions about "what is a gene," but rather to
+ * provide the routines necessary to _re-assemble_ hierarchical models of genes that have been
+ * flattened into features (GFF, GTF, or BED)
+ *
+ * @param id A name, presumably unique within a gene dataset, of a Gene
+ * @param names Common names for the gene, possibly shared with other genes (for historical or
+ *              ad hoc reasons)
+ * @param strand The strand of the Gene (this is from data, not derived from the Transcripts' strand(s), and
+ *               we leave open the possibility that a single Gene will have Transcripts in _both_ directions,
+ *               e.g. anti-sense transcripts)
+ * @param transcripts The Transcripts that are part of this gene model
+ */
+case class Gene(id: String, names: Seq[String], strand: Boolean, transcripts: Iterable[Transcript]) {
+
+  /**
+   * Finds the union of all the locations of the transcripts for this gene,
+   * across all the reference sequences indicates by the transcripts themselves.
+   * @return A Seq of ReferenceRegions
+   */
+  lazy val regions =
+    ReferenceUtils.unionReferenceSet(transcripts.map(_.region)).toSeq
+}
+
+/**
+ * A transcript model (here represented as a value of the Transcript class) is a simple,
+ * hierarchical model containing a collection of exon models as well as an associated
+ * gene identifier, transcript identifier, and a set of common names (synonyms).
+ *
+ * @param id the (unique) identifier of the Transcript
+ * @param names Common names for the transcript
+ * @param geneId The (unique) identifier of the gene to which the transcript belongs
+ * @param exons The set of exons in the transcript model; each of these contain a
+ *              reference region whose coordinates are in genomic space.
+ * @param cds the set of CDS regions (the subset of the exons that are coding) for this
+ *            transcript
+ * @param utrs
+ */
+case class Transcript(id: String,
+                      names: Seq[String],
+                      geneId: String,
+                      strand: Boolean,
+                      exons: Iterable[Exon],
+                      cds: Iterable[CDS],
+                      utrs: Iterable[UTR]) {
+
+  lazy val region = exons.map(_.region).reduceLeft[ReferenceRegion]((acc, ex) => acc.hull(ex))
+
+  /**
+   * Returns the concatenated sequence of the exons of the Transcript.
+   *
+   * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+   *                          respect to which the Transcript's coordinates or locators
+   *                          are given
+   * @return the String representation of this Transcript's spliced mRNA sequence
+   */
+  def extractTranscribedRNASequence(referenceSequence: String): String = {
+    val minStart = exons.map(_.region.start).toSeq.sorted.head.toInt
+    // takes the max...
+
+    val maxEnd = -exons.map(-_.region.end).toSeq.sorted.head.toInt
+    if (strand)
+      referenceSequence.substring(minStart, maxEnd)
+    else
+      Alphabet.dna.reverseComplement(referenceSequence.substring(minStart, maxEnd))
+  }
+
+  /**
+   * Returns the _coding sequence_ of the Transcript -- essentially, the subset of the
+   * exon(s) after the translation-start codon and before the translation-stop codon,
+   * as annotated in the Exon and CDS objects of the transcript.
+   *
+   * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+   *                          respect to which the Transcript's coordinates or locators
+   *                          are given
+   * @return the String representation of this Transcript's spliced mRNA sequence
+   */
+  def extractCodingSequence(referenceSequence: String): String = {
+    val builder = new StringBuilder()
+    val cdses =
+      if (strand)
+        cds.toSeq.sortBy(_.region.start)
+      else
+        cds.toSeq.sortBy(_.region.start).reverse
+
+    cdses.foreach(cds => builder.append(cds.extractSequence(referenceSequence)))
+    builder.toString()
+  }
+
+  /**
+   * Returns the _exonic_ sequence of the Transcript -- basically, the sequences of
+   * each exon concatenated together, with the intervening intronic sequences omitted.
+   *
+   * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+   *                          respect to which the Transcript's coordinates or locators
+   *                          are given
+   * @return the String representation of this Transcript's spliced mRNA sequence
+   */
+  def extractSplicedmRNASequence(referenceSequence: String): String = {
+    val builder = new StringBuilder()
+    val exs =
+      if (strand)
+        exons.toSeq.sortBy(_.region.start)
+      else
+        exons.toSeq.sortBy(_.region.start).reverse
+
+    exs.foreach(exon => builder.append(exon.extractSequence(referenceSequence)))
+    builder.toString()
+  }
+
+}
+
+/**
+ * A trait for values (usually regions or collections of regions) that can be
+ * subsetted or extracted out of a larger region string -- for example, exons
+ * or transcripts which have a sequence defined in terms of their coordinates
+ * against a reference chromosome.  Passing the sequence of the reference
+ * chromosome to a transcript's 'extractSequence' method will return the
+ * sequence of the transcript.
+ */
+trait Extractable {
+
+  /**
+   * Returns the subset of the given referenceSequence that corresponds to this
+   * Extractable (not necessarily a contiguous substring, and possibly reverse-
+   * complemented or transformed in other ways).
+   *
+   * @param referenceSequence The reference sequence (e.g. chromosomal sequence) with
+   *                          respect to which the Extractable's coordinates or locators
+   *                          are given
+   * @return the String representation of this Extractable
+   */
+  def extractSequence(referenceSequence: String): String
+}
+
+abstract class BlockExtractable(strand: Boolean, region: ReferenceRegion)
+    extends Extractable {
+
+  override def extractSequence(referenceSequence: String): String =
+    if (strand)
+      referenceSequence.substring(region.start.toInt, region.end.toInt)
+    else
+      Alphabet.dna.reverseComplement(referenceSequence.substring(region.start.toInt, region.end.toInt))
+}
+
+/**
+ * An exon model (here represented as a value of the Exon class) is a representation of a
+ * single exon from a transcript in genomic coordinates.
+ *
+ * NOTE: we're not handling shared exons here
+ *
+ * @param transcriptId the (unique) identifier of the transcript to which the exon belongs
+ * @param region The region (in genomic coordinates) to which the exon maps
+ */
+case class Exon(exonId: String, transcriptId: String, strand: Boolean, region: ReferenceRegion)
+    extends BlockExtractable(strand, region) {
+}
+
+/**
+ * Coding Sequence annotations, should be a subset of an Exon for a particular Transcript
+ * @param transcriptId
+ * @param strand
+ * @param region
+ */
+case class CDS(transcriptId: String, strand: Boolean, region: ReferenceRegion)
+    extends BlockExtractable(strand, region) {
+}
+
+/**
+ * UnTranslated Regions
+ *
+ * @param transcriptId
+ * @param strand
+ * @param region
+ */
+case class UTR(transcriptId: String, strand: Boolean, region: ReferenceRegion)
+    extends BlockExtractable(strand, region) {
+}
+
+object ReferenceUtils {
+
+  def unionReferenceSet(refs: Iterable[ReferenceRegion]): Iterable[ReferenceRegion] = {
+
+    def folder(acc: Seq[ReferenceRegion], tref: ReferenceRegion): Seq[ReferenceRegion] =
+      acc match {
+        case Seq() => Seq(tref)
+        case (first: ReferenceRegion) +: rest =>
+          if (first.overlaps(tref))
+            first.hull(tref) +: rest
+          else
+            tref +: first +: rest
+      }
+
+    refs.toSeq.sorted.foldLeft(Seq[ReferenceRegion]())(folder)
+  }
+}