clulab · kwalcock · May 10, 2025 · May 13, 2025 · May 14, 2025 · May 14, 2025
diff --git a/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java b/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java
@@ -8,6 +8,7 @@
 import org.clulab.utils.JavaUtils;
 
 import java.util.Iterator;
+import scala.collection.Seq;
 
 public class ProcessorsJavaExample {
     public static void main(String [] args) throws Exception {
@@ -20,25 +21,25 @@ public static void main(String [] args) throws Exception {
         // You are basically done.  The rest of this code simply prints out the annotations.
 
         // Let's print the sentence-level annotations.
-        for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length; sentenceIndex++) {
-            Sentence sentence = doc.sentences()[sentenceIndex];
+        for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length(); sentenceIndex++) {
+            Sentence sentence = doc.sentences().apply(sentenceIndex);
             System.out.println("Sentence #" + sentenceIndex + ":");
-            System.out.println("Tokens: " + mkString(sentence.words()));
-            System.out.println("Start character offsets: " + mkString(sentence.startOffsets()));
-            System.out.println("End character offsets: " + mkString(sentence.endOffsets()));
+            System.out.println("Tokens: " + mkStringStr(sentence.words()));
+            System.out.println("Start character offsets: " + mkStringInt(sentence.startOffsets()));
+            System.out.println("End character offsets: " + mkStringInt(sentence.endOffsets()));
 
             // These annotations are optional, so they are stored using Option objects,
             // hence the isDefined() and get() calls.
             if (sentence.lemmas().isDefined())
-                System.out.println("Lemmas: " + mkString(sentence.lemmas().get()));
+                System.out.println("Lemmas: " + mkStringStr(sentence.lemmas().get()));
             if (sentence.tags().isDefined())
-                System.out.println("POS tags: " + mkString(sentence.tags().get()));
+                System.out.println("POS tags: " + mkStringStr(sentence.tags().get()));
             if (sentence.chunks().isDefined())
-                System.out.println("Chunks: " + mkString(sentence.chunks().get()));
+                System.out.println("Chunks: " + mkStringStr(sentence.chunks().get()));
             if (sentence.entities().isDefined())
-                System.out.println("Named entities: " + mkString(sentence.entities().get()));
+                System.out.println("Named entities: " + mkStringStr(sentence.entities().get()));
             if (sentence.norms().isDefined())
-                System.out.println("Normalized entities: " + mkString(sentence.norms().get()));
+                System.out.println("Normalized entities: " + mkStringStr(sentence.norms().get()));
             if (sentence.dependencies().isDefined()) {
                 System.out.println("Syntactic dependencies:");
                 Iterator<scala.Tuple3<Object, Object, String>> iterator =
@@ -53,27 +54,27 @@ public static void main(String [] args) throws Exception {
         }
     }
 
-    public static String mkString(String[] strings, String sep) {
+    public static String mkStringStr(Seq<String> strings, String sep) {
         StringBuilder stringBuilder = new StringBuilder();
-        for (int i = 0; i < strings.length; i ++) {
+        for (int i = 0; i < strings.length(); i ++) {
             if (i > 0) stringBuilder.append(sep);
-            stringBuilder.append(strings[i]);
+            stringBuilder.append(strings.apply(i));
         }
         return stringBuilder.toString();
     }
 
-    public static String mkString(String[] strings) { return mkString(strings, " "); }
+    public static String mkStringStr(Seq<String> strings) { return mkStringStr(strings, " "); }
 
-    public static String mkString(int[] ints, String sep) {
+    public static String mkStringInt(Seq<Object> ints, String sep) {
         StringBuilder stringBuilder = new StringBuilder();
-        for (int i = 0; i < ints.length; i ++) {
+        for (int i = 0; i < ints.length(); i ++) {
             if (i > 0) stringBuilder.append(sep);
-            stringBuilder.append(ints[i]);
+            stringBuilder.append(ints.apply(i));
         }
         return stringBuilder.toString();
     }
 
-    public static String mkString(int[] ints) { return mkString(ints, " "); }
+    public static String mkStringInt(Seq<Object> ints) { return mkStringInt(ints, " "); }
 
     public static<T> Iterable<T> iteratorToIterable(Iterator<T> iterator) { return () -> iterator; }
 }
diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala
@@ -2,6 +2,8 @@ package org.clulab.processors.apps
 
 import org.clulab.processors.{Document, Processor, Sentence}
 import org.clulab.processors.clu.BalaurProcessor
+import org.clulab.scala.WrappedArrayBuffer._
+import org.clulab.utils.WrappedArraySeq
 import org.slf4j.{Logger, LoggerFactory}
 
 import java.io.InputStream
@@ -17,101 +19,106 @@ class ColumnsToDocument
   * Last Modified: Fix compiler issue: import scala.io.Source.
   */
 object ColumnsToDocument {
-  val logger:Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument])
+  type LabelSetter = (Sentence, Seq[String]) => Sentence
+  type Annotator = (Document) => Document
+  val logger: Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument])
 
   val WORD_POS_CONLLX = 1
   val TAG_POS_CONLLX = 4
   val WORD_POS_CONLLU = 1
   val TAG_POS_CONLLU = 3
 
-  var proc:Processor = new BalaurProcessor()
+  var proc: Processor = new BalaurProcessor()
   var prevLang: String = "en"
 
-  def readFromFile(fn:String,
-                   wordPos:Int = WORD_POS_CONLLX,
-                   labelPos:Int = TAG_POS_CONLLX,
-                   setLabels: (Sentence, Array[String]) => Unit,
-                   annotate: (Document) => Unit,
-                   filterOutContractions:Boolean = false,
-                   lang: String = "en"
-                  ): Document = {
-
-    // redefine proc acording to the language used
+  protected def setProcessor(lang: String): Unit = {
     if (lang != prevLang) {
       if (lang == "pt") {
         println("Using Portuguese processors")
         throw new RuntimeException(s"ERROR: language '$lang' not supported!")
         //this.proc = new PortugueseCluProcessor()
-      } else if (lang == "es") {
+      }
+      else if (lang == "es") {
         println("Using Spanish processors")
         //this.proc = new SpanishCluProcessor()
         throw new RuntimeException(s"ERROR: language '$lang' not supported!")
-      } else {
+      }
+      else {
         println("Using English processors")
         this.proc = new BalaurProcessor()
       }
       this.prevLang = lang
     }
+  }
 
+  def readFromFile(
+    fn: String,
+    wordPos: Int = WORD_POS_CONLLX,
+    labelPos: Int = TAG_POS_CONLLX,
+    setLabels: LabelSetter,
+    annotate: Annotator,
+    filterOutContractions: Boolean = false,
+    lang: String = "en"
+  ): Document = {
+    setProcessor(lang)
     Using.resource(Source.fromFile(fn)) { source =>
       readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions)
     }
   }
 
-  def readFromStream(stream:InputStream,
-                     wordPos:Int = WORD_POS_CONLLX,
-                     labelPos:Int = TAG_POS_CONLLX,
-                     setLabels: (Sentence, Array[String]) => Unit,
-                     annotate: (Document) => Unit,
-                     filterOutContractions:Boolean = false,
-                     lang: String = "en"): Document = {
-
-    // redefine proc acording to the language used
-    if (lang == "pt"){
-      println("Using Portuguese processors")
-      //this.proc = new PortugueseCluProcessor()
-      throw new RuntimeException(s"ERROR: language '$lang' not supported!")
-    } else if(lang == "es") {
-      println("Using Spanish processors")
-      //this.proc = new SpanishCluProcessor()
-      throw new RuntimeException(s"ERROR: language '$lang' not supported!")
-    } else {
-      println("Using English processors")
-      this.proc = new BalaurProcessor()
-    }
-
+  def readFromStream(
+    stream: InputStream,
+    wordPos: Int = WORD_POS_CONLLX,
+    labelPos: Int = TAG_POS_CONLLX,
+    setLabels: LabelSetter,
+    annotate: Annotator,
+    filterOutContractions: Boolean = false,
+    lang: String = "en"
+  ): Document = {
+    setProcessor(lang)
     Using.resource(Source.fromInputStream(stream)) { source =>
       readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions)
     }
   }
 
-  def readFromSource(source:Source,
-                     wordPos:Int,
-                     labelPos:Int,
-                     setLabels: (Sentence, Array[String]) => Unit,
-                     annotate: (Document) => Unit,
-                     filterOutContractions:Boolean): Document = {
-    var words = new ArrayBuffer[String]()
-    var startOffsets = new ArrayBuffer[Int]()
-    var endOffsets = new ArrayBuffer[Int]()
-    var labels = new ArrayBuffer[String]()
-    var charOffset = 0
+  def readFromSource(
+    source: Source,
+    wordPos: Int,
+    labelPos: Int,
+    setLabels: LabelSetter,
+    annotate: Annotator,
+    filterOutContractions:Boolean
+  ): Document = {
+    val words = new ArrayBuffer[String]()
+    val startOffsets = new ArrayBuffer[Int]()
+    val endOffsets = new ArrayBuffer[Int]()
+    val labels = new ArrayBuffer[String]()
     val sentences = new ArrayBuffer[Sentence]()
-    for(line <- source.getLines()) {
-      val l = line.trim
+    var charOffset = 0
+
+    def mkSentence(): Sentence = {
+      val wordsSeq = new WrappedArraySeq(words.toArray).toImmutableSeq
+      val unlabeledSentence = new Sentence(wordsSeq, startOffsets, endOffsets, wordsSeq)
+
+      words.clear()
+      startOffsets.clear()
+      endOffsets.clear()
+
+      val labeledSentence = setLabels(unlabeledSentence, labels.toSeq)
+
+      labels.clear()
+      labeledSentence
+    }
+
+    source.getLines().map(_.trim).foreach { l =>
       if (l.isEmpty) {
         // end of sentence
         if (words.nonEmpty) {
-          val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray)
-          setLabels(s, labels.toArray)
-          sentences += s
-          words = new ArrayBuffer[String]()
-          startOffsets = new ArrayBuffer[Int]()
-          endOffsets = new ArrayBuffer[Int]()
-          labels = new ArrayBuffer[String]()
+          sentences += mkSentence()
           charOffset += 1
         }
-      } else {
+      }
+      else {
         // within the same sentence
         val bits = l.split("\\s+")
         if (bits.length < 2)
@@ -125,52 +132,28 @@ object ColumnsToDocument {
         //   10	as	o	DET	_	Gender=Fem|Number=Plur	11	det	_	_
         //
         val offset = bits(0) // we assume token offsets are always in column 0!
-        if(! filterOutContractions || ! offset.contains("-")) {
+        if (!filterOutContractions || ! offset.contains("-")) {
           words += bits(wordPos)
           labels += bits(labelPos)
           startOffsets += charOffset
           charOffset = bits(wordPos).length
           endOffsets += charOffset
           charOffset += 1
-        } else {
+        }
+        else {
           // println("Skipped line: " + l)
         }
       }
     }
-    if(words.nonEmpty) {
-      val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray)
-      s.tags = Some(labels.toArray)
-      sentences += s
-    }
+    if (words.nonEmpty)
+      sentences += mkSentence()
     logger.debug(s"Loaded ${sentences.size} sentences.")
 
-    val d = new Document(sentences.toArray)
-    annotate(d)
-
-    d
-
-  }
-
-  def setTags(s:Sentence, tags:Array[String]): Unit = {
-    s.tags = Some(tags)
-  }
-
-  def setChunks(s:Sentence, chunks:Array[String]): Unit = {
-    s.chunks = Some(chunks)
-  }
-
-  def setEntities(s:Sentence, entities:Array[String]): Unit = {
-    s.entities = Some(entities)
-  }
-
-  def annotateLemmas(doc:Document): Unit = {
-    proc.lemmatize(doc) // some features use lemmas, which are not available in the CoNLL data
-  }
+    val unannotatedSentence = new Document(sentences)
+    val annotatedSentence = annotate(unannotatedSentence)
 
-  def annotateLemmmaTags(doc:Document): Unit = {
-    proc.lemmatize(doc)
-    proc.tagPartsOfSpeech(doc)
+    annotatedSentence
   }
 
-  def annotateNil(doc:Document): Unit = {}
+  def annotateNil(document: Document): Document = document
 }
diff --git a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala
@@ -3,7 +3,7 @@ package org.clulab.processors.apps
 import org.clulab.processors.Document
 import org.clulab.processors.clu.BalaurProcessor
 import org.clulab.serialization.CoNLLUSerializer
-import org.clulab.utils.{FileUtils, StringUtils}
+import org.clulab.utils.{FileUtils, StringUtils, WrappedArraySeq}
 
 import java.io.PrintWriter
 import scala.util.Using
@@ -36,7 +36,11 @@ object CommandLineInterface extends App {
       } else if(props.containsKey(TOKENS)) {
         // one sentence per line; sentences are tokenized
         val sents = FileUtils.getLinesFromFile(props.getProperty(INPUT))
-        val tokenizedSents = sents.map(_.split("\\s+").toIterable)
+        val tokenizedSents = sents.map { sent =>
+          val tokens = sent.split("\\s+")
+
+          WrappedArraySeq(tokens).toImmutableSeq
+        }
         proc.annotateFromTokens(tokenizedSents)
       } else {
         // assume raw text

diff --git a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala b/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala
diff --git a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala
@@ -2,23 +2,21 @@ package org.clulab.processors.apps
 
 import org.clulab.processors.Document
 import org.clulab.processors.Processor
+import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter}
 import org.clulab.serialization.DocumentSerializer
 import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer}
 
-import java.io.BufferedOutputStream
 import java.io.File
-import java.io.FileOutputStream
 import java.io.PrintWriter
+import scala.collection.compat._
 import scala.collection.parallel.ParSeq
 import scala.util.Using
-import org.clulab.processors.clu.BalaurProcessor
 
 object InfiniteParallelProcessorsExample {
 
   class ProcessorProvider(reuseProcessor: Boolean) {
     protected val processorOpt: Option[Processor] =
-        if (reuseProcessor) Some(new BalaurProcessor())
-        else None
+        Option.when(reuseProcessor)(new BalaurProcessor())
 
     def newOrReusedProcessor: Processor =
         if (reuseProcessor) processorOpt.get
@@ -37,17 +35,14 @@ object InfiniteParallelProcessorsExample {
     val documentSerializer = new DocumentSerializer
 
     def processFiles(parFiles: ParSeq[File], processor: Processor): Unit = {
-
-      def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter)
-
       parFiles.foreach { file =>
         println(s"Processing ${file.getName}...")
 
         val text = FileUtils.getTextFromFile(file)
         val outputFile = new File(outputDir + "/" + file.getName)
         val document = processor.annotate(text)
         val printedDocument = StringUtils.viaPrintWriter { printWriter =>
-          printDocument(document, printWriter)
+          new DocumentPrettyPrinter(printWriter).print(document)
         }
         val savedDocument = documentSerializer.save(document)
         val outputDocument = printedDocument + savedDocument