Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5225cc6
Update scala and sbt
kwalcock May 10, 2025
1d18c8a
Clean up BalaurProcessor
kwalcock May 13, 2025
b540f25
Stop assigning to a val in Document
kwalcock May 14, 2025
7543b9c
Pass the tests
kwalcock May 14, 2025
9ccca36
Compile for Scala 3
kwalcock May 23, 2025
98c8115
Pass Scala3 tests
kwalcock May 23, 2025
211cd2a
NumericUtils
kwalcock May 23, 2025
8ea1301
GraphMapType
kwalcock May 23, 2025
1996cf3
Scala2
kwalcock May 23, 2025
39a9dca
Check in Balaur as well
kwalcock May 23, 2025
cec4087
Start with very basic compatibility
kwalcock May 23, 2025
57d1fa5
Down to last 13
kwalcock May 24, 2025
ed80611
Finish compiling library
kwalcock May 25, 2025
741307c
Compile for other Scala versions
kwalcock May 25, 2025
38369e3
Compile other projects for other Scalas
kwalcock May 25, 2025
deb244b
Compile tests
kwalcock May 25, 2025
e9876cf
Pass tests
kwalcock May 25, 2025
bec8f18
Clean, get webapp to work
kwalcock May 26, 2025
737e538
Remove dead code
kwalcock May 26, 2025
4cfd518
Maintenance
kwalcock May 26, 2025
dbfe52b
Document, Sentence
kwalcock May 26, 2025
2c19b03
Balaur
kwalcock May 26, 2025
55eb202
Remove Scala-specific GraphMap
kwalcock May 26, 2025
3c3f3db
More GraphMap
kwalcock May 26, 2025
61d871d
SeqView again
kwalcock May 26, 2025
e9979ea
Remove spaces
kwalcock May 26, 2025
9c80f42
Update sbt again
kwalcock May 27, 2025
70b031b
Fix a toSeq
kwalcock May 27, 2025
de0041f
Account for immutable doc in some tests
kwalcock May 27, 2025
db9b5e5
Move evaluation resources to app
kwalcock May 27, 2025
7239f2b
Fix test
kwalcock May 27, 2025
8b1c2f3
Make DocumentAttachments immutable
kwalcock May 27, 2025
e986420
Fix test compilation warning
kwalcock May 27, 2025
7d4fec1
Use Option.when
kwalcock May 29, 2025
4f06301
Extract the DocumentPrinter
kwalcock May 29, 2025
0b33f20
Clean up DocumentMaker
kwalcock May 29, 2025
276e894
Fix ColumnsToDocument
kwalcock May 30, 2025
0b93379
Remove unused and duplicate code in NumericUtils
kwalcock May 30, 2025
e826275
Fix typos
kwalcock May 30, 2025
b355af7
Combine named entity without exposing array
kwalcock Jun 2, 2025
000f0ed
Update sbt again
kwalcock Jun 2, 2025
1432985
Fix test
kwalcock Jun 2, 2025
b7da704
Run the cask server
kwalcock Jun 3, 2025
bb3f9f8
Update scalatags, include webapp2
kwalcock Jun 3, 2025
28143ec
Update library versions, cross compile
kwalcock Jun 4, 2025
32ef451
Use scalatags for mention
kwalcock Jun 4, 2025
cac887f
Use scalatags for parse
kwalcock Jun 4, 2025
d04a32d
Cross compile webapp2
kwalcock Jun 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.clulab.utils.JavaUtils;

import java.util.Iterator;
import scala.collection.Seq;

public class ProcessorsJavaExample {
public static void main(String [] args) throws Exception {
Expand All @@ -20,25 +21,25 @@ public static void main(String [] args) throws Exception {
// You are basically done. The rest of this code simply prints out the annotations.

// Let's print the sentence-level annotations.
for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length; sentenceIndex++) {
Sentence sentence = doc.sentences()[sentenceIndex];
for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length(); sentenceIndex++) {
Sentence sentence = doc.sentences().apply(sentenceIndex);
System.out.println("Sentence #" + sentenceIndex + ":");
System.out.println("Tokens: " + mkString(sentence.words()));
System.out.println("Start character offsets: " + mkString(sentence.startOffsets()));
System.out.println("End character offsets: " + mkString(sentence.endOffsets()));
System.out.println("Tokens: " + mkStringStr(sentence.words()));
System.out.println("Start character offsets: " + mkStringInt(sentence.startOffsets()));
System.out.println("End character offsets: " + mkStringInt(sentence.endOffsets()));

// These annotations are optional, so they are stored using Option objects,
// hence the isDefined() and get() calls.
if (sentence.lemmas().isDefined())
System.out.println("Lemmas: " + mkString(sentence.lemmas().get()));
System.out.println("Lemmas: " + mkStringStr(sentence.lemmas().get()));
if (sentence.tags().isDefined())
System.out.println("POS tags: " + mkString(sentence.tags().get()));
System.out.println("POS tags: " + mkStringStr(sentence.tags().get()));
if (sentence.chunks().isDefined())
System.out.println("Chunks: " + mkString(sentence.chunks().get()));
System.out.println("Chunks: " + mkStringStr(sentence.chunks().get()));
if (sentence.entities().isDefined())
System.out.println("Named entities: " + mkString(sentence.entities().get()));
System.out.println("Named entities: " + mkStringStr(sentence.entities().get()));
if (sentence.norms().isDefined())
System.out.println("Normalized entities: " + mkString(sentence.norms().get()));
System.out.println("Normalized entities: " + mkStringStr(sentence.norms().get()));
if (sentence.dependencies().isDefined()) {
System.out.println("Syntactic dependencies:");
Iterator<scala.Tuple3<Object, Object, String>> iterator =
Expand All @@ -53,27 +54,27 @@ public static void main(String [] args) throws Exception {
}
}

public static String mkString(String[] strings, String sep) {
public static String mkStringStr(Seq<String> strings, String sep) {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < strings.length; i ++) {
for (int i = 0; i < strings.length(); i ++) {
if (i > 0) stringBuilder.append(sep);
stringBuilder.append(strings[i]);
stringBuilder.append(strings.apply(i));
}
return stringBuilder.toString();
}

public static String mkString(String[] strings) { return mkString(strings, " "); }
public static String mkStringStr(Seq<String> strings) { return mkStringStr(strings, " "); }

public static String mkString(int[] ints, String sep) {
public static String mkStringInt(Seq<Object> ints, String sep) {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < ints.length; i ++) {
for (int i = 0; i < ints.length(); i ++) {
if (i > 0) stringBuilder.append(sep);
stringBuilder.append(ints[i]);
stringBuilder.append(ints.apply(i));
}
return stringBuilder.toString();
}

public static String mkString(int[] ints) { return mkString(ints, " "); }
public static String mkStringInt(Seq<Object> ints) { return mkStringInt(ints, " "); }

public static<T> Iterable<T> iteratorToIterable(Iterator<T> iterator) { return () -> iterator; }
}
163 changes: 73 additions & 90 deletions apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package org.clulab.processors.apps

import org.clulab.processors.{Document, Processor, Sentence}
import org.clulab.processors.clu.BalaurProcessor
import org.clulab.scala.WrappedArrayBuffer._
import org.clulab.utils.WrappedArraySeq
import org.slf4j.{Logger, LoggerFactory}

import java.io.InputStream
Expand All @@ -17,101 +19,106 @@ class ColumnsToDocument
* Last Modified: Fix compiler issue: import scala.io.Source.
*/
object ColumnsToDocument {
val logger:Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument])
type LabelSetter = (Sentence, Seq[String]) => Sentence
type Annotator = (Document) => Document
val logger: Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument])

val WORD_POS_CONLLX = 1
val TAG_POS_CONLLX = 4
val WORD_POS_CONLLU = 1
val TAG_POS_CONLLU = 3

var proc:Processor = new BalaurProcessor()
var proc: Processor = new BalaurProcessor()
var prevLang: String = "en"

def readFromFile(fn:String,
wordPos:Int = WORD_POS_CONLLX,
labelPos:Int = TAG_POS_CONLLX,
setLabels: (Sentence, Array[String]) => Unit,
annotate: (Document) => Unit,
filterOutContractions:Boolean = false,
lang: String = "en"
): Document = {

// redefine proc acording to the language used
protected def setProcessor(lang: String): Unit = {
if (lang != prevLang) {
if (lang == "pt") {
println("Using Portuguese processors")
throw new RuntimeException(s"ERROR: language '$lang' not supported!")
//this.proc = new PortugueseCluProcessor()
} else if (lang == "es") {
}
else if (lang == "es") {
println("Using Spanish processors")
//this.proc = new SpanishCluProcessor()
throw new RuntimeException(s"ERROR: language '$lang' not supported!")
} else {
}
else {
println("Using English processors")
this.proc = new BalaurProcessor()
}
this.prevLang = lang
}
}

def readFromFile(
fn: String,
wordPos: Int = WORD_POS_CONLLX,
labelPos: Int = TAG_POS_CONLLX,
setLabels: LabelSetter,
annotate: Annotator,
filterOutContractions: Boolean = false,
lang: String = "en"
): Document = {
setProcessor(lang)
Using.resource(Source.fromFile(fn)) { source =>
readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions)
}
}

def readFromStream(stream:InputStream,
wordPos:Int = WORD_POS_CONLLX,
labelPos:Int = TAG_POS_CONLLX,
setLabels: (Sentence, Array[String]) => Unit,
annotate: (Document) => Unit,
filterOutContractions:Boolean = false,
lang: String = "en"): Document = {

// redefine proc acording to the language used
if (lang == "pt"){
println("Using Portuguese processors")
//this.proc = new PortugueseCluProcessor()
throw new RuntimeException(s"ERROR: language '$lang' not supported!")
} else if(lang == "es") {
println("Using Spanish processors")
//this.proc = new SpanishCluProcessor()
throw new RuntimeException(s"ERROR: language '$lang' not supported!")
} else {
println("Using English processors")
this.proc = new BalaurProcessor()
}

def readFromStream(
stream: InputStream,
wordPos: Int = WORD_POS_CONLLX,
labelPos: Int = TAG_POS_CONLLX,
setLabels: LabelSetter,
annotate: Annotator,
filterOutContractions: Boolean = false,
lang: String = "en"
): Document = {
setProcessor(lang)
Using.resource(Source.fromInputStream(stream)) { source =>
readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions)
}
}

def readFromSource(source:Source,
wordPos:Int,
labelPos:Int,
setLabels: (Sentence, Array[String]) => Unit,
annotate: (Document) => Unit,
filterOutContractions:Boolean): Document = {
var words = new ArrayBuffer[String]()
var startOffsets = new ArrayBuffer[Int]()
var endOffsets = new ArrayBuffer[Int]()
var labels = new ArrayBuffer[String]()
var charOffset = 0
def readFromSource(
source: Source,
wordPos: Int,
labelPos: Int,
setLabels: LabelSetter,
annotate: Annotator,
filterOutContractions:Boolean
): Document = {
val words = new ArrayBuffer[String]()
val startOffsets = new ArrayBuffer[Int]()
val endOffsets = new ArrayBuffer[Int]()
val labels = new ArrayBuffer[String]()
val sentences = new ArrayBuffer[Sentence]()
for(line <- source.getLines()) {
val l = line.trim
var charOffset = 0

def mkSentence(): Sentence = {
val wordsSeq = new WrappedArraySeq(words.toArray).toImmutableSeq
val unlabeledSentence = new Sentence(wordsSeq, startOffsets, endOffsets, wordsSeq)

words.clear()
startOffsets.clear()
endOffsets.clear()

val labeledSentence = setLabels(unlabeledSentence, labels.toSeq)

labels.clear()
labeledSentence
}

source.getLines().map(_.trim).foreach { l =>
if (l.isEmpty) {
// end of sentence
if (words.nonEmpty) {
val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray)
setLabels(s, labels.toArray)
sentences += s
words = new ArrayBuffer[String]()
startOffsets = new ArrayBuffer[Int]()
endOffsets = new ArrayBuffer[Int]()
labels = new ArrayBuffer[String]()
sentences += mkSentence()
charOffset += 1
}
} else {
}
else {
// within the same sentence
val bits = l.split("\\s+")
if (bits.length < 2)
Expand All @@ -125,52 +132,28 @@ object ColumnsToDocument {
// 10 as o DET _ Gender=Fem|Number=Plur 11 det _ _
//
val offset = bits(0) // we assume token offsets are always in column 0!
if(! filterOutContractions || ! offset.contains("-")) {
if (!filterOutContractions || ! offset.contains("-")) {
words += bits(wordPos)
labels += bits(labelPos)
startOffsets += charOffset
charOffset = bits(wordPos).length
endOffsets += charOffset
charOffset += 1
} else {
}
else {
// println("Skipped line: " + l)
}
}
}
if(words.nonEmpty) {
val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray)
s.tags = Some(labels.toArray)
sentences += s
}
if (words.nonEmpty)
sentences += mkSentence()
logger.debug(s"Loaded ${sentences.size} sentences.")

val d = new Document(sentences.toArray)
annotate(d)

d

}

def setTags(s:Sentence, tags:Array[String]): Unit = {
s.tags = Some(tags)
}

def setChunks(s:Sentence, chunks:Array[String]): Unit = {
s.chunks = Some(chunks)
}

def setEntities(s:Sentence, entities:Array[String]): Unit = {
s.entities = Some(entities)
}

def annotateLemmas(doc:Document): Unit = {
proc.lemmatize(doc) // some features use lemmas, which are not available in the CoNLL data
}
val unannotatedSentence = new Document(sentences)
val annotatedSentence = annotate(unannotatedSentence)

def annotateLemmmaTags(doc:Document): Unit = {
proc.lemmatize(doc)
proc.tagPartsOfSpeech(doc)
annotatedSentence
}

def annotateNil(doc:Document): Unit = {}
def annotateNil(document: Document): Document = document
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package org.clulab.processors.apps
import org.clulab.processors.Document
import org.clulab.processors.clu.BalaurProcessor
import org.clulab.serialization.CoNLLUSerializer
import org.clulab.utils.{FileUtils, StringUtils}
import org.clulab.utils.{FileUtils, StringUtils, WrappedArraySeq}

import java.io.PrintWriter
import scala.util.Using
Expand Down Expand Up @@ -36,7 +36,11 @@ object CommandLineInterface extends App {
} else if(props.containsKey(TOKENS)) {
// one sentence per line; sentences are tokenized
val sents = FileUtils.getLinesFromFile(props.getProperty(INPUT))
val tokenizedSents = sents.map(_.split("\\s+").toIterable)
val tokenizedSents = sents.map { sent =>
val tokens = sent.split("\\s+")

WrappedArraySeq(tokens).toImmutableSeq
}
proc.annotateFromTokens(tokenizedSents)
} else {
// assume raw text
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,21 @@ package org.clulab.processors.apps

import org.clulab.processors.Document
import org.clulab.processors.Processor
import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter}
import org.clulab.serialization.DocumentSerializer
import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer}

import java.io.BufferedOutputStream
import java.io.File
import java.io.FileOutputStream
import java.io.PrintWriter
import scala.collection.compat._
import scala.collection.parallel.ParSeq
import scala.util.Using
import org.clulab.processors.clu.BalaurProcessor

object InfiniteParallelProcessorsExample {

class ProcessorProvider(reuseProcessor: Boolean) {
protected val processorOpt: Option[Processor] =
if (reuseProcessor) Some(new BalaurProcessor())
else None
Option.when(reuseProcessor)(new BalaurProcessor())

def newOrReusedProcessor: Processor =
if (reuseProcessor) processorOpt.get
Expand All @@ -37,17 +35,14 @@ object InfiniteParallelProcessorsExample {
val documentSerializer = new DocumentSerializer

def processFiles(parFiles: ParSeq[File], processor: Processor): Unit = {

def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter)

parFiles.foreach { file =>
println(s"Processing ${file.getName}...")

val text = FileUtils.getTextFromFile(file)
val outputFile = new File(outputDir + "/" + file.getName)
val document = processor.annotate(text)
val printedDocument = StringUtils.viaPrintWriter { printWriter =>
printDocument(document, printWriter)
new DocumentPrettyPrinter(printWriter).print(document)
}
val savedDocument = documentSerializer.save(document)
val outputDocument = printedDocument + savedDocument
Expand Down
Loading