clulab
diff --git a/‎main/src/main/python/__init__.py‎ b/‎main/src/main/python/__init__.py‎
diff --git a/‎main/src/main/python/pytorch/__init__.py‎ b/‎main/src/main/python/pytorch/__init__.py‎
diff --git a/‎main/src/main/python/pytorch/metal.py‎ b/‎main/src/main/python/pytorch/metal.py‎
diff --git a/‎main/src/main/python/pytorch/taskManager.py‎
Lines changed: 213 additions & 0 deletions b/‎main/src/main/python/pytorch/taskManager.py‎
Lines changed: 213 additions & 0 deletions
diff --git a/‎main/src/main/python/run.py‎
Lines changed: 25 additions & 0 deletions b/‎main/src/main/python/run.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎main/src/main/python/sequences/__init__.py‎ b/‎main/src/main/python/sequences/__init__.py‎
diff --git a/‎main/src/main/python/sequences/columnReader.py‎
Lines changed: 48 additions & 0 deletions b/‎main/src/main/python/sequences/columnReader.py‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,213 @@
+import random
+import math
+from sequences.columnReader import ColumnReader
+
+TYPE_BASIC = 0
+TYPE_DUAL = 1
+
+class TaskManager:
+
+  def __init__(self, config, seed):
+
+    self.config = config
+    self.random = seed
+
+    # How many shards to have per epoch
+    self.shardsPerEpoch = config.get_int("mtl.shardsPerEpoch", 10)
+
+    # Total number of epochs 
+    self.maxEpochs:Int = config.get_int("mtl.maxEpochs", 100)
+
+    # Training patience in number of epochs 
+    self.epochPatience:Int = config.get_int("mtl.epochPatience", 5)
+
+    # Array of all tasks to be managed 
+    self.tasks = self.readTasks()
+
+    self.taskCount = len(self.tasks)
+    self.indices = range(self.taskCount)
+
+    # Training shards from all tasks 
+    self.shards = self.mkShards()
+
+  # Construct training shards by interleaving shards from all tasks 
+  def mkShards(self):
+    shardsByTasks = list()
+
+    # construct the shards for each task
+    for i in self.indices:
+      shardsByTasks += [self.tasks[i].mkShards()]
+      assert(len(shardsByTasks[i]) == self.shardsPerEpoch)
+
+    # now interleave the tasks
+    interleavedShards = list()
+    for i in range(self.shardsPerEpoch):
+      for j in self.indices:
+        crtShard = shardsByTasks[j][i]
+        interleavedShards += [crtShard]
+
+    
+    # print ("All shards:")
+    # for(i <- interleavedShards.indices)
+    #   print (s"${interleavedShards(i)}")
+
+
+    return interleavedShards
+
+  # Iterator over all sentences coming from all interleaved shards 
+  def getSentences(self):
+    return SentenceIterator(self.tasks, self.shards, self.random)
+
+  # Reads all tasks from disk in memory 
+  def readTasks(self):
+    numberOfTasks = self.config.get_int("mtl.numberOfTasks", None)
+    tasks = list()
+    for i in range(numberOfTasks):
+      tasks += [self.readTask(i + 1)]
+
+    print (f"Read {numberOfTasks} tasks from config file.")
+    return tasks
+
+  def readTask(self, taskNumber):
+    taskName = self.config.get_string(f"mtl.task{taskNumber}.name", None)
+    train = self.config.get_string(f"mtl.task{taskNumber}.train", None)
+
+    dev = self.config.get_string(f"mtl.task{taskNumber}.dev", None) if f"mtl.task{taskNumber}.dev" in self.config else None
+    test = self.config.get_string(f"mtl.task{taskNumber}.test", None) if f"mtl.task{taskNumber}.test" in self.config else None
+
+    taskType = self.parseType(self.config.get_string(f"mtl.task{taskNumber}.type", "basic"))
+
+    weight = self.config.get_float(f"mtl.task{taskNumber}.weight", 1.0)
+
+    return Task(taskNumber - 1, taskName, taskType, self.shardsPerEpoch, weight, train, dev, test)
+
+  def parseType(self, inf):
+    if inf == "basic": return TYPE_BASIC
+    elif inf == "dual": return TYPE_DUAL
+    else: raise ValueError(f"ERROR: unknown task type {inf}!")
+
+  def debugTraversal(self):
+    for epoch in range(self.maxEpochs):
+      print (f"Started epoch {epoch}")
+      sentCount = 0
+      taskId = 0
+      totalSents = 0
+      for sentence in getSentences():
+        totalSents += 1
+        if(sentence[0] != taskId):
+          print (f"Read {sentCount} sentences from task {taskId}")
+          taskId = sentence[0]
+          sentCount = 1
+        else:
+          sentCount += 1
+      print (f"Read {sentCount} sentences from task {taskId}")
+      print (f"Read {totalSents} sentences in epoch {epoch}.")
+
+class SentenceIterator(object):
+  def __init__(tasks, shards, random):
+
+    self.tasks = tasks
+    self.shards = shards
+    self.random = random #random seed
+
+    # Offset in randomizedSentencePositions array 
+    self.sentenceOffset = 0
+    self.randomizedSentencePositions = randomizeSentences()
+
+  class Sentence:
+    def __init__(self, taskId, sentencePosition):
+      self.taskId = taskId
+      self.sentencePosition = sentencePosition
+
+  # Randomizes all sentences across all tasks 
+  def randomizeSentences():
+    # first, randomize the shards
+    random.seed(self.random)
+    randomizedShards = random.shuffle(self.shards)
+    randomizedSents = list()
+    for shard in randomizedShards:
+      # second, randomize the sentences inside each shard
+      sents = random.shuffle(list(range(shard.startPosition, shard.endPosition)))
+      for sent in sents:
+        # store the randomized sentences
+        randomizedSents += [Sentence(shard.taskId, sent)]
+    return randomizedSents
+
+  def __len__(self):
+    return len(self.randomizedSentencePositions)
+
+  def __iter__(self):
+    return self
+
+  def hasNext(self): return self.sentenceOffset < len(self.randomizedSentencePositions)
+
+  def __next__(self):
+    assert(self.sentenceOffset >= 0 and self.sentenceOffset < len(self.randomizedSentencePositions))
+
+    s = self.randomizedSentencePositions[sentenceOffset]
+    tid = s.taskId
+    sentence = self.tasks[tid].trainSentences[s.sentencePosition]
+    self.sentenceOffset += 1
+
+    #print ("shardPosition = $shardPosition, sentencePosition = $sentencePosition")
+
+    return (tid, sentence)
+
+class Shard:
+  def __init__(self, taskId, startPosition, endPosition):
+    self.taskId = taskId
+    self.startPosition = startPosition 
+    self.endPosition = endPosition
+
+class Task:
+  def __init__(self,
+  taskId, # this starts at 0 so we can use it as an index in the array of tasks
+  taskName:str,
+  taskType:int,
+  shardsPerEpoch:int,
+  taskWeight:float,
+  trainFileName:str,
+  devFileName:str = None,
+  testFileName:str = None):
+    self.taskId = taskId
+    taskNumber = taskId + 1
+    print (f"Reading task {taskNumber} ({taskName})...")
+    self.trainSentences = ColumnReader.readColumns(trainFileName)
+    self.devSentences = ColumnReader.readColumns(devFileName) if devFileName else None
+    self.testSentences = ColumnReader.readColumns(testFileName) if testFileName else None
+
+    self.isBasic:Boolean = taskType == TYPE_BASIC
+    self.isDual:Boolean = taskType == TYPE_DUAL
+
+    if taskType == TYPE_BASIC: 
+      self.prettyType = "basic"
+    elif taskType == TYPE_DUAL: 
+      self.prettyType = "dual"
+    else: 
+      self.prettyType = "unknown"
+
+    # The size of the training shard for this task
+    self.shardSize = math.ceil(len(self.trainSentences) / shardsPerEpoch)
+
+    # Current position in the training sentences when we iterate during training
+    currentTrainingSentencePosition = 0
+
+    print (f"============ starting task {taskNumber} ============")
+    print (f"Read {len(self.trainSentences)} training sentences for task {taskNumber}, with shard size {self.shardSize}.")
+    if(self.devSentences is not None):
+      print (f"Read {len(self.devSentences)} development sentences for task {taskNumber}.")
+    if(self.testSentences is not None):
+      print (f"Read {len(self.testSentences)} testing sentences for task {taskNumber}.")
+    print (f"Using taskWeight = {taskWeight}")
+    print (f"Task type = {self.prettyType}.")
+    print (f"============ completed task {taskNumber} ============")
+
+  # Construct the shards from all training sentences in this task 
+  def mkShards(self):
+    shards = list()
+    crtPos = 0
+    while(crtPos < len(self.trainSentences)):
+      endPos = min(crtPos + self.shardSize, len(self.trainSentences))
+      shards += [Shard(self.taskId, crtPos, endPos)]
+      crtPos = endPos
+    return shards
@@ -0,0 +1,25 @@
+from pyhocon import ConfigFactory
+import argparse
+from pytorch.taskManager import TaskManager
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_file', type=str, help='Filename of the model.')
+    parser.add_argument('--train', action='store_true', help='Set the code to training purpose.')
+    parser.add_argument('--test', action='store_true', help='Set the code to testing purpose.')
+    parser.add_argument('--shell', action='store_true', help='Set the code to shell mode.')
+    parser.add_argument('--config', type=str, help='Filename of the configuration.')
+    parser.add_argument('--seed', type=int, default=1234)
+    args = parser.parse_args()
+
+    if args.train:
+        config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf')
+        taskManager = TaskManager(config, args.seed)
+        # modelName = args.model_file
+        # mtl = Metal(taskManager, parameters, None)
+        # mtl.train(modelName)
+    elif args.test:
+        pass
+    elif args.shell:
+        pass
@@ -0,0 +1,48 @@
+#-----------------------------------------------------------
+#  Reads the CoNLL-like column format
+#-----------------------------------------------------------
+class ColumnReader:
+
+  def readColumns(source):
+    if type(source) is str:
+      source = open(source)
+    sentence = list()
+    sentences = list()
+    for line in source:
+      print (line)
+      l = line.strip()
+      if (l is ""):
+        # end of sentence
+        if (sentence):
+          sentences += [sentence]
+          sentence = list()
+      else:
+        # within the same sentence
+        bits = l.split("\\s")
+        if (len(bits) < 2):
+          raise RuntimeError(f"ERROR: invalid line {l}!")
+        sentence += Row(bits)
+
+    if (sentence):
+      sentences += [sentence]
+
+    source.close()
+    return sentences
+
+# -----------------------------------------------------------
+# Stores training data for sequence modeling
+# Mandatory columns: 0 - word, 1 - label
+# Optional columns: 2 - POS tag, 3+ SRL arguments
+# @param tokens
+# -----------------------------------------------------------
+
+class Row:
+
+  def __init__(self, tokens): 
+    self.tokens = tokens
+    self.length = len(tokens)
+
+  def get(self, idx):
+    if(idx >= self.length):
+      raise RuntimeError(f"ERROR: trying to read field #{idx}, which does not exist in this row: {tokens}!")
+    return tokens[idx]