diff --git a/main/src/main/python/__init__.py b/main/src/main/python/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/main/src/main/python/embeddings/__init__.py b/main/src/main/python/embeddings/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/main/src/main/python/embeddings/wordEmbeddingMap.py b/main/src/main/python/embeddings/wordEmbeddingMap.py
new file mode 100644
index 000000000..7ea6328cd
--- /dev/null
+++ b/main/src/main/python/embeddings/wordEmbeddingMap.py
@@ -0,0 +1,36 @@
+import numpy as np
+import math
+import torch.nn as nn
+import torch
+
+class WordEmbeddingMap:
+    def __init__(self, config):
+        self.emb_dict, self.dim, self.w2i, self.emb = load(config)
+
+    def isOutOfVocabulary(self, word):
+        return word not in self.w2i
+
+def load(config):
+    emb_dict = dict()
+    w2i = {}
+    i = 0
+    for line in open(config.get_string("glove.matrixResourceName")):
+        if not len(line.split()) == 2:
+            if "\t" in line:
+                delimiter = "\t"
+            else:
+                delimiter = " "
+            word, *rest = line.rstrip().split(delimiter)
+            word = "<UNK>" if word == "" else word
+            w2i[word] = i
+            i += 1
+            x = np.array(list(map(float, rest)))
+            vector = x #(x /np.linalg.norm(x)) #normalized
+            embedding_size = vector.shape[0]
+            emb_dict[word] = vector    
+
+    weights = np.zeros((len(emb_dict), embedding_size))
+    for w, i in w2i.items():
+        weights[i] = emb_dict[w]
+    emb = nn.Embedding.from_pretrained(torch.FloatTensor(weights), freeze=True)
+    return emb_dict, embedding_size, w2i, emb
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/__init__.py b/main/src/main/python/pytorch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/main/src/main/python/pytorch/constEmbeddingsGlove.py b/main/src/main/python/pytorch/constEmbeddingsGlove.py
new file mode 100644
index 000000000..24d298f0b
--- /dev/null
+++ b/main/src/main/python/pytorch/constEmbeddingsGlove.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+import torch.nn as nn
+from embeddings.wordEmbeddingMap import *
+from pyhocon import ConfigFactory
+import torch
+
+@dataclass
+class ConstEmbeddingParameters:
+    emb: nn.Embedding
+    w2i: dict
+
+class _ConstEmbeddingsGlove:
+    def __init__(self):
+        self.SINGLETON_WORD_EMBEDDING_MAP = None
+        self.cep = None
+        config = ConfigFactory.parse_file('../resources/org/clulab/glove.conf')
+        self.load(config)
+        self.dim = self.SINGLETON_WORD_EMBEDDING_MAP.dim
+
+    def load(self, config):
+        if self.SINGLETON_WORD_EMBEDDING_MAP is None:
+            self.SINGLETON_WORD_EMBEDDING_MAP = WordEmbeddingMap(config)
+        self.cep = ConstEmbeddingParameters(self.SINGLETON_WORD_EMBEDDING_MAP.emb, self.SINGLETON_WORD_EMBEDDING_MAP.w2i)
+
+    def get_ConstLookupParams(self):
+        return self.cep
+
+ConstEmbeddingsGlove = _ConstEmbeddingsGlove()
diff --git a/main/src/main/python/pytorch/embeddingLayer.py b/main/src/main/python/pytorch/embeddingLayer.py
new file mode 100644
index 000000000..e3479119a
--- /dev/null
+++ b/main/src/main/python/pytorch/embeddingLayer.py
@@ -0,0 +1,341 @@
+from pytorch.initialLayer import InitialLayer
+import random
+from pytorch.utils import *
+import torch.nn as nn
+import torch
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
+
+DEFAULT_DROPOUT_PROB: float = DEFAULT_DROPOUT_PROBABILITY
+DEFAULT_LEARNED_WORD_EMBEDDING_SIZE: int = 128
+DEFAULT_CHAR_EMBEDDING_SIZE: int = 32
+DEFAULT_CHAR_RNN_STATE_SIZE: int = 16
+DEFAULT_POS_TAG_EMBEDDING_SIZE: int = -1 # no POS tag embeddings by default
+DEFAULT_NE_TAG_EMBEDDING_SIZE: int = -1 # no NE tag embeddings by default
+DEFAULT_DISTANCE_EMBEDDING_SIZE: int = -1 # no distance embeddings by default
+DEFAULT_POSITION_EMBEDDING_SIZE: int = -1 # no position embeddings by default
+DEFAULT_DISTANCE_WINDOW_SIZE: int = -1
+DEFAULT_USE_IS_PREDICATE: int = -1
+random.seed(RANDOM_SEED)
+
+class EmbeddingLayer(InitialLayer):
+    def __init__(self, w2i, # word to index
+                 w2f, # word to frequency
+                 c2i, # character to index
+                 tag2i, # POS tag to index
+                 ne2i, # NE tag to index
+                 learnedWordEmbeddingSize, # size of the learned word embedding
+                 charEmbeddingSize, # size of the character embedding
+                 charRnnStateSize, # size of each one of the char-level RNNs
+                 posTagEmbeddingSize, # size of the POS tag embedding
+                 neTagEmbeddingSize, # size of the NE tag embedding
+                 distanceEmbeddingSize,
+                 distanceWindowSize, # window considered for distance values (relative to predicate)
+                 positionEmbeddingSize,
+                 useIsPredicate, # if true, add a Boolean bit to indicate if current word is the predicate
+                 wordLookupParameters,
+                 charLookupParameters,
+                 charRnnBuilder, # RNNs for the character representation
+                 posTagLookupParameters,
+                 neTagLookupParameters,
+                 distanceLookupParameters,
+                 positionLookupParameters,
+                 dropoutProb):
+        super().__init__()
+        self.w2i = w2i
+        self.w2f = w2f
+        self.c2i = c2i
+        self.tag2i = tag2i
+        self.ne2i = ne2i
+        self.learnedWordEmbeddingSize = learnedWordEmbeddingSize
+        self.charEmbeddingSize = charEmbeddingSize
+        self.charRnnStateSize = charRnnStateSize
+        self.posTagEmbeddingSize = posTagEmbeddingSize
+        self.neTagEmbeddingSize = neTagEmbeddingSize
+        self.distanceEmbeddingSize = distanceEmbeddingSize
+        self.distanceWindowSize = distanceWindowSize
+        self.positionEmbeddingSize = positionEmbeddingSize
+        self.useIsPredicate = useIsPredicate
+        self.wordLookupParameters = wordLookupParameters
+        self.charLookupParameters = charLookupParameters
+        self.charRnnBuilder = mkBuilder(*charRnnBuilder)
+        self.posTagLookupParameters = posTagLookupParameters
+        self.neTagLookupParameters = neTagLookupParameters
+        self.distanceLookupParameters = distanceLookupParameters
+        self.positionLookupParameters = positionLookupParameters
+        self.dropoutProb = dropoutProb
+
+        posTagDim = posTagEmbeddingSize if posTagLookupParameters else 0
+        neTagDim = neTagEmbeddingSize if neTagLookupParameters else 0
+        distanceDim = distanceEmbeddingSize if distanceLookupParameters else 0
+        predicateDim = 1 if distanceLookupParameters and useIsPredicate else 0
+        positionDim = positionEmbeddingSize if positionLookupParameters else 0
+        self.outDim = ConstEmbeddingsGlove.dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim
+        
+    
+    def forward(self, sentence, constEmbeddings, doDropout):
+
+        words = sentence.words
+        tags = sentence.posTags
+        nes = sentence.neTags
+        headPositions = sentence.headPositions
+
+        # const word embeddings such as GloVe
+        constEmbeddingsExpressions = self.mkConstEmbeddings(words, constEmbeddings)
+        assert(constEmbeddingsExpressions.size(0) == len(words))
+        if(tags): assert(len(tags) == len(words))
+        if(nes): assert(len(nes) == len(words))
+        if(headPositions): assert(len(headPositions) == len(words))
+
+        # build the word embeddings one by one
+        embeddings = self.mkEmbeddings(words, constEmbeddingsExpressions, doDropout, tags, nes, headPositions)
+
+        return embeddings
+
+    def mkConstEmbeddings(self, words, constEmbeddings):
+        idxs = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words])
+        embeddings = constEmbeddings.emb(idxs)
+        return embeddings
+
+    def mkEmbeddings(self, words, constEmbeddings, doDropout, tags=None, nes=None, headPositions=None):
+        #
+        # Learned word embeddings
+        # These are initialized randomly, and updated during backprop
+        #
+        ids = []
+        wordPositions = []
+        for i, word in enumerate(words):
+            wordPositions.append(i)
+            id = self.w2i.get(word, 0) # 0 reserved for UNK in the vocab
+            # sample uniformly with prob 0.5 from singletons; move all other singletons to UNK
+            if(doDropout and id > 0 and self.w2f[word] == 1 and random.random() < 0.5): id = 0
+            ids.append(id) 
+        learnedWordEmbeddings = self.wordLookupParameters(torch.LongTensor(ids))
+        #
+        # biLSTM over character embeddings
+        #
+        charEmbedding = torch.stack([mkCharacterEmbedding(word, self.c2i, self.charLookupParameters, self.charRnnBuilder) for word in words])
+        #
+        # POS tag embedding
+        #
+        if tags and self.posTagLookupParameters:
+            posTagEmbed = self.posTagLookupParameters(torch.LongTensor([self.tag2i.get(tag, 0) for tag in tags]))
+        else:
+            posTagEmbed = None
+        #
+        # NE tag embedding
+        #
+        if nes and self.neTagLookupParameters:
+            neTagEmbed = self.neTagLookupParameters(torch.LongTensor([self.ne2i.get(ne, 0) for ne in nes]))
+        else:
+            neTagEmbed = None
+        #
+        # 1 if this word is the predicate
+        #
+        if headPositions and self.useIsPredicate:
+            predEmbed = torch.FloatTensor([1 if i==predicatePosition else 0 for i, predicatePosition in enumerate(headPositions)]).unsqueeze(1)
+        else:
+            predEmbed = None
+
+        #
+        # Distance embedding, relative to the distance to the predicate
+        # We cut the distance down to values inside the window [-distanceWindowSize, +distanceWindowSize]
+        #
+        if headPositions and self.distanceLookupParameters:
+            dists = [max(i-predicatePosition+self.distanceWindowSize+1, 0) if i-predicatePosition <= self.distanceWindowSize else 2 * self.distanceWindowSize + 2 for i, predicatePosition in enumerate(headPositions)]
+            distanceEmbedding = self.distanceLookupParameters(torch.LongTensor(dists))
+        else:
+            distanceEmbedding = None
+
+        #
+        # Embedding that captures the absolute position of the token in the sentence
+        #
+        if self.positionLookupParameters:
+            values = [i if i<100 else 100 for i, word in enumerate(words)]
+            positionEmbedding = self.positionLookupParameters(torch.LongTensor(values))
+        else:
+            positionEmbedding = None
+
+        # The final word embedding is a concatenation of all these
+        embedParts = [constEmbeddings, learnedWordEmbeddings, charEmbedding, posTagEmbed, neTagEmbed, distanceEmbedding, positionEmbedding, predEmbed]
+        embedParts = [ep for ep in embedParts if ep is not None]
+        embed = torch.cat(embedParts, dim=1)
+        return embed
+
+    def saveX2i(self):
+        x2i = dict()
+        x2i['w2i'] = self.w2i
+        x2i['w2f'] = self.w2f
+        x2i['c2i'] = self.c2i
+        if self.tag2i:
+            x2i['hasTag2i'] = 1
+            x2i['tag2i'] = self.tag2i
+        else:
+            x2i['hasTag2i'] = 0
+        if self.ne2i:
+            x2i['hasNe2i'] = 1
+            x2i['ne2i'] = self.ne2i
+        else:
+            x2i['hasNe2i'] = 0
+        x2i['learnedWordEmbeddingSize'] = self.learnedWordEmbeddingSize
+        x2i['charEmbeddingSize']        = self.charEmbeddingSize
+        x2i['charRnnStateSize']         = self.charRnnStateSize
+        x2i['posTagEmbeddingSize']      = self.posTagEmbeddingSize
+        x2i['neTagEmbeddingSize']       = self.neTagEmbeddingSize
+        x2i['distanceEmbeddingSize']    = self.distanceEmbeddingSize
+        x2i['distanceWindowSize']       = self.distanceWindowSize
+        x2i['useIsPredicate']           = 1 if self.useIsPredicate else 0
+        x2i['positionEmbeddingSize']    = self.positionEmbeddingSize
+        x2i['dropoutProb']              = self.dropoutProb
+
+        return x2i
+
+    def __str__(self):
+        return f"EmbeddingLayer({self.outDim})"
+
+    @classmethod
+    def load(cls, x2i):
+        w2i = x2i['w2i']
+        w2f = x2i['w2f']
+        c2i = x2i['c2i']
+        tag2i = x2i['tag2i'] if x2i['hasTag2i'] == 1 else None
+        ne2i = x2i['ne2i'] if x2i['hasNe2i'] == 1 else None
+
+        learnedWordEmbeddingSize = x2i.get('learnedWordEmbeddingSize', DEFAULT_LEARNED_WORD_EMBEDDING_SIZE)
+        charEmbeddingSize        = x2i.get('charEmbeddingSize', DEFAULT_CHAR_EMBEDDING_SIZE)
+        charRnnStateSize         = x2i.get('charRnnStateSize', DEFAULT_CHAR_RNN_STATE_SIZE)
+        posTagEmbeddingSize      = x2i.get('posTagEmbeddingSize', DEFAULT_POS_TAG_EMBEDDING_SIZE)
+        neTagEmbeddingSize       = x2i.get('neTagEmbeddingSize', DEFAULT_NE_TAG_EMBEDDING_SIZE)
+        distanceEmbeddingSize    = x2i.get('distanceEmbeddingSize', DEFAULT_DISTANCE_EMBEDDING_SIZE)
+        distanceWindowSize       = x2i.get('distanceWindowSize', DEFAULT_DISTANCE_WINDOW_SIZE)
+        useIsPredicate           = x2i.get('useIsPredicate', DEFAULT_USE_IS_PREDICATE) == 1
+        positionEmbeddingSize    = x2i.get('positionEmbeddingSize', DEFAULT_POSITION_EMBEDDING_SIZE)
+        dropoutProb              = x2i.get('dropoutProb', DEFAULT_DROPOUT_PROB)
+
+        #  make the loadable parameters
+        wordLookupParameters = nn.Embedding(len(w2i), learnedWordEmbeddingSize)
+        charLookupParameters = nn.Embedding(len(c2i), charEmbeddingSize)
+        
+        charRnnBuilder = (charEmbeddingSize, charRnnStateSize, 1, True, dropoutProb)
+
+        posTagLookupParameters   = nn.Embedding(len(tag2i), posTagEmbeddingSize) if x2i['hasTag2i'] == 1 else None
+        neTagLookupParameters    = nn.Embedding(len(ne2i), neTagEmbeddingSize) if x2i['hasNe2i'] == 1 else None
+        distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize) if distanceEmbeddingSize > 0 else None
+        positionLookupParameters = nn.Embedding(101, positionEmbeddingSize) if positionEmbeddingSize > 0 else None
+
+        return cls(w2i, w2f, c2i, tag2i, ne2i,
+                  learnedWordEmbeddingSize,
+                  charEmbeddingSize,
+                  charRnnStateSize,
+                  posTagEmbeddingSize,
+                  neTagEmbeddingSize,
+                  distanceEmbeddingSize,
+                  distanceWindowSize,
+                  positionEmbeddingSize,
+                  useIsPredicate,
+                  wordLookupParameters,
+                  charLookupParameters,
+                  charRnnBuilder,
+                  posTagLookupParameters,
+                  neTagLookupParameters,
+                  distanceLookupParameters,
+                  positionLookupParameters,
+                  dropoutProb)
+
+    @classmethod
+    def initialize(cls, config, paramPrefix, wordCounter):
+
+        if(not config.contains(paramPrefix)):
+            return None
+
+        learnedWordEmbeddingSize = config.get_int(paramPrefix + ".learnedWordEmbeddingSize",DEFAULT_LEARNED_WORD_EMBEDDING_SIZE)
+        charEmbeddingSize        = config.get_int(paramPrefix + ".charEmbeddingSize",DEFAULT_CHAR_EMBEDDING_SIZE)
+        charRnnStateSize         = config.get_int(paramPrefix + ".charRnnStateSize",DEFAULT_CHAR_RNN_STATE_SIZE)
+        posTagEmbeddingSize      = config.get_int(paramPrefix + ".posTagEmbeddingSize",DEFAULT_POS_TAG_EMBEDDING_SIZE)
+        neTagEmbeddingSize       = config.get_int(paramPrefix + ".neTagEmbeddingSize",DEFAULT_NE_TAG_EMBEDDING_SIZE)
+        distanceEmbeddingSize    = config.get_int(paramPrefix + ".distanceEmbeddingSize",DEFAULT_DISTANCE_EMBEDDING_SIZE)
+        distanceWindowSize       = config.get_int(paramPrefix + ".distanceWindowSize",DEFAULT_DISTANCE_WINDOW_SIZE)
+        useIsPredicate           = config.get_bool(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1)
+        positionEmbeddingSize    = config.get_int(paramPrefix + ".positionEmbeddingSize",DEFAULT_POSITION_EMBEDDING_SIZE)
+        dropoutProb              = config.get_float(paramPrefix + ".dropoutProb",DEFAULT_DROPOUT_PROB)
+
+        wordList = [UNK_WORD] + sorted(wordCounter.keys())
+        w2i = {w:i for i, w in enumerate(wordList)}
+
+        wordLookupParameters = nn.Embedding(len(w2i), learnedWordEmbeddingSize)
+        nn.init.xavier_uniform_(wordLookupParameters.weight)
+
+        c2iFilename = config.get_string(paramPrefix + ".c2i", "org/clulab/c2i-en.txt")
+        c2i = readChar2Ids(c2iFilename)
+
+        charLookupParameters = nn.Embedding(len(c2i), charEmbeddingSize)
+        nn.init.xavier_uniform_(charLookupParameters.weight)
+        charRnnBuilder = (charEmbeddingSize, charRnnStateSize, 1, True, dropoutProb)
+
+        if(posTagEmbeddingSize > 0):
+            tag2i = readString2Ids(config.get_string(paramPrefix + ".tag2i", "../resources/org/clulab/tag2i-en.txt"))
+            posTagLookupParameters = nn.Embedding(len(tag2i), posTagEmbeddingSize)
+            nn.init.xavier_uniform_(posTagLookupParameters.weight)
+        else:
+            tag2i = None
+            posTagLookupParameters = None
+
+        if(neTagEmbeddingSize > 0):
+            ne2i = readString2Ids(config.get_string(paramPrefix + ".ne2i", "../resources/org/clulab/ne2i-en.txt"))
+            neTagLookupParameters = nn.Embedding(len(ne2i), neTagEmbeddingSize)
+        else:
+            ne2i = None
+            neTagLookupParameters = None
+
+        if distanceEmbeddingSize > 0:
+          distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize)
+          nn.init.xavier_uniform_(distanceLookupParameters.weight)
+        else:
+          distanceLookupParameters = None
+
+        if positionEmbeddingSize > 0:
+          positionLookupParameters = nn.Embedding(101, positionEmbeddingSize)
+          nn.init.xavier_uniform_(positionLookupParameters.weight)
+        else:
+          positionLookupParameters = None
+
+        return cls(w2i, wordCounter, c2i, tag2i, ne2i,
+                  learnedWordEmbeddingSize,
+                  charEmbeddingSize,
+                  charRnnStateSize,
+                  posTagEmbeddingSize,
+                  neTagEmbeddingSize,
+                  distanceEmbeddingSize,
+                  distanceWindowSize,
+                  positionEmbeddingSize,
+                  useIsPredicate,
+                  wordLookupParameters,
+                  charLookupParameters,
+                  charRnnBuilder,
+                  posTagLookupParameters,
+                  neTagLookupParameters,
+                  distanceLookupParameters,
+                  positionLookupParameters,
+                  dropoutProb)
+
+def mkBuilder(inputSize, rnnStateSize, numLayers, bi, dropoutProb):
+    return nn.LSTM(inputSize, rnnStateSize, numLayers, bidirectional=bi, dropout=dropoutProb)
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/finalLayer.py b/main/src/main/python/pytorch/finalLayer.py
new file mode 100644
index 000000000..0f2b63d87
--- /dev/null
+++ b/main/src/main/python/pytorch/finalLayer.py
@@ -0,0 +1,21 @@
+import torch
+import torch.nn as nn
+
+class FinalLayer(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.inDim = None
+        self.outDim = None
+
+    def forward(self, inputExpressions, headPositionsOpt, doDropout):
+        raise NotImplementedError
+
+    def loss(self, emissionScoresAsExpression, goldLabels):
+        raise NotImplementedError
+
+    def inference(self, emissionScores):
+        raise NotImplementedError
+
+    def inferenceWithScores(self, emissionScores):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/forwardLayer.py b/main/src/main/python/pytorch/forwardLayer.py
new file mode 100644
index 000000000..d071c66a8
--- /dev/null
+++ b/main/src/main/python/pytorch/forwardLayer.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+from pytorch.finalLayer import FinalLayer
+
+from pytorch.utils import *
+
+class ForwardLayer(FinalLayer):
+    def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None):
+        super().__init__()
+        self.inputSize = inputSize
+        self.isDual = isDual
+        self.t2i = t2i
+        self.i2t = i2t
+        self.spans = spans
+        self.nonlinearity = nonlinearity
+
+        self.pH = nn.Linear(actualInputSize, len(t2i))
+        nn.init.xavier_uniform_(self.pH.weight)
+        self.pRoot = Variable(torch.rand(inputSize)) #TODO: Not sure about the shape here
+        self.dropout = nn.Dropout(dropoutProb)
+
+        self.inDim = spanLength(spans) if spans is not None else inputSize
+        self.outDim = len(t2i)
+
+    # remove pick span part to simplify the ONNX converting
+    # def pickSpan(self, v, i):
+    #     if self.spans is None:
+    #         return v
+    #     else:
+    #         # Zheng: Will spans overlap?
+    #         vs = list()
+    #         for span in self.spans:
+    #             e = torch.index_select(v, i, torch.tensor(range(span[0], span[1])))
+    #             vs.append(e)
+    #         return torch.cat(vs, dim=i)
+
+    def forward(self, inputExpressions, headPositionsOpt = None):
+        if not self.isDual:
+            # Zheng: Why the for loop here? Can we just use matrix manipulation?
+            argExp = self.dropout(inputExpressions)
+            emissionScores = self.dropout(self.pH(argExp))
+            if self.nonlinearity == NONLIN_TANH:
+                emissionScores = F.tanh(emissionScores)
+            elif self.nonlinearity == NONLIN_RELU:
+                emissionScores = F.relu(emissionScores)
+        else:
+            emissionScores = list()
+            if headPositionsOpt is None:
+                raise RuntimeError("ERROR: dual task without information about head positions!")
+            for i, e in enumerate(inputExpressions):
+                headPosition = headPositionsOpt[i]
+                argExp = self.dropout(e)
+                if headPosition >= 0:
+                    # there is an explicit head in the sentence
+                    predExp = self.dropout(inputExpressions[headPosition])
+                else:
+                    # the head is root. we used a dedicated Parameter for root
+                    predExp = self.dropout(self.pRoot)
+                ss = torch.cat([argExp, predExp])
+                l1 = self.dropout(self.pH(ss))
+                if self.nonlinearity == NONLIN_TANH:
+                    l1 = F.tanh(l1)
+                elif self.nonlinearity == NONLIN_RELU:
+                    l1 = F.relu(l1)
+                emissionScores.append(l1)
+            emissionScores = torch.stack(emissionScores)
+        return emissionScores
+
+    @staticmethod
+    def load(x2i):
+        from pytorch.greedyForwardLayer import GreedyForwardLayer
+        from pytorch.viterbiForwardLayer import ViterbiForwardLayer
+        inferenceType = x2i["inferenceType"]
+        if inferenceType == TYPE_VITERBI or inferenceType == TYPE_VITERBI_STRING:#this is a temporary solution to handle a typo in viterbi forward layer...
+            return ViterbiForwardLayer.load(x2i)
+        elif inferenceType == TYPE_GREEDY or inferenceType == TYPE_GREEDY_STRING:
+            return GreedyForwardLayer.load(x2i)
+        else:
+            raise RuntimeError(f"ERROR: unknown forward layer type {inferenceType}!")
+
+    @staticmethod
+    def initialize(config, paramPrefix, labelCounter, isDual, inputSize):
+        from pytorch.greedyForwardLayer import GreedyForwardLayer
+        from pytorch.viterbiForwardLayer import ViterbiForwardLayer
+        if(not config.contains(paramPrefix)):
+            return None
+
+        inferenceType = config.get_string(paramPrefix + ".inference", "greedy")
+        dropoutProb = config.get_float(paramPrefix + ".dropoutProb", DEFAULT_DROPOUT_PROBABILITY)
+
+        nonlinAsString = config.get_string(paramPrefix + ".nonlinearity", "")
+        if nonlinAsString in nonlin_map:
+            nonlin = nonlin_map[nonlinAsString]
+        else:
+            raise RuntimeError(f"ERROR: unknown non-linearity {nonlinAsString}!")
+
+        t2i = {t:i for i, t in enumerate(labelCounter.keys())}
+        i2t = {i:t for t, i in t2i.items()}
+
+        spanConfig = config.get_string(paramPrefix + ".span", "")
+        if spanConfig is "":
+            span = None
+        else:
+            span = parseSpan(spanConfig)
+
+        if span:
+            l = spanLength(span)
+            actualInputSize = 2*l if isDual else l
+        else:
+            actualInputSize = 2*inputSize if isDual else inputSize
+
+        if inferenceType == TYPE_GREEDY_STRING:
+            return GreedyForwardLayer(inputSize, isDual, t2i, i2t, actualInputSize, nonlin, dropoutProb, span)
+        elif inferenceType == TYPE_VITERBI_STRING:
+            layer = ViterbiForwardLayer(inputSize, isDual, t2i, i2t, actualInputSize, nonlin, dropoutProb, span)
+            return layer
+        else:
+            raise RuntimeError(f"ERROR: unknown inference type {inferenceType}!")
+    
+def spanLength(spans):
+    return sum(end - start for start, end in spans)
+
+def parseSpan(spanParam, inputSize=None):
+    # Zheng: Why do we need inputSize here?
+    spans = list()
+    spanParamTokens = spanParam.split(",")
+    for spanParamToken in spanParamTokens:
+        # spanTokens = spanParamToken.split('-')
+        # assert(len(spanTokens) == 2)
+        # spans.append((int(spanTokens[0]), int(spanTokens[1])))
+        token1, token2 = map(int, spanParamToken.split('-'))
+        spans.append((token1, token2))
+    return spans
+
+def spanToString(spans):
+    return ','.join(f'{start}-{end}' for start, end in spans)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/greedyForwardLayer.py b/main/src/main/python/pytorch/greedyForwardLayer.py
new file mode 100644
index 000000000..145878c91
--- /dev/null
+++ b/main/src/main/python/pytorch/greedyForwardLayer.py
@@ -0,0 +1,56 @@
+from pytorch.forwardLayer import *
+from pytorch.utils import *
+import numpy as np
+
+class GreedyForwardLayer(ForwardLayer):
+    def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None):
+        super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans)
+
+    def loss(self, finalStates, goldLabelStrings):
+        goldLabels = [self.t2i[gs] for gs in goldLabelStrings]
+        return sentenceLossGreedy(finalStates, goldLabels)
+
+    def saveX2i(self):
+        x2i = dict()
+        x2i["inferenceType"] = TYPE_GREEDY
+        x2i["inputSize"] = self.inputSize
+        x2i["isDual"] = 1 if self.isDual else 0
+        x2i["span"] = spanToString(self.spans) if self.spans else ""
+        x2i["nonlinearity"] = self.nonlinearity
+        x2i["t2i"] = self.t2i
+
+        return x2i
+
+    def __str__(self):
+        return f"GreedyForwardLayer({self.inDim}, {self.outDim})"
+
+    def inference(self, emissionScores):
+        emissionScores = emissionScoresToArrays(emissionScores)
+        return [self.i2t[np.argmax(es)] for es in emissionScores]
+
+    def inference2(self, emissionScores):
+        return torch.argmax(emissionScores, dim=1)
+
+    def inferenceWithScores(self, emissionScores):
+        emissionScores = emissionScoresToArrays(emissionScores)
+        return [sorted([(i, s) for i, s in enumerate(scoresForPosition)], key=lambda x: x[1]) for scoresForPosition in emissionScores]
+
+    @classmethod
+    def load(cls, x2i):
+        inputSize = x2i["inputSize"]
+        isDual = x2i.get("isDual", DEFAULT_IS_DUAL) == 1
+        sapnValue = x2i.get("span", "")
+        spans = None if sapnValue == "" else parseSpan(sapnValue, inputSize)
+        nonlinearity = x2i.get("nonlinearity", NONLIN_NONE)
+        t2i = x2i["t2i"]
+        i2t = {i:t for t, i in t2i.items()}
+        dropoutProb = x2i.get("dropoutProb", DEFAULT_DROPOUT_PROBABILITY)
+
+        if spans:
+            l = spanLength(spans)
+            actualInputSize = 2*l if isDual else l
+        else:
+            actualInputSize = 2*inputSize if isDual else inputSize
+
+        return cls(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans)
+    
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/initialLayer.py b/main/src/main/python/pytorch/initialLayer.py
new file mode 100644
index 000000000..39db90d28
--- /dev/null
+++ b/main/src/main/python/pytorch/initialLayer.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+class InitialLayer(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.outDim = None
+
+    def forward(self, sentence, constEmbeddings, doDropout):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/intermediateLayer.py b/main/src/main/python/pytorch/intermediateLayer.py
new file mode 100644
index 000000000..48ea53377
--- /dev/null
+++ b/main/src/main/python/pytorch/intermediateLayer.py
@@ -0,0 +1,12 @@
+import torch
+import torch.nn as nn
+
+class IntermediateLayer(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.inDim = None
+        self.outDim = None
+
+    def forward(self, inputExpressions, doDropout):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/layers.py b/main/src/main/python/pytorch/layers.py
new file mode 100644
index 000000000..926bc6606
--- /dev/null
+++ b/main/src/main/python/pytorch/layers.py
@@ -0,0 +1,317 @@
+import torch.nn as nn
+from pytorch.utils import *
+from pytorch.embeddingLayer import EmbeddingLayer
+from pytorch.rnnLayer import RnnLayer
+from pytorch.forwardLayer import ForwardLayer
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
+
+class Layers(object):
+    def __init__(self, initialLayer, intermediateLayers, finalLayer):
+        if finalLayer:
+            self.outDim = finalLayer.outDim
+        elif intermediateLayers:
+            self.outDim = intermediateLayers[-1].outDim
+        elif initialLayer:
+            self.outDim = initialLayer.outDim
+        else:
+            self.outDim = None
+
+        self.nonEmpty = initialLayer is not None and intermediateLayers is not None and finalLayer is not None
+        self.isEmpty = not self.nonEmpty
+
+        self.initialLayer = initialLayer
+        self.intermediateLayers = intermediateLayers
+        self.finalLayer = finalLayer
+
+    def __str__(self):
+        s = ""
+        started = False
+        if(self.initialLayer is not None):
+            s += "initial = " + str(self.initialLayer)
+            started = True
+        for i in range(len(self.intermediateLayers)):
+            if(started): s += " "
+            s += f"intermediate ({i+1}) = " + str(self.intermediateLayers[i])
+            started = True
+        if(self.finalLayer is not None):
+          if(started): s += " "
+          s += "final = " + str(self.finalLayer)
+        return s
+
+    def get_parameters(self):
+        parameters = list()
+        if self.initialLayer is not None:
+            parameters += [p for p in self.initialLayer.named_parameters()]
+        for il in self.intermediateLayers:
+            parameters += [p for p in il.named_parameters()]
+        if self.finalLayer is not None:
+            parameters += [p for p in self.finalLayer.named_parameters()]
+        
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in parameters
+                        if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': WEIGHT_DECAY},
+            {'params': [p for n, p in parameters
+                        if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+        ]
+        return optimizer_grouped_parameters
+
+    def start_train(self):
+        if self.initialLayer is not None:
+            self.initialLayer.train()
+        for il in self.intermediateLayers:
+            il.train()
+        if self.finalLayer is not None:
+            self.finalLayer.train()
+    
+    def start_eval(self):
+        if self.initialLayer is not None:
+            self.initialLayer.eval()
+        for il in self.intermediateLayers:
+            il.eval()
+        if self.finalLayer is not None:
+            self.finalLayer.eval()
+
+    def get_state_dict(self):
+        params = dict()
+        if self.initialLayer is not None:
+            params['initialLayer'] = self.initialLayer.state_dict()
+        if self.intermediateLayers:
+            params['intermediateLayers'] = list()
+        for il in self.intermediateLayers:
+            params['intermediateLayers'].append(il.state_dict())
+        if self.finalLayer is not None:
+            params['finalLayer'] = self.finalLayer.state_dict()
+        return params
+
+    def load_state_dict(self, params):
+        if self.initialLayer is not None:
+            self.initialLayer.load_state_dict(params['initialLayer'])
+        for i, il in enumerate(self.intermediateLayers):
+            il.load_state_dict(params['intermediateLayers'][i])
+        if self.finalLayer is not None:
+            self.finalLayer.load_state_dict(params['finalLayer'])
+
+    def add_state_dict(self, layers):
+        if self.initialLayer is not None:
+            for key in self.initialLayer.state_dict():
+                if self.initialLayer.state_dict()[key].data.dtype == torch.float32:
+                    self.initialLayer.state_dict()[key].data += layers.initialLayer.state_dict()[key].data.clone()
+        for i, il in enumerate(self.intermediateLayers):
+            for key in il.state_dict():
+                if il.state_dict()[key].data.dtype == torch.float32:
+                    il.state_dict()[key].data += layers.intermediateLayers[i].state_dict()[key].data.clone()
+        if self.finalLayer is not None:
+            for key in self.finalLayer.state_dict():
+                if self.finalLayer.state_dict()[key].data.dtype == torch.float32:
+                    self.finalLayer.state_dict()[key].data += layers.finalLayer.state_dict()[key].data.clone()
+
+    def avg_state_dict(self, num_models):
+        if self.initialLayer is not None:
+            for key in self.initialLayer.state_dict():
+                if self.initialLayer.state_dict()[key].data.dtype == torch.float32:
+                    self.initialLayer.state_dict()[key].data /= num_models
+        for i, il in enumerate(self.intermediateLayers):
+            for key in il.state_dict():
+                if il.state_dict()[key].data.dtype == torch.float32:
+                    il.state_dict()[key].data /= num_models
+        if self.finalLayer is not None:
+            for key in self.finalLayer.state_dict():
+                if self.finalLayer.state_dict()[key].data.dtype == torch.float32:
+                    self.finalLayer.state_dict()[key].data /= num_models
+
+
+    def forward(self, sentence, constEmbeddings, doDropout):
+        if self.initialLayer is None:
+            raise RuntimeError(f"ERROR: you can't call forward() on a Layers object that does not have an initial layer: {self}!")
+        states = self.initialLayer(sentence, constEmbeddings, doDropout)
+        for intermediateLayer in self.intermediateLayers:
+            states = intermediateLayer(states, doDropout)
+        if self.finalLayer is not None:
+            states = self.finalLayer(states, sentence.headPositions)
+
+        return states
+
+    def forwardFrom(self, inStates, headPositions, doDropout):
+        if self.initialLayer is not None:
+            raise RuntimeError(f"ERROR: you can't call forwardFrom() on a Layers object that has an initial layer: {self}")
+        states = inStates
+        for intermediateLayer in self.intermediateLayers:
+            states = intermediateLayer(states, doDropout)
+        if self.finalLayer is not None:
+            states = self.finalLayer(states, headPositions)
+
+        return states
+
+    def saveX2i(self):
+        x2i = dict()
+        if self.initialLayer is not None:
+            x2i['hasInitial'] = 1
+            x2i['initialLayer'] = self.initialLayer.saveX2i()
+        else:
+            x2i['hasInitial'] = 0
+        x2i['intermediateCount'] = len(self.intermediateLayers)
+        x2i['intermediateLayers'] = list()
+        for il in self.intermediateLayers:
+            x2i['intermediateLayers'].append(il.saveX2i())
+        if self.finalLayer is not None:
+            x2i['hasFinal'] = 1
+            x2i['finalLayer'] = self.finalLayer.saveX2i()
+        else:
+            x2i['hasFinal'] = 0
+
+        return x2i
+
+    @classmethod
+    def apply(cls, config, paramPrefix, wordCounter, labelCounter, isDual, providedInputSize):
+        initialLayer = EmbeddingLayer.initialize(config, paramPrefix + ".initial", wordCounter)
+
+        if(initialLayer):
+            inputSize = initialLayer.outDim
+        elif(providedInputSize):
+            inputSize = providedInputSize
+        else:
+            inputSize = None
+
+        intermediateLayers = list()
+        done = False
+        MAX_INTERMEDIATE_LAYERS = 10
+
+        for i in range(1, MAX_INTERMEDIATE_LAYERS):
+            if done:
+                break
+            if inputSize is None:
+                raise RuntimeError("ERROR: trying to construct an intermediate layer without a known input size!")
+
+            intermediateLayer = RnnLayer.initialize(config, paramPrefix + f".intermediate{i}", inputSize)
+
+            if intermediateLayer:
+                intermediateLayers.append(intermediateLayer)
+                inputSize = intermediateLayer.outDim
+            else:
+                done = True
+
+        if labelCounter:
+            if inputSize is None:
+                raise RuntimeError("ERROR: trying to construct a final layer without a known input size!")
+            else:
+                finalLayer = ForwardLayer.initialize(config, paramPrefix + ".final", labelCounter, isDual, inputSize)
+        else:
+            finalLayer = None
+
+        return cls(initialLayer, intermediateLayers, finalLayer)
+
+    @classmethod
+    def loadX2i(cls, x2i):
+        hasInitial = x2i['hasInitial']
+        initialLayer = EmbeddingLayer.load(x2i['initialLayer']) if hasInitial == 1 else None
+
+        intermediateLayers = list()
+        intermediateCount = x2i['intermediateCount']
+        for i in range(intermediateCount):
+            il = RnnLayer.load(x2i['intermediateLayers'][i])
+            intermediateLayers.append(il)
+
+        hasFinal = x2i['hasFinal']
+        finalLayer = ForwardLayer.load(x2i['finalLayer']) if hasFinal == 1 else None
+
+        return cls(initialLayer, intermediateLayers, finalLayer)
+
+    @staticmethod
+    def predictJointly(layers, sentence, constEmbeddings):
+        labelsPerTask = list()
+        # layers(0) contains the shared layers
+        if layers[0]:
+            sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout=False)
+            for i in range(1, len(layers)):
+                states = layers[i].forwardFrom(sharedStates, sentence.headPositions, doDropout=False)
+                labels = layers[i].finalLayer.inference(states)
+                labelsPerTask += [labels]
+        # no shared layer
+        else:
+            for i in range(1, len(layers)):
+                states = layers[i].forward(sentence, sentence.headPositions, doDropout=False)
+                labels = layers[i].finalLayer.inference(states)
+                labelsPerTask += [labels]
+
+        return labelsPerTask
+
+    @staticmethod
+    def forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout):
+        if layers[0]:
+            sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout)
+            states = layers[taskId+1].forwardFrom(sharedStates, sentence.headPositions, doDropout)
+        else:
+            states = layers[taskId+1].forward(sentence, constEmbeddings, doDropout)
+        return states
+
+    @staticmethod
+    def predict(layers, taskId, sentence, constEmbeddings):
+        states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=False)
+        return layers[taskId+1].finalLayer.inference(states)
+
+    @staticmethod
+    def predictWithScores(layers, taskId, sentence, constEmbeddings):
+        states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=False)
+        return layers[taskId+1].finalLayer.inferenceWithScores(states)
+
+    @staticmethod
+    def parse(layers, sentence, constEmbeddings):
+        #
+        # first get the output of the layers that are shared between the two tasks
+        #
+        assert(layers[0].nonEmpty)
+        sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout=False)
+
+        #
+        # now predict the heads (first task)
+        #
+        headStates = layers[1].forwardFrom(sharedStates, None, doDropout=False)
+        headScores = layers[1].finalLayer.inference(headStates)
+
+        # store the head values here
+        heads = list()
+        for wi, predictionsForThisWord in enumerate(headScores):
+            # pick the prediction with the highest score, which is within the boundaries of the current sentence
+            done = False
+            for hi, relative in enumerate(predictionsForThisWord):
+                if done:
+                    break
+                try:
+                    relativeHead = int(relative[0])
+                    if relativeHead == 0:
+                        heads.append(1)
+                        done = True
+                    else:
+                        headPosition = wi + relativeHead
+                        heads.append(headPosition)
+                        done = True
+                except:
+                    raise RuntimeError('''some valid predictions may not be integers, e.g., "<STOP>" may be predicted by the sequence model''')
+            if not done:
+                # we should not be here, but let's be safe
+                # if nothing good was found, assume root
+                heads.append(1)
+        
+        #
+        # next, predict the labels using the predicted heads
+        #
+        labelStates = layers[2].forwardFrom(sharedStates, heads, doDropout=False)
+        labels = layers[2].finalLayer.inference(labelStates)
+        assert(len(labels)==len(heads))
+
+        return zip(heads, labels)
+
+    @staticmethod
+    def loss(layers, taskId, sentence, goldLabels):
+        # Zheng: I am not sure this is the suitable way to load embeddings or not, need help...
+        constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams()
+        states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=True) # use dropout during training!
+        loss = layers[taskId+1].finalLayer.loss(states, goldLabels)
+        return loss
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/metal.py b/main/src/main/python/pytorch/metal.py
new file mode 100644
index 000000000..1c6deb81d
--- /dev/null
+++ b/main/src/main/python/pytorch/metal.py
@@ -0,0 +1,366 @@
+from pytorch.utils import *
+from collections import Counter
+from sequences.rowReaders import *
+from pytorch.layers import Layers
+from pytorch.seqScorer import *
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
+
+from torch.optim import SGD, Adam, RMSprop
+from torch.optim.lr_scheduler import *
+
+import json
+import random
+
+class Metal(object):
+    """docstring for Metal"""
+    def __init__(self, taskManager, modelOpt):
+        self.taskManager = taskManager
+
+        # One Layers object per task; model(0) contains the Layers shared between all tasks (if any)
+        if modelOpt:
+            self.model = modelOpt
+        else:
+            self.model = self.initialize()
+
+    def initialize(self):
+
+        taskWords, taskLabels = self.mkVocabularies()
+
+        layersPerTask = [None for _ in range(self.taskManager.taskCount + 1)]
+
+        layersPerTask[0] = Layers.apply(self.taskManager, "mtl.layers", taskWords[0], None, False, None)
+
+        inputSize = layersPerTask[0].outDim
+
+        for i in self.taskManager.indices:
+            layersPerTask[i+1] = Layers.apply(self.taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], self.taskManager.tasks[i].isDual, inputSize)
+
+        for i in range(len(layersPerTask)):
+            print (f"Summary of layersPerTask({i}):")
+            print (layersPerTask[i])
+
+        return layersPerTask
+    
+    def mkVocabularies(self):
+        # index 0 reserved for the shared Layers; tid + 1 corresponds to each task
+        labels = [Counter() for _ in range(self.taskManager.taskCount + 1)]
+        for i in range(1, len(labels)): # labels(0) not used, since only task-specific layers have a final layer
+          labels[i][START_TAG] += 1
+          labels[i][STOP_TAG] += 1
+
+        words = [Counter() for _ in range(self.taskManager.taskCount + 1)]
+
+        reader = MetalRowReader()
+
+        for tid in self.taskManager.indices:
+          for sentence in self.taskManager.tasks[tid].trainSentences:
+            annotatedSentences = reader.toAnnotatedSentences(sentence)
+
+            for asent in annotatedSentences:
+              annotatedSentence = asent[0]
+              sentenceLabels = asent[1]
+              for i, word in enumerate(annotatedSentence.words):
+                words[tid + 1][word] += 1
+                words[0][word] += 1
+                labels[tid + 1][sentenceLabels[i]] += 1
+
+        return words, labels
+
+    def train(self, modelNamePrefix):
+
+        learningRate = self.taskManager.get_float("mtl.learningRate", 1e-5)
+        trainerType = self.taskManager.get_string("mtl.trainer", "adam")
+        batchSize = self.taskManager.get_int("mtl.batchSize", 1)
+
+        torch.manual_seed(self.taskManager.random)
+        random.seed(self.taskManager.random)
+
+        assert(batchSize>0)
+
+        parameters = list()
+        for layers in self.model:
+            parameters += layers.get_parameters()
+
+        if trainerType == "adam":
+            trainer = Adam(parameters, lr=learningRate)
+        elif trainerType == "rmsprop":
+            trainer = RMSprop(parameters, lr=learningRate)
+        elif trainerType == "sgd":
+            trainer = SDG(parameters, lr=learningRate)
+        else:
+            raise RuntimeError(f"ERROR: unknown trainer {trainerType}!")
+
+        scheduler = ExponentialLR(trainer, gamma=0.9)
+
+        reader = MetalRowReader()
+
+        cummulativeLoss = 0.0
+        numTagged = 0
+        
+        maxAvgAcc = 0.0
+        maxAvgF1 = 0.0
+        bestEpoch = 0
+
+        allEpochScores = list()
+        epochPatience = self.taskManager.epochPatience
+
+        for epoch in range(0, self.taskManager.maxEpochs):
+            if epochPatience <= 0:
+                break
+            # this fetches randomized training sentences from all tasks
+            sentenceIterator = self.taskManager.getSentences()
+            sentCount = 0
+
+            for layers in self.model:
+                layers.start_train()
+            trainer.zero_grad()
+
+            batchLoss = 0
+            i = 0
+
+            # traverse all training sentences
+            for metaSentence in sentenceIterator:
+                taskId = metaSentence[0]
+                sentence = metaSentence[1]
+
+                sentCount += 1
+
+                annotatedSentences = reader.toAnnotatedSentences(sentence)
+                assert(annotatedSentences is not None)
+
+                unweightedLoss = 0
+                for a_sent in annotatedSentences:
+                    unweightedLoss += Layers.loss(self.model, taskId, a_sent[0], a_sent[1])
+
+                loss = unweightedLoss * self.taskManager.tasks[taskId].taskWeight # Zheng: I don't think this is necessary: if self.taskManager.tasks[taskId].taskWeight!=1.0 else unweightedLoss
+
+                batchLoss += loss
+                i += 1
+
+                if i >= batchSize:
+                    cummulativeLoss += batchLoss.item()
+                    batchLoss.backward()
+                    trainer.step()
+                    batchLoss = 0
+                    i = 0
+
+                numTagged += len(sentence)
+
+                if(sentCount % 1000 == 0):
+                    print (f"Cumulative loss: {cummulativeLoss/numTagged} ({sentCount} sentences)")
+                    cummulativeLoss = 0.0
+                    numTagged = 0
+            # we may have an incomplete batch here
+            if batchLoss:
+                cummulativeLoss = batchLoss.item()
+                batchLoss.backward()
+                trainer.step()
+                batchLoss = 0
+                i = 0
+            scheduler.step()
+
+            # check dev performance in this epoch, for all tasks
+            totalAcc = 0.0
+            totalPrec = 0.0
+            totalRec = 0.0
+            totalF1 = 0.0
+            for taskId in range(0, self.taskManager.taskCount):
+                taskName = self.taskManager.tasks[taskId].taskName
+                devSentences = self.taskManager.tasks[taskId].devSentences
+
+                if devSentences:
+                    acc, prec, rec, f1 = self.evaluate(taskId, taskName, devSentences, "development", epoch)
+                    totalAcc += acc
+                    totalPrec += prec
+                    totalRec += rec
+                    totalF1 += f1
+
+            avgAcc = totalAcc / self.taskManager.taskCount
+            avgPrec = totalPrec / self.taskManager.taskCount
+            avgRec = totalRec / self.taskManager.taskCount
+            avgF1 = totalF1 / self.taskManager.taskCount
+
+            print (f"Average accuracy across {self.taskManager.taskCount} tasks in epoch {epoch}: {avgAcc}")
+            print (f"Average P/R/F1 across {self.taskManager.taskCount} tasks in epoch $epoch: {avgPrec} / {avgRec} / {avgF1}")
+
+            allEpochScores.append((epoch, avgF1))
+
+            if avgF1 > maxAvgF1:
+                maxAvgF1 = avgF1
+                maxAvgAcc = avgAcc
+                bestEpoch = epoch
+                epochPatience = self.taskManager.epochPatience
+            else:
+                epochPatience -= 1
+
+            self.save(f"{modelNamePrefix}-epoch{epoch}")
+
+        allEpochScores.sort(key=lambda x: x[1])
+        print ("Epochs in descending order of scores:")
+        for t in allEpochScores:
+            print (f"Epoch #{t[0]}: {t[1]}")
+
+    def evaluate(self, taskId, taskName, sentences, name, epoch=-1):
+        scoreCountsByLabel = ScoreCountsByLabel()
+        taskNumber = taskId + 1
+        sentCount = 0
+
+        print (f"Started evaluation on the {name} dataset for task {taskNumber} ({taskName})...")
+
+        if epoch >= 0:
+            pw = open(f"task{taskNumber}.dev.output.{epoch}", "w")
+        else:
+            pw = open(f"task{taskNumber}.test.output", "w")
+
+        reader = MetalRowReader()
+
+        for sent in sentences:
+            sentCount += 1
+
+            annotatedSentences = reader.toAnnotatedSentences(sent)
+
+            for asent in annotatedSentences[:1]:
+                sentence = asent[0]
+                goldLabels = asent[1]
+
+                constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams()
+                preds = self.predict(taskId, sentence, constEmbeddings)
+
+                sc = SeqScorer.f1(goldLabels, preds)
+                scoreCountsByLabel.incAll(sc)
+
+                printCoNLLOutput(pw, sentence.words, goldLabels, preds)
+        
+        pw.close()
+
+        print (f"Accuracy on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.accuracy()}")
+        print (f"Precision on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.precision()}")
+        print (f"Recall on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.recall()}")
+        print (f"Micro F1 on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.f1()}")
+        for label in scoreCountsByLabel.labels():
+            print (f"\tP/R/F1 for label {label} ({scoreCountsByLabel.map[label].gold}): {scoreCountsByLabel.precision(label)} / {scoreCountsByLabel.recall(label)} / {scoreCountsByLabel.f1(label)}")
+
+        return ( scoreCountsByLabel.accuracy(), scoreCountsByLabel.precision(), scoreCountsByLabel.recall(), scoreCountsByLabel.f1() )
+
+    def predictJointly(self, sentence, constEmbeddings):
+        # for layers in self.model:
+        #     layers.start_eval()
+        return Layers.predictJointly(self.model, sentence, constEmbeddings)
+
+    def predict(self, taskId, sentence, constEmbeddings):
+        # for layers in self.model:
+        #     layers.start_eval()
+        return Layers.predict(self.model, taskId, sentence, constEmbeddings)
+
+    def predictWithScores(self, taskId, sentence, constEmbeddings):
+        # for layers in self.model:
+        #     layers.start_eval()
+        return Layers.predictWithScores(self.model, taskId, sentence, constEmbeddings)
+
+    # Custom method for the parsing algorithm
+    # @param sentence Input sentence
+    # @param constEmbeddings Constant embeddings for this sentence
+    # @return Tuple of (head, label) for each word in the sentence
+    def parse(self, sentence, constEmbeddings):
+        Layers.parse(self.model, sentence, constEmbeddings)
+
+    def test(self):
+
+        for layers in self.model:
+            layers.start_eval()
+        for taskId in range(0, self.taskManager.taskCount):
+            taskName = self.taskManager.tasks[taskId].taskName
+            testSentences = self.taskManager.tasks[taskId].testSentences
+            if testSentences:
+                self.evaluate(taskId, taskName, testSentences, "testing")
+
+    def save(self, baseFilename):
+
+        params = list()
+        if "-epoch0" in baseFilename:
+            j_params = list()
+        for layers in self.model:
+            sd = layers.get_state_dict()
+            params.append(sd)
+            if "-epoch0" in baseFilename:
+                x2i = layers.saveX2i()
+                j_params.append({"x2i": x2i})
+
+        # torch pickle save
+        try:
+            torch.save(params, baseFilename+".torch")
+            print("model saved to {}".format(baseFilename+".torch"))
+        except BaseException:
+            print("[Warning: Saving failed... continuing anyway.]")
+
+        # We can also save as text json file:
+        if "-epoch0" in baseFilename:
+            with open(baseFilename.replace("-epoch0", "")+".json", "w") as f:
+                f.write(json.dumps(j_params))
+
+
+    @classmethod
+    def load(cls, modelFilenamePrefix):
+        print (f"Loading MTL model from {modelFilenamePrefix}...")
+        layersSeq = list()
+        checkpoint = torch.load(modelFilenamePrefix+".torch")
+        with open(modelFilenamePrefix+".json") as f:
+            x2i = josn.load(f)
+        for i, param in enumerate(checkpoint):
+            layers = Layers.loadX2i(x2i[i])
+            layers.load_state_dict(param)
+            layersSeq.append(layers)
+
+        print (f"Loading MTL model from {modelFilenamePrefix} complete.")
+
+        return layersSeq
+
+    @classmethod
+    def load_multi(cls, models):
+        print (f"Loading MTL models from {models}...")
+
+        layersSeq = list()
+        for model in models:
+            checkpoint = torch.load(model+".torch")
+            with open(model+".json") as f:
+                x2i = josn.load(f)
+            for i, param in enumerate(checkpoint):
+                layers = Layers.loadX2i(x2i[i])
+                layers.load_state_dict(param)
+                if len(layersSeq)<len(checkpoint):
+                    layersSeq.append(layers)
+                else:
+                    layersSeq[i].add_state_dict(layers)
+        for layers in layersSeq:
+            layers.avg_state_dict(len(models))
+        print (f"Loading MTL models from {models} complete.")
+        return layersSeq
+
+    @classmethod
+    def apply(cls, modelFilenamePrefix, taskManager=None):
+        model = Metal.load(modelFilenamePrefix)
+        return cls(taskManager, model)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/rnnLayer.py b/main/src/main/python/pytorch/rnnLayer.py
new file mode 100644
index 000000000..32cb6edb1
--- /dev/null
+++ b/main/src/main/python/pytorch/rnnLayer.py
@@ -0,0 +1,91 @@
+from pytorch.intermediateLayer import IntermediateLayer
+from pytorch.utils import *
+import torch
+import torch.nn as nn
+
+class RnnLayer(IntermediateLayer):
+    def __init__(self, 
+        inputSize, 
+        numLayers, 
+        rnnStateSize, 
+        useHighwayConnections, 
+        rnnType, 
+        wordRnnBuilder, 
+        dropoutProb):
+        super().__init__()
+        self.inDim = self.inputSize = inputSize
+        self.numLayers = numLayers
+        self.rnnStateSize = rnnStateSize 
+        self.useHighwayConnections = useHighwayConnections
+        self.rnnType = rnnType
+        self.wordRnnBuilder = mkBuilder(*wordRnnBuilder)
+        self.dropoutProb = dropoutProb
+
+        highwaySize = inputSize if useHighwayConnections else 0
+        self.outDim = 2 * rnnStateSize + highwaySize
+
+    def forward(self, inputExpressions, dropout):
+
+        assert(inputExpressions is not None)
+
+        States, _ = self.wordRnnBuilder(inputExpressions.unsqueeze(1))
+        States = States.squeeze(1)
+        if self.useHighwayConnections:
+            States = torch.cat([States, inputExpressions], dim=1)
+
+        return States
+
+    def saveX2i(self):
+        x2i = dict()
+        x2i['inputSize'] = self.inputSize
+        x2i['numLayers'] = self.numLayers
+        x2i['rnnStateSize'] = self.rnnStateSize
+        x2i['useHighwayConnections'] = 1 if self.useHighwayConnections else 0
+        x2i['rnnType'] = self.rnnType
+        x2i['dropoutProb'] = self.dropoutProb
+        return x2i
+
+    def __str__(self):
+        return f"RnnLayer({self.rnnType}, {self.inDim}, {self.outDim})"
+
+    @classmethod
+    def load(cls, x2i):
+        inputSize = x2i['inputSize']
+        numLayers = x2i['numLayers']
+        rnnType = x2i.get('rnnType', 'lstm')
+        rnnStateSize = x2i['rnnStateSize']
+        useHighwayConnections = x2i['useHighwayConnections'] == 1
+        dropoutProb = x2i['dropoutProb']
+
+        builder = (rnnType, numLayers, inputSize, rnnStateSize, dropoutProb)
+
+        return cls(inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb)
+
+    @classmethod
+    def initialize(cls, config, paramPrefix, inputSize):
+
+        if(not config.contains(paramPrefix)):
+            return None
+
+        numLayers = config.get_int(paramPrefix + ".numLayers", 1)
+        rnnStateSize = config.get_int(paramPrefix + ".rnnStateSize", None)
+        useHighwayConnections = config.get_bool(paramPrefix + '.useHighwayConnections', False)
+        rnnType = config.get_string(paramPrefix + ".type", "lstm")
+        dropoutProb = config.get_float(paramPrefix + ".dropoutProb", DEFAULT_DROPOUT_PROBABILITY)
+
+        builder = (rnnType, numLayers, inputSize, rnnStateSize, dropoutProb)
+
+        return cls(inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb)
+
+def mkBuilder(rnnType, numLayers, inputSize, rnnStateSize, dropoutProb):
+    if rnnType == 'gru':
+        return nn.GRU(inputSize, rnnStateSize, numLayers, bidirectional=True, dropout=dropoutProb)
+    elif rnnType == 'lstm':
+        return nn.LSTM(inputSize, rnnStateSize, numLayers, bidirectional=True, dropout=dropoutProb)
+    else:
+        raise RuntimeError(f'ERROR: unknown rnnType "{rnnType}"!')
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/seqScorer.py b/main/src/main/python/pytorch/seqScorer.py
new file mode 100644
index 000000000..068815cff
--- /dev/null
+++ b/main/src/main/python/pytorch/seqScorer.py
@@ -0,0 +1,98 @@
+from dataclasses import dataclass
+from collections import defaultdict
+
+OUTSIDE_LABEL = "O"
+
+@dataclass
+class ScoreCounts:
+    correct: int = 0
+    gold: int = 0
+    predicted: int = 0
+
+class SeqScorer:
+
+    @staticmethod
+    def f1(golds, preds):
+        scoreCountsByLabel = ScoreCountsByLabel()
+
+        for e1, e2 in zip(preds, golds):
+            scoreCountsByLabel.total += 1
+            if e1 == e2:
+                scoreCountsByLabel.correct += 1
+            if e2 != OUTSIDE_LABEL:
+                scoreCountsByLabel.incGold()
+                scoreCountsByLabel.incGold(e2)
+            if e1 != OUTSIDE_LABEL:
+                scoreCountsByLabel.incPredicted()
+                scoreCountsByLabel.incPredicted(e1)
+                if e1 == e2:
+                    scoreCountsByLabel.incCorrect()
+                    scoreCountsByLabel.incCorrect(e1)
+        return scoreCountsByLabel
+
+class ScoreCountsByLabel:
+
+    def __init__(self):
+        self.map = defaultdict(ScoreCounts)
+        self.total = 0
+        self.correct = 0
+
+    def labels(self):
+        return self.map.keys()
+
+    def incGold(self, label="*", value=1):
+        counts = self.map[label]
+        counts.gold += value
+
+    def incPredicted(self, label="*", value=1):
+        counts = self.map[label]
+        counts.predicted += value
+
+    def incCorrect(self, label="*", value=1):
+        counts = self.map[label]
+        counts.correct += value
+
+    def incAll(self, counts):
+        self.correct += counts.correct
+        self.total += counts.total
+
+        for label in counts.labels():
+            c = counts.map[label]
+            self.incGold(label, c.gold)
+            self.incPredicted(label, c.predicted)
+            self.incCorrect(label, c.correct)
+
+    def precision(self, label="*", decimals=2):
+        c = self.map[label].correct
+        p = self.map[label].predicted
+
+        prec = c/p if p!=0 else 0
+
+        return round(prec*100, decimals) if decimals>0 else prec
+
+    def recall(self, label="*", decimals=2):
+        c = self.map[label].correct
+        g = self.map[label].gold
+
+        reca = c/g if g!=0 else 0
+
+        return round(reca*100, decimals) if decimals>0 else reca
+
+    def f1(self, label="*", decimals=2):
+        p = self.precision(label, decimals=-1)
+        r = self.recall(label, decimals=-1)
+
+        f1 = 2.0 * p * r / (p + r) if (p!=0 and r!=0) else 0
+
+        return round(f1*100, decimals) if decimals>0 else f1
+
+    def accuracy(self, decimals=2):
+        a = self.correct / self.total
+
+        return round(a*100, decimals) if decimals>0 else a
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/taskManager.py b/main/src/main/python/pytorch/taskManager.py
new file mode 100644
index 000000000..ba8eab3cd
--- /dev/null
+++ b/main/src/main/python/pytorch/taskManager.py
@@ -0,0 +1,187 @@
+import random
+import math
+from sequences.columnReader import ColumnReader
+from dataclasses import dataclass
+
+TYPE_BASIC = 0
+TYPE_DUAL = 1
+
+class TaskManager():
+
+  def __init__(self, config, seed):
+
+    self.config = config
+    self.random = seed
+
+    # How many shards to have per epoch
+    self.shardsPerEpoch = config.get_int("mtl.shardsPerEpoch", 10)
+
+    # Total number of epochs 
+    self.maxEpochs:Int = config.get_int("mtl.maxEpochs", 100)
+
+    # Training patience in number of epochs 
+    self.epochPatience:Int = config.get_int("mtl.epochPatience", 5)
+
+    # Array of all tasks to be managed 
+    self.tasks = self.readTasks()
+
+    self.taskCount = len(self.tasks)
+    self.indices = range(self.taskCount)
+
+    # Training shards from all tasks 
+    self.shards = self.mkShards()
+
+  def contains(self, paramPrefix):
+    return self.config.__contains__(paramPrefix)
+
+  def get_int(self, x, defualt=None):
+    return self.config.get_int(x, defualt)
+
+  def get_string(self, x, defualt=None):
+    return self.config.get_string(x, defualt)
+
+  def get_float(self, x, defualt=None):
+    return self.config.get_float(x, defualt)
+
+  def get_bool(self, x, defualt=None):
+    return self.config.get_bool(x, defualt)
+
+  def get_list(self, x, defualt=None):
+    return self.config.get_list(x, defualt)
+
+  def get_config(self, x, defualt=None):
+    return self.config.get_config(x, defualt)
+
+  # Construct training shards by interleaving shards from all tasks 
+  def mkShards(self):
+    shardsByTasks = list()
+
+    # construct the shards for each task
+    for i in self.indices:
+      shardsByTasks += [self.tasks[i].mkShards()]
+      assert(len(shardsByTasks[i]) == self.shardsPerEpoch)
+
+    # now interleave the tasks
+    interleavedShards = list()
+    for i in range(self.shardsPerEpoch):
+      for j in self.indices:
+        crtShard = shardsByTasks[j][i]
+        interleavedShards += [crtShard]
+
+    return interleavedShards
+
+  # Iterator over all sentences coming from all interleaved shards 
+  def getSentences(self):
+    random.seed(self.random)
+    randomizedShards = random.sample(self.shards, len(self.shards))
+    for shard in randomizedShards:
+      sents = random.sample(range(shard.startPosition, shard.endPosition), shard.endPosition-shard.startPosition)
+      for sent in sents:
+        yield (shard.taskId, self.tasks[shard.taskId].trainSentences[sent])
+
+  # Reads all tasks from disk in memory 
+  def readTasks(self):
+    numberOfTasks = self.config.get_int("mtl.numberOfTasks", None)
+    tasks = list()
+    for i in range(numberOfTasks):
+      tasks += [self.readTask(i + 1)]
+
+    print (f"Read {numberOfTasks} tasks from config file.")
+    return tasks
+
+  def readTask(self, taskNumber):
+    taskName = self.config.get_string(f"mtl.task{taskNumber}.name", None)
+    train = self.config.get_string(f"mtl.task{taskNumber}.train", None)
+
+    dev = self.config.get_string(f"mtl.task{taskNumber}.dev", None) if f"mtl.task{taskNumber}.dev" in self.config else None
+    test = self.config.get_string(f"mtl.task{taskNumber}.test", None) if f"mtl.task{taskNumber}.test" in self.config else None
+
+    taskType = self.parseType(self.config.get_string(f"mtl.task{taskNumber}.type", "basic"))
+
+    weight = self.config.get_float(f"mtl.task{taskNumber}.weight", 1.0)
+
+    return Task(taskNumber - 1, taskName, taskType, self.shardsPerEpoch, weight, train, dev, test)
+
+  def parseType(self, inf):
+    if inf == "basic": return TYPE_BASIC
+    elif inf == "dual": return TYPE_DUAL
+    else: raise ValueError(f"ERROR: unknown task type {inf}!")
+
+  def debugTraversal(self):
+    for epoch in range(self.maxEpochs):
+      print (f"Started epoch {epoch}")
+      sentCount = 0
+      taskId = 0
+      totalSents = 0
+      for sentence in self.getSentences():
+        totalSents += 1
+        if(sentence[0] != taskId):
+          print (f"Read {sentCount} sentences from task {taskId}")
+          taskId = sentence[0]
+          sentCount = 1
+        else:
+          sentCount += 1
+      print (f"Read {sentCount} sentences from task {taskId}")
+      print (f"Read {totalSents} sentences in epoch {epoch}.")
+
+@dataclass
+class Shard:
+  taskId: int
+  startPosition: int
+  endPosition: int
+
+class Task:
+  def __init__(self,
+  taskId, # this starts at 0 so we can use it as an index in the array of tasks
+  taskName:str,
+  taskType:int,
+  shardsPerEpoch:int,
+  taskWeight:float,
+  trainFileName:str,
+  devFileName:str = None,
+  testFileName:str = None):
+    self.taskId = taskId
+    taskNumber = taskId + 1
+    print (f"Reading task {taskNumber} ({taskName})...")
+    self.trainSentences = ColumnReader.readColumns(trainFileName)
+    self.devSentences = ColumnReader.readColumns(devFileName) if devFileName else None
+    self.testSentences = ColumnReader.readColumns(testFileName) if testFileName else None
+
+    self.isBasic:bool = taskType == TYPE_BASIC
+    self.isDual:bool = taskType == TYPE_DUAL
+
+    if taskType == TYPE_BASIC: 
+      self.prettyType = "basic"
+    elif taskType == TYPE_DUAL: 
+      self.prettyType = "dual"
+    else: 
+      self.prettyType = "unknown"
+
+    # The size of the training shard for this task
+    self.shardSize = math.ceil(len(self.trainSentences) / shardsPerEpoch)
+
+    # Current position in the training sentences when we iterate during training
+    currentTrainingSentencePosition = 0
+
+    self.taskWeight = taskWeight
+    self.taskName = taskName
+
+    print (f"============ starting task {taskNumber} ============")
+    print (f"Read {len(self.trainSentences)} training sentences for task {taskNumber}, with shard size {self.shardSize}.")
+    if(self.devSentences is not None):
+      print (f"Read {len(self.devSentences)} development sentences for task {taskNumber}.")
+    if(self.testSentences is not None):
+      print (f"Read {len(self.testSentences)} testing sentences for task {taskNumber}.")
+    print (f"Using taskWeight = {taskWeight}")
+    print (f"Task type = {self.prettyType}.")
+    print (f"============ completed task {taskNumber} ============")
+
+  # Construct the shards from all training sentences in this task 
+  def mkShards(self):
+    shards = list()
+    crtPos = 0
+    while(crtPos < len(self.trainSentences)):
+      endPos = min(crtPos + self.shardSize, len(self.trainSentences))
+      shards += [Shard(self.taskId, crtPos, endPos)]
+      crtPos = endPos
+    return shards
diff --git a/main/src/main/python/pytorch/utils.py b/main/src/main/python/pytorch/utils.py
new file mode 100644
index 000000000..abcb1e6f9
--- /dev/null
+++ b/main/src/main/python/pytorch/utils.py
@@ -0,0 +1,110 @@
+import torch.nn as nn
+import torch
+from torch.autograd import Variable
+
+import numpy as np
+
+concatenateCount = 0
+
+UNK_WORD = "<UNK>"
+EOS_WORD = "<EOS>"
+
+UNK_EMBEDDING = 0
+
+START_TAG = "<START>"
+STOP_TAG = "<STOP>"
+
+RANDOM_SEED = 2522620396
+WEIGHT_DECAY = 0.01
+
+LOG_MIN_VALUE = -10000.0
+
+DEFAULT_DROPOUT_PROBABILITY = 0.1 # no dropout by  default
+
+TYPE_VITERBI = 1
+TYPE_GREEDY = 2
+
+NONLIN_NONE = 0
+NONLIN_RELU = 1
+NONLIN_TANH = 2
+
+nonlin_map = {"relu":NONLIN_RELU, "tanh":NONLIN_TANH, "":NONLIN_NONE}
+
+TYPE_GREEDY_STRING = "greedy"
+TYPE_VITERBI_STRING = "viterbi"
+
+DEFAULT_IS_DUAL = 0
+
+def save(file, values, comment):
+    file.write("# " + comment + "\n")
+    for key, value in values.items():
+        file.write(f"{key}\t{value}\n")
+    file.write("\n")
+
+def mkCharacterEmbedding(word, c2i, charLookupParameters, charRnnBuilder):
+    charEmbeddings = charLookupParameters(torch.LongTensor([c2i.get(c, UNK_EMBEDDING) for c in word]))
+    output, _ = charRnnBuilder(charEmbeddings.unsqueeze(1))
+    result = output.squeeze(1)[-1]
+    return result
+
+def mkCharacterEmbedding2(char_ids, charLookupParameters, charRnnBuilder):
+    charEmbeddings = charLookupParameters(char_ids)
+    output, _ = charRnnBuilder(charEmbeddings.unsqueeze(1))
+    result = output.squeeze(1)[-1]
+    return result
+
+def readString2Ids(s2iFilename):
+    s2i = dict()
+    with open(s2iFilename) as f:
+        for line in f:
+            if not line.startswith("# ") and line.rstrip():
+                k, v = line.strip().split('\t')
+                s2i[k] = int(v)
+    return s2i
+
+def readChar2Ids(s2iFilename):
+    s2i = dict()
+    with open(s2iFilename) as f:
+        for line in f:
+            if not line.startswith("# ") and line.rstrip():
+                k, v = line.strip().split('\t')
+                s2i[chr(int(k))] = int(v)
+    return s2i
+
+def sentenceLossGreedy(emissionScoresForSeq, golds):
+    assert(emissionScoresForSeq.size(0) == len(golds))
+    criterion = nn.CrossEntropyLoss()
+    golds = Variable(torch.LongTensor(golds))
+    return criterion(emissionScoresForSeq, golds)
+
+def emissionScoresToArrays(expressions):
+    return [expr.data.tolist() for expr in expressions]
+
+def printCoNLLOutput(pw, words, golds, preds):
+
+    assert(len(words) == len(golds))
+    assert(len(words) == len(preds))
+
+    for i in range(len(words)):
+      pw.write(f"{words[i]} {golds[i]} {preds[i]}\n")
+    pw.write("\n")
+def argmax(vec):
+    # return the argmax as a python int
+    _, idx = torch.max(vec, 1)
+    return idx.item()
+
+def log_sum_exp(vec):
+    max_score = vec[0, argmax(vec)]
+    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
+    return max_score + \
+        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
+    
+
+
+
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/viterbiForwardLayer.py b/main/src/main/python/pytorch/viterbiForwardLayer.py
new file mode 100644
index 000000000..77025fd70
--- /dev/null
+++ b/main/src/main/python/pytorch/viterbiForwardLayer.py
@@ -0,0 +1,151 @@
+from pytorch.forwardLayer import *
+from pytorch.utils import *
+
+class ViterbiForwardLayer(ForwardLayer):
+    def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None):
+        super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans)
+
+        # Matrix of transition parameters.  Entry i,j is the score of
+        # transitioning *to* i *from* j.
+        self.transitions = nn.Parameter(
+            torch.randn(self.outDim, self.outDim))
+
+        # These two statements enforce the constraint that we never transfer
+        # to the start tag and we never transfer from the stop tag
+        self.transitions.data[t2i[START_TAG], :] = -10000
+        self.transitions.data[:, t2i[STOP_TAG]] = -10000
+
+    def _forward_alg(self, feats):
+        # Do the forward algorithm to compute the partition function
+        init_alphas = torch.full((1, self.outDim), -10000.)
+        # START_TAG has all of the score.
+        init_alphas[0][self.t2i[START_TAG]] = 0.
+
+        # Wrap in a variable so that we will get automatic backprop
+        forward_var = init_alphas
+
+        # Iterate through the sentence
+        for feat in feats:
+            alphas_t = []  # The forward tensors at this timestep
+            for next_tag in range(self.outDim):
+                # broadcast the emission score: it is the same regardless of
+                # the previous tag
+                emit_score = feat[next_tag].view(
+                    1, -1).expand(1, self.outDim)
+                # the ith entry of trans_score is the score of transitioning to
+                # next_tag from i
+                trans_score = self.transitions[next_tag].view(1, -1)
+                # The ith entry of next_tag_var is the value for the
+                # edge (i -> next_tag) before we do log-sum-exp
+                next_tag_var = forward_var + trans_score + emit_score
+                # The forward variable for this tag is log-sum-exp of all the
+                # scores.
+                alphas_t.append(log_sum_exp(next_tag_var).view(1))
+            forward_var = torch.cat(alphas_t).view(1, -1)
+        terminal_var = forward_var + self.transitions[self.t2i[STOP_TAG]]
+        alpha = log_sum_exp(terminal_var)
+        return alpha
+
+    def _score_sentence(self, feats, tags):
+        # Gives the score of a provided tag sequence
+        score = torch.zeros(1)
+        tags = torch.cat([torch.tensor([self.t2i[START_TAG]], dtype=torch.long), tags])
+        for i, feat in enumerate(feats):
+            score = score + \
+                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
+        score = score + self.transitions[self.t2i[STOP_TAG], tags[-1]]
+        return score
+
+    def _viterbi_decode(self, feats):
+        backpointers = []
+
+        # Initialize the viterbi variables in log space
+        init_vvars = torch.full((1, self.outDim), -10000.)
+        init_vvars[0][self.t2i[START_TAG]] = 0
+
+        # forward_var at step i holds the viterbi variables for step i-1
+        forward_var = init_vvars
+        for feat in feats:
+            bptrs_t = []  # holds the backpointers for this step
+            viterbivars_t = []  # holds the viterbi variables for this step
+
+            for next_tag in range(self.outDim):
+                # next_tag_var[i] holds the viterbi variable for tag i at the
+                # previous step, plus the score of transitioning
+                # from tag i to next_tag.
+                # We don't include the emission scores here because the max
+                # does not depend on them (we add them in below)
+                next_tag_var = forward_var + self.transitions[next_tag]
+                best_tag_id = argmax(next_tag_var)
+                bptrs_t.append(best_tag_id)
+                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
+            # Now add in the emission scores, and assign forward_var to the set
+            # of viterbi variables we just computed
+            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
+            backpointers.append(bptrs_t)
+
+        # Transition to STOP_TAG
+        terminal_var = forward_var + self.transitions[self.t2i[STOP_TAG]]
+        best_tag_id = argmax(terminal_var)
+        path_score = terminal_var[0][best_tag_id]
+
+        # Follow the back pointers to decode the best path.
+        best_path = [best_tag_id]
+        for bptrs_t in reversed(backpointers):
+            best_tag_id = bptrs_t[best_tag_id]
+            best_path.append(best_tag_id)
+        # Pop off the start tag (we dont want to return that to the caller)
+        start = best_path.pop()
+        assert start == self.t2i[START_TAG]  # Sanity check
+        best_path.reverse()
+        return path_score, best_path
+
+    def loss(self, finalStates, goldLabelStrings):
+        goldLabels = torch.tensor([self.t2i[gs] for gs in goldLabelStrings], dtype=torch.long)
+        forward_score = self._forward_alg(finalStates)
+        gold_score = self._score_sentence(finalStates, goldLabels)
+        return forward_score - gold_score
+    
+    def saveX2i(self):
+        x2i = dict()
+        x2i["inferenceType"] = TYPE_VITERBI
+        x2i["inputSize"] = self.inputSize
+        x2i["isDual"] = 1 if self.isDual else 0
+        x2i["span"] = spanToString(self.spans) if self.spans else ""
+        x2i["nonlinearity"] = self.nonlinearity
+        x2i["t2i"] = self.t2i
+
+        return x2i
+
+    def __str__(self):
+        return f"ViterbiForwardLayer({self.inDim}, {self.outDim})"
+
+    def inference(self, emissionScores):
+        score, labelsIds = self._viterbi_decode(emissionScores)
+        return [self.i2t[i] for i in labelsIds]
+
+    def inference2(self, emissionScores):
+        return torch.argmax(emissionScores, dim=1)
+
+    def inferenceWithScores(emissionScores):
+        raise RuntimeError("ERROR: inferenceWithScores not supported for ViterbiLayer!")
+
+    @classmethod
+    def load(cls, x2i):
+        inputSize = x2i["inputSize"]
+        isDual = x2i.get("isDual", DEFAULT_IS_DUAL) == 1
+        sapnValue = x2i.get("span", "")
+        spans = None if sapnValue == "" else parseSpan(sapnValue, inputSize)
+        nonlinearity = x2i.get("nonlinearity", NONLIN_NONE)
+        t2i = x2i["t2i"]
+        i2t = {i:t for t, i in t2i.items()}
+        dropoutProb = x2i.get("dropoutProb", DEFAULT_DROPOUT_PROBABILITY)
+
+        if spans:
+            l = spanLength(spans)
+            actualInputSize = 2*l if isDual else l
+        else:
+            actualInputSize = 2*inputSize if isDual else inputSize
+
+        return cls(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans)
+
diff --git a/main/src/main/python/pytorch2onnx.py b/main/src/main/python/pytorch2onnx.py
new file mode 100644
index 000000000..080b0efd8
--- /dev/null
+++ b/main/src/main/python/pytorch2onnx.py
@@ -0,0 +1,175 @@
+import torch
+import argparse
+from pyhocon import ConfigFactory
+import random
+
+from pytorch.taskManager import TaskManager
+from pytorch.metal import Metal
+from pytorch.utils import *
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
+from sequences.rowReaders import *
+from pytorch.seqScorer import *
+
+import onnx
+import onnxruntime
+
+import json
+
+import numpy as np
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+class Char_RNN(torch.nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        for i, layers in enumerate(model):
+            if layers.initialLayer is not None:
+                self.char_lookup = layers.initialLayer.charLookupParameters
+                self.char_rnn = layers.initialLayer.charRnnBuilder
+
+    def forward(self, char_ids):
+        charEmbedding = mkCharacterEmbedding2(char_ids, self.char_lookup, self.char_rnn)
+        return charEmbedding
+
+class Saving_Model(torch.nn.Module):
+    """docstring for Saving_Model"""
+    def __init__(self, model):
+        super().__init__()
+        self.model_length = len(model)
+        self.intermediateLayerss = [None for _ in range(self.model_length)]
+        self.finalLayers = [None for _ in range(self.model_length)]
+        for i, layers in enumerate(model):
+            if layers.initialLayer is not None:
+                self.word_lookup = layers.initialLayer.wordLookupParameters
+            self.intermediateLayerss[i] = nn.ModuleList(layers.intermediateLayers)
+            self.finalLayers[i] = layers.finalLayer
+        self.intermediateLayerss = nn.ModuleList(self.intermediateLayerss)
+        self.finalLayers = nn.ModuleList(self.finalLayers)
+    def forward(self, embeddings, word_ids, charEmbedding):
+        # Can I assuem there is only one initial layer?
+        learnedWordEmbeddings = self.word_lookup(word_ids)
+        embedParts = [embeddings, learnedWordEmbeddings, charEmbedding]
+        embedParts = [ep for ep in embedParts if ep is not None]
+        state = torch.cat(embedParts, dim=1)
+        for i in range(self.model_length):
+            for il in self.intermediateLayerss[i]:
+                state = il(state, False)
+            if self.finalLayers[i]:
+                state = self.finalLayers[i](state, None)#headPositions set to be None for now, we can add it in input list later
+        transitions = self.finalLayers[-1].transitions
+        return state, transitions
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_file', type=str, help='Filename of the model.', nargs='+')
+    parser.add_argument('--config', type=str, help='Filename of the configuration.')
+    parser.add_argument('--seed', type=int, default=1234)
+    args = parser.parse_args()
+
+    config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf')
+    taskManager = TaskManager(config, args.seed)
+    modelName = args.model_file
+    if len(modelName)==1:
+        model = Metal.load(modelName[0])
+    else:
+        model = Metal.load_multi(modelName)
+
+    for layers in model:
+        layers.start_eval()
+    constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams()
+
+    export_char  = Char_RNN(model)
+    export_model = Saving_Model(model)
+    export_model.eval()
+    export_char.eval()
+    for param in export_model.parameters():
+        param.requires_grad = False
+    for param in export_char.parameters():
+        param.requires_grad = False
+
+    torch.manual_seed(taskManager.random)
+    random.seed(taskManager.random)
+
+    x2i = json.load(open(args.model_file[0]+".json"))
+
+    c2i = x2i[0]['x2i']['initialLayer']['c2i']
+    w2i = x2i[0]['x2i']['initialLayer']['w2i']
+    t2i = x2i[1]['x2i']['finalLayer']["t2i"]
+    i2t = {i:t for t, i in t2i.items()}
+
+    for taskId in range(0, taskManager.taskCount):
+        taskName = taskManager.tasks[taskId].taskName
+        testSentences = taskManager.tasks[taskId].testSentences
+        if testSentences:
+            reader = MetalRowReader()
+            annotatedSentences = reader.toAnnotatedSentences(testSentences[1])
+
+            asent = annotatedSentences[0]
+            sentence = asent[0]
+            goldLabels = asent[1]
+
+            words = sentence.words
+            
+            char_embs = []
+            for word in words:
+                char_ids = torch.LongTensor([c2i.get(c, UNK_EMBEDDING) for c in word])
+                char_out = export_char(char_ids)
+                char_embs.append(char_out)
+            char_embs = torch.stack(char_embs)
+            embed_ids = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words])
+            embeddings = constEmbeddings.emb(embed_ids)
+            word_ids = torch.LongTensor([w2i[word] if word in w2i else 0 for word in words])
+            state, transitions = export_model(embeddings, word_ids, char_embs)
+            dummy_input = (embeddings, word_ids, char_embs)
+
+    torch.onnx.export(export_char,
+                    char_ids,
+                    "char.onnx",
+                    export_params=True,
+                    do_constant_folding=True,
+                    input_names = ['char_ids'],
+                    output_names = ['chars'],
+                    dynamic_axes = {"char_ids": {0: 'word length'}})
+
+    torch.onnx.export(export_model,               # model being run
+                  dummy_input,                         # model input (or a tuple for multiple inputs)
+                  "model.onnx",   # where to save the model (can be a file or file-like object)
+                  export_params=True,        # store the trained parameter weights inside the model file
+                  opset_version=10,          # the ONNX version to export the model to
+                  do_constant_folding=True,  # whether to execute constant folding for optimization
+                  input_names  = ['embed', 'words', 'chars'],   # the model's input names
+                  output_names = ['state', 'transitions'], # the model's output names
+                  dynamic_axes = {'embed' : {0 : 'sentence length'},
+                                  'words' : {0 : 'sentence length'},
+                                  'chars' : {0 : 'sentence length'},
+                                  'state': {0 : 'sentence length'}})
+
+    onnx_model = onnx.load("model.onnx")
+    onnx.checker.check_model(onnx_model)
+    char_model = onnx.load("char.onnx")
+    onnx.checker.check_model(char_model)
+
+    ort_session = onnxruntime.InferenceSession("model.onnx")
+    ort_char = onnxruntime.InferenceSession("char.onnx")
+    # compute ONNX Runtime output prediction
+
+    ort_inputs = {ort_char.get_inputs()[i].name: to_numpy(x) for i, x in enumerate([char_ids])}
+    ort_outs = ort_char.run(None, ort_inputs)
+    try:
+        np.testing.assert_allclose(to_numpy(char_out), ort_outs[0], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print (e)
+    ort_inputs = {ort_session.get_inputs()[i].name: to_numpy(x) for i, x in enumerate(dummy_input)}
+    ort_outs = ort_session.run(None, ort_inputs)
+    
+    try:
+        np.testing.assert_allclose(state.detach().cpu().numpy(), ort_outs[0], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print (e)
+
+    print("Exported model has been tested with ONNXRuntime, and the result looks good!")
+
+    
diff --git a/main/src/main/python/run.py b/main/src/main/python/run.py
new file mode 100644
index 000000000..7d22e56fe
--- /dev/null
+++ b/main/src/main/python/run.py
@@ -0,0 +1,35 @@
+from pyhocon import ConfigFactory
+import argparse
+from pytorch.taskManager import TaskManager
+from pytorch.metal import Metal
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_file', type=str, help='Filename of the model.', nargs='+')
+    parser.add_argument('--train', action='store_true', help='Set the code to training purpose.')
+    parser.add_argument('--test', action='store_true', help='Set the code to testing purpose.')
+    parser.add_argument('--shell', action='store_true', help='Set the code to shell mode.')
+    parser.add_argument('--config', type=str, help='Filename of the configuration.')
+    parser.add_argument('--seed', type=int, default=1234)
+    args = parser.parse_args()
+
+    if args.train:
+        config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf')
+        taskManager = TaskManager(config, args.seed)
+        modelName = args.model_file[0]
+
+        mtl = Metal(taskManager, None)
+        mtl.train(modelName)
+    elif args.test:
+        config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf')
+        taskManager = TaskManager(config, args.seed)
+        modelName = args.model_file
+        if len(modelName)==1:
+            model = Metal.load(modelName[0])
+        else:
+            model = Metal.load_multi(modelName)
+        mtl = Metal(taskManager, model)
+        mtl.test()
+    elif args.shell:
+        pass
\ No newline at end of file
diff --git a/main/src/main/python/sequences/__init__.py b/main/src/main/python/sequences/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/main/src/main/python/sequences/columnReader.py b/main/src/main/python/sequences/columnReader.py
new file mode 100644
index 000000000..e162316f7
--- /dev/null
+++ b/main/src/main/python/sequences/columnReader.py
@@ -0,0 +1,47 @@
+#-----------------------------------------------------------
+#  Reads the CoNLL-like column format
+#-----------------------------------------------------------
+class ColumnReader:
+
+  def readColumns(source):
+    if type(source) is str:
+      source = open(source)
+    sentence = list()
+    sentences = list()
+    for line in source:
+      l = line.strip()
+      if (l is ""):
+        # end of sentence
+        if (sentence):
+          sentences += [sentence]
+          sentence = list()
+      else:
+        # within the same sentence
+        bits = l.split("\t")
+        if (len(bits) < 2):
+          raise RuntimeError(f"ERROR: invalid line {l}!")
+        sentence += [Row(bits)]
+
+    if (sentence):
+      sentences += [sentence]
+
+    source.close()
+    return sentences
+
+# -----------------------------------------------------------
+# Stores training data for sequence modeling
+# Mandatory columns: 0 - word, 1 - label
+# Optional columns: 2 - POS tag, 3+ SRL arguments
+# @param tokens
+# -----------------------------------------------------------
+
+class Row:
+
+  def __init__(self, tokens): 
+    self.tokens = tokens
+    self.length = len(tokens)
+
+  def get(self, idx):
+    if(idx >= self.length):
+      raise RuntimeError(f"ERROR: trying to read field #{idx}, which does not exist in this row: {tokens}!")
+    return self.tokens[idx]
diff --git a/main/src/main/python/sequences/rowReaders.py b/main/src/main/python/sequences/rowReaders.py
new file mode 100644
index 000000000..434469e92
--- /dev/null
+++ b/main/src/main/python/sequences/rowReaders.py
@@ -0,0 +1,100 @@
+
+class AnnotatedSentence:
+
+    def __init__(self, words, posTags = None, neTags = None, headPositions = None):
+        self.words = words
+        self.posTags = posTags
+        self.neTags = neTags
+        self.headPositions = headPositions
+        self.size = len(words)
+        self.indicies = range(self.size)
+
+class RowReader(object):
+
+    def __init__(self):
+        raise NotImplementedError
+
+    def toAnnotatedSentences(self, rows):
+        raise NotImplementedError
+
+class MetalRowReader(RowReader):
+
+    def __init__(self):
+        self.WORD_POSITION = 0
+        self.POS_TAG_POSITION = 1
+        self.NE_LABEL_POSITION = 2
+        self.LABEL_START_OFFSET = 3
+
+    def toAnnotatedSentences(self, rows):
+        if (rows[0].length == 2):
+            return self.parseSimple(rows)
+        elif (rows[0].length == 4):
+            return self.parseSimpleExtended(rows)
+        elif (rows[0].length >= 5):
+            return self.parseFull(rows)
+        else:
+            raise RuntimeError("ERROR: the Metal format expects 2, 4, or 5+ columns!")
+
+    # Parser for the simple format: word, label 
+    def parseSimple(self, rows):
+        assert(rows[0].length == 2)
+        words = list()
+        labels = list()
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            labels += [row.get(self.WORD_POSITION + 1)]
+
+        return [(AnnotatedSentence(words), labels)]
+
+    # Parser for the simple extended format: word, POS tag, NE label, label
+    def parseSimpleExtended(self, rows):
+        assert(rows[0].length == 4)
+        words = list()
+        posTags = list()
+        neLabels = list()
+        labels = list()
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            posTags += [row.get(self.POS_TAG_POSITION)]
+            neLabels += [row.get(self.NE_LABEL_POSITION)]
+            labels += [row.get(self.LABEL_START_OFFSET)]
+
+        return [(AnnotatedSentence(words, posTags, neLabels), labels)]
+
+    # Parser for the full format: word, POS tag, NE label, (label head)+ 
+    def parseFull(self, rows):
+        assert(rows[0].length >= 5)
+        numSent = (rows[0].length - 3) / 2
+        assert(numSent >= 1)
+        assert(numSent==int(numSent))
+        numSent = int(numSent)
+
+
+
+        words = list()
+        posTags = list()
+        neLabels = list()
+        headPositions = [list() for i in range(numSent)]
+        labels = [list() for i in range(numSent)]
+
+        for row in rows:
+            words += [row.get(self.WORD_POSITION)]
+            posTags += [row.get(self.POS_TAG_POSITION)]
+            neLabels += [row.get(self.NE_LABEL_POSITION)]
+
+            for j in range(numSent):
+                labels[j]+= [row.get(self.LABEL_START_OFFSET + (j * 2))]
+                try:
+                    headPositions[j] += [int(row.get(self.LABEL_START_OFFSET + (j * 2) + 1))]
+                except:
+                    raise RuntimeError 
+
+        sentences = list()
+        for i in range(numSent):
+            annotatedSent = AnnotatedSentence(words, posTags, neLabels, headPositions[i])
+            sentLabels = labels[i]
+            sentences += [(annotatedSent, sentLabels)]
+
+        return sentences
diff --git a/main/src/main/python/test_onnx.py b/main/src/main/python/test_onnx.py
new file mode 100644
index 000000000..7c33d4894
--- /dev/null
+++ b/main/src/main/python/test_onnx.py
@@ -0,0 +1,129 @@
+from pytorch2onnx import *
+import json
+import numpy as np
+from pytorch.seqScorer import *
+import time
+
+def viterbi_decode(feats, transitions, t2i):
+    backpointers = []
+
+    # Initialize the viterbi variables in log space
+    init_vvars = np.full((1, len(t2i)), -10000.)
+    init_vvars[0][t2i[START_TAG]] = 0
+
+    # forward_var at step i holds the viterbi variables for step i-1
+    forward_var = init_vvars
+    for feat in feats:
+        bptrs_t = []  # holds the backpointers for this step
+        viterbivars_t = []  # holds the viterbi variables for this step
+
+        for next_tag in range(len(t2i)):
+            # next_tag_var[i] holds the viterbi variable for tag i at the
+            # previous step, plus the score of transitioning
+            # from tag i to next_tag.
+            # We don't include the emission scores here because the max
+            # does not depend on them (we add them in below)
+            next_tag_var = forward_var + transitions[next_tag]
+            best_tag_id = np.argmax(next_tag_var, 1)[0]
+            bptrs_t.append(best_tag_id)
+            viterbivars_t.append(next_tag_var[0][best_tag_id].reshape(1))
+        # Now add in the emission scores, and assign forward_var to the set
+        # of viterbi variables we just computed
+        forward_var = (np.concatenate(viterbivars_t) + feat).reshape(1, -1)
+        backpointers.append(bptrs_t)
+
+    # Transition to STOP_TAG
+    terminal_var = forward_var + transitions[t2i[STOP_TAG]]
+    best_tag_id = np.argmax(terminal_var, 1)[0]
+    path_score = terminal_var[0][best_tag_id]
+
+    # Follow the back pointers to decode the best path.
+    best_path = [best_tag_id]
+    for bptrs_t in reversed(backpointers):
+        best_tag_id = bptrs_t[best_tag_id]
+        best_path.append(best_tag_id)
+    # Pop off the start tag (we dont want to return that to the caller)
+    start = best_path.pop()
+    assert start == t2i[START_TAG]  # Sanity check
+    best_path.reverse()
+    return path_score, best_path
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_file', type=str, help='Filename of the model.')
+    parser.add_argument('--config', type=str, help='Filename of the configuration.')
+    parser.add_argument('--seed', type=int, default=1234)
+    args = parser.parse_args()
+
+    config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf')
+    taskManager = TaskManager(config, args.seed)
+    constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams()
+
+    x2i = json.load(open(args.model_file+".json"))
+
+    c2i = x2i[0]['x2i']['initialLayer']['c2i']
+    w2i = x2i[0]['x2i']['initialLayer']['w2i']
+    t2i = x2i[1]['x2i']['finalLayer']["t2i"]
+    i2t = {i:t for t, i in t2i.items()}
+
+    torch.manual_seed(taskManager.random)
+    random.seed(taskManager.random)
+
+    onnx_model = onnx.load("model.onnx")
+    onnx.checker.check_model(onnx_model)
+    char_model = onnx.load("char.onnx")
+    onnx.checker.check_model(char_model)
+
+    ort_session = onnxruntime.InferenceSession("model.onnx")
+    ort_char = onnxruntime.InferenceSession("char.onnx")
+
+    scoreCountsByLabel = ScoreCountsByLabel()
+    start_time = time.time()
+    for taskId in range(0, taskManager.taskCount):
+        taskName = taskManager.tasks[taskId].taskName
+        sentences = taskManager.tasks[taskId].testSentences
+        if sentences:
+            reader = MetalRowReader()
+            for sent in sentences:
+                annotatedSentences = reader.toAnnotatedSentences(sent)
+
+                for asent in annotatedSentences:
+                    sentence = asent[0]
+                    goldLabels = asent[1]
+
+                    words = sentence.words
+
+                    char_embs = []
+                    for word in words:
+                        char_ids = np.array([c2i.get(c, UNK_EMBEDDING) for c in word])
+                        ort_inputs = {ort_char.get_inputs()[i].name: x for i, x in enumerate([char_ids])}
+                        ort_outs = ort_char.run(None, ort_inputs)
+                        char_embs.append(ort_outs[0])
+                    char_embs = np.stack(char_embs)
+                    embed_ids = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words])
+                    embeddings = constEmbeddings.emb(embed_ids).detach().cpu().numpy()
+                    word_ids = np.array([w2i[word] if word in w2i else 0 for word in words])
+
+                    dummy_input = (embeddings, word_ids, char_embs)
+
+                    ort_inputs = {ort_session.get_inputs()[i].name: x for i, x in enumerate(dummy_input)}
+                    ort_outs = ort_session.run(None, ort_inputs)
+
+                    _, ids = viterbi_decode(ort_outs[0], ort_outs[1], t2i)
+
+                    preds = [i2t[i] for i in ids]
+
+                    sc = SeqScorer.f1(goldLabels, preds)
+                    scoreCountsByLabel.incAll(sc)
+
+
+    print (f"Accuracy : {scoreCountsByLabel.accuracy()}")
+    print (f"Precision : {scoreCountsByLabel.precision()}")
+    print (f"Recall on : {scoreCountsByLabel.recall()}")
+    print (f"Micro F1 : {scoreCountsByLabel.f1()}")
+    for label in scoreCountsByLabel.labels():
+        print (f"\tP/R/F1 for label {label} ({scoreCountsByLabel.map[label].gold}): {scoreCountsByLabel.precision(label)} / {scoreCountsByLabel.recall(label)} / {scoreCountsByLabel.f1(label)}")
+    duration = time.time() - start_time
+    print (duration)
+            
\ No newline at end of file
diff --git a/main/src/main/resources/org/clulab/glove.conf b/main/src/main/resources/org/clulab/glove.conf
index 22f1e4b36..6b8bd08b0 100644
--- a/main/src/main/resources/org/clulab/glove.conf
+++ b/main/src/main/resources/org/clulab/glove.conf
@@ -1,5 +1,5 @@
 
 glove {
-  matrixResourceName = "/org/clulab/glove/glove.840B.300d.10f"
+  matrixResourceName = "glove.840B.300d.10f.txt"
   isResource = true
 }
\ No newline at end of file
diff --git a/main/src/main/resources/org/clulab/mtl-en-ner.conf b/main/src/main/resources/org/clulab/mtl-en-ner.conf
index 6cd5eecc2..aacc06054 100644
--- a/main/src/main/resources/org/clulab/mtl-en-ner.conf
+++ b/main/src/main/resources/org/clulab/mtl-en-ner.conf
@@ -8,7 +8,7 @@ mtl {
       learnedWordEmbeddingSize = 128
       charEmbeddingSize = 32
       charRnnStateSize = 16
-      c2i = "org/clulab/c2i-en.txt"
+      c2i = "../resources/org/clulab/c2i-en.txt"
     }
 
     intermediate1 {
@@ -20,9 +20,9 @@ mtl {
 
   task1 {
     name = "En NER"
-    train = "dynet/en/ner/train.txt"
-    dev = "dynet/en/ner/dev.txt"
-    test = "dynet/en/ner/test.txt"
+    train = "ner/train.txt"
+    dev = "ner/dev.txt"
+    test = "ner/test.txt"
 
     layers {
       final {
diff --git a/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf b/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf
index 828fd973d..b23692fff 100644
--- a/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf
+++ b/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf
@@ -9,7 +9,7 @@ mtl {
 			learnedWordEmbeddingSize = 128
 			charEmbeddingSize = 32
 			charRnnStateSize = 16
-			c2i = "org/clulab/c2i-en.txt"
+			c2i = "../resources/org/clulab/c2i-en.txt"
 		}
 
 		intermediate1 {
@@ -21,9 +21,9 @@ mtl {
 
 	task1 {
 		name = "En POS tagging"
-		train = "dynet/en/pos/train.txt"
-		dev = "dynet/en/pos/dev.txt"
-		test = "dynet/en/pos/test.txt"
+		train = "/data/nlp/corpora/processors-dynet/en/pos/train.txt"
+		dev = "/data/nlp/corpora/processors-dynet/en/pos/dev.txt"
+		test = "/data/nlp/corpora/processors-dynet/en/pos/test.txt"
 
 		layers {
 			final {
@@ -34,9 +34,9 @@ mtl {
 
 	task2 {
 		name = "En chunking"
-		train = "dynet/en/chunking/train.txt"
-		dev = "dynet/en/chunking/test.txt"
-		test = "dynet/en/chunking/test.txt"
+		train = "/data/nlp/corpora/processors-dynet/en/chunking/train.txt"
+		dev = "/data/nlp/corpora/processors-dynet/en/chunking/test.txt"
+		test = "/data/nlp/corpora/processors-dynet/en/chunking/test.txt"
 
 		layers {
 			final {
@@ -47,9 +47,9 @@ mtl {
 
 	task3 {
 		name = "En SRL predicates"
-		train = "dynet/en/srl/train.preds"
-		dev = "dynet/en/srl/dev.preds"
-		test = "dynet/en/srl/test-wsj.preds"
+		train = "/data/nlp/corpora/processors-dynet/en/srl/train.preds"
+		dev = "/data/nlp/corpora/processors-dynet/en/srl/dev.preds"
+		test = "/data/nlp/corpora/processors-dynet/en/srl/test-wsj.preds"
 
 		layers {
 			final {
diff --git a/main/src/main/resources/org/clulab/mtl-en-srla.conf b/main/src/main/resources/org/clulab/mtl-en-srla.conf
index 6dcf5bbd8..8f5181484 100644
--- a/main/src/main/resources/org/clulab/mtl-en-srla.conf
+++ b/main/src/main/resources/org/clulab/mtl-en-srla.conf
@@ -16,9 +16,9 @@ mtl {
       distanceEmbeddingSize = 16
       distanceWindowSize = 50
       useIsPredicate = true
-      c2i = "org/clulab/c2i-en.txt"
-      tag2i = "org/clulab/tag2i-en.txt"
-      ne2i = "org/clulab/ne2i-en.txt"
+      c2i = "../resources/org/clulab/c2i-en.txt"
+      tag2i = "../resources/org/clulab/tag2i-en.txt"
+      ne2i = "../resources/org/clulab/ne2i-en.txt"
     }
 
     intermediate1 {
@@ -31,9 +31,9 @@ mtl {
 
   task1 {
     name = "En SRL arguments"
-    train = "dynet/en/srl/train.args"
-    dev = "dynet/en/srl/dev.args"
-    test = "dynet/en/srl/test-wsj.args"
+    train = "/data/nlp/corpora/processors-dynet/en/srl/train.args"
+    dev = "/data/nlp/corpora/processors-dynet/en/srl/dev.args"
+    test = "/data/nlp/corpora/processors-dynet/en/srl/test-wsj.args"
     type = "dual"
 
     layers {