diff --git a/main/src/main/python/__init__.py b/main/src/main/python/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/main/src/main/python/embeddings/__init__.py b/main/src/main/python/embeddings/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/main/src/main/python/embeddings/wordEmbeddingMap.py b/main/src/main/python/embeddings/wordEmbeddingMap.py new file mode 100644 index 000000000..7ea6328cd --- /dev/null +++ b/main/src/main/python/embeddings/wordEmbeddingMap.py @@ -0,0 +1,36 @@ +import numpy as np +import math +import torch.nn as nn +import torch + +class WordEmbeddingMap: + def __init__(self, config): + self.emb_dict, self.dim, self.w2i, self.emb = load(config) + + def isOutOfVocabulary(self, word): + return word not in self.w2i + +def load(config): + emb_dict = dict() + w2i = {} + i = 0 + for line in open(config.get_string("glove.matrixResourceName")): + if not len(line.split()) == 2: + if "\t" in line: + delimiter = "\t" + else: + delimiter = " " + word, *rest = line.rstrip().split(delimiter) + word = "" if word == "" else word + w2i[word] = i + i += 1 + x = np.array(list(map(float, rest))) + vector = x #(x /np.linalg.norm(x)) #normalized + embedding_size = vector.shape[0] + emb_dict[word] = vector + + weights = np.zeros((len(emb_dict), embedding_size)) + for w, i in w2i.items(): + weights[i] = emb_dict[w] + emb = nn.Embedding.from_pretrained(torch.FloatTensor(weights), freeze=True) + return emb_dict, embedding_size, w2i, emb \ No newline at end of file diff --git a/main/src/main/python/pytorch/__init__.py b/main/src/main/python/pytorch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/main/src/main/python/pytorch/constEmbeddingsGlove.py b/main/src/main/python/pytorch/constEmbeddingsGlove.py new file mode 100644 index 000000000..24d298f0b --- /dev/null +++ b/main/src/main/python/pytorch/constEmbeddingsGlove.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass +import torch.nn as nn +from embeddings.wordEmbeddingMap import * +from pyhocon import ConfigFactory +import torch + +@dataclass +class ConstEmbeddingParameters: + emb: nn.Embedding + w2i: dict + +class _ConstEmbeddingsGlove: + def __init__(self): + self.SINGLETON_WORD_EMBEDDING_MAP = None + self.cep = None + config = ConfigFactory.parse_file('../resources/org/clulab/glove.conf') + self.load(config) + self.dim = self.SINGLETON_WORD_EMBEDDING_MAP.dim + + def load(self, config): + if self.SINGLETON_WORD_EMBEDDING_MAP is None: + self.SINGLETON_WORD_EMBEDDING_MAP = WordEmbeddingMap(config) + self.cep = ConstEmbeddingParameters(self.SINGLETON_WORD_EMBEDDING_MAP.emb, self.SINGLETON_WORD_EMBEDDING_MAP.w2i) + + def get_ConstLookupParams(self): + return self.cep + +ConstEmbeddingsGlove = _ConstEmbeddingsGlove() diff --git a/main/src/main/python/pytorch/embeddingLayer.py b/main/src/main/python/pytorch/embeddingLayer.py new file mode 100644 index 000000000..e3479119a --- /dev/null +++ b/main/src/main/python/pytorch/embeddingLayer.py @@ -0,0 +1,341 @@ +from pytorch.initialLayer import InitialLayer +import random +from pytorch.utils import * +import torch.nn as nn +import torch +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove + +DEFAULT_DROPOUT_PROB: float = DEFAULT_DROPOUT_PROBABILITY +DEFAULT_LEARNED_WORD_EMBEDDING_SIZE: int = 128 +DEFAULT_CHAR_EMBEDDING_SIZE: int = 32 +DEFAULT_CHAR_RNN_STATE_SIZE: int = 16 +DEFAULT_POS_TAG_EMBEDDING_SIZE: int = -1 # no POS tag embeddings by default +DEFAULT_NE_TAG_EMBEDDING_SIZE: int = -1 # no NE tag embeddings by default +DEFAULT_DISTANCE_EMBEDDING_SIZE: int = -1 # no distance embeddings by default +DEFAULT_POSITION_EMBEDDING_SIZE: int = -1 # no position embeddings by default +DEFAULT_DISTANCE_WINDOW_SIZE: int = -1 +DEFAULT_USE_IS_PREDICATE: int = -1 +random.seed(RANDOM_SEED) + +class EmbeddingLayer(InitialLayer): + def __init__(self, w2i, # word to index + w2f, # word to frequency + c2i, # character to index + tag2i, # POS tag to index + ne2i, # NE tag to index + learnedWordEmbeddingSize, # size of the learned word embedding + charEmbeddingSize, # size of the character embedding + charRnnStateSize, # size of each one of the char-level RNNs + posTagEmbeddingSize, # size of the POS tag embedding + neTagEmbeddingSize, # size of the NE tag embedding + distanceEmbeddingSize, + distanceWindowSize, # window considered for distance values (relative to predicate) + positionEmbeddingSize, + useIsPredicate, # if true, add a Boolean bit to indicate if current word is the predicate + wordLookupParameters, + charLookupParameters, + charRnnBuilder, # RNNs for the character representation + posTagLookupParameters, + neTagLookupParameters, + distanceLookupParameters, + positionLookupParameters, + dropoutProb): + super().__init__() + self.w2i = w2i + self.w2f = w2f + self.c2i = c2i + self.tag2i = tag2i + self.ne2i = ne2i + self.learnedWordEmbeddingSize = learnedWordEmbeddingSize + self.charEmbeddingSize = charEmbeddingSize + self.charRnnStateSize = charRnnStateSize + self.posTagEmbeddingSize = posTagEmbeddingSize + self.neTagEmbeddingSize = neTagEmbeddingSize + self.distanceEmbeddingSize = distanceEmbeddingSize + self.distanceWindowSize = distanceWindowSize + self.positionEmbeddingSize = positionEmbeddingSize + self.useIsPredicate = useIsPredicate + self.wordLookupParameters = wordLookupParameters + self.charLookupParameters = charLookupParameters + self.charRnnBuilder = mkBuilder(*charRnnBuilder) + self.posTagLookupParameters = posTagLookupParameters + self.neTagLookupParameters = neTagLookupParameters + self.distanceLookupParameters = distanceLookupParameters + self.positionLookupParameters = positionLookupParameters + self.dropoutProb = dropoutProb + + posTagDim = posTagEmbeddingSize if posTagLookupParameters else 0 + neTagDim = neTagEmbeddingSize if neTagLookupParameters else 0 + distanceDim = distanceEmbeddingSize if distanceLookupParameters else 0 + predicateDim = 1 if distanceLookupParameters and useIsPredicate else 0 + positionDim = positionEmbeddingSize if positionLookupParameters else 0 + self.outDim = ConstEmbeddingsGlove.dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim + + + def forward(self, sentence, constEmbeddings, doDropout): + + words = sentence.words + tags = sentence.posTags + nes = sentence.neTags + headPositions = sentence.headPositions + + # const word embeddings such as GloVe + constEmbeddingsExpressions = self.mkConstEmbeddings(words, constEmbeddings) + assert(constEmbeddingsExpressions.size(0) == len(words)) + if(tags): assert(len(tags) == len(words)) + if(nes): assert(len(nes) == len(words)) + if(headPositions): assert(len(headPositions) == len(words)) + + # build the word embeddings one by one + embeddings = self.mkEmbeddings(words, constEmbeddingsExpressions, doDropout, tags, nes, headPositions) + + return embeddings + + def mkConstEmbeddings(self, words, constEmbeddings): + idxs = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words]) + embeddings = constEmbeddings.emb(idxs) + return embeddings + + def mkEmbeddings(self, words, constEmbeddings, doDropout, tags=None, nes=None, headPositions=None): + # + # Learned word embeddings + # These are initialized randomly, and updated during backprop + # + ids = [] + wordPositions = [] + for i, word in enumerate(words): + wordPositions.append(i) + id = self.w2i.get(word, 0) # 0 reserved for UNK in the vocab + # sample uniformly with prob 0.5 from singletons; move all other singletons to UNK + if(doDropout and id > 0 and self.w2f[word] == 1 and random.random() < 0.5): id = 0 + ids.append(id) + learnedWordEmbeddings = self.wordLookupParameters(torch.LongTensor(ids)) + # + # biLSTM over character embeddings + # + charEmbedding = torch.stack([mkCharacterEmbedding(word, self.c2i, self.charLookupParameters, self.charRnnBuilder) for word in words]) + # + # POS tag embedding + # + if tags and self.posTagLookupParameters: + posTagEmbed = self.posTagLookupParameters(torch.LongTensor([self.tag2i.get(tag, 0) for tag in tags])) + else: + posTagEmbed = None + # + # NE tag embedding + # + if nes and self.neTagLookupParameters: + neTagEmbed = self.neTagLookupParameters(torch.LongTensor([self.ne2i.get(ne, 0) for ne in nes])) + else: + neTagEmbed = None + # + # 1 if this word is the predicate + # + if headPositions and self.useIsPredicate: + predEmbed = torch.FloatTensor([1 if i==predicatePosition else 0 for i, predicatePosition in enumerate(headPositions)]).unsqueeze(1) + else: + predEmbed = None + + # + # Distance embedding, relative to the distance to the predicate + # We cut the distance down to values inside the window [-distanceWindowSize, +distanceWindowSize] + # + if headPositions and self.distanceLookupParameters: + dists = [max(i-predicatePosition+self.distanceWindowSize+1, 0) if i-predicatePosition <= self.distanceWindowSize else 2 * self.distanceWindowSize + 2 for i, predicatePosition in enumerate(headPositions)] + distanceEmbedding = self.distanceLookupParameters(torch.LongTensor(dists)) + else: + distanceEmbedding = None + + # + # Embedding that captures the absolute position of the token in the sentence + # + if self.positionLookupParameters: + values = [i if i<100 else 100 for i, word in enumerate(words)] + positionEmbedding = self.positionLookupParameters(torch.LongTensor(values)) + else: + positionEmbedding = None + + # The final word embedding is a concatenation of all these + embedParts = [constEmbeddings, learnedWordEmbeddings, charEmbedding, posTagEmbed, neTagEmbed, distanceEmbedding, positionEmbedding, predEmbed] + embedParts = [ep for ep in embedParts if ep is not None] + embed = torch.cat(embedParts, dim=1) + return embed + + def saveX2i(self): + x2i = dict() + x2i['w2i'] = self.w2i + x2i['w2f'] = self.w2f + x2i['c2i'] = self.c2i + if self.tag2i: + x2i['hasTag2i'] = 1 + x2i['tag2i'] = self.tag2i + else: + x2i['hasTag2i'] = 0 + if self.ne2i: + x2i['hasNe2i'] = 1 + x2i['ne2i'] = self.ne2i + else: + x2i['hasNe2i'] = 0 + x2i['learnedWordEmbeddingSize'] = self.learnedWordEmbeddingSize + x2i['charEmbeddingSize'] = self.charEmbeddingSize + x2i['charRnnStateSize'] = self.charRnnStateSize + x2i['posTagEmbeddingSize'] = self.posTagEmbeddingSize + x2i['neTagEmbeddingSize'] = self.neTagEmbeddingSize + x2i['distanceEmbeddingSize'] = self.distanceEmbeddingSize + x2i['distanceWindowSize'] = self.distanceWindowSize + x2i['useIsPredicate'] = 1 if self.useIsPredicate else 0 + x2i['positionEmbeddingSize'] = self.positionEmbeddingSize + x2i['dropoutProb'] = self.dropoutProb + + return x2i + + def __str__(self): + return f"EmbeddingLayer({self.outDim})" + + @classmethod + def load(cls, x2i): + w2i = x2i['w2i'] + w2f = x2i['w2f'] + c2i = x2i['c2i'] + tag2i = x2i['tag2i'] if x2i['hasTag2i'] == 1 else None + ne2i = x2i['ne2i'] if x2i['hasNe2i'] == 1 else None + + learnedWordEmbeddingSize = x2i.get('learnedWordEmbeddingSize', DEFAULT_LEARNED_WORD_EMBEDDING_SIZE) + charEmbeddingSize = x2i.get('charEmbeddingSize', DEFAULT_CHAR_EMBEDDING_SIZE) + charRnnStateSize = x2i.get('charRnnStateSize', DEFAULT_CHAR_RNN_STATE_SIZE) + posTagEmbeddingSize = x2i.get('posTagEmbeddingSize', DEFAULT_POS_TAG_EMBEDDING_SIZE) + neTagEmbeddingSize = x2i.get('neTagEmbeddingSize', DEFAULT_NE_TAG_EMBEDDING_SIZE) + distanceEmbeddingSize = x2i.get('distanceEmbeddingSize', DEFAULT_DISTANCE_EMBEDDING_SIZE) + distanceWindowSize = x2i.get('distanceWindowSize', DEFAULT_DISTANCE_WINDOW_SIZE) + useIsPredicate = x2i.get('useIsPredicate', DEFAULT_USE_IS_PREDICATE) == 1 + positionEmbeddingSize = x2i.get('positionEmbeddingSize', DEFAULT_POSITION_EMBEDDING_SIZE) + dropoutProb = x2i.get('dropoutProb', DEFAULT_DROPOUT_PROB) + + # make the loadable parameters + wordLookupParameters = nn.Embedding(len(w2i), learnedWordEmbeddingSize) + charLookupParameters = nn.Embedding(len(c2i), charEmbeddingSize) + + charRnnBuilder = (charEmbeddingSize, charRnnStateSize, 1, True, dropoutProb) + + posTagLookupParameters = nn.Embedding(len(tag2i), posTagEmbeddingSize) if x2i['hasTag2i'] == 1 else None + neTagLookupParameters = nn.Embedding(len(ne2i), neTagEmbeddingSize) if x2i['hasNe2i'] == 1 else None + distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize) if distanceEmbeddingSize > 0 else None + positionLookupParameters = nn.Embedding(101, positionEmbeddingSize) if positionEmbeddingSize > 0 else None + + return cls(w2i, w2f, c2i, tag2i, ne2i, + learnedWordEmbeddingSize, + charEmbeddingSize, + charRnnStateSize, + posTagEmbeddingSize, + neTagEmbeddingSize, + distanceEmbeddingSize, + distanceWindowSize, + positionEmbeddingSize, + useIsPredicate, + wordLookupParameters, + charLookupParameters, + charRnnBuilder, + posTagLookupParameters, + neTagLookupParameters, + distanceLookupParameters, + positionLookupParameters, + dropoutProb) + + @classmethod + def initialize(cls, config, paramPrefix, wordCounter): + + if(not config.contains(paramPrefix)): + return None + + learnedWordEmbeddingSize = config.get_int(paramPrefix + ".learnedWordEmbeddingSize",DEFAULT_LEARNED_WORD_EMBEDDING_SIZE) + charEmbeddingSize = config.get_int(paramPrefix + ".charEmbeddingSize",DEFAULT_CHAR_EMBEDDING_SIZE) + charRnnStateSize = config.get_int(paramPrefix + ".charRnnStateSize",DEFAULT_CHAR_RNN_STATE_SIZE) + posTagEmbeddingSize = config.get_int(paramPrefix + ".posTagEmbeddingSize",DEFAULT_POS_TAG_EMBEDDING_SIZE) + neTagEmbeddingSize = config.get_int(paramPrefix + ".neTagEmbeddingSize",DEFAULT_NE_TAG_EMBEDDING_SIZE) + distanceEmbeddingSize = config.get_int(paramPrefix + ".distanceEmbeddingSize",DEFAULT_DISTANCE_EMBEDDING_SIZE) + distanceWindowSize = config.get_int(paramPrefix + ".distanceWindowSize",DEFAULT_DISTANCE_WINDOW_SIZE) + useIsPredicate = config.get_bool(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1) + positionEmbeddingSize = config.get_int(paramPrefix + ".positionEmbeddingSize",DEFAULT_POSITION_EMBEDDING_SIZE) + dropoutProb = config.get_float(paramPrefix + ".dropoutProb",DEFAULT_DROPOUT_PROB) + + wordList = [UNK_WORD] + sorted(wordCounter.keys()) + w2i = {w:i for i, w in enumerate(wordList)} + + wordLookupParameters = nn.Embedding(len(w2i), learnedWordEmbeddingSize) + nn.init.xavier_uniform_(wordLookupParameters.weight) + + c2iFilename = config.get_string(paramPrefix + ".c2i", "org/clulab/c2i-en.txt") + c2i = readChar2Ids(c2iFilename) + + charLookupParameters = nn.Embedding(len(c2i), charEmbeddingSize) + nn.init.xavier_uniform_(charLookupParameters.weight) + charRnnBuilder = (charEmbeddingSize, charRnnStateSize, 1, True, dropoutProb) + + if(posTagEmbeddingSize > 0): + tag2i = readString2Ids(config.get_string(paramPrefix + ".tag2i", "../resources/org/clulab/tag2i-en.txt")) + posTagLookupParameters = nn.Embedding(len(tag2i), posTagEmbeddingSize) + nn.init.xavier_uniform_(posTagLookupParameters.weight) + else: + tag2i = None + posTagLookupParameters = None + + if(neTagEmbeddingSize > 0): + ne2i = readString2Ids(config.get_string(paramPrefix + ".ne2i", "../resources/org/clulab/ne2i-en.txt")) + neTagLookupParameters = nn.Embedding(len(ne2i), neTagEmbeddingSize) + else: + ne2i = None + neTagLookupParameters = None + + if distanceEmbeddingSize > 0: + distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize) + nn.init.xavier_uniform_(distanceLookupParameters.weight) + else: + distanceLookupParameters = None + + if positionEmbeddingSize > 0: + positionLookupParameters = nn.Embedding(101, positionEmbeddingSize) + nn.init.xavier_uniform_(positionLookupParameters.weight) + else: + positionLookupParameters = None + + return cls(w2i, wordCounter, c2i, tag2i, ne2i, + learnedWordEmbeddingSize, + charEmbeddingSize, + charRnnStateSize, + posTagEmbeddingSize, + neTagEmbeddingSize, + distanceEmbeddingSize, + distanceWindowSize, + positionEmbeddingSize, + useIsPredicate, + wordLookupParameters, + charLookupParameters, + charRnnBuilder, + posTagLookupParameters, + neTagLookupParameters, + distanceLookupParameters, + positionLookupParameters, + dropoutProb) + +def mkBuilder(inputSize, rnnStateSize, numLayers, bi, dropoutProb): + return nn.LSTM(inputSize, rnnStateSize, numLayers, bidirectional=bi, dropout=dropoutProb) + + + + + + + + + + + + + + + + + + + + diff --git a/main/src/main/python/pytorch/finalLayer.py b/main/src/main/python/pytorch/finalLayer.py new file mode 100644 index 000000000..0f2b63d87 --- /dev/null +++ b/main/src/main/python/pytorch/finalLayer.py @@ -0,0 +1,21 @@ +import torch +import torch.nn as nn + +class FinalLayer(nn.Module): + + def __init__(self): + super().__init__() + self.inDim = None + self.outDim = None + + def forward(self, inputExpressions, headPositionsOpt, doDropout): + raise NotImplementedError + + def loss(self, emissionScoresAsExpression, goldLabels): + raise NotImplementedError + + def inference(self, emissionScores): + raise NotImplementedError + + def inferenceWithScores(self, emissionScores): + raise NotImplementedError \ No newline at end of file diff --git a/main/src/main/python/pytorch/forwardLayer.py b/main/src/main/python/pytorch/forwardLayer.py new file mode 100644 index 000000000..d071c66a8 --- /dev/null +++ b/main/src/main/python/pytorch/forwardLayer.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import torch.nn.functional as F + +from pytorch.finalLayer import FinalLayer + +from pytorch.utils import * + +class ForwardLayer(FinalLayer): + def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None): + super().__init__() + self.inputSize = inputSize + self.isDual = isDual + self.t2i = t2i + self.i2t = i2t + self.spans = spans + self.nonlinearity = nonlinearity + + self.pH = nn.Linear(actualInputSize, len(t2i)) + nn.init.xavier_uniform_(self.pH.weight) + self.pRoot = Variable(torch.rand(inputSize)) #TODO: Not sure about the shape here + self.dropout = nn.Dropout(dropoutProb) + + self.inDim = spanLength(spans) if spans is not None else inputSize + self.outDim = len(t2i) + + # remove pick span part to simplify the ONNX converting + # def pickSpan(self, v, i): + # if self.spans is None: + # return v + # else: + # # Zheng: Will spans overlap? + # vs = list() + # for span in self.spans: + # e = torch.index_select(v, i, torch.tensor(range(span[0], span[1]))) + # vs.append(e) + # return torch.cat(vs, dim=i) + + def forward(self, inputExpressions, headPositionsOpt = None): + if not self.isDual: + # Zheng: Why the for loop here? Can we just use matrix manipulation? + argExp = self.dropout(inputExpressions) + emissionScores = self.dropout(self.pH(argExp)) + if self.nonlinearity == NONLIN_TANH: + emissionScores = F.tanh(emissionScores) + elif self.nonlinearity == NONLIN_RELU: + emissionScores = F.relu(emissionScores) + else: + emissionScores = list() + if headPositionsOpt is None: + raise RuntimeError("ERROR: dual task without information about head positions!") + for i, e in enumerate(inputExpressions): + headPosition = headPositionsOpt[i] + argExp = self.dropout(e) + if headPosition >= 0: + # there is an explicit head in the sentence + predExp = self.dropout(inputExpressions[headPosition]) + else: + # the head is root. we used a dedicated Parameter for root + predExp = self.dropout(self.pRoot) + ss = torch.cat([argExp, predExp]) + l1 = self.dropout(self.pH(ss)) + if self.nonlinearity == NONLIN_TANH: + l1 = F.tanh(l1) + elif self.nonlinearity == NONLIN_RELU: + l1 = F.relu(l1) + emissionScores.append(l1) + emissionScores = torch.stack(emissionScores) + return emissionScores + + @staticmethod + def load(x2i): + from pytorch.greedyForwardLayer import GreedyForwardLayer + from pytorch.viterbiForwardLayer import ViterbiForwardLayer + inferenceType = x2i["inferenceType"] + if inferenceType == TYPE_VITERBI or inferenceType == TYPE_VITERBI_STRING:#this is a temporary solution to handle a typo in viterbi forward layer... + return ViterbiForwardLayer.load(x2i) + elif inferenceType == TYPE_GREEDY or inferenceType == TYPE_GREEDY_STRING: + return GreedyForwardLayer.load(x2i) + else: + raise RuntimeError(f"ERROR: unknown forward layer type {inferenceType}!") + + @staticmethod + def initialize(config, paramPrefix, labelCounter, isDual, inputSize): + from pytorch.greedyForwardLayer import GreedyForwardLayer + from pytorch.viterbiForwardLayer import ViterbiForwardLayer + if(not config.contains(paramPrefix)): + return None + + inferenceType = config.get_string(paramPrefix + ".inference", "greedy") + dropoutProb = config.get_float(paramPrefix + ".dropoutProb", DEFAULT_DROPOUT_PROBABILITY) + + nonlinAsString = config.get_string(paramPrefix + ".nonlinearity", "") + if nonlinAsString in nonlin_map: + nonlin = nonlin_map[nonlinAsString] + else: + raise RuntimeError(f"ERROR: unknown non-linearity {nonlinAsString}!") + + t2i = {t:i for i, t in enumerate(labelCounter.keys())} + i2t = {i:t for t, i in t2i.items()} + + spanConfig = config.get_string(paramPrefix + ".span", "") + if spanConfig is "": + span = None + else: + span = parseSpan(spanConfig) + + if span: + l = spanLength(span) + actualInputSize = 2*l if isDual else l + else: + actualInputSize = 2*inputSize if isDual else inputSize + + if inferenceType == TYPE_GREEDY_STRING: + return GreedyForwardLayer(inputSize, isDual, t2i, i2t, actualInputSize, nonlin, dropoutProb, span) + elif inferenceType == TYPE_VITERBI_STRING: + layer = ViterbiForwardLayer(inputSize, isDual, t2i, i2t, actualInputSize, nonlin, dropoutProb, span) + return layer + else: + raise RuntimeError(f"ERROR: unknown inference type {inferenceType}!") + +def spanLength(spans): + return sum(end - start for start, end in spans) + +def parseSpan(spanParam, inputSize=None): + # Zheng: Why do we need inputSize here? + spans = list() + spanParamTokens = spanParam.split(",") + for spanParamToken in spanParamTokens: + # spanTokens = spanParamToken.split('-') + # assert(len(spanTokens) == 2) + # spans.append((int(spanTokens[0]), int(spanTokens[1]))) + token1, token2 = map(int, spanParamToken.split('-')) + spans.append((token1, token2)) + return spans + +def spanToString(spans): + return ','.join(f'{start}-{end}' for start, end in spans) + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/main/src/main/python/pytorch/greedyForwardLayer.py b/main/src/main/python/pytorch/greedyForwardLayer.py new file mode 100644 index 000000000..145878c91 --- /dev/null +++ b/main/src/main/python/pytorch/greedyForwardLayer.py @@ -0,0 +1,56 @@ +from pytorch.forwardLayer import * +from pytorch.utils import * +import numpy as np + +class GreedyForwardLayer(ForwardLayer): + def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None): + super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans) + + def loss(self, finalStates, goldLabelStrings): + goldLabels = [self.t2i[gs] for gs in goldLabelStrings] + return sentenceLossGreedy(finalStates, goldLabels) + + def saveX2i(self): + x2i = dict() + x2i["inferenceType"] = TYPE_GREEDY + x2i["inputSize"] = self.inputSize + x2i["isDual"] = 1 if self.isDual else 0 + x2i["span"] = spanToString(self.spans) if self.spans else "" + x2i["nonlinearity"] = self.nonlinearity + x2i["t2i"] = self.t2i + + return x2i + + def __str__(self): + return f"GreedyForwardLayer({self.inDim}, {self.outDim})" + + def inference(self, emissionScores): + emissionScores = emissionScoresToArrays(emissionScores) + return [self.i2t[np.argmax(es)] for es in emissionScores] + + def inference2(self, emissionScores): + return torch.argmax(emissionScores, dim=1) + + def inferenceWithScores(self, emissionScores): + emissionScores = emissionScoresToArrays(emissionScores) + return [sorted([(i, s) for i, s in enumerate(scoresForPosition)], key=lambda x: x[1]) for scoresForPosition in emissionScores] + + @classmethod + def load(cls, x2i): + inputSize = x2i["inputSize"] + isDual = x2i.get("isDual", DEFAULT_IS_DUAL) == 1 + sapnValue = x2i.get("span", "") + spans = None if sapnValue == "" else parseSpan(sapnValue, inputSize) + nonlinearity = x2i.get("nonlinearity", NONLIN_NONE) + t2i = x2i["t2i"] + i2t = {i:t for t, i in t2i.items()} + dropoutProb = x2i.get("dropoutProb", DEFAULT_DROPOUT_PROBABILITY) + + if spans: + l = spanLength(spans) + actualInputSize = 2*l if isDual else l + else: + actualInputSize = 2*inputSize if isDual else inputSize + + return cls(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans) + \ No newline at end of file diff --git a/main/src/main/python/pytorch/initialLayer.py b/main/src/main/python/pytorch/initialLayer.py new file mode 100644 index 000000000..39db90d28 --- /dev/null +++ b/main/src/main/python/pytorch/initialLayer.py @@ -0,0 +1,11 @@ +import torch +import torch.nn as nn + +class InitialLayer(nn.Module): + + def __init__(self): + super().__init__() + self.outDim = None + + def forward(self, sentence, constEmbeddings, doDropout): + raise NotImplementedError \ No newline at end of file diff --git a/main/src/main/python/pytorch/intermediateLayer.py b/main/src/main/python/pytorch/intermediateLayer.py new file mode 100644 index 000000000..48ea53377 --- /dev/null +++ b/main/src/main/python/pytorch/intermediateLayer.py @@ -0,0 +1,12 @@ +import torch +import torch.nn as nn + +class IntermediateLayer(nn.Module): + + def __init__(self): + super().__init__() + self.inDim = None + self.outDim = None + + def forward(self, inputExpressions, doDropout): + raise NotImplementedError \ No newline at end of file diff --git a/main/src/main/python/pytorch/layers.py b/main/src/main/python/pytorch/layers.py new file mode 100644 index 000000000..926bc6606 --- /dev/null +++ b/main/src/main/python/pytorch/layers.py @@ -0,0 +1,317 @@ +import torch.nn as nn +from pytorch.utils import * +from pytorch.embeddingLayer import EmbeddingLayer +from pytorch.rnnLayer import RnnLayer +from pytorch.forwardLayer import ForwardLayer +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove + +class Layers(object): + def __init__(self, initialLayer, intermediateLayers, finalLayer): + if finalLayer: + self.outDim = finalLayer.outDim + elif intermediateLayers: + self.outDim = intermediateLayers[-1].outDim + elif initialLayer: + self.outDim = initialLayer.outDim + else: + self.outDim = None + + self.nonEmpty = initialLayer is not None and intermediateLayers is not None and finalLayer is not None + self.isEmpty = not self.nonEmpty + + self.initialLayer = initialLayer + self.intermediateLayers = intermediateLayers + self.finalLayer = finalLayer + + def __str__(self): + s = "" + started = False + if(self.initialLayer is not None): + s += "initial = " + str(self.initialLayer) + started = True + for i in range(len(self.intermediateLayers)): + if(started): s += " " + s += f"intermediate ({i+1}) = " + str(self.intermediateLayers[i]) + started = True + if(self.finalLayer is not None): + if(started): s += " " + s += "final = " + str(self.finalLayer) + return s + + def get_parameters(self): + parameters = list() + if self.initialLayer is not None: + parameters += [p for p in self.initialLayer.named_parameters()] + for il in self.intermediateLayers: + parameters += [p for p in il.named_parameters()] + if self.finalLayer is not None: + parameters += [p for p in self.finalLayer.named_parameters()] + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in parameters + if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': WEIGHT_DECAY}, + {'params': [p for n, p in parameters + if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0} + ] + return optimizer_grouped_parameters + + def start_train(self): + if self.initialLayer is not None: + self.initialLayer.train() + for il in self.intermediateLayers: + il.train() + if self.finalLayer is not None: + self.finalLayer.train() + + def start_eval(self): + if self.initialLayer is not None: + self.initialLayer.eval() + for il in self.intermediateLayers: + il.eval() + if self.finalLayer is not None: + self.finalLayer.eval() + + def get_state_dict(self): + params = dict() + if self.initialLayer is not None: + params['initialLayer'] = self.initialLayer.state_dict() + if self.intermediateLayers: + params['intermediateLayers'] = list() + for il in self.intermediateLayers: + params['intermediateLayers'].append(il.state_dict()) + if self.finalLayer is not None: + params['finalLayer'] = self.finalLayer.state_dict() + return params + + def load_state_dict(self, params): + if self.initialLayer is not None: + self.initialLayer.load_state_dict(params['initialLayer']) + for i, il in enumerate(self.intermediateLayers): + il.load_state_dict(params['intermediateLayers'][i]) + if self.finalLayer is not None: + self.finalLayer.load_state_dict(params['finalLayer']) + + def add_state_dict(self, layers): + if self.initialLayer is not None: + for key in self.initialLayer.state_dict(): + if self.initialLayer.state_dict()[key].data.dtype == torch.float32: + self.initialLayer.state_dict()[key].data += layers.initialLayer.state_dict()[key].data.clone() + for i, il in enumerate(self.intermediateLayers): + for key in il.state_dict(): + if il.state_dict()[key].data.dtype == torch.float32: + il.state_dict()[key].data += layers.intermediateLayers[i].state_dict()[key].data.clone() + if self.finalLayer is not None: + for key in self.finalLayer.state_dict(): + if self.finalLayer.state_dict()[key].data.dtype == torch.float32: + self.finalLayer.state_dict()[key].data += layers.finalLayer.state_dict()[key].data.clone() + + def avg_state_dict(self, num_models): + if self.initialLayer is not None: + for key in self.initialLayer.state_dict(): + if self.initialLayer.state_dict()[key].data.dtype == torch.float32: + self.initialLayer.state_dict()[key].data /= num_models + for i, il in enumerate(self.intermediateLayers): + for key in il.state_dict(): + if il.state_dict()[key].data.dtype == torch.float32: + il.state_dict()[key].data /= num_models + if self.finalLayer is not None: + for key in self.finalLayer.state_dict(): + if self.finalLayer.state_dict()[key].data.dtype == torch.float32: + self.finalLayer.state_dict()[key].data /= num_models + + + def forward(self, sentence, constEmbeddings, doDropout): + if self.initialLayer is None: + raise RuntimeError(f"ERROR: you can't call forward() on a Layers object that does not have an initial layer: {self}!") + states = self.initialLayer(sentence, constEmbeddings, doDropout) + for intermediateLayer in self.intermediateLayers: + states = intermediateLayer(states, doDropout) + if self.finalLayer is not None: + states = self.finalLayer(states, sentence.headPositions) + + return states + + def forwardFrom(self, inStates, headPositions, doDropout): + if self.initialLayer is not None: + raise RuntimeError(f"ERROR: you can't call forwardFrom() on a Layers object that has an initial layer: {self}") + states = inStates + for intermediateLayer in self.intermediateLayers: + states = intermediateLayer(states, doDropout) + if self.finalLayer is not None: + states = self.finalLayer(states, headPositions) + + return states + + def saveX2i(self): + x2i = dict() + if self.initialLayer is not None: + x2i['hasInitial'] = 1 + x2i['initialLayer'] = self.initialLayer.saveX2i() + else: + x2i['hasInitial'] = 0 + x2i['intermediateCount'] = len(self.intermediateLayers) + x2i['intermediateLayers'] = list() + for il in self.intermediateLayers: + x2i['intermediateLayers'].append(il.saveX2i()) + if self.finalLayer is not None: + x2i['hasFinal'] = 1 + x2i['finalLayer'] = self.finalLayer.saveX2i() + else: + x2i['hasFinal'] = 0 + + return x2i + + @classmethod + def apply(cls, config, paramPrefix, wordCounter, labelCounter, isDual, providedInputSize): + initialLayer = EmbeddingLayer.initialize(config, paramPrefix + ".initial", wordCounter) + + if(initialLayer): + inputSize = initialLayer.outDim + elif(providedInputSize): + inputSize = providedInputSize + else: + inputSize = None + + intermediateLayers = list() + done = False + MAX_INTERMEDIATE_LAYERS = 10 + + for i in range(1, MAX_INTERMEDIATE_LAYERS): + if done: + break + if inputSize is None: + raise RuntimeError("ERROR: trying to construct an intermediate layer without a known input size!") + + intermediateLayer = RnnLayer.initialize(config, paramPrefix + f".intermediate{i}", inputSize) + + if intermediateLayer: + intermediateLayers.append(intermediateLayer) + inputSize = intermediateLayer.outDim + else: + done = True + + if labelCounter: + if inputSize is None: + raise RuntimeError("ERROR: trying to construct a final layer without a known input size!") + else: + finalLayer = ForwardLayer.initialize(config, paramPrefix + ".final", labelCounter, isDual, inputSize) + else: + finalLayer = None + + return cls(initialLayer, intermediateLayers, finalLayer) + + @classmethod + def loadX2i(cls, x2i): + hasInitial = x2i['hasInitial'] + initialLayer = EmbeddingLayer.load(x2i['initialLayer']) if hasInitial == 1 else None + + intermediateLayers = list() + intermediateCount = x2i['intermediateCount'] + for i in range(intermediateCount): + il = RnnLayer.load(x2i['intermediateLayers'][i]) + intermediateLayers.append(il) + + hasFinal = x2i['hasFinal'] + finalLayer = ForwardLayer.load(x2i['finalLayer']) if hasFinal == 1 else None + + return cls(initialLayer, intermediateLayers, finalLayer) + + @staticmethod + def predictJointly(layers, sentence, constEmbeddings): + labelsPerTask = list() + # layers(0) contains the shared layers + if layers[0]: + sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout=False) + for i in range(1, len(layers)): + states = layers[i].forwardFrom(sharedStates, sentence.headPositions, doDropout=False) + labels = layers[i].finalLayer.inference(states) + labelsPerTask += [labels] + # no shared layer + else: + for i in range(1, len(layers)): + states = layers[i].forward(sentence, sentence.headPositions, doDropout=False) + labels = layers[i].finalLayer.inference(states) + labelsPerTask += [labels] + + return labelsPerTask + + @staticmethod + def forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout): + if layers[0]: + sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout) + states = layers[taskId+1].forwardFrom(sharedStates, sentence.headPositions, doDropout) + else: + states = layers[taskId+1].forward(sentence, constEmbeddings, doDropout) + return states + + @staticmethod + def predict(layers, taskId, sentence, constEmbeddings): + states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=False) + return layers[taskId+1].finalLayer.inference(states) + + @staticmethod + def predictWithScores(layers, taskId, sentence, constEmbeddings): + states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=False) + return layers[taskId+1].finalLayer.inferenceWithScores(states) + + @staticmethod + def parse(layers, sentence, constEmbeddings): + # + # first get the output of the layers that are shared between the two tasks + # + assert(layers[0].nonEmpty) + sharedStates = layers[0].forward(sentence, constEmbeddings, doDropout=False) + + # + # now predict the heads (first task) + # + headStates = layers[1].forwardFrom(sharedStates, None, doDropout=False) + headScores = layers[1].finalLayer.inference(headStates) + + # store the head values here + heads = list() + for wi, predictionsForThisWord in enumerate(headScores): + # pick the prediction with the highest score, which is within the boundaries of the current sentence + done = False + for hi, relative in enumerate(predictionsForThisWord): + if done: + break + try: + relativeHead = int(relative[0]) + if relativeHead == 0: + heads.append(1) + done = True + else: + headPosition = wi + relativeHead + heads.append(headPosition) + done = True + except: + raise RuntimeError('''some valid predictions may not be integers, e.g., "" may be predicted by the sequence model''') + if not done: + # we should not be here, but let's be safe + # if nothing good was found, assume root + heads.append(1) + + # + # next, predict the labels using the predicted heads + # + labelStates = layers[2].forwardFrom(sharedStates, heads, doDropout=False) + labels = layers[2].finalLayer.inference(labelStates) + assert(len(labels)==len(heads)) + + return zip(heads, labels) + + @staticmethod + def loss(layers, taskId, sentence, goldLabels): + # Zheng: I am not sure this is the suitable way to load embeddings or not, need help... + constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams() + states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=True) # use dropout during training! + loss = layers[taskId+1].finalLayer.loss(states, goldLabels) + return loss + + + + + + diff --git a/main/src/main/python/pytorch/metal.py b/main/src/main/python/pytorch/metal.py new file mode 100644 index 000000000..1c6deb81d --- /dev/null +++ b/main/src/main/python/pytorch/metal.py @@ -0,0 +1,366 @@ +from pytorch.utils import * +from collections import Counter +from sequences.rowReaders import * +from pytorch.layers import Layers +from pytorch.seqScorer import * +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove + +from torch.optim import SGD, Adam, RMSprop +from torch.optim.lr_scheduler import * + +import json +import random + +class Metal(object): + """docstring for Metal""" + def __init__(self, taskManager, modelOpt): + self.taskManager = taskManager + + # One Layers object per task; model(0) contains the Layers shared between all tasks (if any) + if modelOpt: + self.model = modelOpt + else: + self.model = self.initialize() + + def initialize(self): + + taskWords, taskLabels = self.mkVocabularies() + + layersPerTask = [None for _ in range(self.taskManager.taskCount + 1)] + + layersPerTask[0] = Layers.apply(self.taskManager, "mtl.layers", taskWords[0], None, False, None) + + inputSize = layersPerTask[0].outDim + + for i in self.taskManager.indices: + layersPerTask[i+1] = Layers.apply(self.taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], self.taskManager.tasks[i].isDual, inputSize) + + for i in range(len(layersPerTask)): + print (f"Summary of layersPerTask({i}):") + print (layersPerTask[i]) + + return layersPerTask + + def mkVocabularies(self): + # index 0 reserved for the shared Layers; tid + 1 corresponds to each task + labels = [Counter() for _ in range(self.taskManager.taskCount + 1)] + for i in range(1, len(labels)): # labels(0) not used, since only task-specific layers have a final layer + labels[i][START_TAG] += 1 + labels[i][STOP_TAG] += 1 + + words = [Counter() for _ in range(self.taskManager.taskCount + 1)] + + reader = MetalRowReader() + + for tid in self.taskManager.indices: + for sentence in self.taskManager.tasks[tid].trainSentences: + annotatedSentences = reader.toAnnotatedSentences(sentence) + + for asent in annotatedSentences: + annotatedSentence = asent[0] + sentenceLabels = asent[1] + for i, word in enumerate(annotatedSentence.words): + words[tid + 1][word] += 1 + words[0][word] += 1 + labels[tid + 1][sentenceLabels[i]] += 1 + + return words, labels + + def train(self, modelNamePrefix): + + learningRate = self.taskManager.get_float("mtl.learningRate", 1e-5) + trainerType = self.taskManager.get_string("mtl.trainer", "adam") + batchSize = self.taskManager.get_int("mtl.batchSize", 1) + + torch.manual_seed(self.taskManager.random) + random.seed(self.taskManager.random) + + assert(batchSize>0) + + parameters = list() + for layers in self.model: + parameters += layers.get_parameters() + + if trainerType == "adam": + trainer = Adam(parameters, lr=learningRate) + elif trainerType == "rmsprop": + trainer = RMSprop(parameters, lr=learningRate) + elif trainerType == "sgd": + trainer = SDG(parameters, lr=learningRate) + else: + raise RuntimeError(f"ERROR: unknown trainer {trainerType}!") + + scheduler = ExponentialLR(trainer, gamma=0.9) + + reader = MetalRowReader() + + cummulativeLoss = 0.0 + numTagged = 0 + + maxAvgAcc = 0.0 + maxAvgF1 = 0.0 + bestEpoch = 0 + + allEpochScores = list() + epochPatience = self.taskManager.epochPatience + + for epoch in range(0, self.taskManager.maxEpochs): + if epochPatience <= 0: + break + # this fetches randomized training sentences from all tasks + sentenceIterator = self.taskManager.getSentences() + sentCount = 0 + + for layers in self.model: + layers.start_train() + trainer.zero_grad() + + batchLoss = 0 + i = 0 + + # traverse all training sentences + for metaSentence in sentenceIterator: + taskId = metaSentence[0] + sentence = metaSentence[1] + + sentCount += 1 + + annotatedSentences = reader.toAnnotatedSentences(sentence) + assert(annotatedSentences is not None) + + unweightedLoss = 0 + for a_sent in annotatedSentences: + unweightedLoss += Layers.loss(self.model, taskId, a_sent[0], a_sent[1]) + + loss = unweightedLoss * self.taskManager.tasks[taskId].taskWeight # Zheng: I don't think this is necessary: if self.taskManager.tasks[taskId].taskWeight!=1.0 else unweightedLoss + + batchLoss += loss + i += 1 + + if i >= batchSize: + cummulativeLoss += batchLoss.item() + batchLoss.backward() + trainer.step() + batchLoss = 0 + i = 0 + + numTagged += len(sentence) + + if(sentCount % 1000 == 0): + print (f"Cumulative loss: {cummulativeLoss/numTagged} ({sentCount} sentences)") + cummulativeLoss = 0.0 + numTagged = 0 + # we may have an incomplete batch here + if batchLoss: + cummulativeLoss = batchLoss.item() + batchLoss.backward() + trainer.step() + batchLoss = 0 + i = 0 + scheduler.step() + + # check dev performance in this epoch, for all tasks + totalAcc = 0.0 + totalPrec = 0.0 + totalRec = 0.0 + totalF1 = 0.0 + for taskId in range(0, self.taskManager.taskCount): + taskName = self.taskManager.tasks[taskId].taskName + devSentences = self.taskManager.tasks[taskId].devSentences + + if devSentences: + acc, prec, rec, f1 = self.evaluate(taskId, taskName, devSentences, "development", epoch) + totalAcc += acc + totalPrec += prec + totalRec += rec + totalF1 += f1 + + avgAcc = totalAcc / self.taskManager.taskCount + avgPrec = totalPrec / self.taskManager.taskCount + avgRec = totalRec / self.taskManager.taskCount + avgF1 = totalF1 / self.taskManager.taskCount + + print (f"Average accuracy across {self.taskManager.taskCount} tasks in epoch {epoch}: {avgAcc}") + print (f"Average P/R/F1 across {self.taskManager.taskCount} tasks in epoch $epoch: {avgPrec} / {avgRec} / {avgF1}") + + allEpochScores.append((epoch, avgF1)) + + if avgF1 > maxAvgF1: + maxAvgF1 = avgF1 + maxAvgAcc = avgAcc + bestEpoch = epoch + epochPatience = self.taskManager.epochPatience + else: + epochPatience -= 1 + + self.save(f"{modelNamePrefix}-epoch{epoch}") + + allEpochScores.sort(key=lambda x: x[1]) + print ("Epochs in descending order of scores:") + for t in allEpochScores: + print (f"Epoch #{t[0]}: {t[1]}") + + def evaluate(self, taskId, taskName, sentences, name, epoch=-1): + scoreCountsByLabel = ScoreCountsByLabel() + taskNumber = taskId + 1 + sentCount = 0 + + print (f"Started evaluation on the {name} dataset for task {taskNumber} ({taskName})...") + + if epoch >= 0: + pw = open(f"task{taskNumber}.dev.output.{epoch}", "w") + else: + pw = open(f"task{taskNumber}.test.output", "w") + + reader = MetalRowReader() + + for sent in sentences: + sentCount += 1 + + annotatedSentences = reader.toAnnotatedSentences(sent) + + for asent in annotatedSentences[:1]: + sentence = asent[0] + goldLabels = asent[1] + + constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams() + preds = self.predict(taskId, sentence, constEmbeddings) + + sc = SeqScorer.f1(goldLabels, preds) + scoreCountsByLabel.incAll(sc) + + printCoNLLOutput(pw, sentence.words, goldLabels, preds) + + pw.close() + + print (f"Accuracy on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.accuracy()}") + print (f"Precision on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.precision()}") + print (f"Recall on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.recall()}") + print (f"Micro F1 on {len(sentences)} {name} sentences for task {taskNumber} ({taskName}): {scoreCountsByLabel.f1()}") + for label in scoreCountsByLabel.labels(): + print (f"\tP/R/F1 for label {label} ({scoreCountsByLabel.map[label].gold}): {scoreCountsByLabel.precision(label)} / {scoreCountsByLabel.recall(label)} / {scoreCountsByLabel.f1(label)}") + + return ( scoreCountsByLabel.accuracy(), scoreCountsByLabel.precision(), scoreCountsByLabel.recall(), scoreCountsByLabel.f1() ) + + def predictJointly(self, sentence, constEmbeddings): + # for layers in self.model: + # layers.start_eval() + return Layers.predictJointly(self.model, sentence, constEmbeddings) + + def predict(self, taskId, sentence, constEmbeddings): + # for layers in self.model: + # layers.start_eval() + return Layers.predict(self.model, taskId, sentence, constEmbeddings) + + def predictWithScores(self, taskId, sentence, constEmbeddings): + # for layers in self.model: + # layers.start_eval() + return Layers.predictWithScores(self.model, taskId, sentence, constEmbeddings) + + # Custom method for the parsing algorithm + # @param sentence Input sentence + # @param constEmbeddings Constant embeddings for this sentence + # @return Tuple of (head, label) for each word in the sentence + def parse(self, sentence, constEmbeddings): + Layers.parse(self.model, sentence, constEmbeddings) + + def test(self): + + for layers in self.model: + layers.start_eval() + for taskId in range(0, self.taskManager.taskCount): + taskName = self.taskManager.tasks[taskId].taskName + testSentences = self.taskManager.tasks[taskId].testSentences + if testSentences: + self.evaluate(taskId, taskName, testSentences, "testing") + + def save(self, baseFilename): + + params = list() + if "-epoch0" in baseFilename: + j_params = list() + for layers in self.model: + sd = layers.get_state_dict() + params.append(sd) + if "-epoch0" in baseFilename: + x2i = layers.saveX2i() + j_params.append({"x2i": x2i}) + + # torch pickle save + try: + torch.save(params, baseFilename+".torch") + print("model saved to {}".format(baseFilename+".torch")) + except BaseException: + print("[Warning: Saving failed... continuing anyway.]") + + # We can also save as text json file: + if "-epoch0" in baseFilename: + with open(baseFilename.replace("-epoch0", "")+".json", "w") as f: + f.write(json.dumps(j_params)) + + + @classmethod + def load(cls, modelFilenamePrefix): + print (f"Loading MTL model from {modelFilenamePrefix}...") + layersSeq = list() + checkpoint = torch.load(modelFilenamePrefix+".torch") + with open(modelFilenamePrefix+".json") as f: + x2i = josn.load(f) + for i, param in enumerate(checkpoint): + layers = Layers.loadX2i(x2i[i]) + layers.load_state_dict(param) + layersSeq.append(layers) + + print (f"Loading MTL model from {modelFilenamePrefix} complete.") + + return layersSeq + + @classmethod + def load_multi(cls, models): + print (f"Loading MTL models from {models}...") + + layersSeq = list() + for model in models: + checkpoint = torch.load(model+".torch") + with open(model+".json") as f: + x2i = josn.load(f) + for i, param in enumerate(checkpoint): + layers = Layers.loadX2i(x2i[i]) + layers.load_state_dict(param) + if len(layersSeq)0 else prec + + def recall(self, label="*", decimals=2): + c = self.map[label].correct + g = self.map[label].gold + + reca = c/g if g!=0 else 0 + + return round(reca*100, decimals) if decimals>0 else reca + + def f1(self, label="*", decimals=2): + p = self.precision(label, decimals=-1) + r = self.recall(label, decimals=-1) + + f1 = 2.0 * p * r / (p + r) if (p!=0 and r!=0) else 0 + + return round(f1*100, decimals) if decimals>0 else f1 + + def accuracy(self, decimals=2): + a = self.correct / self.total + + return round(a*100, decimals) if decimals>0 else a + + + + + + diff --git a/main/src/main/python/pytorch/taskManager.py b/main/src/main/python/pytorch/taskManager.py new file mode 100644 index 000000000..ba8eab3cd --- /dev/null +++ b/main/src/main/python/pytorch/taskManager.py @@ -0,0 +1,187 @@ +import random +import math +from sequences.columnReader import ColumnReader +from dataclasses import dataclass + +TYPE_BASIC = 0 +TYPE_DUAL = 1 + +class TaskManager(): + + def __init__(self, config, seed): + + self.config = config + self.random = seed + + # How many shards to have per epoch + self.shardsPerEpoch = config.get_int("mtl.shardsPerEpoch", 10) + + # Total number of epochs + self.maxEpochs:Int = config.get_int("mtl.maxEpochs", 100) + + # Training patience in number of epochs + self.epochPatience:Int = config.get_int("mtl.epochPatience", 5) + + # Array of all tasks to be managed + self.tasks = self.readTasks() + + self.taskCount = len(self.tasks) + self.indices = range(self.taskCount) + + # Training shards from all tasks + self.shards = self.mkShards() + + def contains(self, paramPrefix): + return self.config.__contains__(paramPrefix) + + def get_int(self, x, defualt=None): + return self.config.get_int(x, defualt) + + def get_string(self, x, defualt=None): + return self.config.get_string(x, defualt) + + def get_float(self, x, defualt=None): + return self.config.get_float(x, defualt) + + def get_bool(self, x, defualt=None): + return self.config.get_bool(x, defualt) + + def get_list(self, x, defualt=None): + return self.config.get_list(x, defualt) + + def get_config(self, x, defualt=None): + return self.config.get_config(x, defualt) + + # Construct training shards by interleaving shards from all tasks + def mkShards(self): + shardsByTasks = list() + + # construct the shards for each task + for i in self.indices: + shardsByTasks += [self.tasks[i].mkShards()] + assert(len(shardsByTasks[i]) == self.shardsPerEpoch) + + # now interleave the tasks + interleavedShards = list() + for i in range(self.shardsPerEpoch): + for j in self.indices: + crtShard = shardsByTasks[j][i] + interleavedShards += [crtShard] + + return interleavedShards + + # Iterator over all sentences coming from all interleaved shards + def getSentences(self): + random.seed(self.random) + randomizedShards = random.sample(self.shards, len(self.shards)) + for shard in randomizedShards: + sents = random.sample(range(shard.startPosition, shard.endPosition), shard.endPosition-shard.startPosition) + for sent in sents: + yield (shard.taskId, self.tasks[shard.taskId].trainSentences[sent]) + + # Reads all tasks from disk in memory + def readTasks(self): + numberOfTasks = self.config.get_int("mtl.numberOfTasks", None) + tasks = list() + for i in range(numberOfTasks): + tasks += [self.readTask(i + 1)] + + print (f"Read {numberOfTasks} tasks from config file.") + return tasks + + def readTask(self, taskNumber): + taskName = self.config.get_string(f"mtl.task{taskNumber}.name", None) + train = self.config.get_string(f"mtl.task{taskNumber}.train", None) + + dev = self.config.get_string(f"mtl.task{taskNumber}.dev", None) if f"mtl.task{taskNumber}.dev" in self.config else None + test = self.config.get_string(f"mtl.task{taskNumber}.test", None) if f"mtl.task{taskNumber}.test" in self.config else None + + taskType = self.parseType(self.config.get_string(f"mtl.task{taskNumber}.type", "basic")) + + weight = self.config.get_float(f"mtl.task{taskNumber}.weight", 1.0) + + return Task(taskNumber - 1, taskName, taskType, self.shardsPerEpoch, weight, train, dev, test) + + def parseType(self, inf): + if inf == "basic": return TYPE_BASIC + elif inf == "dual": return TYPE_DUAL + else: raise ValueError(f"ERROR: unknown task type {inf}!") + + def debugTraversal(self): + for epoch in range(self.maxEpochs): + print (f"Started epoch {epoch}") + sentCount = 0 + taskId = 0 + totalSents = 0 + for sentence in self.getSentences(): + totalSents += 1 + if(sentence[0] != taskId): + print (f"Read {sentCount} sentences from task {taskId}") + taskId = sentence[0] + sentCount = 1 + else: + sentCount += 1 + print (f"Read {sentCount} sentences from task {taskId}") + print (f"Read {totalSents} sentences in epoch {epoch}.") + +@dataclass +class Shard: + taskId: int + startPosition: int + endPosition: int + +class Task: + def __init__(self, + taskId, # this starts at 0 so we can use it as an index in the array of tasks + taskName:str, + taskType:int, + shardsPerEpoch:int, + taskWeight:float, + trainFileName:str, + devFileName:str = None, + testFileName:str = None): + self.taskId = taskId + taskNumber = taskId + 1 + print (f"Reading task {taskNumber} ({taskName})...") + self.trainSentences = ColumnReader.readColumns(trainFileName) + self.devSentences = ColumnReader.readColumns(devFileName) if devFileName else None + self.testSentences = ColumnReader.readColumns(testFileName) if testFileName else None + + self.isBasic:bool = taskType == TYPE_BASIC + self.isDual:bool = taskType == TYPE_DUAL + + if taskType == TYPE_BASIC: + self.prettyType = "basic" + elif taskType == TYPE_DUAL: + self.prettyType = "dual" + else: + self.prettyType = "unknown" + + # The size of the training shard for this task + self.shardSize = math.ceil(len(self.trainSentences) / shardsPerEpoch) + + # Current position in the training sentences when we iterate during training + currentTrainingSentencePosition = 0 + + self.taskWeight = taskWeight + self.taskName = taskName + + print (f"============ starting task {taskNumber} ============") + print (f"Read {len(self.trainSentences)} training sentences for task {taskNumber}, with shard size {self.shardSize}.") + if(self.devSentences is not None): + print (f"Read {len(self.devSentences)} development sentences for task {taskNumber}.") + if(self.testSentences is not None): + print (f"Read {len(self.testSentences)} testing sentences for task {taskNumber}.") + print (f"Using taskWeight = {taskWeight}") + print (f"Task type = {self.prettyType}.") + print (f"============ completed task {taskNumber} ============") + + # Construct the shards from all training sentences in this task + def mkShards(self): + shards = list() + crtPos = 0 + while(crtPos < len(self.trainSentences)): + endPos = min(crtPos + self.shardSize, len(self.trainSentences)) + shards += [Shard(self.taskId, crtPos, endPos)] + crtPos = endPos + return shards diff --git a/main/src/main/python/pytorch/utils.py b/main/src/main/python/pytorch/utils.py new file mode 100644 index 000000000..abcb1e6f9 --- /dev/null +++ b/main/src/main/python/pytorch/utils.py @@ -0,0 +1,110 @@ +import torch.nn as nn +import torch +from torch.autograd import Variable + +import numpy as np + +concatenateCount = 0 + +UNK_WORD = "" +EOS_WORD = "" + +UNK_EMBEDDING = 0 + +START_TAG = "" +STOP_TAG = "" + +RANDOM_SEED = 2522620396 +WEIGHT_DECAY = 0.01 + +LOG_MIN_VALUE = -10000.0 + +DEFAULT_DROPOUT_PROBABILITY = 0.1 # no dropout by default + +TYPE_VITERBI = 1 +TYPE_GREEDY = 2 + +NONLIN_NONE = 0 +NONLIN_RELU = 1 +NONLIN_TANH = 2 + +nonlin_map = {"relu":NONLIN_RELU, "tanh":NONLIN_TANH, "":NONLIN_NONE} + +TYPE_GREEDY_STRING = "greedy" +TYPE_VITERBI_STRING = "viterbi" + +DEFAULT_IS_DUAL = 0 + +def save(file, values, comment): + file.write("# " + comment + "\n") + for key, value in values.items(): + file.write(f"{key}\t{value}\n") + file.write("\n") + +def mkCharacterEmbedding(word, c2i, charLookupParameters, charRnnBuilder): + charEmbeddings = charLookupParameters(torch.LongTensor([c2i.get(c, UNK_EMBEDDING) for c in word])) + output, _ = charRnnBuilder(charEmbeddings.unsqueeze(1)) + result = output.squeeze(1)[-1] + return result + +def mkCharacterEmbedding2(char_ids, charLookupParameters, charRnnBuilder): + charEmbeddings = charLookupParameters(char_ids) + output, _ = charRnnBuilder(charEmbeddings.unsqueeze(1)) + result = output.squeeze(1)[-1] + return result + +def readString2Ids(s2iFilename): + s2i = dict() + with open(s2iFilename) as f: + for line in f: + if not line.startswith("# ") and line.rstrip(): + k, v = line.strip().split('\t') + s2i[k] = int(v) + return s2i + +def readChar2Ids(s2iFilename): + s2i = dict() + with open(s2iFilename) as f: + for line in f: + if not line.startswith("# ") and line.rstrip(): + k, v = line.strip().split('\t') + s2i[chr(int(k))] = int(v) + return s2i + +def sentenceLossGreedy(emissionScoresForSeq, golds): + assert(emissionScoresForSeq.size(0) == len(golds)) + criterion = nn.CrossEntropyLoss() + golds = Variable(torch.LongTensor(golds)) + return criterion(emissionScoresForSeq, golds) + +def emissionScoresToArrays(expressions): + return [expr.data.tolist() for expr in expressions] + +def printCoNLLOutput(pw, words, golds, preds): + + assert(len(words) == len(golds)) + assert(len(words) == len(preds)) + + for i in range(len(words)): + pw.write(f"{words[i]} {golds[i]} {preds[i]}\n") + pw.write("\n") +def argmax(vec): + # return the argmax as a python int + _, idx = torch.max(vec, 1) + return idx.item() + +def log_sum_exp(vec): + max_score = vec[0, argmax(vec)] + max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) + return max_score + \ + torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) + + + + + + + + + + diff --git a/main/src/main/python/pytorch/viterbiForwardLayer.py b/main/src/main/python/pytorch/viterbiForwardLayer.py new file mode 100644 index 000000000..77025fd70 --- /dev/null +++ b/main/src/main/python/pytorch/viterbiForwardLayer.py @@ -0,0 +1,151 @@ +from pytorch.forwardLayer import * +from pytorch.utils import * + +class ViterbiForwardLayer(ForwardLayer): + def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None): + super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans) + + # Matrix of transition parameters. Entry i,j is the score of + # transitioning *to* i *from* j. + self.transitions = nn.Parameter( + torch.randn(self.outDim, self.outDim)) + + # These two statements enforce the constraint that we never transfer + # to the start tag and we never transfer from the stop tag + self.transitions.data[t2i[START_TAG], :] = -10000 + self.transitions.data[:, t2i[STOP_TAG]] = -10000 + + def _forward_alg(self, feats): + # Do the forward algorithm to compute the partition function + init_alphas = torch.full((1, self.outDim), -10000.) + # START_TAG has all of the score. + init_alphas[0][self.t2i[START_TAG]] = 0. + + # Wrap in a variable so that we will get automatic backprop + forward_var = init_alphas + + # Iterate through the sentence + for feat in feats: + alphas_t = [] # The forward tensors at this timestep + for next_tag in range(self.outDim): + # broadcast the emission score: it is the same regardless of + # the previous tag + emit_score = feat[next_tag].view( + 1, -1).expand(1, self.outDim) + # the ith entry of trans_score is the score of transitioning to + # next_tag from i + trans_score = self.transitions[next_tag].view(1, -1) + # The ith entry of next_tag_var is the value for the + # edge (i -> next_tag) before we do log-sum-exp + next_tag_var = forward_var + trans_score + emit_score + # The forward variable for this tag is log-sum-exp of all the + # scores. + alphas_t.append(log_sum_exp(next_tag_var).view(1)) + forward_var = torch.cat(alphas_t).view(1, -1) + terminal_var = forward_var + self.transitions[self.t2i[STOP_TAG]] + alpha = log_sum_exp(terminal_var) + return alpha + + def _score_sentence(self, feats, tags): + # Gives the score of a provided tag sequence + score = torch.zeros(1) + tags = torch.cat([torch.tensor([self.t2i[START_TAG]], dtype=torch.long), tags]) + for i, feat in enumerate(feats): + score = score + \ + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]] + score = score + self.transitions[self.t2i[STOP_TAG], tags[-1]] + return score + + def _viterbi_decode(self, feats): + backpointers = [] + + # Initialize the viterbi variables in log space + init_vvars = torch.full((1, self.outDim), -10000.) + init_vvars[0][self.t2i[START_TAG]] = 0 + + # forward_var at step i holds the viterbi variables for step i-1 + forward_var = init_vvars + for feat in feats: + bptrs_t = [] # holds the backpointers for this step + viterbivars_t = [] # holds the viterbi variables for this step + + for next_tag in range(self.outDim): + # next_tag_var[i] holds the viterbi variable for tag i at the + # previous step, plus the score of transitioning + # from tag i to next_tag. + # We don't include the emission scores here because the max + # does not depend on them (we add them in below) + next_tag_var = forward_var + self.transitions[next_tag] + best_tag_id = argmax(next_tag_var) + bptrs_t.append(best_tag_id) + viterbivars_t.append(next_tag_var[0][best_tag_id].view(1)) + # Now add in the emission scores, and assign forward_var to the set + # of viterbi variables we just computed + forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) + backpointers.append(bptrs_t) + + # Transition to STOP_TAG + terminal_var = forward_var + self.transitions[self.t2i[STOP_TAG]] + best_tag_id = argmax(terminal_var) + path_score = terminal_var[0][best_tag_id] + + # Follow the back pointers to decode the best path. + best_path = [best_tag_id] + for bptrs_t in reversed(backpointers): + best_tag_id = bptrs_t[best_tag_id] + best_path.append(best_tag_id) + # Pop off the start tag (we dont want to return that to the caller) + start = best_path.pop() + assert start == self.t2i[START_TAG] # Sanity check + best_path.reverse() + return path_score, best_path + + def loss(self, finalStates, goldLabelStrings): + goldLabels = torch.tensor([self.t2i[gs] for gs in goldLabelStrings], dtype=torch.long) + forward_score = self._forward_alg(finalStates) + gold_score = self._score_sentence(finalStates, goldLabels) + return forward_score - gold_score + + def saveX2i(self): + x2i = dict() + x2i["inferenceType"] = TYPE_VITERBI + x2i["inputSize"] = self.inputSize + x2i["isDual"] = 1 if self.isDual else 0 + x2i["span"] = spanToString(self.spans) if self.spans else "" + x2i["nonlinearity"] = self.nonlinearity + x2i["t2i"] = self.t2i + + return x2i + + def __str__(self): + return f"ViterbiForwardLayer({self.inDim}, {self.outDim})" + + def inference(self, emissionScores): + score, labelsIds = self._viterbi_decode(emissionScores) + return [self.i2t[i] for i in labelsIds] + + def inference2(self, emissionScores): + return torch.argmax(emissionScores, dim=1) + + def inferenceWithScores(emissionScores): + raise RuntimeError("ERROR: inferenceWithScores not supported for ViterbiLayer!") + + @classmethod + def load(cls, x2i): + inputSize = x2i["inputSize"] + isDual = x2i.get("isDual", DEFAULT_IS_DUAL) == 1 + sapnValue = x2i.get("span", "") + spans = None if sapnValue == "" else parseSpan(sapnValue, inputSize) + nonlinearity = x2i.get("nonlinearity", NONLIN_NONE) + t2i = x2i["t2i"] + i2t = {i:t for t, i in t2i.items()} + dropoutProb = x2i.get("dropoutProb", DEFAULT_DROPOUT_PROBABILITY) + + if spans: + l = spanLength(spans) + actualInputSize = 2*l if isDual else l + else: + actualInputSize = 2*inputSize if isDual else inputSize + + return cls(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans) + diff --git a/main/src/main/python/pytorch2onnx.py b/main/src/main/python/pytorch2onnx.py new file mode 100644 index 000000000..080b0efd8 --- /dev/null +++ b/main/src/main/python/pytorch2onnx.py @@ -0,0 +1,175 @@ +import torch +import argparse +from pyhocon import ConfigFactory +import random + +from pytorch.taskManager import TaskManager +from pytorch.metal import Metal +from pytorch.utils import * +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove +from sequences.rowReaders import * +from pytorch.seqScorer import * + +import onnx +import onnxruntime + +import json + +import numpy as np + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + +class Char_RNN(torch.nn.Module): + + def __init__(self, model): + super().__init__() + for i, layers in enumerate(model): + if layers.initialLayer is not None: + self.char_lookup = layers.initialLayer.charLookupParameters + self.char_rnn = layers.initialLayer.charRnnBuilder + + def forward(self, char_ids): + charEmbedding = mkCharacterEmbedding2(char_ids, self.char_lookup, self.char_rnn) + return charEmbedding + +class Saving_Model(torch.nn.Module): + """docstring for Saving_Model""" + def __init__(self, model): + super().__init__() + self.model_length = len(model) + self.intermediateLayerss = [None for _ in range(self.model_length)] + self.finalLayers = [None for _ in range(self.model_length)] + for i, layers in enumerate(model): + if layers.initialLayer is not None: + self.word_lookup = layers.initialLayer.wordLookupParameters + self.intermediateLayerss[i] = nn.ModuleList(layers.intermediateLayers) + self.finalLayers[i] = layers.finalLayer + self.intermediateLayerss = nn.ModuleList(self.intermediateLayerss) + self.finalLayers = nn.ModuleList(self.finalLayers) + def forward(self, embeddings, word_ids, charEmbedding): + # Can I assuem there is only one initial layer? + learnedWordEmbeddings = self.word_lookup(word_ids) + embedParts = [embeddings, learnedWordEmbeddings, charEmbedding] + embedParts = [ep for ep in embedParts if ep is not None] + state = torch.cat(embedParts, dim=1) + for i in range(self.model_length): + for il in self.intermediateLayerss[i]: + state = il(state, False) + if self.finalLayers[i]: + state = self.finalLayers[i](state, None)#headPositions set to be None for now, we can add it in input list later + transitions = self.finalLayers[-1].transitions + return state, transitions + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--model_file', type=str, help='Filename of the model.', nargs='+') + parser.add_argument('--config', type=str, help='Filename of the configuration.') + parser.add_argument('--seed', type=int, default=1234) + args = parser.parse_args() + + config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf') + taskManager = TaskManager(config, args.seed) + modelName = args.model_file + if len(modelName)==1: + model = Metal.load(modelName[0]) + else: + model = Metal.load_multi(modelName) + + for layers in model: + layers.start_eval() + constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams() + + export_char = Char_RNN(model) + export_model = Saving_Model(model) + export_model.eval() + export_char.eval() + for param in export_model.parameters(): + param.requires_grad = False + for param in export_char.parameters(): + param.requires_grad = False + + torch.manual_seed(taskManager.random) + random.seed(taskManager.random) + + x2i = json.load(open(args.model_file[0]+".json")) + + c2i = x2i[0]['x2i']['initialLayer']['c2i'] + w2i = x2i[0]['x2i']['initialLayer']['w2i'] + t2i = x2i[1]['x2i']['finalLayer']["t2i"] + i2t = {i:t for t, i in t2i.items()} + + for taskId in range(0, taskManager.taskCount): + taskName = taskManager.tasks[taskId].taskName + testSentences = taskManager.tasks[taskId].testSentences + if testSentences: + reader = MetalRowReader() + annotatedSentences = reader.toAnnotatedSentences(testSentences[1]) + + asent = annotatedSentences[0] + sentence = asent[0] + goldLabels = asent[1] + + words = sentence.words + + char_embs = [] + for word in words: + char_ids = torch.LongTensor([c2i.get(c, UNK_EMBEDDING) for c in word]) + char_out = export_char(char_ids) + char_embs.append(char_out) + char_embs = torch.stack(char_embs) + embed_ids = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words]) + embeddings = constEmbeddings.emb(embed_ids) + word_ids = torch.LongTensor([w2i[word] if word in w2i else 0 for word in words]) + state, transitions = export_model(embeddings, word_ids, char_embs) + dummy_input = (embeddings, word_ids, char_embs) + + torch.onnx.export(export_char, + char_ids, + "char.onnx", + export_params=True, + do_constant_folding=True, + input_names = ['char_ids'], + output_names = ['chars'], + dynamic_axes = {"char_ids": {0: 'word length'}}) + + torch.onnx.export(export_model, # model being run + dummy_input, # model input (or a tuple for multiple inputs) + "model.onnx", # where to save the model (can be a file or file-like object) + export_params=True, # store the trained parameter weights inside the model file + opset_version=10, # the ONNX version to export the model to + do_constant_folding=True, # whether to execute constant folding for optimization + input_names = ['embed', 'words', 'chars'], # the model's input names + output_names = ['state', 'transitions'], # the model's output names + dynamic_axes = {'embed' : {0 : 'sentence length'}, + 'words' : {0 : 'sentence length'}, + 'chars' : {0 : 'sentence length'}, + 'state': {0 : 'sentence length'}}) + + onnx_model = onnx.load("model.onnx") + onnx.checker.check_model(onnx_model) + char_model = onnx.load("char.onnx") + onnx.checker.check_model(char_model) + + ort_session = onnxruntime.InferenceSession("model.onnx") + ort_char = onnxruntime.InferenceSession("char.onnx") + # compute ONNX Runtime output prediction + + ort_inputs = {ort_char.get_inputs()[i].name: to_numpy(x) for i, x in enumerate([char_ids])} + ort_outs = ort_char.run(None, ort_inputs) + try: + np.testing.assert_allclose(to_numpy(char_out), ort_outs[0], rtol=1e-03, atol=1e-05) + except AssertionError as e: + print (e) + ort_inputs = {ort_session.get_inputs()[i].name: to_numpy(x) for i, x in enumerate(dummy_input)} + ort_outs = ort_session.run(None, ort_inputs) + + try: + np.testing.assert_allclose(state.detach().cpu().numpy(), ort_outs[0], rtol=1e-03, atol=1e-05) + except AssertionError as e: + print (e) + + print("Exported model has been tested with ONNXRuntime, and the result looks good!") + + diff --git a/main/src/main/python/run.py b/main/src/main/python/run.py new file mode 100644 index 000000000..7d22e56fe --- /dev/null +++ b/main/src/main/python/run.py @@ -0,0 +1,35 @@ +from pyhocon import ConfigFactory +import argparse +from pytorch.taskManager import TaskManager +from pytorch.metal import Metal + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--model_file', type=str, help='Filename of the model.', nargs='+') + parser.add_argument('--train', action='store_true', help='Set the code to training purpose.') + parser.add_argument('--test', action='store_true', help='Set the code to testing purpose.') + parser.add_argument('--shell', action='store_true', help='Set the code to shell mode.') + parser.add_argument('--config', type=str, help='Filename of the configuration.') + parser.add_argument('--seed', type=int, default=1234) + args = parser.parse_args() + + if args.train: + config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf') + taskManager = TaskManager(config, args.seed) + modelName = args.model_file[0] + + mtl = Metal(taskManager, None) + mtl.train(modelName) + elif args.test: + config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf') + taskManager = TaskManager(config, args.seed) + modelName = args.model_file + if len(modelName)==1: + model = Metal.load(modelName[0]) + else: + model = Metal.load_multi(modelName) + mtl = Metal(taskManager, model) + mtl.test() + elif args.shell: + pass \ No newline at end of file diff --git a/main/src/main/python/sequences/__init__.py b/main/src/main/python/sequences/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/main/src/main/python/sequences/columnReader.py b/main/src/main/python/sequences/columnReader.py new file mode 100644 index 000000000..e162316f7 --- /dev/null +++ b/main/src/main/python/sequences/columnReader.py @@ -0,0 +1,47 @@ +#----------------------------------------------------------- +# Reads the CoNLL-like column format +#----------------------------------------------------------- +class ColumnReader: + + def readColumns(source): + if type(source) is str: + source = open(source) + sentence = list() + sentences = list() + for line in source: + l = line.strip() + if (l is ""): + # end of sentence + if (sentence): + sentences += [sentence] + sentence = list() + else: + # within the same sentence + bits = l.split("\t") + if (len(bits) < 2): + raise RuntimeError(f"ERROR: invalid line {l}!") + sentence += [Row(bits)] + + if (sentence): + sentences += [sentence] + + source.close() + return sentences + +# ----------------------------------------------------------- +# Stores training data for sequence modeling +# Mandatory columns: 0 - word, 1 - label +# Optional columns: 2 - POS tag, 3+ SRL arguments +# @param tokens +# ----------------------------------------------------------- + +class Row: + + def __init__(self, tokens): + self.tokens = tokens + self.length = len(tokens) + + def get(self, idx): + if(idx >= self.length): + raise RuntimeError(f"ERROR: trying to read field #{idx}, which does not exist in this row: {tokens}!") + return self.tokens[idx] diff --git a/main/src/main/python/sequences/rowReaders.py b/main/src/main/python/sequences/rowReaders.py new file mode 100644 index 000000000..434469e92 --- /dev/null +++ b/main/src/main/python/sequences/rowReaders.py @@ -0,0 +1,100 @@ + +class AnnotatedSentence: + + def __init__(self, words, posTags = None, neTags = None, headPositions = None): + self.words = words + self.posTags = posTags + self.neTags = neTags + self.headPositions = headPositions + self.size = len(words) + self.indicies = range(self.size) + +class RowReader(object): + + def __init__(self): + raise NotImplementedError + + def toAnnotatedSentences(self, rows): + raise NotImplementedError + +class MetalRowReader(RowReader): + + def __init__(self): + self.WORD_POSITION = 0 + self.POS_TAG_POSITION = 1 + self.NE_LABEL_POSITION = 2 + self.LABEL_START_OFFSET = 3 + + def toAnnotatedSentences(self, rows): + if (rows[0].length == 2): + return self.parseSimple(rows) + elif (rows[0].length == 4): + return self.parseSimpleExtended(rows) + elif (rows[0].length >= 5): + return self.parseFull(rows) + else: + raise RuntimeError("ERROR: the Metal format expects 2, 4, or 5+ columns!") + + # Parser for the simple format: word, label + def parseSimple(self, rows): + assert(rows[0].length == 2) + words = list() + labels = list() + + for row in rows: + words += [row.get(self.WORD_POSITION)] + labels += [row.get(self.WORD_POSITION + 1)] + + return [(AnnotatedSentence(words), labels)] + + # Parser for the simple extended format: word, POS tag, NE label, label + def parseSimpleExtended(self, rows): + assert(rows[0].length == 4) + words = list() + posTags = list() + neLabels = list() + labels = list() + + for row in rows: + words += [row.get(self.WORD_POSITION)] + posTags += [row.get(self.POS_TAG_POSITION)] + neLabels += [row.get(self.NE_LABEL_POSITION)] + labels += [row.get(self.LABEL_START_OFFSET)] + + return [(AnnotatedSentence(words, posTags, neLabels), labels)] + + # Parser for the full format: word, POS tag, NE label, (label head)+ + def parseFull(self, rows): + assert(rows[0].length >= 5) + numSent = (rows[0].length - 3) / 2 + assert(numSent >= 1) + assert(numSent==int(numSent)) + numSent = int(numSent) + + + + words = list() + posTags = list() + neLabels = list() + headPositions = [list() for i in range(numSent)] + labels = [list() for i in range(numSent)] + + for row in rows: + words += [row.get(self.WORD_POSITION)] + posTags += [row.get(self.POS_TAG_POSITION)] + neLabels += [row.get(self.NE_LABEL_POSITION)] + + for j in range(numSent): + labels[j]+= [row.get(self.LABEL_START_OFFSET + (j * 2))] + try: + headPositions[j] += [int(row.get(self.LABEL_START_OFFSET + (j * 2) + 1))] + except: + raise RuntimeError + + sentences = list() + for i in range(numSent): + annotatedSent = AnnotatedSentence(words, posTags, neLabels, headPositions[i]) + sentLabels = labels[i] + sentences += [(annotatedSent, sentLabels)] + + return sentences diff --git a/main/src/main/python/test_onnx.py b/main/src/main/python/test_onnx.py new file mode 100644 index 000000000..7c33d4894 --- /dev/null +++ b/main/src/main/python/test_onnx.py @@ -0,0 +1,129 @@ +from pytorch2onnx import * +import json +import numpy as np +from pytorch.seqScorer import * +import time + +def viterbi_decode(feats, transitions, t2i): + backpointers = [] + + # Initialize the viterbi variables in log space + init_vvars = np.full((1, len(t2i)), -10000.) + init_vvars[0][t2i[START_TAG]] = 0 + + # forward_var at step i holds the viterbi variables for step i-1 + forward_var = init_vvars + for feat in feats: + bptrs_t = [] # holds the backpointers for this step + viterbivars_t = [] # holds the viterbi variables for this step + + for next_tag in range(len(t2i)): + # next_tag_var[i] holds the viterbi variable for tag i at the + # previous step, plus the score of transitioning + # from tag i to next_tag. + # We don't include the emission scores here because the max + # does not depend on them (we add them in below) + next_tag_var = forward_var + transitions[next_tag] + best_tag_id = np.argmax(next_tag_var, 1)[0] + bptrs_t.append(best_tag_id) + viterbivars_t.append(next_tag_var[0][best_tag_id].reshape(1)) + # Now add in the emission scores, and assign forward_var to the set + # of viterbi variables we just computed + forward_var = (np.concatenate(viterbivars_t) + feat).reshape(1, -1) + backpointers.append(bptrs_t) + + # Transition to STOP_TAG + terminal_var = forward_var + transitions[t2i[STOP_TAG]] + best_tag_id = np.argmax(terminal_var, 1)[0] + path_score = terminal_var[0][best_tag_id] + + # Follow the back pointers to decode the best path. + best_path = [best_tag_id] + for bptrs_t in reversed(backpointers): + best_tag_id = bptrs_t[best_tag_id] + best_path.append(best_tag_id) + # Pop off the start tag (we dont want to return that to the caller) + start = best_path.pop() + assert start == t2i[START_TAG] # Sanity check + best_path.reverse() + return path_score, best_path + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--model_file', type=str, help='Filename of the model.') + parser.add_argument('--config', type=str, help='Filename of the configuration.') + parser.add_argument('--seed', type=int, default=1234) + args = parser.parse_args() + + config = ConfigFactory.parse_file(f'../resources/org/clulab/{args.config}.conf') + taskManager = TaskManager(config, args.seed) + constEmbeddings = ConstEmbeddingsGlove.get_ConstLookupParams() + + x2i = json.load(open(args.model_file+".json")) + + c2i = x2i[0]['x2i']['initialLayer']['c2i'] + w2i = x2i[0]['x2i']['initialLayer']['w2i'] + t2i = x2i[1]['x2i']['finalLayer']["t2i"] + i2t = {i:t for t, i in t2i.items()} + + torch.manual_seed(taskManager.random) + random.seed(taskManager.random) + + onnx_model = onnx.load("model.onnx") + onnx.checker.check_model(onnx_model) + char_model = onnx.load("char.onnx") + onnx.checker.check_model(char_model) + + ort_session = onnxruntime.InferenceSession("model.onnx") + ort_char = onnxruntime.InferenceSession("char.onnx") + + scoreCountsByLabel = ScoreCountsByLabel() + start_time = time.time() + for taskId in range(0, taskManager.taskCount): + taskName = taskManager.tasks[taskId].taskName + sentences = taskManager.tasks[taskId].testSentences + if sentences: + reader = MetalRowReader() + for sent in sentences: + annotatedSentences = reader.toAnnotatedSentences(sent) + + for asent in annotatedSentences: + sentence = asent[0] + goldLabels = asent[1] + + words = sentence.words + + char_embs = [] + for word in words: + char_ids = np.array([c2i.get(c, UNK_EMBEDDING) for c in word]) + ort_inputs = {ort_char.get_inputs()[i].name: x for i, x in enumerate([char_ids])} + ort_outs = ort_char.run(None, ort_inputs) + char_embs.append(ort_outs[0]) + char_embs = np.stack(char_embs) + embed_ids = torch.LongTensor([constEmbeddings.w2i[word] if word in constEmbeddings.w2i else 0 for word in words]) + embeddings = constEmbeddings.emb(embed_ids).detach().cpu().numpy() + word_ids = np.array([w2i[word] if word in w2i else 0 for word in words]) + + dummy_input = (embeddings, word_ids, char_embs) + + ort_inputs = {ort_session.get_inputs()[i].name: x for i, x in enumerate(dummy_input)} + ort_outs = ort_session.run(None, ort_inputs) + + _, ids = viterbi_decode(ort_outs[0], ort_outs[1], t2i) + + preds = [i2t[i] for i in ids] + + sc = SeqScorer.f1(goldLabels, preds) + scoreCountsByLabel.incAll(sc) + + + print (f"Accuracy : {scoreCountsByLabel.accuracy()}") + print (f"Precision : {scoreCountsByLabel.precision()}") + print (f"Recall on : {scoreCountsByLabel.recall()}") + print (f"Micro F1 : {scoreCountsByLabel.f1()}") + for label in scoreCountsByLabel.labels(): + print (f"\tP/R/F1 for label {label} ({scoreCountsByLabel.map[label].gold}): {scoreCountsByLabel.precision(label)} / {scoreCountsByLabel.recall(label)} / {scoreCountsByLabel.f1(label)}") + duration = time.time() - start_time + print (duration) + \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/glove.conf b/main/src/main/resources/org/clulab/glove.conf index 22f1e4b36..6b8bd08b0 100644 --- a/main/src/main/resources/org/clulab/glove.conf +++ b/main/src/main/resources/org/clulab/glove.conf @@ -1,5 +1,5 @@ glove { - matrixResourceName = "/org/clulab/glove/glove.840B.300d.10f" + matrixResourceName = "glove.840B.300d.10f.txt" isResource = true } \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/mtl-en-ner.conf b/main/src/main/resources/org/clulab/mtl-en-ner.conf index 6cd5eecc2..aacc06054 100644 --- a/main/src/main/resources/org/clulab/mtl-en-ner.conf +++ b/main/src/main/resources/org/clulab/mtl-en-ner.conf @@ -8,7 +8,7 @@ mtl { learnedWordEmbeddingSize = 128 charEmbeddingSize = 32 charRnnStateSize = 16 - c2i = "org/clulab/c2i-en.txt" + c2i = "../resources/org/clulab/c2i-en.txt" } intermediate1 { @@ -20,9 +20,9 @@ mtl { task1 { name = "En NER" - train = "dynet/en/ner/train.txt" - dev = "dynet/en/ner/dev.txt" - test = "dynet/en/ner/test.txt" + train = "ner/train.txt" + dev = "ner/dev.txt" + test = "ner/test.txt" layers { final { diff --git a/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf b/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf index 828fd973d..b23692fff 100644 --- a/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf +++ b/main/src/main/resources/org/clulab/mtl-en-pos-chunk-srlp.conf @@ -9,7 +9,7 @@ mtl { learnedWordEmbeddingSize = 128 charEmbeddingSize = 32 charRnnStateSize = 16 - c2i = "org/clulab/c2i-en.txt" + c2i = "../resources/org/clulab/c2i-en.txt" } intermediate1 { @@ -21,9 +21,9 @@ mtl { task1 { name = "En POS tagging" - train = "dynet/en/pos/train.txt" - dev = "dynet/en/pos/dev.txt" - test = "dynet/en/pos/test.txt" + train = "/data/nlp/corpora/processors-dynet/en/pos/train.txt" + dev = "/data/nlp/corpora/processors-dynet/en/pos/dev.txt" + test = "/data/nlp/corpora/processors-dynet/en/pos/test.txt" layers { final { @@ -34,9 +34,9 @@ mtl { task2 { name = "En chunking" - train = "dynet/en/chunking/train.txt" - dev = "dynet/en/chunking/test.txt" - test = "dynet/en/chunking/test.txt" + train = "/data/nlp/corpora/processors-dynet/en/chunking/train.txt" + dev = "/data/nlp/corpora/processors-dynet/en/chunking/test.txt" + test = "/data/nlp/corpora/processors-dynet/en/chunking/test.txt" layers { final { @@ -47,9 +47,9 @@ mtl { task3 { name = "En SRL predicates" - train = "dynet/en/srl/train.preds" - dev = "dynet/en/srl/dev.preds" - test = "dynet/en/srl/test-wsj.preds" + train = "/data/nlp/corpora/processors-dynet/en/srl/train.preds" + dev = "/data/nlp/corpora/processors-dynet/en/srl/dev.preds" + test = "/data/nlp/corpora/processors-dynet/en/srl/test-wsj.preds" layers { final { diff --git a/main/src/main/resources/org/clulab/mtl-en-srla.conf b/main/src/main/resources/org/clulab/mtl-en-srla.conf index 6dcf5bbd8..8f5181484 100644 --- a/main/src/main/resources/org/clulab/mtl-en-srla.conf +++ b/main/src/main/resources/org/clulab/mtl-en-srla.conf @@ -16,9 +16,9 @@ mtl { distanceEmbeddingSize = 16 distanceWindowSize = 50 useIsPredicate = true - c2i = "org/clulab/c2i-en.txt" - tag2i = "org/clulab/tag2i-en.txt" - ne2i = "org/clulab/ne2i-en.txt" + c2i = "../resources/org/clulab/c2i-en.txt" + tag2i = "../resources/org/clulab/tag2i-en.txt" + ne2i = "../resources/org/clulab/ne2i-en.txt" } intermediate1 { @@ -31,9 +31,9 @@ mtl { task1 { name = "En SRL arguments" - train = "dynet/en/srl/train.args" - dev = "dynet/en/srl/dev.args" - test = "dynet/en/srl/test-wsj.args" + train = "/data/nlp/corpora/processors-dynet/en/srl/train.args" + dev = "/data/nlp/corpora/processors-dynet/en/srl/dev.args" + test = "/data/nlp/corpora/processors-dynet/en/srl/test-wsj.args" type = "dual" layers {