From 6c213d7618df6e9c2cea215325fb039bc0291565 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Wed, 8 Jun 2022 06:22:27 +0530 Subject: [PATCH 1/5] Add very rough implementation --- .isort.cfg | 2 +- .pre-commit-config.yaml | 2 +- src/models/gatedcnn_nci.py | 436 +++++++++++++++++++++++++++++++++++++ 3 files changed, 438 insertions(+), 2 deletions(-) create mode 100644 src/models/gatedcnn_nci.py diff --git a/.isort.cfg b/.isort.cfg index 82cd117..cf67fba 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,2 +1,2 @@ [settings] -known_third_party = gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml +known_third_party = embeddings,gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4adfd09..7cd4c15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: hooks: - id: black args: ["--config", "./pyproject.toml"] - language_version: python3.7 + language_version: python3 - repo: https://github.com/asottile/seed-isort-config rev: v2.2.0 diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py new file mode 100644 index 0000000..38d0016 --- /dev/null +++ b/src/models/gatedcnn_nci.py @@ -0,0 +1,436 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +class GatedCNNencoder(nn.Module): + def __init__(self, args, Y, dicts): + super(GatedCNNencoder, self).__init__() + self.args = args + self.max_length = args.MAX_LENGTH + self.dropout = args.dropout + self.ninp = args.embed_size + self.nhid = args.nhid + self.nout = args.nout + self.bidirectional = args.bidirectional + + self.word_rep = WordRep(args, Y, dicts) + self.encoder = GatedCNN(args, Y, dicts, self.ninp, self.nout) + self.network = nn.ModuleList([self.encoder]) + if self.bidirectional: + self.output_layer = OutputLayer(args, Y, dicts, self.nout * 2) + else: + self.output_layer = OutputLayer(args, Y, dicts, self.nout) + self.var_drop = VariationalDropout() + + def freeze_net(self): + for p in self.word_rep.embed.parameters(): + p.requires_grad = False + + def _reverse_seq(self, X, mask, seq_max_len): + """ + X -> batch, seq_len, dim + mask -> batch, seq_len + """ + mask_sum = torch.sum(mask, 1).int() + xfs = [] + for x, c in zip(X, mask_sum): + xf = torch.flip(x[:c], [0]) + xfs.append(xf) + padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda() + for i, mat in enumerate(xfs): + padded_rev[i][: len(mat), :] = mat + return padded_rev + + def forward(self, data, target, mask, hidden, desc): + """ + :param data: The input sequence, with dimesion (N, L) + :param target: labels + :param mask: input sequence mask + :param hidden: The initial hidden state (h, c) + :param desc: Whether to use code description + :return: logits, loss, hidden + """ + emb = self.word_rep(data, target) + if self.bidirectional: + emb_reverse = self._reverse_seq(emb, mask, self.max_length) + emb = emb.transpose(1, 2) # emb: [bs, 100, len] + if self.bidirectional: + emb_reverse = emb_reverse.transpose( + 1, 2 + ) # emb_reverse: [bs, 100, len] + cnn_encoder = self.network[0] + raw_output, hidden = cnn_encoder(emb, hidden) + if self.bidirectional: + raw_out_re, hidden = cnn_encoder(emb_reverse, hidden) + output = self.var_drop(raw_output, self.dropout) + if self.bidirectional: + output_re = self._reverse_seq(raw_out_re, mask, self.max_length) + output_re = self.var_drop(output_re, self.dropout) + if self.bidirectional: + output = torch.cat([output, output_re], dim=2) + if self.args.desc: + logits, loss, _, interaction = self.output_layer( + output, target, desc + ) + else: + logits, loss, _, interaction = self.output_layer( + output, target, None + ) + return logits, loss, hidden, interaction + + def init_hidden(self, bsz): + h_size = self.nhid + self.nout + weight = next(self.parameters()).data + return ( + weight.new(bsz, h_size, 1).zero_(), + weight.new(bsz, h_size, 1).zero_(), + ) + + +from typing import Tuple + +import torch.nn as nn +from embeddings import build_pretrain_embedding, load_embeddings +from torch import Tensor +from torch.nn.init import kaiming_uniform_, normal_, xavier_uniform_ + + +class WordEmbeddingLayer(nn.Module): + """ + A Word Embedding Layer. This layer loads a pre-trained word embedding matrix + , and copies its weights to an nn.Embedding layer. + + Args: + embed_dir (str): A directory containing the pre-trained word embedding + matrix, among other things. Please see + https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17 + for more details. + dropout (float): The dropout probability. + """ + + def __init__(self, embed_dir, dropout, num_filter_maps): + super(WordEmbeddingLayer, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"embed_dir = {embed_dir}, dropout = {dropout}" + ) + + # Note: This should be changed, since we won't always use Word2Vec. + embedding_cls = ConfigMapper.get_object("embeddings", "word2vec") + + W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir)) + self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) + self.embed.weight.data = W.clone() + + self.embedding_size = self.embed.embedding_dim + + self.dropout = nn.Dropout(dropout) + + self.conv_dict = { + 1: [self.embedding_size, num_filter_maps], + 2: [self.embedding_size, 100, num_filter_maps], + 3: [self.embedding_size, 150, 100, num_filter_maps], + 4: [self.embedding_size, 200, 150, 100, num_filter_maps], + } + + def forward(self, x): + embedding = self.embed(x) + x = self.dropout(embedding) + return x + + +class VariationalHidDropout(nn.Module): + def __init__(self, dropout=0.0): + """ + Hidden-to-hidden (VD-based) dropout that applies the same mask at every + time step and every layer of TrellisNet. + + Args: + dropout (float): The dropout probability. + """ + super(VariationalHidDropout, self).__init__() + self.dropout_probability = dropout + self.mask = None + + def reset_mask(self, input): + + # Dimension (N, C, L) + m = input.data.new(input.size(0), input.size(1), 1).bernoulli_( + 1 - self.dropout_probability + ) + with torch.no_grad(): + mask = m / (1 - self.dropout_probability) + self.mask = mask + return mask + + def forward(self, input): + # We don't apply dropout if the model is in eval mode. + if not self.training or self.dropout_probability == 0: + return input + + assert ( + self.mask is not None + ), "You need to reset mask before using VariationalHidDropout" + mask = self.mask.expand_as(input) # Make sure the dimension matches + return mask * input + + +class WeightShareConv1d(nn.Module): + def __init__( + self, + input_dim, + hidden_dim, + out_channels, + kernel_size, + dropout=0.0, + init_mean=0.0, + init_std=0.01, + ): + """ + The weight-tied 1D convolution used in TrellisNet. + + Args: + input_dim (int): The dimension of the input. This is equivalent to + the number of input channels in the first + convolutional layer. + hidden_dim (int): The dimension of the hidden state. This is + equivalent to the number of input channels in the + second convolutional layer. + out_channels (int): The number of output channels in both + convolutional layers. + kernel_size (int): The size of the filter used in both + convolutional layers. + dropout (float): Dropout probability for the hidden-to-hidden + dropout layer. + init_mean (float): The mean of the normal distribution with which + weights of the convolutional layers are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the convolutional layers are + initialised. + """ + super(WeightShareConv1d, self).__init__() + + self.input_dim = input_dim + self.kernel_size = kernel_size + + self._dict = {} + + conv_layer_1 = nn.Conv1d( + in_channels=input_dim, + out_channels=out_channels, + kernel_size=kernel_size, + ) + self.weight_1 = conv_layer_1.weight + + conv_layer_2 = nn.Conv1d( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=kernel_size, + ) + self.weight_2 = conv2.weight + self.bias_2 = conv2.bias + + self.init_conv_weights(init_mean, init_std) + + self.dropout = VariationalHidDropout(dropout=dropout) + + def init_conv_weights(self, init_mean, init_std): + self.weight_1.data.normal_(mean=init_mean, std=init_std) + self.weight_2.data.normal_(mean=init_mean, std=init_std) + self.bias_2.data.normal_(mean=init_mean, std=init_std) + + def forward(self, input, dilation, hid): + batch_size = input.size(0) + + padding = (self.kernel_size - 1) * dilation # Padding size. + x = F.pad(input=input, pad=(padding, 0)) # Pad with zeros. + + x_1 = x[:, : self.input_dim] + z_1 = x[:, self.input_dim :] + z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding) + + device = x_1.get_device() + + if (dilation, device) not in self.dict or self.dict[ + (dilation, device) + ] is None: + self.dict[(dilation, device)] = F.conv1d( + input=x_1, weight=self.weight1, dilation=dilation + ) + + z_1 = self.dropout(z_1) + injected = self.dict[(dilation, device)] + F.conv1d( + input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation + ) + return injected + + +class GatedCNN(nn.Module): + def __init__( + self, + input_dim, + hidden_dim, + output_dim, + kernel_size, + dropout, + init_mean, + init_std, + levels, + ): + """ + Gated CNN module. + + Args: + input_dim (int): The dimension of the input. + hidden_dim (int): The hidden dimension. The hidden dimension for the + weight-shared Conv1D layer is + `hidden_dim + output_dim`. + output_dim (int): The output dimension. The number of output + channels of the weight-shared Conv1D layer is + `4 * (hidden_dim + output_dim)`. + kernel_size (int): The size of the filter used in + `WeightSharedConv1D`. + dropout (float): Dropout probability for the `WeightSharedConv1D`. + init_mean (float): The mean of the normal distribution with which + weights of the `WeightSharedConv1D` layer are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the `WeightSharedConv1D` + layer are initialised. + """ + super(GatedCNN, self).__init__() + self.input_dim = input_dim + self.hidden_dim = args.hidden_dim + self.output_dim = output_dim + self.levels = levels + + self.hidden_dim_for_conv = hidden_dim + output_dim + + self.dilations = [i + 1 for i in range(levels)] + + self.full_conv = WeightShareConv1d( + input_dim=input_dim, + hidden_dim=self.hidden_dim_for_conv, + output_channels=4 * self.hidden_dim_for_conv, + kernel_size=kernel_size, + dropout=dropout, + init_mean=init_mean, + init_std=init_std, + ) + + self.ht = None + + def transform_input(self, X): + device = X.get_device() + if device == -1: + device = "cpu" + + batch_size = X.size(0) + seq_len = X.size(2) + + ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( + device + ) + self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( + device + ) + return torch.cat((X, ht), dim=1) + + def gating(self, Z, dilation=1, hc=None): + batch_size = Z.size(0) + (hid, cell) = hc + + out = self.full_conv(input=Z, dilation=dilation, hid=hid) + + ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation] + ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation) + + it = torch.sigmoid(out[:, : self.hidden_dim_for_conv]) + ot = torch.sigmoid( + out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv] + ) + gt = torch.tanh( + out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv] + ) + ft = torch.sigmoid( + out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv] + ) + self.ct = ft * ct_1 + it * gt + ht = ot * torch.tanh(self.ct) + + Z = torch.cat((Z[:, : self.input_dim], ht), dim=1) + return Z + + def forward(self, emb, hc): + Z = self.transform_input(emb) + for key in self.full_conv.dict: + if key[1] == emb.get_device(): + self.full_conv.dict[key] = None + self.full_conv.drop.reset_mask(Z[:, self.input_dim :]) + + for dilation_per_level in self.dilations: + Z = self.gating(Z, dilation=dilation_per_level, hc=hc) + + out = Z[:, -self.output_dim :].transpose(1, 2) + hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:]) + return out, hc + + +class OutputLayer(nn.Module): + def __init__( + self, input_size, num_labels, embed_dir, dropout, num_filter_maps + ): + super(OutputLayer, self).__init__() + + self.word_embedding_layer = WordEmbeddingLayer( + embed_dir, dropout, num_filter_maps + ) + + self.U = nn.Linear(input_size, num_labels) + self.final = nn.Linear(input_size, num_labels) + self.proj_layer = nn.Linear(input_size, 1, bias=False) + + xavier_uniform_(self.U.weight) + xavier_uniform_(self.final.weight) + + def forward(self, x, desc): + if desc is not None: + desc_vec = self.word_rep(desc, target) + desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0) + mmt = desc_vec.matmul(x.transpose(1, 2)) + else: + mmt = self.U.weight.matmul(x.transpose(1, 2)) + + m = mmt.matmul(x) + + y = self.final.weight.mul(m) + logits = self.proj_layer(y).squeeze(-1).add(self.final.bias) + + return logits + + +class VariationalDropout(nn.Module): + def __init__(self): + """ + Feed-forward version of variational dropout that applies the same mask + at every time step. + """ + super(VariationalDropout, self).__init__() + + def forward(self, x, dropout=0.5, dim=3): + if not self.training or not dropout: + return x + if dim == 4: + # Dimension (M, N, L, C), where C stands for channels + m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_( + 1 - dropout + ) + else: + # Dimension (N, L, C) + m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - dropout) + with torch.no_grad(): + mask = m / (1 - dropout) + mask = mask.expand_as(x) + return mask * x From fb1b981038ffa0304f99fa3853f69179317cbc32 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Tue, 14 Jun 2022 22:38:36 +0530 Subject: [PATCH 2/5] Add configs --- .isort.cfg | 2 +- configs/{ => caml}/caml_mimic3_50.yml | 0 configs/{ => caml}/caml_mimic3_50_old.yml | 0 configs/{ => caml}/caml_mimic3_full.yml | 0 configs/{ => caml}/caml_mimic3_full_old.yml | 0 configs/{ => caml}/cnn_mimic3_50.yml | 0 configs/{ => caml}/drcaml_mimic3_50.yml | 0 configs/gatedcnn_nci/gatedcnn_nci.py | 512 ++++++++++++++++++ .../gatedcnn_nci/gatedcnn_nci_mimic3_50.yml | 127 +++++ .../gatedcnn_nci_mimic3_50_old.yml | 127 +++++ .../gatedcnn_nci/gatedcnn_nci_mimic3_full.yml | 127 +++++ .../gatedcnn_nci_mimic3_full_old.yml | 127 +++++ src/models/fusion.py | 2 +- src/models/gatedcnn_nci.py | 392 ++++++++------ src/modules/metrics.py | 1 + src/utils/caml_utils.py | 15 + 16 files changed, 1274 insertions(+), 158 deletions(-) rename configs/{ => caml}/caml_mimic3_50.yml (100%) rename configs/{ => caml}/caml_mimic3_50_old.yml (100%) rename configs/{ => caml}/caml_mimic3_full.yml (100%) rename configs/{ => caml}/caml_mimic3_full_old.yml (100%) rename configs/{ => caml}/cnn_mimic3_50.yml (100%) rename configs/{ => caml}/drcaml_mimic3_50.yml (100%) create mode 100644 configs/gatedcnn_nci/gatedcnn_nci.py create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml diff --git a/.isort.cfg b/.isort.cfg index cf67fba..82cd117 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,2 +1,2 @@ [settings] -known_third_party = embeddings,gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml +known_third_party = gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml diff --git a/configs/caml_mimic3_50.yml b/configs/caml/caml_mimic3_50.yml similarity index 100% rename from configs/caml_mimic3_50.yml rename to configs/caml/caml_mimic3_50.yml diff --git a/configs/caml_mimic3_50_old.yml b/configs/caml/caml_mimic3_50_old.yml similarity index 100% rename from configs/caml_mimic3_50_old.yml rename to configs/caml/caml_mimic3_50_old.yml diff --git a/configs/caml_mimic3_full.yml b/configs/caml/caml_mimic3_full.yml similarity index 100% rename from configs/caml_mimic3_full.yml rename to configs/caml/caml_mimic3_full.yml diff --git a/configs/caml_mimic3_full_old.yml b/configs/caml/caml_mimic3_full_old.yml similarity index 100% rename from configs/caml_mimic3_full_old.yml rename to configs/caml/caml_mimic3_full_old.yml diff --git a/configs/cnn_mimic3_50.yml b/configs/caml/cnn_mimic3_50.yml similarity index 100% rename from configs/cnn_mimic3_50.yml rename to configs/caml/cnn_mimic3_50.yml diff --git a/configs/drcaml_mimic3_50.yml b/configs/caml/drcaml_mimic3_50.yml similarity index 100% rename from configs/drcaml_mimic3_50.yml rename to configs/caml/drcaml_mimic3_50.yml diff --git a/configs/gatedcnn_nci/gatedcnn_nci.py b/configs/gatedcnn_nci/gatedcnn_nci.py new file mode 100644 index 0000000..4014637 --- /dev/null +++ b/configs/gatedcnn_nci/gatedcnn_nci.py @@ -0,0 +1,512 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.init import normal_, xavier_uniform_ + + +class GatedCNNEncoder(nn.Module): + def __init__(self, config): + super(GatedCNNEncoder, self).__init__() + self.max_length = config.max_length + self.dropout = config.dropout + self.input_dim = config.embed_size + self.hidden_dim = config.hidden_dim + self.output_dim = config.output_dim + self.bidirectional = config.bidirectional + self.use_description = config.use_description + + self.word_embedding_layer = WordEmbeddingLayer( + embed_dir=config.embed_dir, + dataset_dir=config.dataset_dir, + mimic_dir=config.mimic_dir, + static_dir=config.static_dir, + dropout=config.dropout, + pad_token=config.pad_token, + unk_token=config.unk_token, + ) + self.desc_vecs = self.word_embedding_layer.desc_vecs + + self.encoder = GatedCNN( + input_dim=config.input_dim, + hidden_dim=config.hidden_dim, + output_dim=config.output_dim, + kernel_size=config.kernel_size, + dropout=config.dropout, + init_mean=config.init_mean, + init_std=config.init_std, + levels=config.levels, + ) + + if self.bidirectional: + self.output_layer = OutputLayer( + input_dim=2 * config.input_dim, + num_labels=config.num_labels, + embed_dir=config.embed_dir, + ) + else: + self.output_layer = OutputLayer( + input_dim=config.input_dim, + num_labels=config.num_labels, + embed_dir=config.embed_dir, + ) + + self.variational_dropout = VariationalDropout(dropout=config.dropout) + + self.hidden = None + + def freeze_net(self): + for p in self.word_embedding_layer.embed.parameters(): + p.requires_grad = False + + def init_hidden(self, batch_size): + h_size = self.hidden_dim + self.output_dim + weight = next(self.parameters()).data + return ( + weight.new(batch_size, h_size, 1).zero_(), + weight.new(batch_size, h_size, 1).zero_(), + ) + + def _reverse_seq(self, X, mask, seq_max_len): + """ + X -> batch, seq_len, dim + mask -> batch, seq_len + """ + mask_sum = torch.sum(mask, 1).int() + xfs = [] + for x, c in zip(X, mask_sum): + xf = torch.flip(x[:c], [0]) + xfs.append(xf) + padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda() + for i, mat in enumerate(xfs): + padded_rev[i][: len(mat), :] = mat + return padded_rev + + def forward(self, data, desc): + """ + :param data: The input sequence, with dimesion (N, L) + :param desc: Whether to use code description + :return: logits, loss, hidden + """ + # If this is the first forward pass, we will initialise the hidden + # state. + if self.hidden is None: + self.init_hidden_flag = True + self.hidden = self.init_hidden(data.size(0)) + + # Look up the embeddings of all the tokens using the WordEmbeddingLayer. + # `emb` shape: (batch_size, max_length, embed_size) + emb, mask = self.word_embedding_layer(data) + + # If we want a bidirectional model, we reverse the sequence of + # tokens. + if self.bidirectional: + # `emb_reverse` shape: (batch_size, max_length, embed_size) + emb_reverse = self._reverse_seq(emb, mask, self.max_length) + # `emb_reverse` shape`: [batch_size, embed_size, max_length] + emb_reverse = emb_reverse.transpose(1, 2) + # `emb` shape: (batch_size, embed_size, max_length) + emb = emb.transpose(1, 2) + + # Pass the embeddings through the encoder. If the model is + # bidirectional, we pass the reverse embeddings as well. + raw_output, self.hidden = self.encoder(emb, self.hidden) + if self.bidirectional: + raw_out_reverse, self.hidden = self.encoder( + emb_reverse, self.hidden + ) + + output = self.variational_dropout(raw_output) + if self.bidirectional: + output_reverse = self._reverse_seq( + raw_out_reverse, mask, self.max_length + ) + output_reverse = self.variational_dropout(output_reverse) + output = torch.cat([output, output_reverse], dim=2) + + if self.use_description: + logits = self.output_layer(output, self.desc_vecs) + else: + logits = self.output_layer(output, None) + return logits + + +class WordEmbeddingLayer(nn.Module): + """ + A Word Embedding Layer. This layer loads a pre-trained word embedding matrix + , and copies its weights to an nn.Embedding layer. + + Args: + embed_dir (str): A directory containing the pre-trained word embedding + matrix, among other things. Please see + https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17 + for more details. + dropout (float): The dropout probability. + """ + + def __init__( + self, + embed_dir, + dataset_dir, + mimic_dir, + static_dir, + version, + dropout, + pad_token="", + unk_token="", + return_pad_mask=True, + use_description=True, + ): + super(WordEmbeddingLayer, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"embed_dir = {embed_dir}, dropout = {dropout}" + ) + + self.return_pad_mask = return_pad_mask + + # Note: This should be changed, since we won't always use Word2Vec. + embedding_cls = ConfigMapper.get_object("embeddings", "word2vec") + vocab = embedding_cls.load_vocab(embed_dir) + self.pad_token_id = vocab[pad_token] + self.unk_token_id = vocab[unk_token] + + W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir)) + self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) + self.embed.weight.data = W.clone() + + self.embedding_size = self.embed.embedding_dim + + self.dropout = nn.Dropout(dropout) + + if use_description: + dicts = load_lookups( + dataset_dir=dataset_dir, + mimic_dir=mimic_dir, + static_dir=static_dir, + word2vec_dir=embed_dir, + version=version, + ) + ind2c = dicts["ind2c"] + w2ind = dicts["w2ind"] + desc_dict = dicts["desc"] + self.desc_vecs = [] + for i, c in ind2c.items(): + self.desc_vecs.append( + [ + w2ind[w] if w in w2ind else self.unk_token_id + for w in desc_dict[c] + ] + ) + + # Pad and convert to torch tensor. + self.desc_vecs = torch.Tensor( + list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0))) + ) + + def forward(self, x): + embedding = self.embed(x) + x = self.dropout(embedding) + if self.return_pad_mask: + pad_mask = ~(batch == pad_token_id) + return x, pad_mask + return x + + +class VariationalHidDropout(nn.Module): + """ + Hidden-to-hidden (VD-based) dropout that applies the same mask at every + time step and every layer of TrellisNet. + + Args: + dropout (float): The dropout probability. + """ + + def __init__(self, dropout=0.0): + super(VariationalHidDropout, self).__init__() + self.dropout_probability = dropout + self.mask = None + + def reset_mask(self, input): + + # Dimension (N, C, L) + m = input.data.new(input.size(0), input.size(1), 1).bernoulli_( + 1 - self.dropout_probability + ) + with torch.no_grad(): + mask = m / (1 - self.dropout_probability) + self.mask = mask + return mask + + def forward(self, input): + # We don't apply dropout if the model is in eval mode. + if not self.training or self.dropout_probability == 0: + return input + + assert ( + self.mask is not None + ), "You need to reset mask before using VariationalHidDropout" + mask = self.mask.expand_as(input) # Make sure the dimension matches + return mask * input + + +class WeightShareConv1d(nn.Module): + """ + The weight-tied 1D convolution used in TrellisNet. + + Args: + input_dim (int): The dimension of the input. This is equivalent to + the number of input channels in the first + convolutional layer. + hidden_dim (int): The dimension of the hidden state. This is + equivalent to the number of input channels in the + second convolutional layer. + out_channels (int): The number of output channels in both + convolutional layers. + kernel_size (int): The size of the filter used in both + convolutional layers. + dropout (float): Dropout probability for the hidden-to-hidden + dropout layer. + init_mean (float): The mean of the normal distribution with which + weights of the convolutional layers are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the convolutional layers are + initialised. + """ + + def __init__( + self, + input_dim, + hidden_dim, + out_channels, + kernel_size, + dropout=0.0, + init_mean=0.0, + init_std=0.01, + ): + super(WeightShareConv1d, self).__init__() + + self.input_dim = input_dim + self.kernel_size = kernel_size + + self._dict = {} + + conv_layer_1 = nn.Conv1d( + in_channels=input_dim, + out_channels=out_channels, + kernel_size=kernel_size, + ) + self.weight_1 = conv_layer_1.weight + + conv_layer_2 = nn.Conv1d( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=kernel_size, + ) + self.weight_2 = conv2.weight + self.bias_2 = conv2.bias + + self.init_conv_weights(init_mean, init_std) + + self.dropout = VariationalHidDropout(dropout=dropout) + + def init_conv_weights(self, init_mean, init_std): + self.weight_1.data.normal_(mean=init_mean, std=init_std) + self.weight_2.data.normal_(mean=init_mean, std=init_std) + self.bias_2.data.normal_(mean=init_mean, std=init_std) + + def forward(self, input, dilation, hid): + batch_size = input.size(0) + + padding = (self.kernel_size - 1) * dilation # Padding size. + x = F.pad(input=input, pad=(padding, 0)) # Pad with zeros. + + x_1 = x[:, : self.input_dim] + z_1 = x[:, self.input_dim :] + z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding) + + device = x_1.get_device() + + if (dilation, device) not in self.dict or self.dict[ + (dilation, device) + ] is None: + self.dict[(dilation, device)] = F.conv1d( + input=x_1, weight=self.weight1, dilation=dilation + ) + + z_1 = self.dropout(z_1) + injected = self.dict[(dilation, device)] + F.conv1d( + input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation + ) + return injected + + +class GatedCNN(nn.Module): + """ + Gated CNN module. + + Args: + input_dim (int): The dimension of the input. + hidden_dim (int): The hidden dimension. The hidden dimension for the + weight-shared Conv1D layer is + `hidden_dim + output_dim`. + output_dim (int): The output dimension. The number of output + channels of the weight-shared Conv1D layer is + `4 * (hidden_dim + output_dim)`. + kernel_size (int): The size of the filter used in + `WeightSharedConv1D`. + dropout (float): Dropout probability for the `WeightSharedConv1D`. + init_mean (float): The mean of the normal distribution with which + weights of the `WeightSharedConv1D` layer are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the `WeightSharedConv1D` + layer are initialised. + """ + + def __init__( + self, + input_dim, + hidden_dim, + output_dim, + kernel_size, + dropout, + init_mean, + init_std, + levels, + ): + super(GatedCNN, self).__init__() + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.output_dim = output_dim + self.levels = levels + + self.hidden_dim_for_conv = hidden_dim + output_dim + + self.dilations = [i + 1 for i in range(levels)] + + self.full_conv = WeightShareConv1d( + input_dim=input_dim, + hidden_dim=self.hidden_dim_for_conv, + output_channels=4 * self.hidden_dim_for_conv, + kernel_size=kernel_size, + dropout=dropout, + init_mean=init_mean, + init_std=init_std, + ) + + self.ht = None + + def transform_input(self, X): + device = X.get_device() + if device == -1: + device = "cpu" + + batch_size = X.size(0) + seq_len = X.size(2) + + ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( + device + ) + self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( + device + ) + return torch.cat((X, ht), dim=1) + + def gating(self, Z, dilation=1, hc=None): + batch_size = Z.size(0) + (hid, cell) = hc + + out = self.full_conv(input=Z, dilation=dilation, hid=hid) + + ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation] + ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation) + + it = torch.sigmoid(out[:, : self.hidden_dim_for_conv]) + ot = torch.sigmoid( + out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv] + ) + gt = torch.tanh( + out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv] + ) + ft = torch.sigmoid( + out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv] + ) + self.ct = ft * ct_1 + it * gt + ht = ot * torch.tanh(self.ct) + + Z = torch.cat((Z[:, : self.input_dim], ht), dim=1) + return Z + + def forward(self, emb, hc): + Z = self.transform_input(emb) + for key in self.full_conv.dict: + if key[1] == emb.get_device(): + self.full_conv.dict[key] = None + self.full_conv.drop.reset_mask(Z[:, self.input_dim :]) + + for dilation_per_level in self.dilations: + Z = self.gating(Z, dilation=dilation_per_level, hc=hc) + + out = Z[:, -self.output_dim :].transpose(1, 2) + hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:]) + return out, hc + + +class VariationalDropout(nn.Module): + """ + Feed-forward version of variational dropout that applies the same mask + at every time step. + """ + + def __init__(self, dropout=0.5, dim=3): + super(VariationalDropout, self).__init__() + assert dim in (3, 4), "`dim` should be either 3 or 4" + self.dropout = dropout + self.dim = dim + + def forward(self, x): + if not self.training or not self.dropout: + return x + + if self.dim == 4: + # Dimension (M, N, L, C), where C stands for channels + m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_( + 1 - self.dropout + ) + else: + # Dimension (N, L, C) + m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout) + with torch.no_grad(): + mask = m / (1 - dropout) + mask = mask.expand_as(x) + return mask * x + + +class OutputLayer(nn.Module): + def __init__(self, input_dim, num_labels, embed_dir, dropout): + super(OutputLayer, self).__init__() + + self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout) + + self.U = nn.Linear(input_dim, num_labels) + self.final = nn.Linear(input_dim, num_labels) + self.proj_layer = nn.Linear(input_dim, 1, bias=False) + + xavier_uniform_(self.U.weight) + xavier_uniform_(self.final.weight) + + def forward(self, x, desc): + if desc is not None: + desc_vec, _ = self.word_embedding_layer(desc) + desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0) + mmt = desc_vec.matmul(x.transpose(1, 2)) + else: + mmt = self.U.weight.matmul(x.transpose(1, 2)) + + m = mmt.matmul(x) + + y = self.final.weight.mul(m) + logits = self.proj_layer(y).squeeze(-1).add(self.final.bias) + + return logits diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml new file mode 100644 index 0000000..3613885 --- /dev/null +++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml @@ -0,0 +1,127 @@ +paths: + mimic_dir: &mimic_dir datasets/mimic3/csv + static_dir: &static_dir datasets/mimic3/static + dataset_dir: &dataset_dir datasets/mimic3_50 + word2vec_dir: &word2vec_dir datasets/mimic3_50/word2vec + output_dir: &output_dir results/gatedcnn_nci_mimic3_50 + +dataset: + name: base_dataset + data_common: &data_common + column_names: + hadm_id: "HADM_ID" + clinical_note: "TEXT" + labels: "LABELS" + word2vec_dir: *word2vec_dir + pad_token: "" + unk_token: "" + dataset_dir: *dataset_dir + label_file: labels.json + max_length: 2500 + params: + train: + <<: *data_common + data_file: train.json + val: + <<: *data_common + data_file: val.json + test: + <<: *data_common + data_file: test.json + +model: + name: gatedcnn_nci + params: + version: mimic3 + dataset_dir: *dataset_dir + mimic_dir: *mimic_dir + static_dir: *static_dir + embed_dir: *word2vec_dir + max_length: 2500 + dropout: 0.2 + input_dim: 100 + hidden_dim: 100 + output_dim: 50 + bidirectional: true + use_description: true + pad_token: "" + unk_token: "" + kernel_size: 3 + init_mean: 0 + init_std: 0.01 + levels: 3 + +trainer: + name: base_trainer + params: + output_dir: *output_dir + data_loader: + batch_size: 16 + num_workers: 4 + shuffle: false + drop_last: true + loss: + name: BinaryCrossEntropyLoss + params: null + optimizer: + name: adam + params: + lr: 0.0001 + weight_decay: 0.0 + max_epochs: 200 + lr_scheduler: null + stopping_criterion: + metric: + name: prec_at_8 + desired: max + patience: 10 + checkpoint_saver: + name: base_saver + params: + checkpoint_dir: *output_dir + interval: 1 + max_to_keep: 5 + ckpt_fname_format: "ckpt-{}.pth" + best_fname_format: "best-{}.pth" + metric: + name: prec_at_8 + class: prec_at_k + params: + k: 8 + desired: max + eval_metrics: &eval_metrics + - name: prec_at_5 + class: prec_at_k + params: + k: 5 + - name: prec_at_8 + class: prec_at_k + params: + k: 8 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + graph: + writer: + name: tensorboard + params: + log_dir: *output_dir + train: + interval: 100 + interval_unit: step + metric: + - name: loss + val: + interval: 1 + interval_unit: epoch + metric: + - name: loss + - name: prec_at_5 + - name: prec_at_8 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + seed: 1337 + use_gpu: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml new file mode 100644 index 0000000..4ccca3a --- /dev/null +++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml @@ -0,0 +1,127 @@ +paths: + mimic_dir: &mimic_dir datasets/mimic3/csv + static_dir: &static_dir datasets/mimic3/static + dataset_dir: &dataset_dir datasets/mimic3_50_old + word2vec_dir: &word2vec_dir datasets/mimic3_50_old/word2vec + output_dir: &output_dir results/gatedcnn_nci_mimic3_50_old + +dataset: + name: base_dataset + data_common: &data_common + column_names: + hadm_id: "HADM_ID" + clinical_note: "TEXT" + labels: "LABELS" + word2vec_dir: *word2vec_dir + pad_token: "" + unk_token: "" + dataset_dir: *dataset_dir + label_file: labels.json + max_length: 2500 + params: + train: + <<: *data_common + data_file: train.json + val: + <<: *data_common + data_file: val.json + test: + <<: *data_common + data_file: test.json + +model: + name: gatedcnn_nci + params: + version: mimic3 + dataset_dir: *dataset_dir + mimic_dir: *mimic_dir + static_dir: *static_dir + embed_dir: *word2vec_dir + max_length: 2500 + dropout: 0.2 + input_dim: 100 + hidden_dim: 100 + output_dim: 50 + bidirectional: true + use_description: true + pad_token: "" + unk_token: "" + kernel_size: 3 + init_mean: 0 + init_std: 0.01 + levels: 3 + +trainer: + name: base_trainer + params: + output_dir: *output_dir + data_loader: + batch_size: 16 + num_workers: 4 + shuffle: false + drop_last: true + loss: + name: BinaryCrossEntropyLoss + params: null + optimizer: + name: adam + params: + lr: 0.000001 + weight_decay: 0.0 + max_epochs: 200 + lr_scheduler: null + stopping_criterion: + metric: + name: prec_at_8 + desired: max + patience: 10 + checkpoint_saver: + name: base_saver + params: + checkpoint_dir: *output_dir + interval: 1 + max_to_keep: 5 + ckpt_fname_format: "ckpt-{}.pth" + best_fname_format: "best-{}.pth" + metric: + name: prec_at_8 + class: prec_at_k + params: + k: 8 + desired: max + eval_metrics: &eval_metrics + - name: prec_at_5 + class: prec_at_k + params: + k: 5 + - name: prec_at_8 + class: prec_at_k + params: + k: 8 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + graph: + writer: + name: tensorboard + params: + log_dir: *output_dir + train: + interval: 100 + interval_unit: step + metric: + - name: loss + val: + interval: 1 + interval_unit: epoch + metric: + - name: loss + - name: prec_at_5 + - name: prec_at_8 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + seed: 1337 + use_gpu: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml new file mode 100644 index 0000000..de9b1ab --- /dev/null +++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml @@ -0,0 +1,127 @@ +paths: + mimic_dir: &mimic_dir datasets/mimic3/csv + static_dir: &static_dir datasets/mimic3/static + dataset_dir: &dataset_dir datasets/mimic3_full + word2vec_dir: &word2vec_dir datasets/mimic3_full/word2vec + output_dir: &output_dir results/gatedcnn_nci_mimic3_full + +dataset: + name: base_dataset + data_common: &data_common + column_names: + hadm_id: "HADM_ID" + clinical_note: "TEXT" + labels: "LABELS" + word2vec_dir: *word2vec_dir + pad_token: "" + unk_token: "" + dataset_dir: *dataset_dir + label_file: labels.json + max_length: 2500 + params: + train: + <<: *data_common + data_file: train.json + val: + <<: *data_common + data_file: val.json + test: + <<: *data_common + data_file: test.json + +model: + name: CAML + params: + version: mimic3 + dataset_dir: *dataset_dir + mimic_dir: *mimic_dir + static_dir: *static_dir + embed_dir: *word2vec_dir + max_length: 2500 + dropout: 0.2 + input_dim: 100 + hidden_dim: 100 + output_dim: 8922 + bidirectional: true + use_description: true + pad_token: "" + unk_token: "" + kernel_size: 3 + init_mean: 0 + init_std: 0.01 + levels: 3 + +trainer: + name: base_trainer + params: + output_dir: *output_dir + data_loader: + batch_size: 16 + num_workers: 4 + shuffle: false + drop_last: true + loss: + name: BinaryCrossEntropyLoss + params: null + optimizer: + name: adam + params: + lr: 0.0001 + weight_decay: 0.0 + max_epochs: 200 + lr_scheduler: null + stopping_criterion: + metric: + name: prec_at_8 + desired: max + patience: 10 + checkpoint_saver: + name: base_saver + params: + checkpoint_dir: *output_dir + interval: 1 + max_to_keep: 5 + ckpt_fname_format: "ckpt-{}.pth" + best_fname_format: "best-{}.pth" + metric: + name: prec_at_8 + class: prec_at_k + params: + k: 8 + desired: max + eval_metrics: &eval_metrics + - name: prec_at_8 + class: prec_at_k + params: + k: 8 + - name: prec_at_15 + class: prec_at_k + params: + k: 15 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + graph: + writer: + name: tensorboard + params: + log_dir: *output_dir + train: + interval: 100 + interval_unit: step + metric: + - name: loss + val: + interval: 1 + interval_unit: epoch + metric: + - name: loss + - name: prec_at_8 + - name: prec_at_15 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + seed: 1337 + use_gpu: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml new file mode 100644 index 0000000..c95f4aa --- /dev/null +++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml @@ -0,0 +1,127 @@ +paths: + mimic_dir: &mimic_dir datasets/mimic3/csv + static_dir: &static_dir datasets/mimic3/static + dataset_dir: &dataset_dir datasets/mimic3_full_old + word2vec_dir: &word2vec_dir datasets/mimic3_full_old/word2vec + output_dir: &output_dir results/gatedcnn_nci_mimic3_full_old + +dataset: + name: base_dataset + data_common: &data_common + column_names: + hadm_id: "HADM_ID" + clinical_note: "TEXT" + labels: "LABELS" + word2vec_dir: *word2vec_dir + pad_token: "" + unk_token: "" + dataset_dir: *dataset_dir + label_file: labels.json + max_length: 2500 + params: + train: + <<: *data_common + data_file: train.json + val: + <<: *data_common + data_file: val.json + test: + <<: *data_common + data_file: test.json + +model: + name: gatedcnn_nci + params: + version: mimic3 + dataset_dir: *dataset_dir + mimic_dir: *mimic_dir + static_dir: *static_dir + embed_dir: *word2vec_dir + max_length: 2500 + dropout: 0.2 + input_dim: 100 + hidden_dim: 100 + output_dim: 8922 + bidirectional: true + use_description: true + pad_token: "" + unk_token: "" + kernel_size: 3 + init_mean: 0 + init_std: 0.01 + levels: 3 + +trainer: + name: base_trainer + params: + output_dir: *output_dir + data_loader: + batch_size: 16 + num_workers: 4 + shuffle: false + drop_last: true + loss: + name: BinaryCrossEntropyLoss + params: null + optimizer: + name: adam + params: + lr: 0.0001 + weight_decay: 0.0 + max_epochs: 200 + lr_scheduler: null + stopping_criterion: + metric: + name: prec_at_8 + desired: max + patience: 10 + checkpoint_saver: + name: base_saver + params: + checkpoint_dir: *output_dir + interval: 1 + max_to_keep: 5 + ckpt_fname_format: "ckpt-{}.pth" + best_fname_format: "best-{}.pth" + metric: + name: prec_at_8 + class: prec_at_k + params: + k: 8 + desired: max + eval_metrics: &eval_metrics + - name: prec_at_8 + class: prec_at_k + params: + k: 8 + - name: prec_at_15 + class: prec_at_k + params: + k: 15 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + graph: + writer: + name: tensorboard + params: + log_dir: *output_dir + train: + interval: 100 + interval_unit: step + metric: + - name: loss + val: + interval: 1 + interval_unit: epoch + metric: + - name: loss + - name: prec_at_8 + - name: prec_at_15 + - name: macro_f1 + - name: micro_f1 + - name: macro_auc + - name: micro_auc + seed: 1337 + use_gpu: true diff --git a/src/models/fusion.py b/src/models/fusion.py index 69fed90..5236047 100755 --- a/src/models/fusion.py +++ b/src/models/fusion.py @@ -415,7 +415,7 @@ class Fusion(nn.Module): def __init__(self, config): super(Fusion, self).__init__() - logger.info(f"Initialising %s", self.__class__.__name__) + logger.info("Initialising %s", self.__class__.__name__) logger.debug( "Initialising %s with config: %s", self.__class__.__name__, config ) diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py index 38d0016..72114d6 100644 --- a/src/models/gatedcnn_nci.py +++ b/src/models/gatedcnn_nci.py @@ -1,32 +1,75 @@ import torch +import torch.nn as nn import torch.nn.functional as F -from torch import nn - - -class GatedCNNencoder(nn.Module): - def __init__(self, args, Y, dicts): - super(GatedCNNencoder, self).__init__() - self.args = args - self.max_length = args.MAX_LENGTH - self.dropout = args.dropout - self.ninp = args.embed_size - self.nhid = args.nhid - self.nout = args.nout - self.bidirectional = args.bidirectional - - self.word_rep = WordRep(args, Y, dicts) - self.encoder = GatedCNN(args, Y, dicts, self.ninp, self.nout) - self.network = nn.ModuleList([self.encoder]) +from torch.nn.init import normal_, xavier_uniform_ + +from src.utils.caml_utils import load_lookups, pad_desc_vecs +from src.utils.mapper import ConfigMapper + + +@ConfigMapper.map("models", "gatedcnn_nci") +class GatedCNNEncoder(nn.Module): + def __init__(self, config): + super(GatedCNNEncoder, self).__init__() + self.max_length = config.max_length + self.dropout = config.dropout + self.input_dim = config.embed_size + self.hidden_dim = config.hidden_dim + self.output_dim = config.output_dim + self.bidirectional = config.bidirectional + self.use_description = config.use_description + + self.word_embedding_layer = WordEmbeddingLayer( + embed_dir=config.embed_dir, + dataset_dir=config.dataset_dir, + mimic_dir=config.mimic_dir, + static_dir=config.static_dir, + dropout=config.dropout, + pad_token=config.pad_token, + unk_token=config.unk_token, + ) + self.desc_vecs = self.word_embedding_layer.desc_vecs + + self.encoder = GatedCNN( + input_dim=config.input_dim, + hidden_dim=config.hidden_dim, + output_dim=config.output_dim, + kernel_size=config.kernel_size, + dropout=config.dropout, + init_mean=config.init_mean, + init_std=config.init_std, + levels=config.levels, + ) + if self.bidirectional: - self.output_layer = OutputLayer(args, Y, dicts, self.nout * 2) + self.output_layer = OutputLayer( + input_dim=2 * config.input_dim, + num_labels=config.output_dim, + embed_dir=config.embed_dir, + ) else: - self.output_layer = OutputLayer(args, Y, dicts, self.nout) - self.var_drop = VariationalDropout() + self.output_layer = OutputLayer( + input_dim=config.input_dim, + num_labels=config.num_labels, + embed_dir=config.embed_dir, + ) + + self.variational_dropout = VariationalDropout(dropout=config.dropout) + + self.hidden = None def freeze_net(self): - for p in self.word_rep.embed.parameters(): + for p in self.word_embedding_layer.embed.parameters(): p.requires_grad = False + def init_hidden(self, batch_size): + h_size = self.hidden_dim + self.output_dim + weight = next(self.parameters()).data + return ( + weight.new(batch_size, h_size, 1).zero_(), + weight.new(batch_size, h_size, 1).zero_(), + ) + def _reverse_seq(self, X, mask, seq_max_len): """ X -> batch, seq_len, dim @@ -42,58 +85,53 @@ def _reverse_seq(self, X, mask, seq_max_len): padded_rev[i][: len(mat), :] = mat return padded_rev - def forward(self, data, target, mask, hidden, desc): + def forward(self, data, desc): """ :param data: The input sequence, with dimesion (N, L) - :param target: labels - :param mask: input sequence mask - :param hidden: The initial hidden state (h, c) :param desc: Whether to use code description :return: logits, loss, hidden """ - emb = self.word_rep(data, target) + # If this is the first forward pass, we will initialise the hidden + # state. + if self.hidden is None: + self.init_hidden_flag = True + self.hidden = self.init_hidden(data.size(0)) + + # Look up the embeddings of all the tokens using the WordEmbeddingLayer. + # `emb` shape: (batch_size, max_length, embed_size) + emb, mask = self.word_embedding_layer(data) + + # If we want a bidirectional model, we reverse the sequence of + # tokens. if self.bidirectional: + # `emb_reverse` shape: (batch_size, max_length, embed_size) emb_reverse = self._reverse_seq(emb, mask, self.max_length) - emb = emb.transpose(1, 2) # emb: [bs, 100, len] - if self.bidirectional: - emb_reverse = emb_reverse.transpose( - 1, 2 - ) # emb_reverse: [bs, 100, len] - cnn_encoder = self.network[0] - raw_output, hidden = cnn_encoder(emb, hidden) - if self.bidirectional: - raw_out_re, hidden = cnn_encoder(emb_reverse, hidden) - output = self.var_drop(raw_output, self.dropout) - if self.bidirectional: - output_re = self._reverse_seq(raw_out_re, mask, self.max_length) - output_re = self.var_drop(output_re, self.dropout) + # `emb_reverse` shape`: [batch_size, embed_size, max_length] + emb_reverse = emb_reverse.transpose(1, 2) + # `emb` shape: (batch_size, embed_size, max_length) + emb = emb.transpose(1, 2) + + # Pass the embeddings through the encoder. If the model is + # bidirectional, we pass the reverse embeddings as well. + raw_output, self.hidden = self.encoder(emb, self.hidden) if self.bidirectional: - output = torch.cat([output, output_re], dim=2) - if self.args.desc: - logits, loss, _, interaction = self.output_layer( - output, target, desc + raw_out_reverse, self.hidden = self.encoder( + emb_reverse, self.hidden ) - else: - logits, loss, _, interaction = self.output_layer( - output, target, None - ) - return logits, loss, hidden, interaction - - def init_hidden(self, bsz): - h_size = self.nhid + self.nout - weight = next(self.parameters()).data - return ( - weight.new(bsz, h_size, 1).zero_(), - weight.new(bsz, h_size, 1).zero_(), - ) + output = self.variational_dropout(raw_output) + if self.bidirectional: + output_reverse = self._reverse_seq( + raw_out_reverse, mask, self.max_length + ) + output_reverse = self.variational_dropout(output_reverse) + output = torch.cat([output, output_reverse], dim=2) -from typing import Tuple - -import torch.nn as nn -from embeddings import build_pretrain_embedding, load_embeddings -from torch import Tensor -from torch.nn.init import kaiming_uniform_, normal_, xavier_uniform_ + if self.use_description: + logits = self.output_layer(output, self.desc_vecs) + else: + logits = self.output_layer(output, None) + return logits class WordEmbeddingLayer(nn.Module): @@ -109,15 +147,32 @@ class WordEmbeddingLayer(nn.Module): dropout (float): The dropout probability. """ - def __init__(self, embed_dir, dropout, num_filter_maps): + def __init__( + self, + embed_dir, + dataset_dir, + mimic_dir, + static_dir, + version, + dropout, + pad_token="", + unk_token="", + return_pad_mask=True, + use_description=True, + ): super(WordEmbeddingLayer, self).__init__() logger.debug( f"Initialising {self.__class__.__name__} with " f"embed_dir = {embed_dir}, dropout = {dropout}" ) + self.return_pad_mask = return_pad_mask + # Note: This should be changed, since we won't always use Word2Vec. embedding_cls = ConfigMapper.get_object("embeddings", "word2vec") + vocab = embedding_cls.load_vocab(embed_dir) + self.pad_token_id = vocab[pad_token] + self.unk_token_id = vocab[unk_token] W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir)) self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) @@ -127,28 +182,50 @@ def __init__(self, embed_dir, dropout, num_filter_maps): self.dropout = nn.Dropout(dropout) - self.conv_dict = { - 1: [self.embedding_size, num_filter_maps], - 2: [self.embedding_size, 100, num_filter_maps], - 3: [self.embedding_size, 150, 100, num_filter_maps], - 4: [self.embedding_size, 200, 150, 100, num_filter_maps], - } + if use_description: + dicts = load_lookups( + dataset_dir=dataset_dir, + mimic_dir=mimic_dir, + static_dir=static_dir, + word2vec_dir=embed_dir, + version=version, + ) + ind2c = dicts["ind2c"] + w2ind = dicts["w2ind"] + desc_dict = dicts["desc"] + self.desc_vecs = [] + for i, c in ind2c.items(): + self.desc_vecs.append( + [ + w2ind[w] if w in w2ind else self.unk_token_id + for w in desc_dict[c] + ] + ) + + # Pad and convert to torch tensor. + self.desc_vecs = torch.Tensor( + list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0))) + ) def forward(self, x): embedding = self.embed(x) x = self.dropout(embedding) + if self.return_pad_mask: + pad_mask = ~(batch == pad_token_id) + return x, pad_mask return x class VariationalHidDropout(nn.Module): - def __init__(self, dropout=0.0): - """ - Hidden-to-hidden (VD-based) dropout that applies the same mask at every - time step and every layer of TrellisNet. + """ + Hidden-to-hidden (VD-based) dropout that applies the same mask at every + time step and every layer of TrellisNet. - Args: - dropout (float): The dropout probability. - """ + Args: + dropout (float): The dropout probability. + """ + + def __init__(self, dropout=0.0): super(VariationalHidDropout, self).__init__() self.dropout_probability = dropout self.mask = None @@ -177,6 +254,30 @@ def forward(self, input): class WeightShareConv1d(nn.Module): + """ + The weight-tied 1D convolution used in TrellisNet. + + Args: + input_dim (int): The dimension of the input. This is equivalent to + the number of input channels in the first + convolutional layer. + hidden_dim (int): The dimension of the hidden state. This is + equivalent to the number of input channels in the + second convolutional layer. + out_channels (int): The number of output channels in both + convolutional layers. + kernel_size (int): The size of the filter used in both + convolutional layers. + dropout (float): Dropout probability for the hidden-to-hidden + dropout layer. + init_mean (float): The mean of the normal distribution with which + weights of the convolutional layers are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the convolutional layers are + initialised. + """ + def __init__( self, input_dim, @@ -187,29 +288,6 @@ def __init__( init_mean=0.0, init_std=0.01, ): - """ - The weight-tied 1D convolution used in TrellisNet. - - Args: - input_dim (int): The dimension of the input. This is equivalent to - the number of input channels in the first - convolutional layer. - hidden_dim (int): The dimension of the hidden state. This is - equivalent to the number of input channels in the - second convolutional layer. - out_channels (int): The number of output channels in both - convolutional layers. - kernel_size (int): The size of the filter used in both - convolutional layers. - dropout (float): Dropout probability for the hidden-to-hidden - dropout layer. - init_mean (float): The mean of the normal distribution with which - weights of the convolutional layers are - initialised. - init_std (float): The standard deviation of the normal distribution - with which weights of the convolutional layers are - initialised. - """ super(WeightShareConv1d, self).__init__() self.input_dim = input_dim @@ -268,6 +346,28 @@ def forward(self, input, dilation, hid): class GatedCNN(nn.Module): + """ + Gated CNN module. + + Args: + input_dim (int): The dimension of the input. + hidden_dim (int): The hidden dimension. The hidden dimension for the + weight-shared Conv1D layer is + `hidden_dim + output_dim`. + output_dim (int): The output dimension. The number of output + channels of the weight-shared Conv1D layer is + `4 * (hidden_dim + output_dim)`. + kernel_size (int): The size of the filter used in + `WeightSharedConv1D`. + dropout (float): Dropout probability for the `WeightSharedConv1D`. + init_mean (float): The mean of the normal distribution with which + weights of the `WeightSharedConv1D` layer are + initialised. + init_std (float): The standard deviation of the normal distribution + with which weights of the `WeightSharedConv1D` + layer are initialised. + """ + def __init__( self, input_dim, @@ -279,30 +379,9 @@ def __init__( init_std, levels, ): - """ - Gated CNN module. - - Args: - input_dim (int): The dimension of the input. - hidden_dim (int): The hidden dimension. The hidden dimension for the - weight-shared Conv1D layer is - `hidden_dim + output_dim`. - output_dim (int): The output dimension. The number of output - channels of the weight-shared Conv1D layer is - `4 * (hidden_dim + output_dim)`. - kernel_size (int): The size of the filter used in - `WeightSharedConv1D`. - dropout (float): Dropout probability for the `WeightSharedConv1D`. - init_mean (float): The mean of the normal distribution with which - weights of the `WeightSharedConv1D` layer are - initialised. - init_std (float): The standard deviation of the normal distribution - with which weights of the `WeightSharedConv1D` - layer are initialised. - """ super(GatedCNN, self).__init__() self.input_dim = input_dim - self.hidden_dim = args.hidden_dim + self.hidden_dim = hidden_dim self.output_dim = output_dim self.levels = levels @@ -378,26 +457,52 @@ def forward(self, emb, hc): return out, hc +class VariationalDropout(nn.Module): + """ + Feed-forward version of variational dropout that applies the same mask + at every time step. + """ + + def __init__(self, dropout=0.5, dim=3): + super(VariationalDropout, self).__init__() + assert dim in (3, 4), "`dim` should be either 3 or 4" + self.dropout = dropout + self.dim = dim + + def forward(self, x): + if not self.training or not self.dropout: + return x + + if self.dim == 4: + # Dimension (M, N, L, C), where C stands for channels + m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_( + 1 - self.dropout + ) + else: + # Dimension (N, L, C) + m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout) + with torch.no_grad(): + mask = m / (1 - dropout) + mask = mask.expand_as(x) + return mask * x + + class OutputLayer(nn.Module): - def __init__( - self, input_size, num_labels, embed_dir, dropout, num_filter_maps - ): + def __init__(self, input_dim, num_labels, embed_dir, dropout): super(OutputLayer, self).__init__() - self.word_embedding_layer = WordEmbeddingLayer( - embed_dir, dropout, num_filter_maps - ) + self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout) - self.U = nn.Linear(input_size, num_labels) - self.final = nn.Linear(input_size, num_labels) - self.proj_layer = nn.Linear(input_size, 1, bias=False) + self.U = nn.Linear(input_dim, num_labels) + self.final = nn.Linear(input_dim, num_labels) + self.proj_layer = nn.Linear(input_dim, 1, bias=False) xavier_uniform_(self.U.weight) xavier_uniform_(self.final.weight) def forward(self, x, desc): if desc is not None: - desc_vec = self.word_rep(desc, target) + desc_vec, _ = self.word_embedding_layer(desc) desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0) mmt = desc_vec.matmul(x.transpose(1, 2)) else: @@ -409,28 +514,3 @@ def forward(self, x, desc): logits = self.proj_layer(y).squeeze(-1).add(self.final.bias) return logits - - -class VariationalDropout(nn.Module): - def __init__(self): - """ - Feed-forward version of variational dropout that applies the same mask - at every time step. - """ - super(VariationalDropout, self).__init__() - - def forward(self, x, dropout=0.5, dim=3): - if not self.training or not dropout: - return x - if dim == 4: - # Dimension (M, N, L, C), where C stands for channels - m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_( - 1 - dropout - ) - else: - # Dimension (N, L, C) - m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - dropout) - with torch.no_grad(): - mask = m / (1 - dropout) - mask = mask.expand_as(x) - return mask * x diff --git a/src/modules/metrics.py b/src/modules/metrics.py index 04d2341..a5edda6 100755 --- a/src/modules/metrics.py +++ b/src/modules/metrics.py @@ -23,6 +23,7 @@ def to_np_array(array): array = np.array(array) return array + def _auc_job(x): return roc_auc_score(x[0], x[1]) diff --git a/src/utils/caml_utils.py b/src/utils/caml_utils.py index a165a4d..58fd76d 100644 --- a/src/utils/caml_utils.py +++ b/src/utils/caml_utils.py @@ -101,3 +101,18 @@ def pad_desc_vecs(desc_vecs): for vec in desc_vecs: pad_vecs.append(vec + [0] * (desc_len - len(vec))) return pad_vecs + + +def load_description_tokens(lookup_dict, vocab_json): + # load description one-hot vectors from file + dv_dict = {} + + with open("%s/description_vectors.vocab" % (data_dir), "r") as vfile: + r = csv.reader(vfile, delimiter=" ") + # header + next(r) + for row in r: + code = row[0] + vec = [int(x) for x in row[1:]] + dv_dict[code] = vec + return dv_dict From 8f994bdceefa7cca8511b48306b8ff0595bc3fef Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Tue, 14 Jun 2022 23:45:10 +0530 Subject: [PATCH 3/5] Fix more bugs --- src/models/__init__.py | 1 + src/models/gatedcnn_nci.py | 84 ++++++++++++++++++++++++++++++-------- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/src/models/__init__.py b/src/models/__init__.py index 04f7630..9cf8c04 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -2,3 +2,4 @@ from src.models.caml import VanillaConv as CNN from src.models.dcan import DCAN from src.models.fusion import Fusion +from src.models.gatedcnn_nci import GatedCNNNCI diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py index 72114d6..84ae5e8 100644 --- a/src/models/gatedcnn_nci.py +++ b/src/models/gatedcnn_nci.py @@ -1,3 +1,5 @@ +import itertools + import torch import torch.nn as nn import torch.nn.functional as F @@ -5,15 +7,18 @@ from src.utils.caml_utils import load_lookups, pad_desc_vecs from src.utils.mapper import ConfigMapper +from src.utils.text_loggers import get_logger + +logger = get_logger(__name__) @ConfigMapper.map("models", "gatedcnn_nci") -class GatedCNNEncoder(nn.Module): +class GatedCNNNCI(nn.Module): def __init__(self, config): - super(GatedCNNEncoder, self).__init__() + super(GatedCNNNCI, self).__init__() self.max_length = config.max_length self.dropout = config.dropout - self.input_dim = config.embed_size + self.input_dim = config.input_dim self.hidden_dim = config.hidden_dim self.output_dim = config.output_dim self.bidirectional = config.bidirectional @@ -24,6 +29,7 @@ def __init__(self, config): dataset_dir=config.dataset_dir, mimic_dir=config.mimic_dir, static_dir=config.static_dir, + version=config.version, dropout=config.dropout, pad_token=config.pad_token, unk_token=config.unk_token, @@ -43,15 +49,29 @@ def __init__(self, config): if self.bidirectional: self.output_layer = OutputLayer( + embed_dir=config.embed_dir, + dataset_dir=config.dataset_dir, + mimic_dir=config.mimic_dir, + static_dir=config.static_dir, + version=config.version, input_dim=2 * config.input_dim, num_labels=config.output_dim, - embed_dir=config.embed_dir, + dropout=config.dropout, + pad_token=config.pad_token, + unk_token=config.unk_token, ) else: self.output_layer = OutputLayer( + embed_dir=config.embed_dir, + dataset_dir=config.dataset_dir, + mimic_dir=config.mimic_dir, + static_dir=config.static_dir, + version=config.version, input_dim=config.input_dim, num_labels=config.num_labels, - embed_dir=config.embed_dir, + dropout=config.dropout, + pad_token=config.pad_token, + unk_token=config.unk_token, ) self.variational_dropout = VariationalDropout(dropout=config.dropout) @@ -85,12 +105,16 @@ def _reverse_seq(self, X, mask, seq_max_len): padded_rev[i][: len(mat), :] = mat return padded_rev - def forward(self, data, desc): + def forward(self, data): """ :param data: The input sequence, with dimesion (N, L) :param desc: Whether to use code description :return: logits, loss, hidden """ + device = data.get_device() + if device == -1: + device = "cpu" + # If this is the first forward pass, we will initialise the hidden # state. if self.hidden is None: @@ -128,7 +152,7 @@ def forward(self, data, desc): output = torch.cat([output, output_reverse], dim=2) if self.use_description: - logits = self.output_layer(output, self.desc_vecs) + logits = self.output_layer(output, self.desc_vecs.to(device)) else: logits = self.output_layer(output, None) return logits @@ -205,13 +229,14 @@ def __init__( # Pad and convert to torch tensor. self.desc_vecs = torch.Tensor( list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0))) - ) + ).long() def forward(self, x): + if self.return_pad_mask: + pad_mask = ~(x == self.pad_token_id) embedding = self.embed(x) x = self.dropout(embedding) if self.return_pad_mask: - pad_mask = ~(batch == pad_token_id) return x, pad_mask return x @@ -307,13 +332,15 @@ def __init__( out_channels=out_channels, kernel_size=kernel_size, ) - self.weight_2 = conv2.weight - self.bias_2 = conv2.bias + self.weight_2 = conv_layer_2.weight + self.bias_2 = conv_layer_2.bias self.init_conv_weights(init_mean, init_std) self.dropout = VariationalHidDropout(dropout=dropout) + self.dict = {} + def init_conv_weights(self, init_mean, init_std): self.weight_1.data.normal_(mean=init_mean, std=init_std) self.weight_2.data.normal_(mean=init_mean, std=init_std) @@ -335,12 +362,12 @@ def forward(self, input, dilation, hid): (dilation, device) ] is None: self.dict[(dilation, device)] = F.conv1d( - input=x_1, weight=self.weight1, dilation=dilation + input=x_1, weight=self.weight_1, dilation=dilation ) z_1 = self.dropout(z_1) injected = self.dict[(dilation, device)] + F.conv1d( - input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation + input=z_1, weight=self.weight_2, bias=self.bias_2, dilation=dilation ) return injected @@ -392,7 +419,7 @@ def __init__( self.full_conv = WeightShareConv1d( input_dim=input_dim, hidden_dim=self.hidden_dim_for_conv, - output_channels=4 * self.hidden_dim_for_conv, + out_channels=4 * self.hidden_dim_for_conv, kernel_size=kernel_size, dropout=dropout, init_mean=init_mean, @@ -447,7 +474,7 @@ def forward(self, emb, hc): for key in self.full_conv.dict: if key[1] == emb.get_device(): self.full_conv.dict[key] = None - self.full_conv.drop.reset_mask(Z[:, self.input_dim :]) + self.full_conv.dropout.reset_mask(Z[:, self.input_dim :]) for dilation_per_level in self.dilations: Z = self.gating(Z, dilation=dilation_per_level, hc=hc) @@ -482,16 +509,37 @@ def forward(self, x): # Dimension (N, L, C) m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout) with torch.no_grad(): - mask = m / (1 - dropout) + mask = m / (1 - self.dropout) mask = mask.expand_as(x) return mask * x class OutputLayer(nn.Module): - def __init__(self, input_dim, num_labels, embed_dir, dropout): + def __init__( + self, + embed_dir, + dataset_dir, + mimic_dir, + static_dir, + version, + input_dim, + num_labels, + dropout=0.2, + pad_token="", + unk_token="", + ): super(OutputLayer, self).__init__() - self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout) + self.word_embedding_layer = WordEmbeddingLayer( + embed_dir=embed_dir, + dataset_dir=dataset_dir, + mimic_dir=mimic_dir, + static_dir=static_dir, + version=version, + dropout=dropout, + pad_token=pad_token, + unk_token=unk_token, + ) self.U = nn.Linear(input_dim, num_labels) self.final = nn.Linear(input_dim, num_labels) From ec76041314ad81663c2d612c92939a4ac278698d Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Wed, 15 Jun 2022 19:24:14 +0530 Subject: [PATCH 4/5] Fix bugs --- configs/caml/caml_mimic3_50.yml | 1 + configs/caml/caml_mimic3_50_old.yml | 1 + configs/caml/caml_mimic3_full.yml | 1 + configs/caml/caml_mimic3_full_old.yml | 1 + configs/caml/cnn_mimic3_50.yml | 1 + configs/caml/drcaml_mimic3_50.yml | 1 + configs/dcan/mimic3_50.yml | 1 + configs/dcan/mimic3_50_old.yml | 1 + configs/dcan/mimic3_full.yml | 1 + configs/dcan/mimic3_full_old.yml | 1 + configs/fusion/mimic3_50.yml | 1 + configs/fusion/mimic3_50_old.yml | 1 + configs/fusion/mimic3_full.yml | 1 + configs/fusion/mimic3_full_old.yml | 1 + configs/gatedcnn_nci/gatedcnn_nci.py | 512 ------------------ ...tedcnn_nci_mimic3_50.yml => mimic3_50.yml} | 3 +- ...ci_mimic3_50_old.yml => mimic3_50_old.yml} | 3 +- ...nn_nci_mimic3_full.yml => mimic3_full.yml} | 3 +- ...imic3_full_old.yml => mimic3_full_old.yml} | 3 +- src/models/gatedcnn_nci.py | 14 +- src/trainers/base_trainer.py | 4 + 21 files changed, 35 insertions(+), 521 deletions(-) delete mode 100644 configs/gatedcnn_nci/gatedcnn_nci.py rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_50.yml => mimic3_50.yml} (97%) rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_50_old.yml => mimic3_50_old.yml} (97%) rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_full.yml => mimic3_full.yml} (97%) rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_full_old.yml => mimic3_full_old.yml} (97%) diff --git a/configs/caml/caml_mimic3_50.yml b/configs/caml/caml_mimic3_50.yml index 84f704c..47cc2e8 100644 --- a/configs/caml/caml_mimic3_50.yml +++ b/configs/caml/caml_mimic3_50.yml @@ -121,3 +121,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/caml/caml_mimic3_50_old.yml b/configs/caml/caml_mimic3_50_old.yml index bb2bd2f..6d9bf50 100644 --- a/configs/caml/caml_mimic3_50_old.yml +++ b/configs/caml/caml_mimic3_50_old.yml @@ -121,3 +121,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/caml/caml_mimic3_full.yml b/configs/caml/caml_mimic3_full.yml index 9a66d1c..7a4633f 100644 --- a/configs/caml/caml_mimic3_full.yml +++ b/configs/caml/caml_mimic3_full.yml @@ -121,3 +121,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/caml/caml_mimic3_full_old.yml b/configs/caml/caml_mimic3_full_old.yml index 39ba0ed..7d37859 100644 --- a/configs/caml/caml_mimic3_full_old.yml +++ b/configs/caml/caml_mimic3_full_old.yml @@ -121,3 +121,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/caml/cnn_mimic3_50.yml b/configs/caml/cnn_mimic3_50.yml index 1bd742e..074758e 100644 --- a/configs/caml/cnn_mimic3_50.yml +++ b/configs/caml/cnn_mimic3_50.yml @@ -117,3 +117,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/caml/drcaml_mimic3_50.yml b/configs/caml/drcaml_mimic3_50.yml index 979fd04..ee90972 100644 --- a/configs/caml/drcaml_mimic3_50.yml +++ b/configs/caml/drcaml_mimic3_50.yml @@ -121,3 +121,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/dcan/mimic3_50.yml b/configs/dcan/mimic3_50.yml index 788f329..e5a0a80 100644 --- a/configs/dcan/mimic3_50.yml +++ b/configs/dcan/mimic3_50.yml @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1 use_gpu: true + initialise_hidden_states: false diff --git a/configs/dcan/mimic3_50_old.yml b/configs/dcan/mimic3_50_old.yml index 380aacf..12ddfea 100644 --- a/configs/dcan/mimic3_50_old.yml +++ b/configs/dcan/mimic3_50_old.yml @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1 use_gpu: true + initialise_hidden_states: false diff --git a/configs/dcan/mimic3_full.yml b/configs/dcan/mimic3_full.yml index 03e6e11..95f6fc8 100644 --- a/configs/dcan/mimic3_full.yml +++ b/configs/dcan/mimic3_full.yml @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1 use_gpu: true + initialise_hidden_states: false diff --git a/configs/dcan/mimic3_full_old.yml b/configs/dcan/mimic3_full_old.yml index e21f402..2e69608 100644 --- a/configs/dcan/mimic3_full_old.yml +++ b/configs/dcan/mimic3_full_old.yml @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1 use_gpu: true + initialise_hidden_states: false diff --git a/configs/fusion/mimic3_50.yml b/configs/fusion/mimic3_50.yml index 05236c1..69e2276 100644 --- a/configs/fusion/mimic3_50.yml +++ b/configs/fusion/mimic3_50.yml @@ -126,3 +126,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: true diff --git a/configs/fusion/mimic3_50_old.yml b/configs/fusion/mimic3_50_old.yml index 53532cd..3ff6583 100644 --- a/configs/fusion/mimic3_50_old.yml +++ b/configs/fusion/mimic3_50_old.yml @@ -126,3 +126,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/fusion/mimic3_full.yml b/configs/fusion/mimic3_full.yml index af05013..a0f43f4 100644 --- a/configs/fusion/mimic3_full.yml +++ b/configs/fusion/mimic3_full.yml @@ -126,3 +126,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/fusion/mimic3_full_old.yml b/configs/fusion/mimic3_full_old.yml index 13b4659..9a953f4 100644 --- a/configs/fusion/mimic3_full_old.yml +++ b/configs/fusion/mimic3_full_old.yml @@ -126,3 +126,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: false diff --git a/configs/gatedcnn_nci/gatedcnn_nci.py b/configs/gatedcnn_nci/gatedcnn_nci.py deleted file mode 100644 index 4014637..0000000 --- a/configs/gatedcnn_nci/gatedcnn_nci.py +++ /dev/null @@ -1,512 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn.init import normal_, xavier_uniform_ - - -class GatedCNNEncoder(nn.Module): - def __init__(self, config): - super(GatedCNNEncoder, self).__init__() - self.max_length = config.max_length - self.dropout = config.dropout - self.input_dim = config.embed_size - self.hidden_dim = config.hidden_dim - self.output_dim = config.output_dim - self.bidirectional = config.bidirectional - self.use_description = config.use_description - - self.word_embedding_layer = WordEmbeddingLayer( - embed_dir=config.embed_dir, - dataset_dir=config.dataset_dir, - mimic_dir=config.mimic_dir, - static_dir=config.static_dir, - dropout=config.dropout, - pad_token=config.pad_token, - unk_token=config.unk_token, - ) - self.desc_vecs = self.word_embedding_layer.desc_vecs - - self.encoder = GatedCNN( - input_dim=config.input_dim, - hidden_dim=config.hidden_dim, - output_dim=config.output_dim, - kernel_size=config.kernel_size, - dropout=config.dropout, - init_mean=config.init_mean, - init_std=config.init_std, - levels=config.levels, - ) - - if self.bidirectional: - self.output_layer = OutputLayer( - input_dim=2 * config.input_dim, - num_labels=config.num_labels, - embed_dir=config.embed_dir, - ) - else: - self.output_layer = OutputLayer( - input_dim=config.input_dim, - num_labels=config.num_labels, - embed_dir=config.embed_dir, - ) - - self.variational_dropout = VariationalDropout(dropout=config.dropout) - - self.hidden = None - - def freeze_net(self): - for p in self.word_embedding_layer.embed.parameters(): - p.requires_grad = False - - def init_hidden(self, batch_size): - h_size = self.hidden_dim + self.output_dim - weight = next(self.parameters()).data - return ( - weight.new(batch_size, h_size, 1).zero_(), - weight.new(batch_size, h_size, 1).zero_(), - ) - - def _reverse_seq(self, X, mask, seq_max_len): - """ - X -> batch, seq_len, dim - mask -> batch, seq_len - """ - mask_sum = torch.sum(mask, 1).int() - xfs = [] - for x, c in zip(X, mask_sum): - xf = torch.flip(x[:c], [0]) - xfs.append(xf) - padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda() - for i, mat in enumerate(xfs): - padded_rev[i][: len(mat), :] = mat - return padded_rev - - def forward(self, data, desc): - """ - :param data: The input sequence, with dimesion (N, L) - :param desc: Whether to use code description - :return: logits, loss, hidden - """ - # If this is the first forward pass, we will initialise the hidden - # state. - if self.hidden is None: - self.init_hidden_flag = True - self.hidden = self.init_hidden(data.size(0)) - - # Look up the embeddings of all the tokens using the WordEmbeddingLayer. - # `emb` shape: (batch_size, max_length, embed_size) - emb, mask = self.word_embedding_layer(data) - - # If we want a bidirectional model, we reverse the sequence of - # tokens. - if self.bidirectional: - # `emb_reverse` shape: (batch_size, max_length, embed_size) - emb_reverse = self._reverse_seq(emb, mask, self.max_length) - # `emb_reverse` shape`: [batch_size, embed_size, max_length] - emb_reverse = emb_reverse.transpose(1, 2) - # `emb` shape: (batch_size, embed_size, max_length) - emb = emb.transpose(1, 2) - - # Pass the embeddings through the encoder. If the model is - # bidirectional, we pass the reverse embeddings as well. - raw_output, self.hidden = self.encoder(emb, self.hidden) - if self.bidirectional: - raw_out_reverse, self.hidden = self.encoder( - emb_reverse, self.hidden - ) - - output = self.variational_dropout(raw_output) - if self.bidirectional: - output_reverse = self._reverse_seq( - raw_out_reverse, mask, self.max_length - ) - output_reverse = self.variational_dropout(output_reverse) - output = torch.cat([output, output_reverse], dim=2) - - if self.use_description: - logits = self.output_layer(output, self.desc_vecs) - else: - logits = self.output_layer(output, None) - return logits - - -class WordEmbeddingLayer(nn.Module): - """ - A Word Embedding Layer. This layer loads a pre-trained word embedding matrix - , and copies its weights to an nn.Embedding layer. - - Args: - embed_dir (str): A directory containing the pre-trained word embedding - matrix, among other things. Please see - https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17 - for more details. - dropout (float): The dropout probability. - """ - - def __init__( - self, - embed_dir, - dataset_dir, - mimic_dir, - static_dir, - version, - dropout, - pad_token="", - unk_token="", - return_pad_mask=True, - use_description=True, - ): - super(WordEmbeddingLayer, self).__init__() - logger.debug( - f"Initialising {self.__class__.__name__} with " - f"embed_dir = {embed_dir}, dropout = {dropout}" - ) - - self.return_pad_mask = return_pad_mask - - # Note: This should be changed, since we won't always use Word2Vec. - embedding_cls = ConfigMapper.get_object("embeddings", "word2vec") - vocab = embedding_cls.load_vocab(embed_dir) - self.pad_token_id = vocab[pad_token] - self.unk_token_id = vocab[unk_token] - - W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir)) - self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) - self.embed.weight.data = W.clone() - - self.embedding_size = self.embed.embedding_dim - - self.dropout = nn.Dropout(dropout) - - if use_description: - dicts = load_lookups( - dataset_dir=dataset_dir, - mimic_dir=mimic_dir, - static_dir=static_dir, - word2vec_dir=embed_dir, - version=version, - ) - ind2c = dicts["ind2c"] - w2ind = dicts["w2ind"] - desc_dict = dicts["desc"] - self.desc_vecs = [] - for i, c in ind2c.items(): - self.desc_vecs.append( - [ - w2ind[w] if w in w2ind else self.unk_token_id - for w in desc_dict[c] - ] - ) - - # Pad and convert to torch tensor. - self.desc_vecs = torch.Tensor( - list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0))) - ) - - def forward(self, x): - embedding = self.embed(x) - x = self.dropout(embedding) - if self.return_pad_mask: - pad_mask = ~(batch == pad_token_id) - return x, pad_mask - return x - - -class VariationalHidDropout(nn.Module): - """ - Hidden-to-hidden (VD-based) dropout that applies the same mask at every - time step and every layer of TrellisNet. - - Args: - dropout (float): The dropout probability. - """ - - def __init__(self, dropout=0.0): - super(VariationalHidDropout, self).__init__() - self.dropout_probability = dropout - self.mask = None - - def reset_mask(self, input): - - # Dimension (N, C, L) - m = input.data.new(input.size(0), input.size(1), 1).bernoulli_( - 1 - self.dropout_probability - ) - with torch.no_grad(): - mask = m / (1 - self.dropout_probability) - self.mask = mask - return mask - - def forward(self, input): - # We don't apply dropout if the model is in eval mode. - if not self.training or self.dropout_probability == 0: - return input - - assert ( - self.mask is not None - ), "You need to reset mask before using VariationalHidDropout" - mask = self.mask.expand_as(input) # Make sure the dimension matches - return mask * input - - -class WeightShareConv1d(nn.Module): - """ - The weight-tied 1D convolution used in TrellisNet. - - Args: - input_dim (int): The dimension of the input. This is equivalent to - the number of input channels in the first - convolutional layer. - hidden_dim (int): The dimension of the hidden state. This is - equivalent to the number of input channels in the - second convolutional layer. - out_channels (int): The number of output channels in both - convolutional layers. - kernel_size (int): The size of the filter used in both - convolutional layers. - dropout (float): Dropout probability for the hidden-to-hidden - dropout layer. - init_mean (float): The mean of the normal distribution with which - weights of the convolutional layers are - initialised. - init_std (float): The standard deviation of the normal distribution - with which weights of the convolutional layers are - initialised. - """ - - def __init__( - self, - input_dim, - hidden_dim, - out_channels, - kernel_size, - dropout=0.0, - init_mean=0.0, - init_std=0.01, - ): - super(WeightShareConv1d, self).__init__() - - self.input_dim = input_dim - self.kernel_size = kernel_size - - self._dict = {} - - conv_layer_1 = nn.Conv1d( - in_channels=input_dim, - out_channels=out_channels, - kernel_size=kernel_size, - ) - self.weight_1 = conv_layer_1.weight - - conv_layer_2 = nn.Conv1d( - in_channels=hidden_dim, - out_channels=out_channels, - kernel_size=kernel_size, - ) - self.weight_2 = conv2.weight - self.bias_2 = conv2.bias - - self.init_conv_weights(init_mean, init_std) - - self.dropout = VariationalHidDropout(dropout=dropout) - - def init_conv_weights(self, init_mean, init_std): - self.weight_1.data.normal_(mean=init_mean, std=init_std) - self.weight_2.data.normal_(mean=init_mean, std=init_std) - self.bias_2.data.normal_(mean=init_mean, std=init_std) - - def forward(self, input, dilation, hid): - batch_size = input.size(0) - - padding = (self.kernel_size - 1) * dilation # Padding size. - x = F.pad(input=input, pad=(padding, 0)) # Pad with zeros. - - x_1 = x[:, : self.input_dim] - z_1 = x[:, self.input_dim :] - z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding) - - device = x_1.get_device() - - if (dilation, device) not in self.dict or self.dict[ - (dilation, device) - ] is None: - self.dict[(dilation, device)] = F.conv1d( - input=x_1, weight=self.weight1, dilation=dilation - ) - - z_1 = self.dropout(z_1) - injected = self.dict[(dilation, device)] + F.conv1d( - input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation - ) - return injected - - -class GatedCNN(nn.Module): - """ - Gated CNN module. - - Args: - input_dim (int): The dimension of the input. - hidden_dim (int): The hidden dimension. The hidden dimension for the - weight-shared Conv1D layer is - `hidden_dim + output_dim`. - output_dim (int): The output dimension. The number of output - channels of the weight-shared Conv1D layer is - `4 * (hidden_dim + output_dim)`. - kernel_size (int): The size of the filter used in - `WeightSharedConv1D`. - dropout (float): Dropout probability for the `WeightSharedConv1D`. - init_mean (float): The mean of the normal distribution with which - weights of the `WeightSharedConv1D` layer are - initialised. - init_std (float): The standard deviation of the normal distribution - with which weights of the `WeightSharedConv1D` - layer are initialised. - """ - - def __init__( - self, - input_dim, - hidden_dim, - output_dim, - kernel_size, - dropout, - init_mean, - init_std, - levels, - ): - super(GatedCNN, self).__init__() - self.input_dim = input_dim - self.hidden_dim = hidden_dim - self.output_dim = output_dim - self.levels = levels - - self.hidden_dim_for_conv = hidden_dim + output_dim - - self.dilations = [i + 1 for i in range(levels)] - - self.full_conv = WeightShareConv1d( - input_dim=input_dim, - hidden_dim=self.hidden_dim_for_conv, - output_channels=4 * self.hidden_dim_for_conv, - kernel_size=kernel_size, - dropout=dropout, - init_mean=init_mean, - init_std=init_std, - ) - - self.ht = None - - def transform_input(self, X): - device = X.get_device() - if device == -1: - device = "cpu" - - batch_size = X.size(0) - seq_len = X.size(2) - - ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( - device - ) - self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to( - device - ) - return torch.cat((X, ht), dim=1) - - def gating(self, Z, dilation=1, hc=None): - batch_size = Z.size(0) - (hid, cell) = hc - - out = self.full_conv(input=Z, dilation=dilation, hid=hid) - - ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation] - ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation) - - it = torch.sigmoid(out[:, : self.hidden_dim_for_conv]) - ot = torch.sigmoid( - out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv] - ) - gt = torch.tanh( - out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv] - ) - ft = torch.sigmoid( - out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv] - ) - self.ct = ft * ct_1 + it * gt - ht = ot * torch.tanh(self.ct) - - Z = torch.cat((Z[:, : self.input_dim], ht), dim=1) - return Z - - def forward(self, emb, hc): - Z = self.transform_input(emb) - for key in self.full_conv.dict: - if key[1] == emb.get_device(): - self.full_conv.dict[key] = None - self.full_conv.drop.reset_mask(Z[:, self.input_dim :]) - - for dilation_per_level in self.dilations: - Z = self.gating(Z, dilation=dilation_per_level, hc=hc) - - out = Z[:, -self.output_dim :].transpose(1, 2) - hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:]) - return out, hc - - -class VariationalDropout(nn.Module): - """ - Feed-forward version of variational dropout that applies the same mask - at every time step. - """ - - def __init__(self, dropout=0.5, dim=3): - super(VariationalDropout, self).__init__() - assert dim in (3, 4), "`dim` should be either 3 or 4" - self.dropout = dropout - self.dim = dim - - def forward(self, x): - if not self.training or not self.dropout: - return x - - if self.dim == 4: - # Dimension (M, N, L, C), where C stands for channels - m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_( - 1 - self.dropout - ) - else: - # Dimension (N, L, C) - m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout) - with torch.no_grad(): - mask = m / (1 - dropout) - mask = mask.expand_as(x) - return mask * x - - -class OutputLayer(nn.Module): - def __init__(self, input_dim, num_labels, embed_dir, dropout): - super(OutputLayer, self).__init__() - - self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout) - - self.U = nn.Linear(input_dim, num_labels) - self.final = nn.Linear(input_dim, num_labels) - self.proj_layer = nn.Linear(input_dim, 1, bias=False) - - xavier_uniform_(self.U.weight) - xavier_uniform_(self.final.weight) - - def forward(self, x, desc): - if desc is not None: - desc_vec, _ = self.word_embedding_layer(desc) - desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0) - mmt = desc_vec.matmul(x.transpose(1, 2)) - else: - mmt = self.U.weight.matmul(x.transpose(1, 2)) - - m = mmt.matmul(x) - - y = self.final.weight.mul(m) - logits = self.proj_layer(y).squeeze(-1).add(self.final.bias) - - return logits diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml b/configs/gatedcnn_nci/mimic3_50.yml similarity index 97% rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml rename to configs/gatedcnn_nci/mimic3_50.yml index 3613885..5f0b893 100644 --- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml +++ b/configs/gatedcnn_nci/mimic3_50.yml @@ -42,7 +42,7 @@ model: input_dim: 100 hidden_dim: 100 output_dim: 50 - bidirectional: true + bidirectional: false use_description: true pad_token: "" unk_token: "" @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml b/configs/gatedcnn_nci/mimic3_50_old.yml similarity index 97% rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml rename to configs/gatedcnn_nci/mimic3_50_old.yml index 4ccca3a..ec8f0b2 100644 --- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml +++ b/configs/gatedcnn_nci/mimic3_50_old.yml @@ -42,7 +42,7 @@ model: input_dim: 100 hidden_dim: 100 output_dim: 50 - bidirectional: true + bidirectional: false use_description: true pad_token: "" unk_token: "" @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml b/configs/gatedcnn_nci/mimic3_full.yml similarity index 97% rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml rename to configs/gatedcnn_nci/mimic3_full.yml index de9b1ab..8a833ad 100644 --- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml +++ b/configs/gatedcnn_nci/mimic3_full.yml @@ -42,7 +42,7 @@ model: input_dim: 100 hidden_dim: 100 output_dim: 8922 - bidirectional: true + bidirectional: false use_description: true pad_token: "" unk_token: "" @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: true diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml b/configs/gatedcnn_nci/mimic3_full_old.yml similarity index 97% rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml rename to configs/gatedcnn_nci/mimic3_full_old.yml index c95f4aa..d76c03b 100644 --- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml +++ b/configs/gatedcnn_nci/mimic3_full_old.yml @@ -42,7 +42,7 @@ model: input_dim: 100 hidden_dim: 100 output_dim: 8922 - bidirectional: true + bidirectional: false use_description: true pad_token: "" unk_token: "" @@ -125,3 +125,4 @@ trainer: - name: micro_auc seed: 1337 use_gpu: true + initialise_hidden_states: true diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py index 84ae5e8..bd1d765 100644 --- a/src/models/gatedcnn_nci.py +++ b/src/models/gatedcnn_nci.py @@ -68,7 +68,7 @@ def __init__(self, config): static_dir=config.static_dir, version=config.version, input_dim=config.input_dim, - num_labels=config.num_labels, + num_labels=config.output_dim, dropout=config.dropout, pad_token=config.pad_token, unk_token=config.unk_token, @@ -85,7 +85,7 @@ def freeze_net(self): def init_hidden(self, batch_size): h_size = self.hidden_dim + self.output_dim weight = next(self.parameters()).data - return ( + self.hidden = ( weight.new(batch_size, h_size, 1).zero_(), weight.new(batch_size, h_size, 1).zero_(), ) @@ -95,12 +95,16 @@ def _reverse_seq(self, X, mask, seq_max_len): X -> batch, seq_len, dim mask -> batch, seq_len """ + device = X.get_device() + if device == -1: + device = "cpu" + mask_sum = torch.sum(mask, 1).int() xfs = [] for x, c in zip(X, mask_sum): xf = torch.flip(x[:c], [0]) xfs.append(xf) - padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda() + padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).to(device) for i, mat in enumerate(xfs): padded_rev[i][: len(mat), :] = mat return padded_rev @@ -552,11 +556,11 @@ def forward(self, x, desc): if desc is not None: desc_vec, _ = self.word_embedding_layer(desc) desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0) - mmt = desc_vec.matmul(x.transpose(1, 2)) + mmt = x.matmul(desc_vec) else: mmt = self.U.weight.matmul(x.transpose(1, 2)) - m = mmt.matmul(x) + m = x.transpose(1, 2).matmul(mmt) y = self.final.weight.mul(m) logits = self.proj_layer(y).squeeze(-1).add(self.final.bias) diff --git a/src/trainers/base_trainer.py b/src/trainers/base_trainer.py index a1a70dc..731e527 100755 --- a/src/trainers/base_trainer.py +++ b/src/trainers/base_trainer.py @@ -174,6 +174,10 @@ def train(self, model, train_dataset, val_dataset=None): batch_inputs = batch_inputs.cuda() batch_labels = batch_labels.cuda() + # Initialise the hidden states. + if self.config.initialise_hidden_states: + model.init_hidden(batch_inputs.size(0)) + batch_outputs = model(batch_inputs) batch_loss = self.loss_fn( input=batch_outputs, target=batch_labels From a98a0d40cdaf78cc5e701164cc1218e69a74796c Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Wed, 22 Jun 2022 18:37:29 +0530 Subject: [PATCH 5/5] Fix test issue --- configs/gatedcnn_nci/mimic3_50_old.yml | 4 ++-- src/trainers/base_trainer.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/gatedcnn_nci/mimic3_50_old.yml b/configs/gatedcnn_nci/mimic3_50_old.yml index ec8f0b2..4eef858 100644 --- a/configs/gatedcnn_nci/mimic3_50_old.yml +++ b/configs/gatedcnn_nci/mimic3_50_old.yml @@ -66,9 +66,9 @@ trainer: optimizer: name: adam params: - lr: 0.000001 + lr: 0.01 weight_decay: 0.0 - max_epochs: 200 + max_epochs: 100 lr_scheduler: null stopping_criterion: metric: diff --git a/src/trainers/base_trainer.py b/src/trainers/base_trainer.py index 731e527..506cd47 100755 --- a/src/trainers/base_trainer.py +++ b/src/trainers/base_trainer.py @@ -397,6 +397,11 @@ def _forward_epoch(self, model, dataset=None, dataloader=None): if self.config.use_gpu: batch_inputs = batch_inputs.cuda() batch_labels = batch_labels.cuda() + + # Initialise the hidden states. + if self.config.initialise_hidden_states: + model.init_hidden(batch_inputs.size(0)) + batch_outputs = model(batch_inputs) epoch_labels.append(batch_labels.cpu()) epoch_outputs.append(batch_outputs.cpu())