From 6c213d7618df6e9c2cea215325fb039bc0291565 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Wed, 8 Jun 2022 06:22:27 +0530
Subject: [PATCH 1/5] Add very rough implementation

---
 .isort.cfg                 |   2 +-
 .pre-commit-config.yaml    |   2 +-
 src/models/gatedcnn_nci.py | 436 +++++++++++++++++++++++++++++++++++++
 3 files changed, 438 insertions(+), 2 deletions(-)
 create mode 100644 src/models/gatedcnn_nci.py

diff --git a/.isort.cfg b/.isort.cfg
index 82cd117..cf67fba 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,2 +1,2 @@
 [settings]
-known_third_party = gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml
+known_third_party = embeddings,gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4adfd09..7cd4c15 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     hooks:
       - id: black
         args: ["--config", "./pyproject.toml"]
-        language_version: python3.7
+        language_version: python3
 
   - repo: https://github.com/asottile/seed-isort-config
     rev: v2.2.0
diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py
new file mode 100644
index 0000000..38d0016
--- /dev/null
+++ b/src/models/gatedcnn_nci.py
@@ -0,0 +1,436 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class GatedCNNencoder(nn.Module):
+    def __init__(self, args, Y, dicts):
+        super(GatedCNNencoder, self).__init__()
+        self.args = args
+        self.max_length = args.MAX_LENGTH
+        self.dropout = args.dropout
+        self.ninp = args.embed_size
+        self.nhid = args.nhid
+        self.nout = args.nout
+        self.bidirectional = args.bidirectional
+
+        self.word_rep = WordRep(args, Y, dicts)
+        self.encoder = GatedCNN(args, Y, dicts, self.ninp, self.nout)
+        self.network = nn.ModuleList([self.encoder])
+        if self.bidirectional:
+            self.output_layer = OutputLayer(args, Y, dicts, self.nout * 2)
+        else:
+            self.output_layer = OutputLayer(args, Y, dicts, self.nout)
+        self.var_drop = VariationalDropout()
+
+    def freeze_net(self):
+        for p in self.word_rep.embed.parameters():
+            p.requires_grad = False
+
+    def _reverse_seq(self, X, mask, seq_max_len):
+        """
+        X -> batch, seq_len, dim
+        mask -> batch, seq_len
+        """
+        mask_sum = torch.sum(mask, 1).int()
+        xfs = []
+        for x, c in zip(X, mask_sum):
+            xf = torch.flip(x[:c], [0])
+            xfs.append(xf)
+        padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda()
+        for i, mat in enumerate(xfs):
+            padded_rev[i][: len(mat), :] = mat
+        return padded_rev
+
+    def forward(self, data, target, mask, hidden, desc):
+        """
+        :param data: The input sequence, with dimesion (N, L)
+        :param target: labels
+        :param mask: input sequence mask
+        :param hidden: The initial hidden state (h, c)
+        :param desc: Whether to use code description
+        :return: logits, loss, hidden
+        """
+        emb = self.word_rep(data, target)
+        if self.bidirectional:
+            emb_reverse = self._reverse_seq(emb, mask, self.max_length)
+        emb = emb.transpose(1, 2)  # emb: [bs, 100, len]
+        if self.bidirectional:
+            emb_reverse = emb_reverse.transpose(
+                1, 2
+            )  # emb_reverse: [bs, 100, len]
+        cnn_encoder = self.network[0]
+        raw_output, hidden = cnn_encoder(emb, hidden)
+        if self.bidirectional:
+            raw_out_re, hidden = cnn_encoder(emb_reverse, hidden)
+        output = self.var_drop(raw_output, self.dropout)
+        if self.bidirectional:
+            output_re = self._reverse_seq(raw_out_re, mask, self.max_length)
+            output_re = self.var_drop(output_re, self.dropout)
+        if self.bidirectional:
+            output = torch.cat([output, output_re], dim=2)
+        if self.args.desc:
+            logits, loss, _, interaction = self.output_layer(
+                output, target, desc
+            )
+        else:
+            logits, loss, _, interaction = self.output_layer(
+                output, target, None
+            )
+        return logits, loss, hidden, interaction
+
+    def init_hidden(self, bsz):
+        h_size = self.nhid + self.nout
+        weight = next(self.parameters()).data
+        return (
+            weight.new(bsz, h_size, 1).zero_(),
+            weight.new(bsz, h_size, 1).zero_(),
+        )
+
+
+from typing import Tuple
+
+import torch.nn as nn
+from embeddings import build_pretrain_embedding, load_embeddings
+from torch import Tensor
+from torch.nn.init import kaiming_uniform_, normal_, xavier_uniform_
+
+
+class WordEmbeddingLayer(nn.Module):
+    """
+    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
+    , and copies its weights to an nn.Embedding layer.
+
+    Args:
+        embed_dir (str): A directory containing the pre-trained word embedding
+                         matrix, among other things. Please see
+                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
+                         for more details.
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(self, embed_dir, dropout, num_filter_maps):
+        super(WordEmbeddingLayer, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"embed_dir = {embed_dir}, dropout = {dropout}"
+        )
+
+        # Note: This should be changed, since we won't always use Word2Vec.
+        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
+
+        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
+        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
+        self.embed.weight.data = W.clone()
+
+        self.embedding_size = self.embed.embedding_dim
+
+        self.dropout = nn.Dropout(dropout)
+
+        self.conv_dict = {
+            1: [self.embedding_size, num_filter_maps],
+            2: [self.embedding_size, 100, num_filter_maps],
+            3: [self.embedding_size, 150, 100, num_filter_maps],
+            4: [self.embedding_size, 200, 150, 100, num_filter_maps],
+        }
+
+    def forward(self, x):
+        embedding = self.embed(x)
+        x = self.dropout(embedding)
+        return x
+
+
+class VariationalHidDropout(nn.Module):
+    def __init__(self, dropout=0.0):
+        """
+        Hidden-to-hidden (VD-based) dropout that applies the same mask at every
+        time step and every layer of TrellisNet.
+
+        Args:
+            dropout (float): The dropout probability.
+        """
+        super(VariationalHidDropout, self).__init__()
+        self.dropout_probability = dropout
+        self.mask = None
+
+    def reset_mask(self, input):
+
+        # Dimension (N, C, L)
+        m = input.data.new(input.size(0), input.size(1), 1).bernoulli_(
+            1 - self.dropout_probability
+        )
+        with torch.no_grad():
+            mask = m / (1 - self.dropout_probability)
+            self.mask = mask
+        return mask
+
+    def forward(self, input):
+        # We don't apply dropout if the model is in eval mode.
+        if not self.training or self.dropout_probability == 0:
+            return input
+
+        assert (
+            self.mask is not None
+        ), "You need to reset mask before using VariationalHidDropout"
+        mask = self.mask.expand_as(input)  # Make sure the dimension matches
+        return mask * input
+
+
+class WeightShareConv1d(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        out_channels,
+        kernel_size,
+        dropout=0.0,
+        init_mean=0.0,
+        init_std=0.01,
+    ):
+        """
+        The weight-tied 1D convolution used in TrellisNet.
+
+        Args:
+            input_dim (int): The dimension of the input. This is equivalent to
+                             the number of input channels in the first
+                             convolutional layer.
+            hidden_dim (int): The dimension of the hidden state. This is
+                              equivalent to the number of input channels in the
+                              second convolutional layer.
+            out_channels (int): The number of output channels in both
+                                convolutional layers.
+            kernel_size (int): The size of the filter used in both
+                               convolutional layers.
+            dropout (float): Dropout probability for the hidden-to-hidden
+                             dropout layer.
+            init_mean (float): The mean of the normal distribution with which
+                               weights of the convolutional layers are
+                               initialised.
+            init_std (float): The standard deviation of the normal distribution
+                              with which weights of the convolutional layers are
+                              initialised.
+        """
+        super(WeightShareConv1d, self).__init__()
+
+        self.input_dim = input_dim
+        self.kernel_size = kernel_size
+
+        self._dict = {}
+
+        conv_layer_1 = nn.Conv1d(
+            in_channels=input_dim,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+        )
+        self.weight_1 = conv_layer_1.weight
+
+        conv_layer_2 = nn.Conv1d(
+            in_channels=hidden_dim,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+        )
+        self.weight_2 = conv2.weight
+        self.bias_2 = conv2.bias
+
+        self.init_conv_weights(init_mean, init_std)
+
+        self.dropout = VariationalHidDropout(dropout=dropout)
+
+    def init_conv_weights(self, init_mean, init_std):
+        self.weight_1.data.normal_(mean=init_mean, std=init_std)
+        self.weight_2.data.normal_(mean=init_mean, std=init_std)
+        self.bias_2.data.normal_(mean=init_mean, std=init_std)
+
+    def forward(self, input, dilation, hid):
+        batch_size = input.size(0)
+
+        padding = (self.kernel_size - 1) * dilation  # Padding size.
+        x = F.pad(input=input, pad=(padding, 0))  # Pad with zeros.
+
+        x_1 = x[:, : self.input_dim]
+        z_1 = x[:, self.input_dim :]
+        z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding)
+
+        device = x_1.get_device()
+
+        if (dilation, device) not in self.dict or self.dict[
+            (dilation, device)
+        ] is None:
+            self.dict[(dilation, device)] = F.conv1d(
+                input=x_1, weight=self.weight1, dilation=dilation
+            )
+
+        z_1 = self.dropout(z_1)
+        injected = self.dict[(dilation, device)] + F.conv1d(
+            input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation
+        )
+        return injected
+
+
+class GatedCNN(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim,
+        kernel_size,
+        dropout,
+        init_mean,
+        init_std,
+        levels,
+    ):
+        """
+        Gated CNN module.
+
+        Args:
+            input_dim (int): The dimension of the input.
+            hidden_dim (int): The hidden dimension. The hidden dimension for the
+                              weight-shared Conv1D layer is
+                              `hidden_dim + output_dim`.
+            output_dim (int): The output dimension. The number of output
+                              channels of the weight-shared Conv1D layer is
+                              `4 * (hidden_dim + output_dim)`.
+            kernel_size (int): The size of the filter used in
+                               `WeightSharedConv1D`.
+            dropout (float): Dropout probability for the `WeightSharedConv1D`.
+            init_mean (float): The mean of the normal distribution with which
+                               weights of the `WeightSharedConv1D` layer are
+                               initialised.
+            init_std (float): The standard deviation of the normal distribution
+                              with which weights of the `WeightSharedConv1D`
+                              layer are initialised.
+        """
+        super(GatedCNN, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = args.hidden_dim
+        self.output_dim = output_dim
+        self.levels = levels
+
+        self.hidden_dim_for_conv = hidden_dim + output_dim
+
+        self.dilations = [i + 1 for i in range(levels)]
+
+        self.full_conv = WeightShareConv1d(
+            input_dim=input_dim,
+            hidden_dim=self.hidden_dim_for_conv,
+            output_channels=4 * self.hidden_dim_for_conv,
+            kernel_size=kernel_size,
+            dropout=dropout,
+            init_mean=init_mean,
+            init_std=init_std,
+        )
+
+        self.ht = None
+
+    def transform_input(self, X):
+        device = X.get_device()
+        if device == -1:
+            device = "cpu"
+
+        batch_size = X.size(0)
+        seq_len = X.size(2)
+
+        ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
+            device
+        )
+        self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
+            device
+        )
+        return torch.cat((X, ht), dim=1)
+
+    def gating(self, Z, dilation=1, hc=None):
+        batch_size = Z.size(0)
+        (hid, cell) = hc
+
+        out = self.full_conv(input=Z, dilation=dilation, hid=hid)
+
+        ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation]
+        ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation)
+
+        it = torch.sigmoid(out[:, : self.hidden_dim_for_conv])
+        ot = torch.sigmoid(
+            out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv]
+        )
+        gt = torch.tanh(
+            out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv]
+        )
+        ft = torch.sigmoid(
+            out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv]
+        )
+        self.ct = ft * ct_1 + it * gt
+        ht = ot * torch.tanh(self.ct)
+
+        Z = torch.cat((Z[:, : self.input_dim], ht), dim=1)
+        return Z
+
+    def forward(self, emb, hc):
+        Z = self.transform_input(emb)
+        for key in self.full_conv.dict:
+            if key[1] == emb.get_device():
+                self.full_conv.dict[key] = None
+        self.full_conv.drop.reset_mask(Z[:, self.input_dim :])
+
+        for dilation_per_level in self.dilations:
+            Z = self.gating(Z, dilation=dilation_per_level, hc=hc)
+
+        out = Z[:, -self.output_dim :].transpose(1, 2)
+        hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:])
+        return out, hc
+
+
+class OutputLayer(nn.Module):
+    def __init__(
+        self, input_size, num_labels, embed_dir, dropout, num_filter_maps
+    ):
+        super(OutputLayer, self).__init__()
+
+        self.word_embedding_layer = WordEmbeddingLayer(
+            embed_dir, dropout, num_filter_maps
+        )
+
+        self.U = nn.Linear(input_size, num_labels)
+        self.final = nn.Linear(input_size, num_labels)
+        self.proj_layer = nn.Linear(input_size, 1, bias=False)
+
+        xavier_uniform_(self.U.weight)
+        xavier_uniform_(self.final.weight)
+
+    def forward(self, x, desc):
+        if desc is not None:
+            desc_vec = self.word_rep(desc, target)
+            desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0)
+            mmt = desc_vec.matmul(x.transpose(1, 2))
+        else:
+            mmt = self.U.weight.matmul(x.transpose(1, 2))
+
+        m = mmt.matmul(x)
+
+        y = self.final.weight.mul(m)
+        logits = self.proj_layer(y).squeeze(-1).add(self.final.bias)
+
+        return logits
+
+
+class VariationalDropout(nn.Module):
+    def __init__(self):
+        """
+        Feed-forward version of variational dropout that applies the same mask
+        at every time step.
+        """
+        super(VariationalDropout, self).__init__()
+
+    def forward(self, x, dropout=0.5, dim=3):
+        if not self.training or not dropout:
+            return x
+        if dim == 4:
+            # Dimension (M, N, L, C), where C stands for channels
+            m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_(
+                1 - dropout
+            )
+        else:
+            # Dimension (N, L, C)
+            m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - dropout)
+        with torch.no_grad():
+            mask = m / (1 - dropout)
+            mask = mask.expand_as(x)
+        return mask * x

From fb1b981038ffa0304f99fa3853f69179317cbc32 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Tue, 14 Jun 2022 22:38:36 +0530
Subject: [PATCH 2/5] Add configs

---
 .isort.cfg                                    |   2 +-
 configs/{ => caml}/caml_mimic3_50.yml         |   0
 configs/{ => caml}/caml_mimic3_50_old.yml     |   0
 configs/{ => caml}/caml_mimic3_full.yml       |   0
 configs/{ => caml}/caml_mimic3_full_old.yml   |   0
 configs/{ => caml}/cnn_mimic3_50.yml          |   0
 configs/{ => caml}/drcaml_mimic3_50.yml       |   0
 configs/gatedcnn_nci/gatedcnn_nci.py          | 512 ++++++++++++++++++
 .../gatedcnn_nci/gatedcnn_nci_mimic3_50.yml   | 127 +++++
 .../gatedcnn_nci_mimic3_50_old.yml            | 127 +++++
 .../gatedcnn_nci/gatedcnn_nci_mimic3_full.yml | 127 +++++
 .../gatedcnn_nci_mimic3_full_old.yml          | 127 +++++
 src/models/fusion.py                          |   2 +-
 src/models/gatedcnn_nci.py                    | 392 ++++++++------
 src/modules/metrics.py                        |   1 +
 src/utils/caml_utils.py                       |  15 +
 16 files changed, 1274 insertions(+), 158 deletions(-)
 rename configs/{ => caml}/caml_mimic3_50.yml (100%)
 rename configs/{ => caml}/caml_mimic3_50_old.yml (100%)
 rename configs/{ => caml}/caml_mimic3_full.yml (100%)
 rename configs/{ => caml}/caml_mimic3_full_old.yml (100%)
 rename configs/{ => caml}/cnn_mimic3_50.yml (100%)
 rename configs/{ => caml}/drcaml_mimic3_50.yml (100%)
 create mode 100644 configs/gatedcnn_nci/gatedcnn_nci.py
 create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml
 create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml
 create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml
 create mode 100644 configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml

diff --git a/.isort.cfg b/.isort.cfg
index cf67fba..82cd117 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,2 +1,2 @@
 [settings]
-known_third_party = embeddings,gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml
+known_third_party = gensim,nltk,numpy,pandas,sklearn,streamlit,torch,torchsummaryX,tqdm,yaml
diff --git a/configs/caml_mimic3_50.yml b/configs/caml/caml_mimic3_50.yml
similarity index 100%
rename from configs/caml_mimic3_50.yml
rename to configs/caml/caml_mimic3_50.yml
diff --git a/configs/caml_mimic3_50_old.yml b/configs/caml/caml_mimic3_50_old.yml
similarity index 100%
rename from configs/caml_mimic3_50_old.yml
rename to configs/caml/caml_mimic3_50_old.yml
diff --git a/configs/caml_mimic3_full.yml b/configs/caml/caml_mimic3_full.yml
similarity index 100%
rename from configs/caml_mimic3_full.yml
rename to configs/caml/caml_mimic3_full.yml
diff --git a/configs/caml_mimic3_full_old.yml b/configs/caml/caml_mimic3_full_old.yml
similarity index 100%
rename from configs/caml_mimic3_full_old.yml
rename to configs/caml/caml_mimic3_full_old.yml
diff --git a/configs/cnn_mimic3_50.yml b/configs/caml/cnn_mimic3_50.yml
similarity index 100%
rename from configs/cnn_mimic3_50.yml
rename to configs/caml/cnn_mimic3_50.yml
diff --git a/configs/drcaml_mimic3_50.yml b/configs/caml/drcaml_mimic3_50.yml
similarity index 100%
rename from configs/drcaml_mimic3_50.yml
rename to configs/caml/drcaml_mimic3_50.yml
diff --git a/configs/gatedcnn_nci/gatedcnn_nci.py b/configs/gatedcnn_nci/gatedcnn_nci.py
new file mode 100644
index 0000000..4014637
--- /dev/null
+++ b/configs/gatedcnn_nci/gatedcnn_nci.py
@@ -0,0 +1,512 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import normal_, xavier_uniform_
+
+
+class GatedCNNEncoder(nn.Module):
+    def __init__(self, config):
+        super(GatedCNNEncoder, self).__init__()
+        self.max_length = config.max_length
+        self.dropout = config.dropout
+        self.input_dim = config.embed_size
+        self.hidden_dim = config.hidden_dim
+        self.output_dim = config.output_dim
+        self.bidirectional = config.bidirectional
+        self.use_description = config.use_description
+
+        self.word_embedding_layer = WordEmbeddingLayer(
+            embed_dir=config.embed_dir,
+            dataset_dir=config.dataset_dir,
+            mimic_dir=config.mimic_dir,
+            static_dir=config.static_dir,
+            dropout=config.dropout,
+            pad_token=config.pad_token,
+            unk_token=config.unk_token,
+        )
+        self.desc_vecs = self.word_embedding_layer.desc_vecs
+
+        self.encoder = GatedCNN(
+            input_dim=config.input_dim,
+            hidden_dim=config.hidden_dim,
+            output_dim=config.output_dim,
+            kernel_size=config.kernel_size,
+            dropout=config.dropout,
+            init_mean=config.init_mean,
+            init_std=config.init_std,
+            levels=config.levels,
+        )
+
+        if self.bidirectional:
+            self.output_layer = OutputLayer(
+                input_dim=2 * config.input_dim,
+                num_labels=config.num_labels,
+                embed_dir=config.embed_dir,
+            )
+        else:
+            self.output_layer = OutputLayer(
+                input_dim=config.input_dim,
+                num_labels=config.num_labels,
+                embed_dir=config.embed_dir,
+            )
+
+        self.variational_dropout = VariationalDropout(dropout=config.dropout)
+
+        self.hidden = None
+
+    def freeze_net(self):
+        for p in self.word_embedding_layer.embed.parameters():
+            p.requires_grad = False
+
+    def init_hidden(self, batch_size):
+        h_size = self.hidden_dim + self.output_dim
+        weight = next(self.parameters()).data
+        return (
+            weight.new(batch_size, h_size, 1).zero_(),
+            weight.new(batch_size, h_size, 1).zero_(),
+        )
+
+    def _reverse_seq(self, X, mask, seq_max_len):
+        """
+        X -> batch, seq_len, dim
+        mask -> batch, seq_len
+        """
+        mask_sum = torch.sum(mask, 1).int()
+        xfs = []
+        for x, c in zip(X, mask_sum):
+            xf = torch.flip(x[:c], [0])
+            xfs.append(xf)
+        padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda()
+        for i, mat in enumerate(xfs):
+            padded_rev[i][: len(mat), :] = mat
+        return padded_rev
+
+    def forward(self, data, desc):
+        """
+        :param data: The input sequence, with dimesion (N, L)
+        :param desc: Whether to use code description
+        :return: logits, loss, hidden
+        """
+        # If this is the first forward pass, we will initialise the hidden
+        # state.
+        if self.hidden is None:
+            self.init_hidden_flag = True
+            self.hidden = self.init_hidden(data.size(0))
+
+        # Look up the embeddings of all the tokens using the WordEmbeddingLayer.
+        # `emb` shape: (batch_size, max_length, embed_size)
+        emb, mask = self.word_embedding_layer(data)
+
+        # If we want a bidirectional model, we reverse the sequence of
+        # tokens.
+        if self.bidirectional:
+            # `emb_reverse` shape: (batch_size, max_length, embed_size)
+            emb_reverse = self._reverse_seq(emb, mask, self.max_length)
+            # `emb_reverse` shape`: [batch_size, embed_size, max_length]
+            emb_reverse = emb_reverse.transpose(1, 2)
+        # `emb` shape: (batch_size, embed_size, max_length)
+        emb = emb.transpose(1, 2)
+
+        # Pass the embeddings through the encoder. If the model is
+        # bidirectional, we pass the reverse embeddings as well.
+        raw_output, self.hidden = self.encoder(emb, self.hidden)
+        if self.bidirectional:
+            raw_out_reverse, self.hidden = self.encoder(
+                emb_reverse, self.hidden
+            )
+
+        output = self.variational_dropout(raw_output)
+        if self.bidirectional:
+            output_reverse = self._reverse_seq(
+                raw_out_reverse, mask, self.max_length
+            )
+            output_reverse = self.variational_dropout(output_reverse)
+            output = torch.cat([output, output_reverse], dim=2)
+
+        if self.use_description:
+            logits = self.output_layer(output, self.desc_vecs)
+        else:
+            logits = self.output_layer(output, None)
+        return logits
+
+
+class WordEmbeddingLayer(nn.Module):
+    """
+    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
+    , and copies its weights to an nn.Embedding layer.
+
+    Args:
+        embed_dir (str): A directory containing the pre-trained word embedding
+                         matrix, among other things. Please see
+                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
+                         for more details.
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(
+        self,
+        embed_dir,
+        dataset_dir,
+        mimic_dir,
+        static_dir,
+        version,
+        dropout,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        return_pad_mask=True,
+        use_description=True,
+    ):
+        super(WordEmbeddingLayer, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"embed_dir = {embed_dir}, dropout = {dropout}"
+        )
+
+        self.return_pad_mask = return_pad_mask
+
+        # Note: This should be changed, since we won't always use Word2Vec.
+        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
+        vocab = embedding_cls.load_vocab(embed_dir)
+        self.pad_token_id = vocab[pad_token]
+        self.unk_token_id = vocab[unk_token]
+
+        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
+        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
+        self.embed.weight.data = W.clone()
+
+        self.embedding_size = self.embed.embedding_dim
+
+        self.dropout = nn.Dropout(dropout)
+
+        if use_description:
+            dicts = load_lookups(
+                dataset_dir=dataset_dir,
+                mimic_dir=mimic_dir,
+                static_dir=static_dir,
+                word2vec_dir=embed_dir,
+                version=version,
+            )
+            ind2c = dicts["ind2c"]
+            w2ind = dicts["w2ind"]
+            desc_dict = dicts["desc"]
+            self.desc_vecs = []
+            for i, c in ind2c.items():
+                self.desc_vecs.append(
+                    [
+                        w2ind[w] if w in w2ind else self.unk_token_id
+                        for w in desc_dict[c]
+                    ]
+                )
+
+            # Pad and convert to torch tensor.
+            self.desc_vecs = torch.Tensor(
+                list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0)))
+            )
+
+    def forward(self, x):
+        embedding = self.embed(x)
+        x = self.dropout(embedding)
+        if self.return_pad_mask:
+            pad_mask = ~(batch == pad_token_id)
+            return x, pad_mask
+        return x
+
+
+class VariationalHidDropout(nn.Module):
+    """
+    Hidden-to-hidden (VD-based) dropout that applies the same mask at every
+    time step and every layer of TrellisNet.
+
+    Args:
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(self, dropout=0.0):
+        super(VariationalHidDropout, self).__init__()
+        self.dropout_probability = dropout
+        self.mask = None
+
+    def reset_mask(self, input):
+
+        # Dimension (N, C, L)
+        m = input.data.new(input.size(0), input.size(1), 1).bernoulli_(
+            1 - self.dropout_probability
+        )
+        with torch.no_grad():
+            mask = m / (1 - self.dropout_probability)
+            self.mask = mask
+        return mask
+
+    def forward(self, input):
+        # We don't apply dropout if the model is in eval mode.
+        if not self.training or self.dropout_probability == 0:
+            return input
+
+        assert (
+            self.mask is not None
+        ), "You need to reset mask before using VariationalHidDropout"
+        mask = self.mask.expand_as(input)  # Make sure the dimension matches
+        return mask * input
+
+
+class WeightShareConv1d(nn.Module):
+    """
+    The weight-tied 1D convolution used in TrellisNet.
+
+    Args:
+        input_dim (int): The dimension of the input. This is equivalent to
+                            the number of input channels in the first
+                            convolutional layer.
+        hidden_dim (int): The dimension of the hidden state. This is
+                            equivalent to the number of input channels in the
+                            second convolutional layer.
+        out_channels (int): The number of output channels in both
+                            convolutional layers.
+        kernel_size (int): The size of the filter used in both
+                            convolutional layers.
+        dropout (float): Dropout probability for the hidden-to-hidden
+                            dropout layer.
+        init_mean (float): The mean of the normal distribution with which
+                            weights of the convolutional layers are
+                            initialised.
+        init_std (float): The standard deviation of the normal distribution
+                            with which weights of the convolutional layers are
+                            initialised.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        out_channels,
+        kernel_size,
+        dropout=0.0,
+        init_mean=0.0,
+        init_std=0.01,
+    ):
+        super(WeightShareConv1d, self).__init__()
+
+        self.input_dim = input_dim
+        self.kernel_size = kernel_size
+
+        self._dict = {}
+
+        conv_layer_1 = nn.Conv1d(
+            in_channels=input_dim,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+        )
+        self.weight_1 = conv_layer_1.weight
+
+        conv_layer_2 = nn.Conv1d(
+            in_channels=hidden_dim,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+        )
+        self.weight_2 = conv2.weight
+        self.bias_2 = conv2.bias
+
+        self.init_conv_weights(init_mean, init_std)
+
+        self.dropout = VariationalHidDropout(dropout=dropout)
+
+    def init_conv_weights(self, init_mean, init_std):
+        self.weight_1.data.normal_(mean=init_mean, std=init_std)
+        self.weight_2.data.normal_(mean=init_mean, std=init_std)
+        self.bias_2.data.normal_(mean=init_mean, std=init_std)
+
+    def forward(self, input, dilation, hid):
+        batch_size = input.size(0)
+
+        padding = (self.kernel_size - 1) * dilation  # Padding size.
+        x = F.pad(input=input, pad=(padding, 0))  # Pad with zeros.
+
+        x_1 = x[:, : self.input_dim]
+        z_1 = x[:, self.input_dim :]
+        z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding)
+
+        device = x_1.get_device()
+
+        if (dilation, device) not in self.dict or self.dict[
+            (dilation, device)
+        ] is None:
+            self.dict[(dilation, device)] = F.conv1d(
+                input=x_1, weight=self.weight1, dilation=dilation
+            )
+
+        z_1 = self.dropout(z_1)
+        injected = self.dict[(dilation, device)] + F.conv1d(
+            input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation
+        )
+        return injected
+
+
+class GatedCNN(nn.Module):
+    """
+    Gated CNN module.
+
+    Args:
+        input_dim (int): The dimension of the input.
+        hidden_dim (int): The hidden dimension. The hidden dimension for the
+                            weight-shared Conv1D layer is
+                            `hidden_dim + output_dim`.
+        output_dim (int): The output dimension. The number of output
+                            channels of the weight-shared Conv1D layer is
+                            `4 * (hidden_dim + output_dim)`.
+        kernel_size (int): The size of the filter used in
+                            `WeightSharedConv1D`.
+        dropout (float): Dropout probability for the `WeightSharedConv1D`.
+        init_mean (float): The mean of the normal distribution with which
+                            weights of the `WeightSharedConv1D` layer are
+                            initialised.
+        init_std (float): The standard deviation of the normal distribution
+                            with which weights of the `WeightSharedConv1D`
+                            layer are initialised.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim,
+        kernel_size,
+        dropout,
+        init_mean,
+        init_std,
+        levels,
+    ):
+        super(GatedCNN, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+        self.levels = levels
+
+        self.hidden_dim_for_conv = hidden_dim + output_dim
+
+        self.dilations = [i + 1 for i in range(levels)]
+
+        self.full_conv = WeightShareConv1d(
+            input_dim=input_dim,
+            hidden_dim=self.hidden_dim_for_conv,
+            output_channels=4 * self.hidden_dim_for_conv,
+            kernel_size=kernel_size,
+            dropout=dropout,
+            init_mean=init_mean,
+            init_std=init_std,
+        )
+
+        self.ht = None
+
+    def transform_input(self, X):
+        device = X.get_device()
+        if device == -1:
+            device = "cpu"
+
+        batch_size = X.size(0)
+        seq_len = X.size(2)
+
+        ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
+            device
+        )
+        self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
+            device
+        )
+        return torch.cat((X, ht), dim=1)
+
+    def gating(self, Z, dilation=1, hc=None):
+        batch_size = Z.size(0)
+        (hid, cell) = hc
+
+        out = self.full_conv(input=Z, dilation=dilation, hid=hid)
+
+        ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation]
+        ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation)
+
+        it = torch.sigmoid(out[:, : self.hidden_dim_for_conv])
+        ot = torch.sigmoid(
+            out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv]
+        )
+        gt = torch.tanh(
+            out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv]
+        )
+        ft = torch.sigmoid(
+            out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv]
+        )
+        self.ct = ft * ct_1 + it * gt
+        ht = ot * torch.tanh(self.ct)
+
+        Z = torch.cat((Z[:, : self.input_dim], ht), dim=1)
+        return Z
+
+    def forward(self, emb, hc):
+        Z = self.transform_input(emb)
+        for key in self.full_conv.dict:
+            if key[1] == emb.get_device():
+                self.full_conv.dict[key] = None
+        self.full_conv.drop.reset_mask(Z[:, self.input_dim :])
+
+        for dilation_per_level in self.dilations:
+            Z = self.gating(Z, dilation=dilation_per_level, hc=hc)
+
+        out = Z[:, -self.output_dim :].transpose(1, 2)
+        hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:])
+        return out, hc
+
+
+class VariationalDropout(nn.Module):
+    """
+    Feed-forward version of variational dropout that applies the same mask
+    at every time step.
+    """
+
+    def __init__(self, dropout=0.5, dim=3):
+        super(VariationalDropout, self).__init__()
+        assert dim in (3, 4), "`dim` should be either 3 or 4"
+        self.dropout = dropout
+        self.dim = dim
+
+    def forward(self, x):
+        if not self.training or not self.dropout:
+            return x
+
+        if self.dim == 4:
+            # Dimension (M, N, L, C), where C stands for channels
+            m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_(
+                1 - self.dropout
+            )
+        else:
+            # Dimension (N, L, C)
+            m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout)
+        with torch.no_grad():
+            mask = m / (1 - dropout)
+            mask = mask.expand_as(x)
+        return mask * x
+
+
+class OutputLayer(nn.Module):
+    def __init__(self, input_dim, num_labels, embed_dir, dropout):
+        super(OutputLayer, self).__init__()
+
+        self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout)
+
+        self.U = nn.Linear(input_dim, num_labels)
+        self.final = nn.Linear(input_dim, num_labels)
+        self.proj_layer = nn.Linear(input_dim, 1, bias=False)
+
+        xavier_uniform_(self.U.weight)
+        xavier_uniform_(self.final.weight)
+
+    def forward(self, x, desc):
+        if desc is not None:
+            desc_vec, _ = self.word_embedding_layer(desc)
+            desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0)
+            mmt = desc_vec.matmul(x.transpose(1, 2))
+        else:
+            mmt = self.U.weight.matmul(x.transpose(1, 2))
+
+        m = mmt.matmul(x)
+
+        y = self.final.weight.mul(m)
+        logits = self.proj_layer(y).squeeze(-1).add(self.final.bias)
+
+        return logits
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml
new file mode 100644
index 0000000..3613885
--- /dev/null
+++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml
@@ -0,0 +1,127 @@
+paths:
+  mimic_dir: &mimic_dir datasets/mimic3/csv
+  static_dir: &static_dir datasets/mimic3/static
+  dataset_dir: &dataset_dir datasets/mimic3_50
+  word2vec_dir: &word2vec_dir datasets/mimic3_50/word2vec
+  output_dir: &output_dir results/gatedcnn_nci_mimic3_50
+
+dataset:
+  name: base_dataset
+  data_common: &data_common
+    column_names:
+      hadm_id: "HADM_ID"
+      clinical_note: "TEXT"
+      labels: "LABELS"
+    word2vec_dir: *word2vec_dir
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    dataset_dir: *dataset_dir
+    label_file: labels.json
+    max_length: 2500
+  params:
+    train:
+      <<: *data_common
+      data_file: train.json
+    val:
+      <<: *data_common
+      data_file: val.json
+    test:
+      <<: *data_common
+      data_file: test.json
+
+model:
+  name: gatedcnn_nci
+  params:
+    version: mimic3
+    dataset_dir: *dataset_dir
+    mimic_dir: *mimic_dir
+    static_dir: *static_dir
+    embed_dir: *word2vec_dir
+    max_length: 2500
+    dropout: 0.2
+    input_dim: 100
+    hidden_dim: 100
+    output_dim: 50
+    bidirectional: true
+    use_description: true
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    kernel_size: 3
+    init_mean: 0
+    init_std: 0.01
+    levels: 3
+
+trainer:
+  name: base_trainer
+  params:
+    output_dir: *output_dir
+    data_loader:
+      batch_size: 16
+      num_workers: 4
+      shuffle: false
+      drop_last: true
+    loss:
+      name: BinaryCrossEntropyLoss
+      params: null
+    optimizer:
+      name: adam
+      params:
+        lr: 0.0001
+        weight_decay: 0.0
+    max_epochs: 200
+    lr_scheduler: null
+    stopping_criterion:
+      metric:
+        name: prec_at_8
+      desired: max
+      patience: 10
+    checkpoint_saver:
+      name: base_saver
+      params:
+        checkpoint_dir: *output_dir
+        interval: 1
+        max_to_keep: 5
+        ckpt_fname_format: "ckpt-{}.pth"
+        best_fname_format: "best-{}.pth"
+        metric:
+          name: prec_at_8
+          class: prec_at_k
+          params:
+            k: 8
+        desired: max
+    eval_metrics: &eval_metrics
+      - name: prec_at_5
+        class: prec_at_k
+        params:
+          k: 5
+      - name: prec_at_8
+        class: prec_at_k
+        params:
+          k: 8
+      - name: macro_f1
+      - name: micro_f1
+      - name: macro_auc
+      - name: micro_auc
+    graph:
+      writer:
+        name: tensorboard
+        params:
+          log_dir: *output_dir
+      train:
+        interval: 100
+        interval_unit: step
+        metric:
+          - name: loss
+      val:
+        interval: 1
+        interval_unit: epoch
+        metric:
+          - name: loss
+          - name: prec_at_5
+          - name: prec_at_8
+          - name: macro_f1
+          - name: micro_f1
+          - name: macro_auc
+          - name: micro_auc
+    seed: 1337
+    use_gpu: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml
new file mode 100644
index 0000000..4ccca3a
--- /dev/null
+++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml
@@ -0,0 +1,127 @@
+paths:
+  mimic_dir: &mimic_dir datasets/mimic3/csv
+  static_dir: &static_dir datasets/mimic3/static
+  dataset_dir: &dataset_dir datasets/mimic3_50_old
+  word2vec_dir: &word2vec_dir datasets/mimic3_50_old/word2vec
+  output_dir: &output_dir results/gatedcnn_nci_mimic3_50_old
+
+dataset:
+  name: base_dataset
+  data_common: &data_common
+    column_names:
+      hadm_id: "HADM_ID"
+      clinical_note: "TEXT"
+      labels: "LABELS"
+    word2vec_dir: *word2vec_dir
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    dataset_dir: *dataset_dir
+    label_file: labels.json
+    max_length: 2500
+  params:
+    train:
+      <<: *data_common
+      data_file: train.json
+    val:
+      <<: *data_common
+      data_file: val.json
+    test:
+      <<: *data_common
+      data_file: test.json
+
+model:
+  name: gatedcnn_nci
+  params:
+    version: mimic3
+    dataset_dir: *dataset_dir
+    mimic_dir: *mimic_dir
+    static_dir: *static_dir
+    embed_dir: *word2vec_dir
+    max_length: 2500
+    dropout: 0.2
+    input_dim: 100
+    hidden_dim: 100
+    output_dim: 50
+    bidirectional: true
+    use_description: true
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    kernel_size: 3
+    init_mean: 0
+    init_std: 0.01
+    levels: 3
+
+trainer:
+  name: base_trainer
+  params:
+    output_dir: *output_dir
+    data_loader:
+      batch_size: 16
+      num_workers: 4
+      shuffle: false
+      drop_last: true
+    loss:
+      name: BinaryCrossEntropyLoss
+      params: null
+    optimizer:
+      name: adam
+      params:
+        lr: 0.000001
+        weight_decay: 0.0
+    max_epochs: 200
+    lr_scheduler: null
+    stopping_criterion:
+      metric:
+        name: prec_at_8
+      desired: max
+      patience: 10
+    checkpoint_saver:
+      name: base_saver
+      params:
+        checkpoint_dir: *output_dir
+        interval: 1
+        max_to_keep: 5
+        ckpt_fname_format: "ckpt-{}.pth"
+        best_fname_format: "best-{}.pth"
+        metric:
+          name: prec_at_8
+          class: prec_at_k
+          params:
+            k: 8
+        desired: max
+    eval_metrics: &eval_metrics
+      - name: prec_at_5
+        class: prec_at_k
+        params:
+          k: 5
+      - name: prec_at_8
+        class: prec_at_k
+        params:
+          k: 8
+      - name: macro_f1
+      - name: micro_f1
+      - name: macro_auc
+      - name: micro_auc
+    graph:
+      writer:
+        name: tensorboard
+        params:
+          log_dir: *output_dir
+      train:
+        interval: 100
+        interval_unit: step
+        metric:
+          - name: loss
+      val:
+        interval: 1
+        interval_unit: epoch
+        metric:
+          - name: loss
+          - name: prec_at_5
+          - name: prec_at_8
+          - name: macro_f1
+          - name: micro_f1
+          - name: macro_auc
+          - name: micro_auc
+    seed: 1337
+    use_gpu: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml
new file mode 100644
index 0000000..de9b1ab
--- /dev/null
+++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml
@@ -0,0 +1,127 @@
+paths:
+  mimic_dir: &mimic_dir datasets/mimic3/csv
+  static_dir: &static_dir datasets/mimic3/static
+  dataset_dir: &dataset_dir datasets/mimic3_full
+  word2vec_dir: &word2vec_dir datasets/mimic3_full/word2vec
+  output_dir: &output_dir results/gatedcnn_nci_mimic3_full
+
+dataset:
+  name: base_dataset
+  data_common: &data_common
+    column_names:
+      hadm_id: "HADM_ID"
+      clinical_note: "TEXT"
+      labels: "LABELS"
+    word2vec_dir: *word2vec_dir
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    dataset_dir: *dataset_dir
+    label_file: labels.json
+    max_length: 2500
+  params:
+    train:
+      <<: *data_common
+      data_file: train.json
+    val:
+      <<: *data_common
+      data_file: val.json
+    test:
+      <<: *data_common
+      data_file: test.json
+
+model:
+  name: CAML
+  params:
+    version: mimic3
+    dataset_dir: *dataset_dir
+    mimic_dir: *mimic_dir
+    static_dir: *static_dir
+    embed_dir: *word2vec_dir
+    max_length: 2500
+    dropout: 0.2
+    input_dim: 100
+    hidden_dim: 100
+    output_dim: 8922
+    bidirectional: true
+    use_description: true
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    kernel_size: 3
+    init_mean: 0
+    init_std: 0.01
+    levels: 3
+
+trainer:
+  name: base_trainer
+  params:
+    output_dir: *output_dir
+    data_loader:
+      batch_size: 16
+      num_workers: 4
+      shuffle: false
+      drop_last: true
+    loss:
+      name: BinaryCrossEntropyLoss
+      params: null
+    optimizer:
+      name: adam
+      params:
+        lr: 0.0001
+        weight_decay: 0.0
+    max_epochs: 200
+    lr_scheduler: null
+    stopping_criterion:
+      metric:
+        name: prec_at_8
+      desired: max
+      patience: 10
+    checkpoint_saver:
+      name: base_saver
+      params:
+        checkpoint_dir: *output_dir
+        interval: 1
+        max_to_keep: 5
+        ckpt_fname_format: "ckpt-{}.pth"
+        best_fname_format: "best-{}.pth"
+        metric:
+          name: prec_at_8
+          class: prec_at_k
+          params:
+            k: 8
+        desired: max
+    eval_metrics: &eval_metrics
+      - name: prec_at_8
+        class: prec_at_k
+        params:
+          k: 8
+      - name: prec_at_15
+        class: prec_at_k
+        params:
+          k: 15
+      - name: macro_f1
+      - name: micro_f1
+      - name: macro_auc
+      - name: micro_auc
+    graph:
+      writer:
+        name: tensorboard
+        params:
+          log_dir: *output_dir
+      train:
+        interval: 100
+        interval_unit: step
+        metric:
+          - name: loss
+      val:
+        interval: 1
+        interval_unit: epoch
+        metric:
+          - name: loss
+          - name: prec_at_8
+          - name: prec_at_15
+          - name: macro_f1
+          - name: micro_f1
+          - name: macro_auc
+          - name: micro_auc
+    seed: 1337
+    use_gpu: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml
new file mode 100644
index 0000000..c95f4aa
--- /dev/null
+++ b/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml
@@ -0,0 +1,127 @@
+paths:
+  mimic_dir: &mimic_dir datasets/mimic3/csv
+  static_dir: &static_dir datasets/mimic3/static
+  dataset_dir: &dataset_dir datasets/mimic3_full_old
+  word2vec_dir: &word2vec_dir datasets/mimic3_full_old/word2vec
+  output_dir: &output_dir results/gatedcnn_nci_mimic3_full_old
+
+dataset:
+  name: base_dataset
+  data_common: &data_common
+    column_names:
+      hadm_id: "HADM_ID"
+      clinical_note: "TEXT"
+      labels: "LABELS"
+    word2vec_dir: *word2vec_dir
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    dataset_dir: *dataset_dir
+    label_file: labels.json
+    max_length: 2500
+  params:
+    train:
+      <<: *data_common
+      data_file: train.json
+    val:
+      <<: *data_common
+      data_file: val.json
+    test:
+      <<: *data_common
+      data_file: test.json
+
+model:
+  name: gatedcnn_nci
+  params:
+    version: mimic3
+    dataset_dir: *dataset_dir
+    mimic_dir: *mimic_dir
+    static_dir: *static_dir
+    embed_dir: *word2vec_dir
+    max_length: 2500
+    dropout: 0.2
+    input_dim: 100
+    hidden_dim: 100
+    output_dim: 8922
+    bidirectional: true
+    use_description: true
+    pad_token: "<pad>"
+    unk_token: "<unk>"
+    kernel_size: 3
+    init_mean: 0
+    init_std: 0.01
+    levels: 3
+
+trainer:
+  name: base_trainer
+  params:
+    output_dir: *output_dir
+    data_loader:
+      batch_size: 16
+      num_workers: 4
+      shuffle: false
+      drop_last: true
+    loss:
+      name: BinaryCrossEntropyLoss
+      params: null
+    optimizer:
+      name: adam
+      params:
+        lr: 0.0001
+        weight_decay: 0.0
+    max_epochs: 200
+    lr_scheduler: null
+    stopping_criterion:
+      metric:
+        name: prec_at_8
+      desired: max
+      patience: 10
+    checkpoint_saver:
+      name: base_saver
+      params:
+        checkpoint_dir: *output_dir
+        interval: 1
+        max_to_keep: 5
+        ckpt_fname_format: "ckpt-{}.pth"
+        best_fname_format: "best-{}.pth"
+        metric:
+          name: prec_at_8
+          class: prec_at_k
+          params:
+            k: 8
+        desired: max
+    eval_metrics: &eval_metrics
+      - name: prec_at_8
+        class: prec_at_k
+        params:
+          k: 8
+      - name: prec_at_15
+        class: prec_at_k
+        params:
+          k: 15
+      - name: macro_f1
+      - name: micro_f1
+      - name: macro_auc
+      - name: micro_auc
+    graph:
+      writer:
+        name: tensorboard
+        params:
+          log_dir: *output_dir
+      train:
+        interval: 100
+        interval_unit: step
+        metric:
+          - name: loss
+      val:
+        interval: 1
+        interval_unit: epoch
+        metric:
+          - name: loss
+          - name: prec_at_8
+          - name: prec_at_15
+          - name: macro_f1
+          - name: micro_f1
+          - name: macro_auc
+          - name: micro_auc
+    seed: 1337
+    use_gpu: true
diff --git a/src/models/fusion.py b/src/models/fusion.py
index 69fed90..5236047 100755
--- a/src/models/fusion.py
+++ b/src/models/fusion.py
@@ -415,7 +415,7 @@ class Fusion(nn.Module):
 
     def __init__(self, config):
         super(Fusion, self).__init__()
-        logger.info(f"Initialising %s", self.__class__.__name__)
+        logger.info("Initialising %s", self.__class__.__name__)
         logger.debug(
             "Initialising %s with config: %s", self.__class__.__name__, config
         )
diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py
index 38d0016..72114d6 100644
--- a/src/models/gatedcnn_nci.py
+++ b/src/models/gatedcnn_nci.py
@@ -1,32 +1,75 @@
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from torch import nn
-
-
-class GatedCNNencoder(nn.Module):
-    def __init__(self, args, Y, dicts):
-        super(GatedCNNencoder, self).__init__()
-        self.args = args
-        self.max_length = args.MAX_LENGTH
-        self.dropout = args.dropout
-        self.ninp = args.embed_size
-        self.nhid = args.nhid
-        self.nout = args.nout
-        self.bidirectional = args.bidirectional
-
-        self.word_rep = WordRep(args, Y, dicts)
-        self.encoder = GatedCNN(args, Y, dicts, self.ninp, self.nout)
-        self.network = nn.ModuleList([self.encoder])
+from torch.nn.init import normal_, xavier_uniform_
+
+from src.utils.caml_utils import load_lookups, pad_desc_vecs
+from src.utils.mapper import ConfigMapper
+
+
+@ConfigMapper.map("models", "gatedcnn_nci")
+class GatedCNNEncoder(nn.Module):
+    def __init__(self, config):
+        super(GatedCNNEncoder, self).__init__()
+        self.max_length = config.max_length
+        self.dropout = config.dropout
+        self.input_dim = config.embed_size
+        self.hidden_dim = config.hidden_dim
+        self.output_dim = config.output_dim
+        self.bidirectional = config.bidirectional
+        self.use_description = config.use_description
+
+        self.word_embedding_layer = WordEmbeddingLayer(
+            embed_dir=config.embed_dir,
+            dataset_dir=config.dataset_dir,
+            mimic_dir=config.mimic_dir,
+            static_dir=config.static_dir,
+            dropout=config.dropout,
+            pad_token=config.pad_token,
+            unk_token=config.unk_token,
+        )
+        self.desc_vecs = self.word_embedding_layer.desc_vecs
+
+        self.encoder = GatedCNN(
+            input_dim=config.input_dim,
+            hidden_dim=config.hidden_dim,
+            output_dim=config.output_dim,
+            kernel_size=config.kernel_size,
+            dropout=config.dropout,
+            init_mean=config.init_mean,
+            init_std=config.init_std,
+            levels=config.levels,
+        )
+
         if self.bidirectional:
-            self.output_layer = OutputLayer(args, Y, dicts, self.nout * 2)
+            self.output_layer = OutputLayer(
+                input_dim=2 * config.input_dim,
+                num_labels=config.output_dim,
+                embed_dir=config.embed_dir,
+            )
         else:
-            self.output_layer = OutputLayer(args, Y, dicts, self.nout)
-        self.var_drop = VariationalDropout()
+            self.output_layer = OutputLayer(
+                input_dim=config.input_dim,
+                num_labels=config.num_labels,
+                embed_dir=config.embed_dir,
+            )
+
+        self.variational_dropout = VariationalDropout(dropout=config.dropout)
+
+        self.hidden = None
 
     def freeze_net(self):
-        for p in self.word_rep.embed.parameters():
+        for p in self.word_embedding_layer.embed.parameters():
             p.requires_grad = False
 
+    def init_hidden(self, batch_size):
+        h_size = self.hidden_dim + self.output_dim
+        weight = next(self.parameters()).data
+        return (
+            weight.new(batch_size, h_size, 1).zero_(),
+            weight.new(batch_size, h_size, 1).zero_(),
+        )
+
     def _reverse_seq(self, X, mask, seq_max_len):
         """
         X -> batch, seq_len, dim
@@ -42,58 +85,53 @@ def _reverse_seq(self, X, mask, seq_max_len):
             padded_rev[i][: len(mat), :] = mat
         return padded_rev
 
-    def forward(self, data, target, mask, hidden, desc):
+    def forward(self, data, desc):
         """
         :param data: The input sequence, with dimesion (N, L)
-        :param target: labels
-        :param mask: input sequence mask
-        :param hidden: The initial hidden state (h, c)
         :param desc: Whether to use code description
         :return: logits, loss, hidden
         """
-        emb = self.word_rep(data, target)
+        # If this is the first forward pass, we will initialise the hidden
+        # state.
+        if self.hidden is None:
+            self.init_hidden_flag = True
+            self.hidden = self.init_hidden(data.size(0))
+
+        # Look up the embeddings of all the tokens using the WordEmbeddingLayer.
+        # `emb` shape: (batch_size, max_length, embed_size)
+        emb, mask = self.word_embedding_layer(data)
+
+        # If we want a bidirectional model, we reverse the sequence of
+        # tokens.
         if self.bidirectional:
+            # `emb_reverse` shape: (batch_size, max_length, embed_size)
             emb_reverse = self._reverse_seq(emb, mask, self.max_length)
-        emb = emb.transpose(1, 2)  # emb: [bs, 100, len]
-        if self.bidirectional:
-            emb_reverse = emb_reverse.transpose(
-                1, 2
-            )  # emb_reverse: [bs, 100, len]
-        cnn_encoder = self.network[0]
-        raw_output, hidden = cnn_encoder(emb, hidden)
-        if self.bidirectional:
-            raw_out_re, hidden = cnn_encoder(emb_reverse, hidden)
-        output = self.var_drop(raw_output, self.dropout)
-        if self.bidirectional:
-            output_re = self._reverse_seq(raw_out_re, mask, self.max_length)
-            output_re = self.var_drop(output_re, self.dropout)
+            # `emb_reverse` shape`: [batch_size, embed_size, max_length]
+            emb_reverse = emb_reverse.transpose(1, 2)
+        # `emb` shape: (batch_size, embed_size, max_length)
+        emb = emb.transpose(1, 2)
+
+        # Pass the embeddings through the encoder. If the model is
+        # bidirectional, we pass the reverse embeddings as well.
+        raw_output, self.hidden = self.encoder(emb, self.hidden)
         if self.bidirectional:
-            output = torch.cat([output, output_re], dim=2)
-        if self.args.desc:
-            logits, loss, _, interaction = self.output_layer(
-                output, target, desc
+            raw_out_reverse, self.hidden = self.encoder(
+                emb_reverse, self.hidden
             )
-        else:
-            logits, loss, _, interaction = self.output_layer(
-                output, target, None
-            )
-        return logits, loss, hidden, interaction
-
-    def init_hidden(self, bsz):
-        h_size = self.nhid + self.nout
-        weight = next(self.parameters()).data
-        return (
-            weight.new(bsz, h_size, 1).zero_(),
-            weight.new(bsz, h_size, 1).zero_(),
-        )
 
+        output = self.variational_dropout(raw_output)
+        if self.bidirectional:
+            output_reverse = self._reverse_seq(
+                raw_out_reverse, mask, self.max_length
+            )
+            output_reverse = self.variational_dropout(output_reverse)
+            output = torch.cat([output, output_reverse], dim=2)
 
-from typing import Tuple
-
-import torch.nn as nn
-from embeddings import build_pretrain_embedding, load_embeddings
-from torch import Tensor
-from torch.nn.init import kaiming_uniform_, normal_, xavier_uniform_
+        if self.use_description:
+            logits = self.output_layer(output, self.desc_vecs)
+        else:
+            logits = self.output_layer(output, None)
+        return logits
 
 
 class WordEmbeddingLayer(nn.Module):
@@ -109,15 +147,32 @@ class WordEmbeddingLayer(nn.Module):
         dropout (float): The dropout probability.
     """
 
-    def __init__(self, embed_dir, dropout, num_filter_maps):
+    def __init__(
+        self,
+        embed_dir,
+        dataset_dir,
+        mimic_dir,
+        static_dir,
+        version,
+        dropout,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        return_pad_mask=True,
+        use_description=True,
+    ):
         super(WordEmbeddingLayer, self).__init__()
         logger.debug(
             f"Initialising {self.__class__.__name__} with "
             f"embed_dir = {embed_dir}, dropout = {dropout}"
         )
 
+        self.return_pad_mask = return_pad_mask
+
         # Note: This should be changed, since we won't always use Word2Vec.
         embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
+        vocab = embedding_cls.load_vocab(embed_dir)
+        self.pad_token_id = vocab[pad_token]
+        self.unk_token_id = vocab[unk_token]
 
         W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
         self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
@@ -127,28 +182,50 @@ def __init__(self, embed_dir, dropout, num_filter_maps):
 
         self.dropout = nn.Dropout(dropout)
 
-        self.conv_dict = {
-            1: [self.embedding_size, num_filter_maps],
-            2: [self.embedding_size, 100, num_filter_maps],
-            3: [self.embedding_size, 150, 100, num_filter_maps],
-            4: [self.embedding_size, 200, 150, 100, num_filter_maps],
-        }
+        if use_description:
+            dicts = load_lookups(
+                dataset_dir=dataset_dir,
+                mimic_dir=mimic_dir,
+                static_dir=static_dir,
+                word2vec_dir=embed_dir,
+                version=version,
+            )
+            ind2c = dicts["ind2c"]
+            w2ind = dicts["w2ind"]
+            desc_dict = dicts["desc"]
+            self.desc_vecs = []
+            for i, c in ind2c.items():
+                self.desc_vecs.append(
+                    [
+                        w2ind[w] if w in w2ind else self.unk_token_id
+                        for w in desc_dict[c]
+                    ]
+                )
+
+            # Pad and convert to torch tensor.
+            self.desc_vecs = torch.Tensor(
+                list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0)))
+            )
 
     def forward(self, x):
         embedding = self.embed(x)
         x = self.dropout(embedding)
+        if self.return_pad_mask:
+            pad_mask = ~(batch == pad_token_id)
+            return x, pad_mask
         return x
 
 
 class VariationalHidDropout(nn.Module):
-    def __init__(self, dropout=0.0):
-        """
-        Hidden-to-hidden (VD-based) dropout that applies the same mask at every
-        time step and every layer of TrellisNet.
+    """
+    Hidden-to-hidden (VD-based) dropout that applies the same mask at every
+    time step and every layer of TrellisNet.
 
-        Args:
-            dropout (float): The dropout probability.
-        """
+    Args:
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(self, dropout=0.0):
         super(VariationalHidDropout, self).__init__()
         self.dropout_probability = dropout
         self.mask = None
@@ -177,6 +254,30 @@ def forward(self, input):
 
 
 class WeightShareConv1d(nn.Module):
+    """
+    The weight-tied 1D convolution used in TrellisNet.
+
+    Args:
+        input_dim (int): The dimension of the input. This is equivalent to
+                            the number of input channels in the first
+                            convolutional layer.
+        hidden_dim (int): The dimension of the hidden state. This is
+                            equivalent to the number of input channels in the
+                            second convolutional layer.
+        out_channels (int): The number of output channels in both
+                            convolutional layers.
+        kernel_size (int): The size of the filter used in both
+                            convolutional layers.
+        dropout (float): Dropout probability for the hidden-to-hidden
+                            dropout layer.
+        init_mean (float): The mean of the normal distribution with which
+                            weights of the convolutional layers are
+                            initialised.
+        init_std (float): The standard deviation of the normal distribution
+                            with which weights of the convolutional layers are
+                            initialised.
+    """
+
     def __init__(
         self,
         input_dim,
@@ -187,29 +288,6 @@ def __init__(
         init_mean=0.0,
         init_std=0.01,
     ):
-        """
-        The weight-tied 1D convolution used in TrellisNet.
-
-        Args:
-            input_dim (int): The dimension of the input. This is equivalent to
-                             the number of input channels in the first
-                             convolutional layer.
-            hidden_dim (int): The dimension of the hidden state. This is
-                              equivalent to the number of input channels in the
-                              second convolutional layer.
-            out_channels (int): The number of output channels in both
-                                convolutional layers.
-            kernel_size (int): The size of the filter used in both
-                               convolutional layers.
-            dropout (float): Dropout probability for the hidden-to-hidden
-                             dropout layer.
-            init_mean (float): The mean of the normal distribution with which
-                               weights of the convolutional layers are
-                               initialised.
-            init_std (float): The standard deviation of the normal distribution
-                              with which weights of the convolutional layers are
-                              initialised.
-        """
         super(WeightShareConv1d, self).__init__()
 
         self.input_dim = input_dim
@@ -268,6 +346,28 @@ def forward(self, input, dilation, hid):
 
 
 class GatedCNN(nn.Module):
+    """
+    Gated CNN module.
+
+    Args:
+        input_dim (int): The dimension of the input.
+        hidden_dim (int): The hidden dimension. The hidden dimension for the
+                            weight-shared Conv1D layer is
+                            `hidden_dim + output_dim`.
+        output_dim (int): The output dimension. The number of output
+                            channels of the weight-shared Conv1D layer is
+                            `4 * (hidden_dim + output_dim)`.
+        kernel_size (int): The size of the filter used in
+                            `WeightSharedConv1D`.
+        dropout (float): Dropout probability for the `WeightSharedConv1D`.
+        init_mean (float): The mean of the normal distribution with which
+                            weights of the `WeightSharedConv1D` layer are
+                            initialised.
+        init_std (float): The standard deviation of the normal distribution
+                            with which weights of the `WeightSharedConv1D`
+                            layer are initialised.
+    """
+
     def __init__(
         self,
         input_dim,
@@ -279,30 +379,9 @@ def __init__(
         init_std,
         levels,
     ):
-        """
-        Gated CNN module.
-
-        Args:
-            input_dim (int): The dimension of the input.
-            hidden_dim (int): The hidden dimension. The hidden dimension for the
-                              weight-shared Conv1D layer is
-                              `hidden_dim + output_dim`.
-            output_dim (int): The output dimension. The number of output
-                              channels of the weight-shared Conv1D layer is
-                              `4 * (hidden_dim + output_dim)`.
-            kernel_size (int): The size of the filter used in
-                               `WeightSharedConv1D`.
-            dropout (float): Dropout probability for the `WeightSharedConv1D`.
-            init_mean (float): The mean of the normal distribution with which
-                               weights of the `WeightSharedConv1D` layer are
-                               initialised.
-            init_std (float): The standard deviation of the normal distribution
-                              with which weights of the `WeightSharedConv1D`
-                              layer are initialised.
-        """
         super(GatedCNN, self).__init__()
         self.input_dim = input_dim
-        self.hidden_dim = args.hidden_dim
+        self.hidden_dim = hidden_dim
         self.output_dim = output_dim
         self.levels = levels
 
@@ -378,26 +457,52 @@ def forward(self, emb, hc):
         return out, hc
 
 
+class VariationalDropout(nn.Module):
+    """
+    Feed-forward version of variational dropout that applies the same mask
+    at every time step.
+    """
+
+    def __init__(self, dropout=0.5, dim=3):
+        super(VariationalDropout, self).__init__()
+        assert dim in (3, 4), "`dim` should be either 3 or 4"
+        self.dropout = dropout
+        self.dim = dim
+
+    def forward(self, x):
+        if not self.training or not self.dropout:
+            return x
+
+        if self.dim == 4:
+            # Dimension (M, N, L, C), where C stands for channels
+            m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_(
+                1 - self.dropout
+            )
+        else:
+            # Dimension (N, L, C)
+            m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout)
+        with torch.no_grad():
+            mask = m / (1 - dropout)
+            mask = mask.expand_as(x)
+        return mask * x
+
+
 class OutputLayer(nn.Module):
-    def __init__(
-        self, input_size, num_labels, embed_dir, dropout, num_filter_maps
-    ):
+    def __init__(self, input_dim, num_labels, embed_dir, dropout):
         super(OutputLayer, self).__init__()
 
-        self.word_embedding_layer = WordEmbeddingLayer(
-            embed_dir, dropout, num_filter_maps
-        )
+        self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout)
 
-        self.U = nn.Linear(input_size, num_labels)
-        self.final = nn.Linear(input_size, num_labels)
-        self.proj_layer = nn.Linear(input_size, 1, bias=False)
+        self.U = nn.Linear(input_dim, num_labels)
+        self.final = nn.Linear(input_dim, num_labels)
+        self.proj_layer = nn.Linear(input_dim, 1, bias=False)
 
         xavier_uniform_(self.U.weight)
         xavier_uniform_(self.final.weight)
 
     def forward(self, x, desc):
         if desc is not None:
-            desc_vec = self.word_rep(desc, target)
+            desc_vec, _ = self.word_embedding_layer(desc)
             desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0)
             mmt = desc_vec.matmul(x.transpose(1, 2))
         else:
@@ -409,28 +514,3 @@ def forward(self, x, desc):
         logits = self.proj_layer(y).squeeze(-1).add(self.final.bias)
 
         return logits
-
-
-class VariationalDropout(nn.Module):
-    def __init__(self):
-        """
-        Feed-forward version of variational dropout that applies the same mask
-        at every time step.
-        """
-        super(VariationalDropout, self).__init__()
-
-    def forward(self, x, dropout=0.5, dim=3):
-        if not self.training or not dropout:
-            return x
-        if dim == 4:
-            # Dimension (M, N, L, C), where C stands for channels
-            m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_(
-                1 - dropout
-            )
-        else:
-            # Dimension (N, L, C)
-            m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - dropout)
-        with torch.no_grad():
-            mask = m / (1 - dropout)
-            mask = mask.expand_as(x)
-        return mask * x
diff --git a/src/modules/metrics.py b/src/modules/metrics.py
index 04d2341..a5edda6 100755
--- a/src/modules/metrics.py
+++ b/src/modules/metrics.py
@@ -23,6 +23,7 @@ def to_np_array(array):
         array = np.array(array)
     return array
 
+
 def _auc_job(x):
     return roc_auc_score(x[0], x[1])
 
diff --git a/src/utils/caml_utils.py b/src/utils/caml_utils.py
index a165a4d..58fd76d 100644
--- a/src/utils/caml_utils.py
+++ b/src/utils/caml_utils.py
@@ -101,3 +101,18 @@ def pad_desc_vecs(desc_vecs):
     for vec in desc_vecs:
         pad_vecs.append(vec + [0] * (desc_len - len(vec)))
     return pad_vecs
+
+
+def load_description_tokens(lookup_dict, vocab_json):
+    # load description one-hot vectors from file
+    dv_dict = {}
+
+    with open("%s/description_vectors.vocab" % (data_dir), "r") as vfile:
+        r = csv.reader(vfile, delimiter=" ")
+        # header
+        next(r)
+        for row in r:
+            code = row[0]
+            vec = [int(x) for x in row[1:]]
+            dv_dict[code] = vec
+    return dv_dict

From 8f994bdceefa7cca8511b48306b8ff0595bc3fef Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Tue, 14 Jun 2022 23:45:10 +0530
Subject: [PATCH 3/5] Fix more bugs

---
 src/models/__init__.py     |  1 +
 src/models/gatedcnn_nci.py | 84 ++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/src/models/__init__.py b/src/models/__init__.py
index 04f7630..9cf8c04 100644
--- a/src/models/__init__.py
+++ b/src/models/__init__.py
@@ -2,3 +2,4 @@
 from src.models.caml import VanillaConv as CNN
 from src.models.dcan import DCAN
 from src.models.fusion import Fusion
+from src.models.gatedcnn_nci import GatedCNNNCI
diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py
index 72114d6..84ae5e8 100644
--- a/src/models/gatedcnn_nci.py
+++ b/src/models/gatedcnn_nci.py
@@ -1,3 +1,5 @@
+import itertools
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -5,15 +7,18 @@
 
 from src.utils.caml_utils import load_lookups, pad_desc_vecs
 from src.utils.mapper import ConfigMapper
+from src.utils.text_loggers import get_logger
+
+logger = get_logger(__name__)
 
 
 @ConfigMapper.map("models", "gatedcnn_nci")
-class GatedCNNEncoder(nn.Module):
+class GatedCNNNCI(nn.Module):
     def __init__(self, config):
-        super(GatedCNNEncoder, self).__init__()
+        super(GatedCNNNCI, self).__init__()
         self.max_length = config.max_length
         self.dropout = config.dropout
-        self.input_dim = config.embed_size
+        self.input_dim = config.input_dim
         self.hidden_dim = config.hidden_dim
         self.output_dim = config.output_dim
         self.bidirectional = config.bidirectional
@@ -24,6 +29,7 @@ def __init__(self, config):
             dataset_dir=config.dataset_dir,
             mimic_dir=config.mimic_dir,
             static_dir=config.static_dir,
+            version=config.version,
             dropout=config.dropout,
             pad_token=config.pad_token,
             unk_token=config.unk_token,
@@ -43,15 +49,29 @@ def __init__(self, config):
 
         if self.bidirectional:
             self.output_layer = OutputLayer(
+                embed_dir=config.embed_dir,
+                dataset_dir=config.dataset_dir,
+                mimic_dir=config.mimic_dir,
+                static_dir=config.static_dir,
+                version=config.version,
                 input_dim=2 * config.input_dim,
                 num_labels=config.output_dim,
-                embed_dir=config.embed_dir,
+                dropout=config.dropout,
+                pad_token=config.pad_token,
+                unk_token=config.unk_token,
             )
         else:
             self.output_layer = OutputLayer(
+                embed_dir=config.embed_dir,
+                dataset_dir=config.dataset_dir,
+                mimic_dir=config.mimic_dir,
+                static_dir=config.static_dir,
+                version=config.version,
                 input_dim=config.input_dim,
                 num_labels=config.num_labels,
-                embed_dir=config.embed_dir,
+                dropout=config.dropout,
+                pad_token=config.pad_token,
+                unk_token=config.unk_token,
             )
 
         self.variational_dropout = VariationalDropout(dropout=config.dropout)
@@ -85,12 +105,16 @@ def _reverse_seq(self, X, mask, seq_max_len):
             padded_rev[i][: len(mat), :] = mat
         return padded_rev
 
-    def forward(self, data, desc):
+    def forward(self, data):
         """
         :param data: The input sequence, with dimesion (N, L)
         :param desc: Whether to use code description
         :return: logits, loss, hidden
         """
+        device = data.get_device()
+        if device == -1:
+            device = "cpu"
+
         # If this is the first forward pass, we will initialise the hidden
         # state.
         if self.hidden is None:
@@ -128,7 +152,7 @@ def forward(self, data, desc):
             output = torch.cat([output, output_reverse], dim=2)
 
         if self.use_description:
-            logits = self.output_layer(output, self.desc_vecs)
+            logits = self.output_layer(output, self.desc_vecs.to(device))
         else:
             logits = self.output_layer(output, None)
         return logits
@@ -205,13 +229,14 @@ def __init__(
             # Pad and convert to torch tensor.
             self.desc_vecs = torch.Tensor(
                 list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0)))
-            )
+            ).long()
 
     def forward(self, x):
+        if self.return_pad_mask:
+            pad_mask = ~(x == self.pad_token_id)
         embedding = self.embed(x)
         x = self.dropout(embedding)
         if self.return_pad_mask:
-            pad_mask = ~(batch == pad_token_id)
             return x, pad_mask
         return x
 
@@ -307,13 +332,15 @@ def __init__(
             out_channels=out_channels,
             kernel_size=kernel_size,
         )
-        self.weight_2 = conv2.weight
-        self.bias_2 = conv2.bias
+        self.weight_2 = conv_layer_2.weight
+        self.bias_2 = conv_layer_2.bias
 
         self.init_conv_weights(init_mean, init_std)
 
         self.dropout = VariationalHidDropout(dropout=dropout)
 
+        self.dict = {}
+
     def init_conv_weights(self, init_mean, init_std):
         self.weight_1.data.normal_(mean=init_mean, std=init_std)
         self.weight_2.data.normal_(mean=init_mean, std=init_std)
@@ -335,12 +362,12 @@ def forward(self, input, dilation, hid):
             (dilation, device)
         ] is None:
             self.dict[(dilation, device)] = F.conv1d(
-                input=x_1, weight=self.weight1, dilation=dilation
+                input=x_1, weight=self.weight_1, dilation=dilation
             )
 
         z_1 = self.dropout(z_1)
         injected = self.dict[(dilation, device)] + F.conv1d(
-            input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation
+            input=z_1, weight=self.weight_2, bias=self.bias_2, dilation=dilation
         )
         return injected
 
@@ -392,7 +419,7 @@ def __init__(
         self.full_conv = WeightShareConv1d(
             input_dim=input_dim,
             hidden_dim=self.hidden_dim_for_conv,
-            output_channels=4 * self.hidden_dim_for_conv,
+            out_channels=4 * self.hidden_dim_for_conv,
             kernel_size=kernel_size,
             dropout=dropout,
             init_mean=init_mean,
@@ -447,7 +474,7 @@ def forward(self, emb, hc):
         for key in self.full_conv.dict:
             if key[1] == emb.get_device():
                 self.full_conv.dict[key] = None
-        self.full_conv.drop.reset_mask(Z[:, self.input_dim :])
+        self.full_conv.dropout.reset_mask(Z[:, self.input_dim :])
 
         for dilation_per_level in self.dilations:
             Z = self.gating(Z, dilation=dilation_per_level, hc=hc)
@@ -482,16 +509,37 @@ def forward(self, x):
             # Dimension (N, L, C)
             m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout)
         with torch.no_grad():
-            mask = m / (1 - dropout)
+            mask = m / (1 - self.dropout)
             mask = mask.expand_as(x)
         return mask * x
 
 
 class OutputLayer(nn.Module):
-    def __init__(self, input_dim, num_labels, embed_dir, dropout):
+    def __init__(
+        self,
+        embed_dir,
+        dataset_dir,
+        mimic_dir,
+        static_dir,
+        version,
+        input_dim,
+        num_labels,
+        dropout=0.2,
+        pad_token="<pad>",
+        unk_token="<unk>",
+    ):
         super(OutputLayer, self).__init__()
 
-        self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout)
+        self.word_embedding_layer = WordEmbeddingLayer(
+            embed_dir=embed_dir,
+            dataset_dir=dataset_dir,
+            mimic_dir=mimic_dir,
+            static_dir=static_dir,
+            version=version,
+            dropout=dropout,
+            pad_token=pad_token,
+            unk_token=unk_token,
+        )
 
         self.U = nn.Linear(input_dim, num_labels)
         self.final = nn.Linear(input_dim, num_labels)

From ec76041314ad81663c2d612c92939a4ac278698d Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Wed, 15 Jun 2022 19:24:14 +0530
Subject: [PATCH 4/5] Fix bugs

---
 configs/caml/caml_mimic3_50.yml               |   1 +
 configs/caml/caml_mimic3_50_old.yml           |   1 +
 configs/caml/caml_mimic3_full.yml             |   1 +
 configs/caml/caml_mimic3_full_old.yml         |   1 +
 configs/caml/cnn_mimic3_50.yml                |   1 +
 configs/caml/drcaml_mimic3_50.yml             |   1 +
 configs/dcan/mimic3_50.yml                    |   1 +
 configs/dcan/mimic3_50_old.yml                |   1 +
 configs/dcan/mimic3_full.yml                  |   1 +
 configs/dcan/mimic3_full_old.yml              |   1 +
 configs/fusion/mimic3_50.yml                  |   1 +
 configs/fusion/mimic3_50_old.yml              |   1 +
 configs/fusion/mimic3_full.yml                |   1 +
 configs/fusion/mimic3_full_old.yml            |   1 +
 configs/gatedcnn_nci/gatedcnn_nci.py          | 512 ------------------
 ...tedcnn_nci_mimic3_50.yml => mimic3_50.yml} |   3 +-
 ...ci_mimic3_50_old.yml => mimic3_50_old.yml} |   3 +-
 ...nn_nci_mimic3_full.yml => mimic3_full.yml} |   3 +-
 ...imic3_full_old.yml => mimic3_full_old.yml} |   3 +-
 src/models/gatedcnn_nci.py                    |  14 +-
 src/trainers/base_trainer.py                  |   4 +
 21 files changed, 35 insertions(+), 521 deletions(-)
 delete mode 100644 configs/gatedcnn_nci/gatedcnn_nci.py
 rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_50.yml => mimic3_50.yml} (97%)
 rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_50_old.yml => mimic3_50_old.yml} (97%)
 rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_full.yml => mimic3_full.yml} (97%)
 rename configs/gatedcnn_nci/{gatedcnn_nci_mimic3_full_old.yml => mimic3_full_old.yml} (97%)

diff --git a/configs/caml/caml_mimic3_50.yml b/configs/caml/caml_mimic3_50.yml
index 84f704c..47cc2e8 100644
--- a/configs/caml/caml_mimic3_50.yml
+++ b/configs/caml/caml_mimic3_50.yml
@@ -121,3 +121,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/caml/caml_mimic3_50_old.yml b/configs/caml/caml_mimic3_50_old.yml
index bb2bd2f..6d9bf50 100644
--- a/configs/caml/caml_mimic3_50_old.yml
+++ b/configs/caml/caml_mimic3_50_old.yml
@@ -121,3 +121,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/caml/caml_mimic3_full.yml b/configs/caml/caml_mimic3_full.yml
index 9a66d1c..7a4633f 100644
--- a/configs/caml/caml_mimic3_full.yml
+++ b/configs/caml/caml_mimic3_full.yml
@@ -121,3 +121,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/caml/caml_mimic3_full_old.yml b/configs/caml/caml_mimic3_full_old.yml
index 39ba0ed..7d37859 100644
--- a/configs/caml/caml_mimic3_full_old.yml
+++ b/configs/caml/caml_mimic3_full_old.yml
@@ -121,3 +121,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/caml/cnn_mimic3_50.yml b/configs/caml/cnn_mimic3_50.yml
index 1bd742e..074758e 100644
--- a/configs/caml/cnn_mimic3_50.yml
+++ b/configs/caml/cnn_mimic3_50.yml
@@ -117,3 +117,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/caml/drcaml_mimic3_50.yml b/configs/caml/drcaml_mimic3_50.yml
index 979fd04..ee90972 100644
--- a/configs/caml/drcaml_mimic3_50.yml
+++ b/configs/caml/drcaml_mimic3_50.yml
@@ -121,3 +121,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/dcan/mimic3_50.yml b/configs/dcan/mimic3_50.yml
index 788f329..e5a0a80 100644
--- a/configs/dcan/mimic3_50.yml
+++ b/configs/dcan/mimic3_50.yml
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/dcan/mimic3_50_old.yml b/configs/dcan/mimic3_50_old.yml
index 380aacf..12ddfea 100644
--- a/configs/dcan/mimic3_50_old.yml
+++ b/configs/dcan/mimic3_50_old.yml
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/dcan/mimic3_full.yml b/configs/dcan/mimic3_full.yml
index 03e6e11..95f6fc8 100644
--- a/configs/dcan/mimic3_full.yml
+++ b/configs/dcan/mimic3_full.yml
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/dcan/mimic3_full_old.yml b/configs/dcan/mimic3_full_old.yml
index e21f402..2e69608 100644
--- a/configs/dcan/mimic3_full_old.yml
+++ b/configs/dcan/mimic3_full_old.yml
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/fusion/mimic3_50.yml b/configs/fusion/mimic3_50.yml
index 05236c1..69e2276 100644
--- a/configs/fusion/mimic3_50.yml
+++ b/configs/fusion/mimic3_50.yml
@@ -126,3 +126,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: true
diff --git a/configs/fusion/mimic3_50_old.yml b/configs/fusion/mimic3_50_old.yml
index 53532cd..3ff6583 100644
--- a/configs/fusion/mimic3_50_old.yml
+++ b/configs/fusion/mimic3_50_old.yml
@@ -126,3 +126,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/fusion/mimic3_full.yml b/configs/fusion/mimic3_full.yml
index af05013..a0f43f4 100644
--- a/configs/fusion/mimic3_full.yml
+++ b/configs/fusion/mimic3_full.yml
@@ -126,3 +126,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/fusion/mimic3_full_old.yml b/configs/fusion/mimic3_full_old.yml
index 13b4659..9a953f4 100644
--- a/configs/fusion/mimic3_full_old.yml
+++ b/configs/fusion/mimic3_full_old.yml
@@ -126,3 +126,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: false
diff --git a/configs/gatedcnn_nci/gatedcnn_nci.py b/configs/gatedcnn_nci/gatedcnn_nci.py
deleted file mode 100644
index 4014637..0000000
--- a/configs/gatedcnn_nci/gatedcnn_nci.py
+++ /dev/null
@@ -1,512 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import normal_, xavier_uniform_
-
-
-class GatedCNNEncoder(nn.Module):
-    def __init__(self, config):
-        super(GatedCNNEncoder, self).__init__()
-        self.max_length = config.max_length
-        self.dropout = config.dropout
-        self.input_dim = config.embed_size
-        self.hidden_dim = config.hidden_dim
-        self.output_dim = config.output_dim
-        self.bidirectional = config.bidirectional
-        self.use_description = config.use_description
-
-        self.word_embedding_layer = WordEmbeddingLayer(
-            embed_dir=config.embed_dir,
-            dataset_dir=config.dataset_dir,
-            mimic_dir=config.mimic_dir,
-            static_dir=config.static_dir,
-            dropout=config.dropout,
-            pad_token=config.pad_token,
-            unk_token=config.unk_token,
-        )
-        self.desc_vecs = self.word_embedding_layer.desc_vecs
-
-        self.encoder = GatedCNN(
-            input_dim=config.input_dim,
-            hidden_dim=config.hidden_dim,
-            output_dim=config.output_dim,
-            kernel_size=config.kernel_size,
-            dropout=config.dropout,
-            init_mean=config.init_mean,
-            init_std=config.init_std,
-            levels=config.levels,
-        )
-
-        if self.bidirectional:
-            self.output_layer = OutputLayer(
-                input_dim=2 * config.input_dim,
-                num_labels=config.num_labels,
-                embed_dir=config.embed_dir,
-            )
-        else:
-            self.output_layer = OutputLayer(
-                input_dim=config.input_dim,
-                num_labels=config.num_labels,
-                embed_dir=config.embed_dir,
-            )
-
-        self.variational_dropout = VariationalDropout(dropout=config.dropout)
-
-        self.hidden = None
-
-    def freeze_net(self):
-        for p in self.word_embedding_layer.embed.parameters():
-            p.requires_grad = False
-
-    def init_hidden(self, batch_size):
-        h_size = self.hidden_dim + self.output_dim
-        weight = next(self.parameters()).data
-        return (
-            weight.new(batch_size, h_size, 1).zero_(),
-            weight.new(batch_size, h_size, 1).zero_(),
-        )
-
-    def _reverse_seq(self, X, mask, seq_max_len):
-        """
-        X -> batch, seq_len, dim
-        mask -> batch, seq_len
-        """
-        mask_sum = torch.sum(mask, 1).int()
-        xfs = []
-        for x, c in zip(X, mask_sum):
-            xf = torch.flip(x[:c], [0])
-            xfs.append(xf)
-        padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda()
-        for i, mat in enumerate(xfs):
-            padded_rev[i][: len(mat), :] = mat
-        return padded_rev
-
-    def forward(self, data, desc):
-        """
-        :param data: The input sequence, with dimesion (N, L)
-        :param desc: Whether to use code description
-        :return: logits, loss, hidden
-        """
-        # If this is the first forward pass, we will initialise the hidden
-        # state.
-        if self.hidden is None:
-            self.init_hidden_flag = True
-            self.hidden = self.init_hidden(data.size(0))
-
-        # Look up the embeddings of all the tokens using the WordEmbeddingLayer.
-        # `emb` shape: (batch_size, max_length, embed_size)
-        emb, mask = self.word_embedding_layer(data)
-
-        # If we want a bidirectional model, we reverse the sequence of
-        # tokens.
-        if self.bidirectional:
-            # `emb_reverse` shape: (batch_size, max_length, embed_size)
-            emb_reverse = self._reverse_seq(emb, mask, self.max_length)
-            # `emb_reverse` shape`: [batch_size, embed_size, max_length]
-            emb_reverse = emb_reverse.transpose(1, 2)
-        # `emb` shape: (batch_size, embed_size, max_length)
-        emb = emb.transpose(1, 2)
-
-        # Pass the embeddings through the encoder. If the model is
-        # bidirectional, we pass the reverse embeddings as well.
-        raw_output, self.hidden = self.encoder(emb, self.hidden)
-        if self.bidirectional:
-            raw_out_reverse, self.hidden = self.encoder(
-                emb_reverse, self.hidden
-            )
-
-        output = self.variational_dropout(raw_output)
-        if self.bidirectional:
-            output_reverse = self._reverse_seq(
-                raw_out_reverse, mask, self.max_length
-            )
-            output_reverse = self.variational_dropout(output_reverse)
-            output = torch.cat([output, output_reverse], dim=2)
-
-        if self.use_description:
-            logits = self.output_layer(output, self.desc_vecs)
-        else:
-            logits = self.output_layer(output, None)
-        return logits
-
-
-class WordEmbeddingLayer(nn.Module):
-    """
-    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
-    , and copies its weights to an nn.Embedding layer.
-
-    Args:
-        embed_dir (str): A directory containing the pre-trained word embedding
-                         matrix, among other things. Please see
-                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
-                         for more details.
-        dropout (float): The dropout probability.
-    """
-
-    def __init__(
-        self,
-        embed_dir,
-        dataset_dir,
-        mimic_dir,
-        static_dir,
-        version,
-        dropout,
-        pad_token="<pad>",
-        unk_token="<unk>",
-        return_pad_mask=True,
-        use_description=True,
-    ):
-        super(WordEmbeddingLayer, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"embed_dir = {embed_dir}, dropout = {dropout}"
-        )
-
-        self.return_pad_mask = return_pad_mask
-
-        # Note: This should be changed, since we won't always use Word2Vec.
-        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
-        vocab = embedding_cls.load_vocab(embed_dir)
-        self.pad_token_id = vocab[pad_token]
-        self.unk_token_id = vocab[unk_token]
-
-        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
-        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
-        self.embed.weight.data = W.clone()
-
-        self.embedding_size = self.embed.embedding_dim
-
-        self.dropout = nn.Dropout(dropout)
-
-        if use_description:
-            dicts = load_lookups(
-                dataset_dir=dataset_dir,
-                mimic_dir=mimic_dir,
-                static_dir=static_dir,
-                word2vec_dir=embed_dir,
-                version=version,
-            )
-            ind2c = dicts["ind2c"]
-            w2ind = dicts["w2ind"]
-            desc_dict = dicts["desc"]
-            self.desc_vecs = []
-            for i, c in ind2c.items():
-                self.desc_vecs.append(
-                    [
-                        w2ind[w] if w in w2ind else self.unk_token_id
-                        for w in desc_dict[c]
-                    ]
-                )
-
-            # Pad and convert to torch tensor.
-            self.desc_vecs = torch.Tensor(
-                list(zip(*itertools.zip_longest(*self.desc_vecs, fillvalue=0)))
-            )
-
-    def forward(self, x):
-        embedding = self.embed(x)
-        x = self.dropout(embedding)
-        if self.return_pad_mask:
-            pad_mask = ~(batch == pad_token_id)
-            return x, pad_mask
-        return x
-
-
-class VariationalHidDropout(nn.Module):
-    """
-    Hidden-to-hidden (VD-based) dropout that applies the same mask at every
-    time step and every layer of TrellisNet.
-
-    Args:
-        dropout (float): The dropout probability.
-    """
-
-    def __init__(self, dropout=0.0):
-        super(VariationalHidDropout, self).__init__()
-        self.dropout_probability = dropout
-        self.mask = None
-
-    def reset_mask(self, input):
-
-        # Dimension (N, C, L)
-        m = input.data.new(input.size(0), input.size(1), 1).bernoulli_(
-            1 - self.dropout_probability
-        )
-        with torch.no_grad():
-            mask = m / (1 - self.dropout_probability)
-            self.mask = mask
-        return mask
-
-    def forward(self, input):
-        # We don't apply dropout if the model is in eval mode.
-        if not self.training or self.dropout_probability == 0:
-            return input
-
-        assert (
-            self.mask is not None
-        ), "You need to reset mask before using VariationalHidDropout"
-        mask = self.mask.expand_as(input)  # Make sure the dimension matches
-        return mask * input
-
-
-class WeightShareConv1d(nn.Module):
-    """
-    The weight-tied 1D convolution used in TrellisNet.
-
-    Args:
-        input_dim (int): The dimension of the input. This is equivalent to
-                            the number of input channels in the first
-                            convolutional layer.
-        hidden_dim (int): The dimension of the hidden state. This is
-                            equivalent to the number of input channels in the
-                            second convolutional layer.
-        out_channels (int): The number of output channels in both
-                            convolutional layers.
-        kernel_size (int): The size of the filter used in both
-                            convolutional layers.
-        dropout (float): Dropout probability for the hidden-to-hidden
-                            dropout layer.
-        init_mean (float): The mean of the normal distribution with which
-                            weights of the convolutional layers are
-                            initialised.
-        init_std (float): The standard deviation of the normal distribution
-                            with which weights of the convolutional layers are
-                            initialised.
-    """
-
-    def __init__(
-        self,
-        input_dim,
-        hidden_dim,
-        out_channels,
-        kernel_size,
-        dropout=0.0,
-        init_mean=0.0,
-        init_std=0.01,
-    ):
-        super(WeightShareConv1d, self).__init__()
-
-        self.input_dim = input_dim
-        self.kernel_size = kernel_size
-
-        self._dict = {}
-
-        conv_layer_1 = nn.Conv1d(
-            in_channels=input_dim,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-        )
-        self.weight_1 = conv_layer_1.weight
-
-        conv_layer_2 = nn.Conv1d(
-            in_channels=hidden_dim,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-        )
-        self.weight_2 = conv2.weight
-        self.bias_2 = conv2.bias
-
-        self.init_conv_weights(init_mean, init_std)
-
-        self.dropout = VariationalHidDropout(dropout=dropout)
-
-    def init_conv_weights(self, init_mean, init_std):
-        self.weight_1.data.normal_(mean=init_mean, std=init_std)
-        self.weight_2.data.normal_(mean=init_mean, std=init_std)
-        self.bias_2.data.normal_(mean=init_mean, std=init_std)
-
-    def forward(self, input, dilation, hid):
-        batch_size = input.size(0)
-
-        padding = (self.kernel_size - 1) * dilation  # Padding size.
-        x = F.pad(input=input, pad=(padding, 0))  # Pad with zeros.
-
-        x_1 = x[:, : self.input_dim]
-        z_1 = x[:, self.input_dim :]
-        z_1[:, :, :padding] = hid[:batch_size, :, :].repeat(1, 1, padding)
-
-        device = x_1.get_device()
-
-        if (dilation, device) not in self.dict or self.dict[
-            (dilation, device)
-        ] is None:
-            self.dict[(dilation, device)] = F.conv1d(
-                input=x_1, weight=self.weight1, dilation=dilation
-            )
-
-        z_1 = self.dropout(z_1)
-        injected = self.dict[(dilation, device)] + F.conv1d(
-            input=z_1, weight=self.weight2, bias=self.bias2, dilation=dilation
-        )
-        return injected
-
-
-class GatedCNN(nn.Module):
-    """
-    Gated CNN module.
-
-    Args:
-        input_dim (int): The dimension of the input.
-        hidden_dim (int): The hidden dimension. The hidden dimension for the
-                            weight-shared Conv1D layer is
-                            `hidden_dim + output_dim`.
-        output_dim (int): The output dimension. The number of output
-                            channels of the weight-shared Conv1D layer is
-                            `4 * (hidden_dim + output_dim)`.
-        kernel_size (int): The size of the filter used in
-                            `WeightSharedConv1D`.
-        dropout (float): Dropout probability for the `WeightSharedConv1D`.
-        init_mean (float): The mean of the normal distribution with which
-                            weights of the `WeightSharedConv1D` layer are
-                            initialised.
-        init_std (float): The standard deviation of the normal distribution
-                            with which weights of the `WeightSharedConv1D`
-                            layer are initialised.
-    """
-
-    def __init__(
-        self,
-        input_dim,
-        hidden_dim,
-        output_dim,
-        kernel_size,
-        dropout,
-        init_mean,
-        init_std,
-        levels,
-    ):
-        super(GatedCNN, self).__init__()
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.output_dim = output_dim
-        self.levels = levels
-
-        self.hidden_dim_for_conv = hidden_dim + output_dim
-
-        self.dilations = [i + 1 for i in range(levels)]
-
-        self.full_conv = WeightShareConv1d(
-            input_dim=input_dim,
-            hidden_dim=self.hidden_dim_for_conv,
-            output_channels=4 * self.hidden_dim_for_conv,
-            kernel_size=kernel_size,
-            dropout=dropout,
-            init_mean=init_mean,
-            init_std=init_std,
-        )
-
-        self.ht = None
-
-    def transform_input(self, X):
-        device = X.get_device()
-        if device == -1:
-            device = "cpu"
-
-        batch_size = X.size(0)
-        seq_len = X.size(2)
-
-        ht = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
-            device
-        )
-        self.ct = torch.zeros(batch_size, self.hidden_dim_for_conv, seq_len).to(
-            device
-        )
-        return torch.cat((X, ht), dim=1)
-
-    def gating(self, Z, dilation=1, hc=None):
-        batch_size = Z.size(0)
-        (hid, cell) = hc
-
-        out = self.full_conv(input=Z, dilation=dilation, hid=hid)
-
-        ct_1 = F.pad(self.ct, (dilation, 0))[:, :, :-dilation]
-        ct_1[:, :, :dilation] = cell[:batch_size].repeat(1, 1, dilation)
-
-        it = torch.sigmoid(out[:, : self.hidden_dim_for_conv])
-        ot = torch.sigmoid(
-            out[:, self.hidden_dim_for_conv : 2 * self.hidden_dim_for_conv]
-        )
-        gt = torch.tanh(
-            out[:, 2 * self.hidden_dim_for_conv : 3 * self.hidden_dim_for_conv]
-        )
-        ft = torch.sigmoid(
-            out[:, 3 * self.hidden_dim_for_conv : 4 * self.hidden_dim_for_conv]
-        )
-        self.ct = ft * ct_1 + it * gt
-        ht = ot * torch.tanh(self.ct)
-
-        Z = torch.cat((Z[:, : self.input_dim], ht), dim=1)
-        return Z
-
-    def forward(self, emb, hc):
-        Z = self.transform_input(emb)
-        for key in self.full_conv.dict:
-            if key[1] == emb.get_device():
-                self.full_conv.dict[key] = None
-        self.full_conv.drop.reset_mask(Z[:, self.input_dim :])
-
-        for dilation_per_level in self.dilations:
-            Z = self.gating(Z, dilation=dilation_per_level, hc=hc)
-
-        out = Z[:, -self.output_dim :].transpose(1, 2)
-        hc = (Z[:, self.input_dim :, -1:], self.ct[:, :, -1:])
-        return out, hc
-
-
-class VariationalDropout(nn.Module):
-    """
-    Feed-forward version of variational dropout that applies the same mask
-    at every time step.
-    """
-
-    def __init__(self, dropout=0.5, dim=3):
-        super(VariationalDropout, self).__init__()
-        assert dim in (3, 4), "`dim` should be either 3 or 4"
-        self.dropout = dropout
-        self.dim = dim
-
-    def forward(self, x):
-        if not self.training or not self.dropout:
-            return x
-
-        if self.dim == 4:
-            # Dimension (M, N, L, C), where C stands for channels
-            m = x.data.new(x.size(0), x.size(1), 1, x.size(3)).bernoulli_(
-                1 - self.dropout
-            )
-        else:
-            # Dimension (N, L, C)
-            m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout)
-        with torch.no_grad():
-            mask = m / (1 - dropout)
-            mask = mask.expand_as(x)
-        return mask * x
-
-
-class OutputLayer(nn.Module):
-    def __init__(self, input_dim, num_labels, embed_dir, dropout):
-        super(OutputLayer, self).__init__()
-
-        self.word_embedding_layer = WordEmbeddingLayer(embed_dir, dropout)
-
-        self.U = nn.Linear(input_dim, num_labels)
-        self.final = nn.Linear(input_dim, num_labels)
-        self.proj_layer = nn.Linear(input_dim, 1, bias=False)
-
-        xavier_uniform_(self.U.weight)
-        xavier_uniform_(self.final.weight)
-
-    def forward(self, x, desc):
-        if desc is not None:
-            desc_vec, _ = self.word_embedding_layer(desc)
-            desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0)
-            mmt = desc_vec.matmul(x.transpose(1, 2))
-        else:
-            mmt = self.U.weight.matmul(x.transpose(1, 2))
-
-        m = mmt.matmul(x)
-
-        y = self.final.weight.mul(m)
-        logits = self.proj_layer(y).squeeze(-1).add(self.final.bias)
-
-        return logits
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml b/configs/gatedcnn_nci/mimic3_50.yml
similarity index 97%
rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml
rename to configs/gatedcnn_nci/mimic3_50.yml
index 3613885..5f0b893 100644
--- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50.yml
+++ b/configs/gatedcnn_nci/mimic3_50.yml
@@ -42,7 +42,7 @@ model:
     input_dim: 100
     hidden_dim: 100
     output_dim: 50
-    bidirectional: true
+    bidirectional: false
     use_description: true
     pad_token: "<pad>"
     unk_token: "<unk>"
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml b/configs/gatedcnn_nci/mimic3_50_old.yml
similarity index 97%
rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml
rename to configs/gatedcnn_nci/mimic3_50_old.yml
index 4ccca3a..ec8f0b2 100644
--- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_50_old.yml
+++ b/configs/gatedcnn_nci/mimic3_50_old.yml
@@ -42,7 +42,7 @@ model:
     input_dim: 100
     hidden_dim: 100
     output_dim: 50
-    bidirectional: true
+    bidirectional: false
     use_description: true
     pad_token: "<pad>"
     unk_token: "<unk>"
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml b/configs/gatedcnn_nci/mimic3_full.yml
similarity index 97%
rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml
rename to configs/gatedcnn_nci/mimic3_full.yml
index de9b1ab..8a833ad 100644
--- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full.yml
+++ b/configs/gatedcnn_nci/mimic3_full.yml
@@ -42,7 +42,7 @@ model:
     input_dim: 100
     hidden_dim: 100
     output_dim: 8922
-    bidirectional: true
+    bidirectional: false
     use_description: true
     pad_token: "<pad>"
     unk_token: "<unk>"
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: true
diff --git a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml b/configs/gatedcnn_nci/mimic3_full_old.yml
similarity index 97%
rename from configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml
rename to configs/gatedcnn_nci/mimic3_full_old.yml
index c95f4aa..d76c03b 100644
--- a/configs/gatedcnn_nci/gatedcnn_nci_mimic3_full_old.yml
+++ b/configs/gatedcnn_nci/mimic3_full_old.yml
@@ -42,7 +42,7 @@ model:
     input_dim: 100
     hidden_dim: 100
     output_dim: 8922
-    bidirectional: true
+    bidirectional: false
     use_description: true
     pad_token: "<pad>"
     unk_token: "<unk>"
@@ -125,3 +125,4 @@ trainer:
           - name: micro_auc
     seed: 1337
     use_gpu: true
+    initialise_hidden_states: true
diff --git a/src/models/gatedcnn_nci.py b/src/models/gatedcnn_nci.py
index 84ae5e8..bd1d765 100644
--- a/src/models/gatedcnn_nci.py
+++ b/src/models/gatedcnn_nci.py
@@ -68,7 +68,7 @@ def __init__(self, config):
                 static_dir=config.static_dir,
                 version=config.version,
                 input_dim=config.input_dim,
-                num_labels=config.num_labels,
+                num_labels=config.output_dim,
                 dropout=config.dropout,
                 pad_token=config.pad_token,
                 unk_token=config.unk_token,
@@ -85,7 +85,7 @@ def freeze_net(self):
     def init_hidden(self, batch_size):
         h_size = self.hidden_dim + self.output_dim
         weight = next(self.parameters()).data
-        return (
+        self.hidden = (
             weight.new(batch_size, h_size, 1).zero_(),
             weight.new(batch_size, h_size, 1).zero_(),
         )
@@ -95,12 +95,16 @@ def _reverse_seq(self, X, mask, seq_max_len):
         X -> batch, seq_len, dim
         mask -> batch, seq_len
         """
+        device = X.get_device()
+        if device == -1:
+            device = "cpu"
+
         mask_sum = torch.sum(mask, 1).int()
         xfs = []
         for x, c in zip(X, mask_sum):
             xf = torch.flip(x[:c], [0])
             xfs.append(xf)
-        padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).cuda()
+        padded_rev = torch.zeros((len(xfs), X.size(1), X.size(2))).to(device)
         for i, mat in enumerate(xfs):
             padded_rev[i][: len(mat), :] = mat
         return padded_rev
@@ -552,11 +556,11 @@ def forward(self, x, desc):
         if desc is not None:
             desc_vec, _ = self.word_embedding_layer(desc)
             desc_vec = torch.mean(desc_vec, dim=1).unsqueeze(0)
-            mmt = desc_vec.matmul(x.transpose(1, 2))
+            mmt = x.matmul(desc_vec)
         else:
             mmt = self.U.weight.matmul(x.transpose(1, 2))
 
-        m = mmt.matmul(x)
+        m = x.transpose(1, 2).matmul(mmt)
 
         y = self.final.weight.mul(m)
         logits = self.proj_layer(y).squeeze(-1).add(self.final.bias)
diff --git a/src/trainers/base_trainer.py b/src/trainers/base_trainer.py
index a1a70dc..731e527 100755
--- a/src/trainers/base_trainer.py
+++ b/src/trainers/base_trainer.py
@@ -174,6 +174,10 @@ def train(self, model, train_dataset, val_dataset=None):
                     batch_inputs = batch_inputs.cuda()
                     batch_labels = batch_labels.cuda()
 
+                # Initialise the hidden states.
+                if self.config.initialise_hidden_states:
+                    model.init_hidden(batch_inputs.size(0))
+
                 batch_outputs = model(batch_inputs)
                 batch_loss = self.loss_fn(
                     input=batch_outputs, target=batch_labels

From a98a0d40cdaf78cc5e701164cc1218e69a74796c Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Wed, 22 Jun 2022 18:37:29 +0530
Subject: [PATCH 5/5] Fix test issue

---
 configs/gatedcnn_nci/mimic3_50_old.yml | 4 ++--
 src/trainers/base_trainer.py           | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/configs/gatedcnn_nci/mimic3_50_old.yml b/configs/gatedcnn_nci/mimic3_50_old.yml
index ec8f0b2..4eef858 100644
--- a/configs/gatedcnn_nci/mimic3_50_old.yml
+++ b/configs/gatedcnn_nci/mimic3_50_old.yml
@@ -66,9 +66,9 @@ trainer:
     optimizer:
       name: adam
       params:
-        lr: 0.000001
+        lr: 0.01
         weight_decay: 0.0
-    max_epochs: 200
+    max_epochs: 100
     lr_scheduler: null
     stopping_criterion:
       metric:
diff --git a/src/trainers/base_trainer.py b/src/trainers/base_trainer.py
index 731e527..506cd47 100755
--- a/src/trainers/base_trainer.py
+++ b/src/trainers/base_trainer.py
@@ -397,6 +397,11 @@ def _forward_epoch(self, model, dataset=None, dataloader=None):
                 if self.config.use_gpu:
                     batch_inputs = batch_inputs.cuda()
                     batch_labels = batch_labels.cuda()
+
+                # Initialise the hidden states.
+                if self.config.initialise_hidden_states:
+                    model.init_hidden(batch_inputs.size(0))
+
                 batch_outputs = model(batch_inputs)
                 epoch_labels.append(batch_labels.cpu())
                 epoch_outputs.append(batch_outputs.cpu())