diff --git a/src/models/__init__.py b/src/models/__init__.py
index 4f5ec99..52bcc95 100644
--- a/src/models/__init__.py
+++ b/src/models/__init__.py
@@ -1,3 +1,3 @@
 from src.models.caml import ConvAttnPool as CAML
 from src.models.caml import VanillaConv as CNN
-from src.models.dcan import DCAN
+from src.models.dcan.model import DCAN
diff --git a/src/models/dcan.py b/src/models/dcan.py
deleted file mode 100644
index b2e27ed..0000000
--- a/src/models/dcan.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_
-
-from src.modules.activations import *
-from src.modules.layers.label_wise_attn import LabelWiseAttn
-from src.modules.layers.temporal_conv_net import TemporalConvNet
-from src.modules.layers.word_embedding_layer import WordEmbeddingLayer
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-@ConfigMapper.map("models", "dcan")
-class DCAN(nn.Module):
-    """
-    This class is used to create the DCAN model.
-    References:
-        Paper: https://aclanthology.org/2020.clinicalnlp-1.8/
-        GitHub Repository: https://github.com/shaoxiongji/DCAN
-    For the parameters related to convolutional layers, please see this:
-    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
-    Args:
-        num_classes (int): Number of classes (ICD codes).
-        conv_channel_sizes (list): List of lists of integers. Each list
-                                   represents the channel sizes of convolutional
-                                   layers in a `TemporalBlock`. So, for example,
-                                   if the list is [[100, 600, 600],
-                                                   [600, 600, 600]].
-                                   the `TemporalConvNet` layer will have 2
-                                   `TemporalBlock`s, each temporal block have
-                                   2 convolutional layers:
-                                   Conv(100, 600), Conv(600, 600) for the first
-                                   one, and Conv(600, 600), Conv(600, 600). If
-                                   the `add_emb_size_to_channel_sizes`, we don't
-                                   have to pass the input channel size. So, in
-                                   the above case, we can just pass
-                                   [[600, 600], [600, 600, 600]].
-        add_emb_size_to_channel_sizes (bool): If True, you need not specify
-                                              the input channel size. Please
-                                              see the description of
-                                              `conv_channel_sizes`.
-        kernel_sizes (list): List of list of integers (same format as
-                             `conv_channel_sizes`). Each integer represents the
-                             kernel size/filter size of the respective
-                             convolutional layer in `TemporalBlock` layer.
-        strides (list): List of list of integers (same format as
-                        `conv_channel_sizes`). Each integer represents the
-                        stride of the respective convolutional layer in
-                        `TemporalBlock` layer.
-        paddings (list): List of list of integers (same format as
-                         `conv_channel_sizes`). Each integer represents the
-                         padding of the respective convolutional layer in
-                         `TemporalBlock` layer. in DCAN, this value is set to
-                         "(kernel_size - 1) * dilation_size".
-        dilations (list): List of list of integers (same format as
-                          `conv_channel_sizes`). Each integer represents the
-                          dilation size of the respective convolutional layer
-                          `TemporalBlock` layer.` In DCAN, this value is
-                          "2^(temporal_block_level)".
-        dropouts (list): List of list of floats (same format as
-                         `conv_channel_sizes`). Each float represents the
-                         dropout probability of the respective convolutional
-                         `TemporalBlock` layer.
-        weight_norm (bool): If True, apply weight normalization to the
-                            convolutional layers.
-        activation (str): Activation function to use. Should be one of "relu",
-                          "elu", "leaky_relu".
-    """
-
-    def __init__(self, config):
-        super(DCAN, self).__init__()
-        logger.info(f"Initialising {self.__class__.__name__}")
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with " f"config: {config}"
-        )
-
-        self.config = config
-
-        self.word_embedding_layer = WordEmbeddingLayer(
-            **config.word_representation_layer.params.init_params.as_dict()
-        )
-        if config.word_representation_layer.params.freeze_layer:
-            self.freeze_layer(self.word_embedding_layer.embed)
-
-        num_levels = len(config.kernel_sizes)
-        num_inner_conv_levels = len(config.kernel_sizes[0])
-
-        conv_channel_sizes = config.conv_channel_sizes
-        if config.add_emb_size_to_channel_sizes:
-            conv_channel_sizes[0] = [
-                self.word_embedding_layer.embedding_size
-            ] + conv_channel_sizes[0]
-        dropouts = [
-            [config.dropout for _ in range(num_inner_conv_levels)]
-            for _ in range(num_levels)
-        ]
-
-        self.temporal_conv_net = TemporalConvNet(
-            conv_channel_sizes_=conv_channel_sizes,
-            kernel_sizes_=config.kernel_sizes,
-            strides_=config.strides,
-            paddings_=config.paddings,
-            dilations_=config.dilations,
-            dropouts_=dropouts,
-            weight_norm=config.weight_norm,
-            activation=config.activation,
-        )
-
-        self.linear_layer = nn.Linear(
-            conv_channel_sizes[-1][-1], config.projection_size
-        )
-        self.activation = ConfigMapper.get_object(
-            "activations", config.activation
-        )()
-
-        self.output_layer = OutputLayer(
-            config.projection_size, config.num_classes
-        )
-
-        xavier_uniform_(self.linear_layer.weight)
-
-    def forward(self, data):
-        x = self.word_embedding_layer(data)
-        hid_seq = self.temporal_conv_net(x.transpose(1, 2)).transpose(1, 2)
-        hid_seq = self.activation(self.linear_layer(hid_seq))
-        logits = self.output_layer(hid_seq)
-        return logits
-
-    def freeze_layer(self, layer):
-        for param in layer.parameters():
-            param.requires_grad = False
-
-
-class OutputLayer(nn.Module):
-    def __init__(self, input_size, num_classes):
-        super(OutputLayer, self).__init__()
-        self.label_wise_attn = LabelWiseAttn(input_size, num_classes)
-
-        self.final = nn.Linear(input_size, num_classes)
-        xavier_uniform_(self.final.weight)
-
-    def forward(self, x):
-        m = self.label_wise_attn(x)
-        logits = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)
-        return logits
diff --git a/src/modules/layers/__init__.py b/src/models/dcan/__init__.py
similarity index 100%
rename from src/modules/layers/__init__.py
rename to src/models/dcan/__init__.py
diff --git a/src/models/dcan/model.py b/src/models/dcan/model.py
new file mode 100644
index 0000000..c2cb900
--- /dev/null
+++ b/src/models/dcan/model.py
@@ -0,0 +1,508 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_
+from torch.nn.utils import weight_norm as weight_norm_
+
+from src.modules.activations import *
+from src.utils.mapper import ConfigMapper
+from src.utils.text_loggers import get_logger
+
+logger = get_logger(__name__)
+
+
+@ConfigMapper.map("models", "dcan")
+class DCAN(nn.Module):
+    """
+    This class is used to create the DCAN model.
+    References:
+        Paper: https://aclanthology.org/2020.clinicalnlp-1.8/
+        GitHub Repository: https://github.com/shaoxiongji/DCAN
+    For the parameters related to convolutional layers, please see this:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
+    Args:
+        num_classes (int): Number of classes (ICD codes).
+        conv_channel_sizes (list): List of lists of integers. Each list
+                                   represents the channel sizes of convolutional
+                                   layers in a `TemporalBlock`. So, for example,
+                                   if the list is [[100, 600, 600],
+                                                   [600, 600, 600]].
+                                   the `TemporalConvNet` layer will have 2
+                                   `TemporalBlock`s, each temporal block have
+                                   2 convolutional layers:
+                                   Conv(100, 600), Conv(600, 600) for the first
+                                   one, and Conv(600, 600), Conv(600, 600). If
+                                   the `add_emb_size_to_channel_sizes`, we don't
+                                   have to pass the input channel size. So, in
+                                   the above case, we can just pass
+                                   [[600, 600], [600, 600, 600]].
+        add_emb_size_to_channel_sizes (bool): If True, you need not specify
+                                              the input channel size. Please
+                                              see the description of
+                                              `conv_channel_sizes`.
+        kernel_sizes (list): List of list of integers (same format as
+                             `conv_channel_sizes`). Each integer represents the
+                             kernel size/filter size of the respective
+                             convolutional layer in `TemporalBlock` layer.
+        strides (list): List of list of integers (same format as
+                        `conv_channel_sizes`). Each integer represents the
+                        stride of the respective convolutional layer in
+                        `TemporalBlock` layer.
+        paddings (list): List of list of integers (same format as
+                         `conv_channel_sizes`). Each integer represents the
+                         padding of the respective convolutional layer in
+                         `TemporalBlock` layer. in DCAN, this value is set to
+                         "(kernel_size - 1) * dilation_size".
+        dilations (list): List of list of integers (same format as
+                          `conv_channel_sizes`). Each integer represents the
+                          dilation size of the respective convolutional layer
+                          `TemporalBlock` layer.` In DCAN, this value is
+                          "2^(temporal_block_level)".
+        dropouts (list): List of list of floats (same format as
+                         `conv_channel_sizes`). Each float represents the
+                         dropout probability of the respective convolutional
+                         `TemporalBlock` layer.
+        weight_norm (bool): If True, apply weight normalization to the
+                            convolutional layers.
+        activation (str): Activation function to use. Should be one of "relu",
+                          "elu", "leaky_relu".
+    """
+
+    def __init__(self, config):
+        super(DCAN, self).__init__()
+        logger.info(f"Initialising {self.__class__.__name__}")
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with " f"config: {config}"
+        )
+
+        self.config = config
+
+        self.word_embedding_layer = WordEmbeddingLayer(
+            **config.word_representation_layer.params.init_params.as_dict()
+        )
+        if config.word_representation_layer.params.freeze_layer:
+            self.freeze_layer(self.word_embedding_layer.embed)
+
+        num_levels = len(config.kernel_sizes)
+        num_inner_conv_levels = len(config.kernel_sizes[0])
+
+        conv_channel_sizes = config.conv_channel_sizes
+        if config.add_emb_size_to_channel_sizes:
+            conv_channel_sizes[0] = [
+                self.word_embedding_layer.embedding_size
+            ] + conv_channel_sizes[0]
+        dropouts = [
+            [config.dropout for _ in range(num_inner_conv_levels)]
+            for _ in range(num_levels)
+        ]
+
+        self.temporal_conv_net = TemporalConvNet(
+            conv_channel_sizes_=conv_channel_sizes,
+            kernel_sizes_=config.kernel_sizes,
+            strides_=config.strides,
+            paddings_=config.paddings,
+            dilations_=config.dilations,
+            dropouts_=dropouts,
+            weight_norm=config.weight_norm,
+            activation=config.activation,
+        )
+
+        self.linear_layer = nn.Linear(
+            conv_channel_sizes[-1][-1], config.projection_size
+        )
+        self.activation = ConfigMapper.get_object(
+            "activations", config.activation
+        )()
+
+        self.output_layer = OutputLayer(
+            config.projection_size, config.num_classes
+        )
+
+        xavier_uniform_(self.linear_layer.weight)
+
+    def forward(self, data):
+        x = self.word_embedding_layer(data)
+        hid_seq = self.temporal_conv_net(x.transpose(1, 2)).transpose(1, 2)
+        hid_seq = self.activation(self.linear_layer(hid_seq))
+        logits = self.output_layer(hid_seq)
+        return logits
+
+    def freeze_layer(self, layer):
+        for param in layer.parameters():
+            param.requires_grad = False
+
+
+class WordEmbeddingLayer(nn.Module):
+    """
+    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
+    , and copies its weights to an nn.Embedding layer.
+
+    Args:
+        embed_dir (str): A directory containing the pre-trained word embedding
+                         matrix, among other things. Please see
+                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
+                         for more details.
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(self, embed_dir, dropout):
+        super(WordEmbeddingLayer, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"embed_dir = {embed_dir}, dropout = {dropout}"
+        )
+
+        # Note: This should be changed, since we won't always use Word2Vec.
+        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
+
+        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
+        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
+        self.embed.weight.data = W.clone()
+
+        self.embedding_size = self.embed.embedding_dim
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        embedding = self.embed(x)
+        x = self.dropout(embedding)
+        return x
+
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size].contiguous()
+
+
+class ConvTemporalSubBlock(nn.Module):
+    """
+    A simple temporal convolutional block. Adapted from
+    https://github.com/shaoxiongji/DCAN/blob/master/models.py#L84-L88. This
+    layer has a dilated convolutional layer, a `chomp1d` layer, followed by
+    activation and dropout. For the parameters related to convolutional layers,
+    please see this:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
+
+    Args:
+        in_channels (int): The number of input channels in the convolutional
+                           layer.
+        out_channels (int): The number of output channels in the convolutional
+                            layer.
+        kernel_size (int): The size of the kernel in the convolutional layer.
+        stride (int): The stride of the convolutional layer.
+        padding (int): The padding of the convolutional layer.
+        dilation (int): The dilation size of the convolutional layer.
+        dropout (float): The dropout probability.
+        weight_norm (bool): Whether to apply weight normalization to the
+                            convolutional layer.
+        activation (str): The activation function to use. DCAN uses "relu".
+                          For all available activations, see
+                          https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/activations.py.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        dropout=0.2,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(ConvTemporalSubBlock, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"in_channels = {in_channels}, out_channels = "
+            f"{out_channels}, kernel_size = {kernel_size}, "
+            f"stride = {stride}, padding = {padding}, "
+            f"dilation = {dilation}, dropout = {dropout}, "
+            f"weight_norm = {weight_norm}, activation = {activation}"
+        )
+
+        self.conv_layer = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        if weight_norm:
+            self.conv_layer = weight_norm_(self.conv_layer)
+        self.chomp1d = Chomp1d(padding)
+        self.activation = ConfigMapper.get_object("activations", activation)()
+        self.dropout = nn.Dropout(dropout)
+
+        self.__init_weights__()
+
+    def __init_weights__(self):
+        xavier_uniform_(self.conv_layer.weight)
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x = self.chomp1d(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        return x
+
+
+class TemporalBlock(nn.Module):
+    """
+    A Temporal Block containing stacks of `ConvTemporalSubBlocks`, followed
+    by activation.
+    References:
+        Paper: https://arxiv.org/abs/2009.14578
+        Repository:
+        https://github.com/shaoxiongji/DCAN/blob/master/models.py#L81
+
+    Args:
+        conv_channel_sizes (list): List of integers, with channel sizes of
+                                   convolutional layers. For example, if the
+                                   list is [100, 200, 300], there will be two
+                                   convolutional layers: Conv1d(100, 200) and
+                                   Conv1d(200, 300).
+        kernel_sizes (list): List of integers, with kernel sizes of every
+                             `ConvTemporalSubBlock`.
+        strides (list): List of integers, with strides of convolutional layers.
+        paddings (list): List of integers, with paddings of every
+                         `ConvTemporalSubBlock`.
+        dilations (list): List of integers, with dilation sizes of every
+                          `ConvTemporalSubBlock`.
+        dropouts (list): List of floats, with dropout probabilities of every
+                         `ConvTemporalSubBlock`.
+        weight_norm (bool): Whether to apply weight normalization to every
+                             convolutional layer. DCAN uses weight norm.
+        activation (str): The activation function to use. DCAN uses "relu".
+    """
+
+    def __init__(
+        self,
+        conv_channel_sizes,
+        kernel_sizes,
+        strides,
+        paddings,
+        dilations,
+        dropouts,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(TemporalBlock, self).__init__()
+        conv_channel_size_pairs = list(
+            zip(conv_channel_sizes[:-1], conv_channel_sizes[1:])
+        )
+
+        self.conv_temporal_sub_blocks = nn.ModuleList(
+            [
+                ConvTemporalSubBlock(
+                    in_channels=conv_channel_size_pair[0],
+                    out_channels=conv_channel_size_pair[1],
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    dropout=dropout,
+                    weight_norm=weight_norm,
+                    activation=activation,
+                )
+                for (
+                    conv_channel_size_pair,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    dropout,
+                ) in zip(
+                    conv_channel_size_pairs,
+                    kernel_sizes,
+                    strides,
+                    paddings,
+                    dilations,
+                    dropouts,
+                )
+            ]
+        )
+
+        self.downsample = (
+            nn.Conv1d(conv_channel_sizes[0], conv_channel_sizes[-1], 1)
+            if conv_channel_sizes[0] != conv_channel_sizes[-1]
+            else None
+        )
+        self.output_activation = ConfigMapper.get_object(
+            "activations", activation
+        )()
+
+        self.init_weights()
+
+    def init_weights(self):
+        if self.downsample is not None:
+            xavier_uniform_(self.downsample.weight)
+
+    def forward(self, x):
+        conv_layer_output = x
+        for conv_temporal_sub_block in self.conv_temporal_sub_blocks:
+            conv_layer_output = conv_temporal_sub_block(conv_layer_output)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.output_activation(conv_layer_output + res)
+
+
+class TemporalConvNet(nn.Module):
+    """
+    Stack of `TemporalBlock`s. Used in the DCAN model.
+    References:
+        Paper: https://arxiv.org/abs/2009.14578
+        Repository:
+        https://github.com/shaoxiongji/DCAN/blob/master/models.py#L114
+
+    Args:
+        conv_channel_sizes_ (list): List of lists of integers. Each list
+                                    represents the channel sizes of
+                                    convolutional layers in a `TemporalBlock`.
+                                    So, for example, if the list is
+                                    [
+                                        [100, 600, 600],
+                                        [600, 600, 600]
+                                    ],
+                                    the `TemporalConvNet` layer will have 2
+                                    `TemporalBlock`s, each temporal block have
+                                    2 convolutional layers:
+                                    Conv(100, 600), Conv(600, 600) for the first
+                                    one, and Conv(600, 600), Conv(600, 600). If
+                                    the `add_emb_size_to_channel_sizes`, we
+                                    don't have to pass the input channel size.
+                                    So, in the above case, we can just pass
+                                    [[600, 600], [600, 600, 600]].
+        kernel_sizes_ (list): List of list of integers (same format as
+                              `conv_channel_sizes`). Each integer represents the
+                              kernel size/filter size of the respective
+                              convolutional layer in `TemporalBlock` layer.
+        strides_ (list): List of list of integers (same format as
+                         `conv_channel_sizes`). Each integer represents the
+                         stride of the respective convolutional layer in
+                         `TemporalBlock` layer.
+        paddings_ (list): List of list of integers (same format as
+                          `conv_channel_sizes`). Each integer represents the
+                          padding of the respective convolutional layer in
+                          `TemporalBlock` layer. in DCAN, this value is set to
+                          "(kernel_size - 1) * dilation_size".
+        dilations_ (list): List of list of integers (same format as
+                           `conv_channel_sizes`). Each integer represents the
+                           dilation size of the respective convolutional layer
+                           `TemporalBlock` layer.` In DCAN, this value is
+                           "2^(temporal_block_level)".
+        dropouts_ (list): List of list of floats (same format as
+                          `conv_channel_sizes`). Each float represents the
+                          dropout probability of the respective convolutional
+                          `TemporalBlock` layer.
+        weight_norm (bool): If True, apply weight normalization to the
+                            convolutional layers.
+        activation (str): Activation function to use. DCAN uses "relu".
+    """
+
+    def __init__(
+        self,
+        conv_channel_sizes_,
+        kernel_sizes_,
+        strides_,
+        paddings_,
+        dilations_,
+        dropouts_,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(TemporalConvNet, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"conv_channel_sizes_ = {conv_channel_sizes_}, "
+            f"kernel_sizes_ = {kernel_sizes_}, "
+            f"strides_ = {strides_}, paddings_ = {paddings_}, "
+            f"dilations_ = {dilations_}, dropouts_ = {dropouts_}, "
+            f"weight_norm = {weight_norm}, activation = {activation}"
+        )
+
+        self.temporal_blocks = nn.ModuleList(
+            [
+                TemporalBlock(
+                    conv_channel_sizes=conv_channel_sizes,
+                    kernel_sizes=kernel_sizes,
+                    strides=strides,
+                    paddings=paddings,
+                    dilations=dilations,
+                    dropouts=dropouts,
+                    weight_norm=weight_norm,
+                    activation=activation,
+                )
+                for (
+                    conv_channel_sizes,
+                    kernel_sizes,
+                    strides,
+                    paddings,
+                    dilations,
+                    dropouts,
+                ) in zip(
+                    conv_channel_sizes_,
+                    kernel_sizes_,
+                    strides_,
+                    paddings_,
+                    dilations_,
+                    dropouts_,
+                )
+            ]
+        )
+
+    def forward(self, x):
+        for temporal_block in self.temporal_blocks:
+            x = temporal_block(x)
+        return x
+
+
+class LabelWiseAttn(nn.Module):
+    """
+    A Label-wise Attention layer (as implemented in CAML, DCAN, etc.).
+    References:
+        Papers: https://arxiv.org/abs/1802.05695 (Section 2.2)
+        Repository:
+        https://github.com/jamesmullenbach/caml-mimic/blob/master/learn/models.py#L184
+
+    Args:
+        input_size (int): The size of the input, i.e., the number of channels
+                          if the output is from a convolutional layer/embedding
+                          size if the output is from a fully connected layer.
+        num_classes (int): The number of classes.
+    """
+
+    def __init__(self, input_size, num_classes):
+        super(LabelWiseAttn, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"input size = {input_size}, num_classes = {num_classes}"
+        )
+
+        self.U = nn.Linear(input_size, num_classes)
+        xavier_uniform_(self.U.weight)
+
+    def forward(self, x):
+        att = self.U.weight.matmul(x.transpose(1, 2))  # [bs, Y, seq_len]
+        alpha = F.softmax(att, dim=2)
+        m = alpha.matmul(x)  # [bs, Y, dim]
+        return m
+
+
+class OutputLayer(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(OutputLayer, self).__init__()
+        self.label_wise_attn = LabelWiseAttn(input_size, num_classes)
+
+        self.final = nn.Linear(input_size, num_classes)
+        xavier_uniform_(self.final.weight)
+
+    def forward(self, x):
+        m = self.label_wise_attn(x)
+        logits = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)
+        return logits
diff --git a/src/modules/layers/label_wise_attn.py b/src/modules/layers/label_wise_attn.py
deleted file mode 100644
index cb2fa5e..0000000
--- a/src/modules/layers/label_wise_attn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_
-
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class LabelWiseAttn(nn.Module):
-    """
-    A Label-wise Attention layer (as implemented in CAML, DCAN, etc.).
-    References:
-        Papers: https://arxiv.org/abs/1802.05695 (Section 2.2)
-        Repository: https://github.com/jamesmullenbach/caml-mimic/blob/master/learn/models.py#L184
-
-    Args:
-        input_size (int): The size of the input, i.e., the number of channels
-                          if the output is from a convolutional layer/embedding
-                          size if the output is from a fully connected layer.
-        num_classes (int): The number of classes.
-    """
-
-    def __init__(self, input_size, num_classes):
-        super(LabelWiseAttn, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"input size = {input_size}, num_classes = {num_classes}"
-        )
-
-        self.U = nn.Linear(input_size, num_classes)
-        xavier_uniform_(self.U.weight)
-
-    def forward(self, x):
-        att = self.U.weight.matmul(x.transpose(1, 2))  # [bs, Y, seq_len]
-        alpha = F.softmax(att, dim=2)
-        m = alpha.matmul(x)  # [bs, Y, dim]
-        return m
diff --git a/src/modules/layers/temporal_block.py b/src/modules/layers/temporal_block.py
deleted file mode 100644
index cb4c598..0000000
--- a/src/modules/layers/temporal_block.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-from torch.nn.init import xavier_uniform_
-from torch.nn.utils import weight_norm as weight_norm_
-
-from src.modules.activations import *
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class Chomp1d(nn.Module):
-    def __init__(self, chomp_size):
-        super(Chomp1d, self).__init__()
-        self.chomp_size = chomp_size
-
-    def forward(self, x):
-        return x[:, :, : -self.chomp_size].contiguous()
-
-
-class ConvTemporalSubBlock(nn.Module):
-    """
-    A simple temporal convolutional block. Adapted from
-    https://github.com/shaoxiongji/DCAN/blob/master/models.py#L84-L88. This
-    layer has a dilated convolutional layer, a `chomp1d` layer, followed by
-    activation and dropout. For the parameters related to convolutional layers,
-    please see this:
-    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
-
-    Args:
-        in_channels (int): The number of input channels in the convolutional
-                           layer.
-        out_channels (int): The number of output channels in the convolutional
-                            layer.
-        kernel_size (int): The size of the kernel in the convolutional layer.
-        stride (int): The stride of the convolutional layer.
-        padding (int): The padding of the convolutional layer.
-        dilation (int): The dilation size of the convolutional layer.
-        dropout (float): The dropout probability.
-        weight_norm (bool): Whether to apply weight normalization to the
-                            convolutional layer.
-        activation (str): The activation function to use. DCAN uses "relu".
-                          For all available activations, see
-                          https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/activations.py.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        dropout=0.2,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(ConvTemporalSubBlock, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"in_channels = {in_channels}, out_channels = "
-            f"{out_channels}, kernel_size = {kernel_size}, "
-            f"stride = {stride}, padding = {padding}, "
-            f"dilation = {dilation}, dropout = {dropout}, "
-            f"weight_norm = {weight_norm}, activation = {activation}"
-        )
-
-        self.conv_layer = nn.Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-        )
-        if weight_norm:
-            self.conv_layer = weight_norm_(self.conv_layer)
-        self.chomp1d = Chomp1d(padding)
-        self.activation = ConfigMapper.get_object("activations", activation)()
-        self.dropout = nn.Dropout(dropout)
-
-        self.__init_weights__()
-
-    def __init_weights__(self):
-        xavier_uniform_(self.conv_layer.weight)
-
-    def forward(self, x):
-        x = self.conv_layer(x)
-        x = self.chomp1d(x)
-        x = self.activation(x)
-        x = self.dropout(x)
-        return x
-
-
-class TemporalBlock(nn.Module):
-    """
-    A Temporal Block containing stacks of `ConvTemporalSubBlocks`, followed
-    by activation.
-    References:
-        Paper: https://arxiv.org/abs/2009.14578
-        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L81
-
-    Args:
-        conv_channel_sizes (list): List of integers, with channel sizes of
-                                   convolutional layers. For example, if the
-                                   list is [100, 200, 300], there will be two
-                                   convolutional layers: Conv1d(100, 200) and
-                                   Conv1d(200, 300).
-        kernel_sizes (list): List of integers, with kernel sizes of every
-                             `ConvTemporalSubBlock`.
-        strides (list): List of integers, with strides of convolutional layers.
-        paddings (list): List of integers, with paddings of every
-                         `ConvTemporalSubBlock`.
-        dilations (list): List of integers, with dilation sizes of every
-                          `ConvTemporalSubBlock`.
-        dropouts (list): List of floats, with dropout probabilities of every
-                         `ConvTemporalSubBlock`.
-        weight_norm (bool): Whether to apply weight normalization to every
-                             convolutional layer. DCAN uses weight norm.
-        activation (str): The activation function to use. DCAN uses "relu".
-    """
-
-    def __init__(
-        self,
-        conv_channel_sizes,
-        kernel_sizes,
-        strides,
-        paddings,
-        dilations,
-        dropouts,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(TemporalBlock, self).__init__()
-        conv_channel_size_pairs = list(
-            zip(conv_channel_sizes[:-1], conv_channel_sizes[1:])
-        )
-
-        self.conv_temporal_sub_blocks = nn.ModuleList(
-            [
-                ConvTemporalSubBlock(
-                    in_channels=conv_channel_size_pair[0],
-                    out_channels=conv_channel_size_pair[1],
-                    kernel_size=kernel_size,
-                    stride=stride,
-                    padding=padding,
-                    dilation=dilation,
-                    dropout=dropout,
-                    weight_norm=weight_norm,
-                    activation=activation,
-                )
-                for (
-                    conv_channel_size_pair,
-                    kernel_size,
-                    stride,
-                    padding,
-                    dilation,
-                    dropout,
-                ) in zip(
-                    conv_channel_size_pairs,
-                    kernel_sizes,
-                    strides,
-                    paddings,
-                    dilations,
-                    dropouts,
-                )
-            ]
-        )
-
-        self.downsample = (
-            nn.Conv1d(conv_channel_sizes[0], conv_channel_sizes[-1], 1)
-            if conv_channel_sizes[0] != conv_channel_sizes[-1]
-            else None
-        )
-        self.output_activation = ConfigMapper.get_object(
-            "activations", activation
-        )()
-
-        self.init_weights()
-
-    def init_weights(self):
-        if self.downsample is not None:
-            xavier_uniform_(self.downsample.weight)
-
-    def forward(self, x):
-        conv_layer_output = x
-        for conv_temporal_sub_block in self.conv_temporal_sub_blocks:
-            conv_layer_output = conv_temporal_sub_block(conv_layer_output)
-        res = x if self.downsample is None else self.downsample(x)
-        return self.output_activation(conv_layer_output + res)
diff --git a/src/modules/layers/temporal_conv_net.py b/src/modules/layers/temporal_conv_net.py
deleted file mode 100644
index 3222a95..0000000
--- a/src/modules/layers/temporal_conv_net.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-
-from src.modules.activations import *
-from src.modules.layers.temporal_block import TemporalBlock
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class TemporalConvNet(nn.Module):
-    """
-    Stack of `TemporalBlock`s. Used in the DCAN model.
-    References:
-        Paper: https://arxiv.org/abs/2009.14578
-        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L114
-
-    Args:
-        conv_channel_sizes_ (list): List of lists of integers. Each list
-                                    represents the channel sizes of convolutional
-                                    layers in a `TemporalBlock`. So, for
-                                    example, if the list is [[100, 600, 600],
-                                                             [600, 600, 600]].
-                                    the `TemporalConvNet` layer will have 2
-                                    `TemporalBlock`s, each temporal block have
-                                    2 convolutional layers:
-                                    Conv(100, 600), Conv(600, 600) for the first
-                                    one, and Conv(600, 600), Conv(600, 600). If
-                                    the `add_emb_size_to_channel_sizes`, we
-                                    don't have to pass the input channel size.
-                                    So, in the above case, we can just pass
-                                    [[600, 600], [600, 600, 600]].
-        kernel_sizes_ (list): List of list of integers (same format as
-                              `conv_channel_sizes`). Each integer represents the
-                              kernel size/filter size of the respective
-                              convolutional layer in `TemporalBlock` layer.
-        strides_ (list): List of list of integers (same format as
-                         `conv_channel_sizes`). Each integer represents the
-                         stride of the respective convolutional layer in
-                         `TemporalBlock` layer.
-        paddings_ (list): List of list of integers (same format as
-                          `conv_channel_sizes`). Each integer represents the
-                          padding of the respective convolutional layer in
-                          `TemporalBlock` layer. in DCAN, this value is set to
-                          "(kernel_size - 1) * dilation_size".
-        dilations_ (list): List of list of integers (same format as
-                           `conv_channel_sizes`). Each integer represents the
-                           dilation size of the respective convolutional layer
-                           `TemporalBlock` layer.` In DCAN, this value is
-                           "2^(temporal_block_level)".
-        dropouts_ (list): List of list of floats (same format as
-                          `conv_channel_sizes`). Each float represents the
-                          dropout probability of the respective convolutional
-                          `TemporalBlock` layer.
-        weight_norm (bool): If True, apply weight normalization to the
-                            convolutional layers.
-        activation (str): Activation function to use. DCAN uses "relu".
-    """
-
-    def __init__(
-        self,
-        conv_channel_sizes_,
-        kernel_sizes_,
-        strides_,
-        paddings_,
-        dilations_,
-        dropouts_,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(TemporalConvNet, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"conv_channel_sizes_ = {conv_channel_sizes_}, "
-            f"kernel_sizes_ = {kernel_sizes_}, "
-            f"strides_ = {strides_}, paddings_ = {paddings_}, "
-            f"dilations_ = {dilations_}, dropouts_ = {dropouts_}, "
-            f"weight_norm = {weight_norm}, activation = {activation}"
-        )
-
-        self.temporal_blocks = nn.ModuleList(
-            [
-                TemporalBlock(
-                    conv_channel_sizes=conv_channel_sizes,
-                    kernel_sizes=kernel_sizes,
-                    strides=strides,
-                    paddings=paddings,
-                    dilations=dilations,
-                    dropouts=dropouts,
-                    weight_norm=weight_norm,
-                    activation=activation,
-                )
-                for (
-                    conv_channel_sizes,
-                    kernel_sizes,
-                    strides,
-                    paddings,
-                    dilations,
-                    dropouts,
-                ) in zip(
-                    conv_channel_sizes_,
-                    kernel_sizes_,
-                    strides_,
-                    paddings_,
-                    dilations_,
-                    dropouts_,
-                )
-            ]
-        )
-
-    def forward(self, x):
-        for temporal_block in self.temporal_blocks:
-            x = temporal_block(x)
-        return x
diff --git a/src/modules/layers/word_embedding_layer.py b/src/modules/layers/word_embedding_layer.py
deleted file mode 100644
index ecdfd43..0000000
--- a/src/modules/layers/word_embedding_layer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class WordEmbeddingLayer(nn.Module):
-    """
-    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
-    , and copies its weights to an nn.Embedding layer.
-
-    Args:
-        embed_dir (str): A directory containing the pre-trained word embedding
-                         matrix, among other things. Please see
-                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
-                         for more details.
-        dropout (float): The dropout probability.
-    """
-
-    def __init__(self, embed_dir, dropout):
-        super(WordEmbeddingLayer, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"embed_dir = {embed_dir}, dropout = {dropout}"
-        )
-
-        # Note: This should be changed, since we won't always use Word2Vec.
-        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
-
-        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
-        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
-        self.embed.weight.data = W.clone()
-
-        self.embedding_size = self.embed.embedding_dim
-
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        embedding = self.embed(x)
-        x = self.dropout(embedding)
-        return x